Mercurial > hg > GearsTemplate

Binary file doc/semaphoreSequence.graffle has changed
--- a/src/parallel_execution/CUDAExecutor.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/CUDAExecutor.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -1,10 +1,13 @@
 #include "../context.h"
 #interface "Executor.h"
+#interface "Timer.h"
 #include <stdio.h>
+#include <math.h>

 Executor* createCUDAExecutor(struct Context* context, CUdevice device) {
     struct Executor* executor = new Executor();
     struct CUDAExecutor* cudaExecutor = new CUDAExecutor();
+    cudaExecutor->timer = createTimerImpl(context);
     checkCudaErrors(cuDeviceGetAttribute(&cudaExecutor->maxThreadPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device));
     executor->executor = (union Data*)cudaExecutor;
     executor->read  = C_readCUDAExecutor;
@@ -28,21 +31,47 @@
     }
     // TODO: Implements pipeline
     // goto next(...);
-    goto meta(context, C_execCUDAExecutor);
+    struct Timer* timer = executor->timer;
+    goto timer->start(execCUDAExecutor);
 }

 int computeblockDim(int count, int maxThreadPerBlock) {
     return count < maxThreadPerBlock ? count : maxThreadPerBlock;
 }

+void calcBlockMaxThread(struct MultiDimIterator* iterator, struct CUDAExecutor* executor) {
+    executor->maxThreadPerBlockX = 1;
+    executor->maxThreadPerBlockY = 1;
+    executor->maxThreadPerBlockZ = 1;
+    // maxThreadPerBlockX * maxThreadPerBlockY * maxThreadPerBlockZ <= maxThreadPerBlock
+    if (iterator->x > 1 && iterator->y == 1 && iterator->z == 1) {
+        executor->maxThreadPerBlockX = executor->maxThreadPerBlock;
+        executor->maxThreadPerBlockY = 1;
+        executor->maxThreadPerBlockZ = 1;
+    } else if (iterator->x > 1 && iterator->y > 1 && iterator->z == 1) {
+        int ln_2 = log2(executor->maxThreadPerBlock);
+        int maxThread = 1 << (ln_2/2);
+        executor->maxThreadPerBlockX = maxThread;
+        executor->maxThreadPerBlockY = maxThread;
+        executor->maxThreadPerBlockZ = 1;
+    } else {
+        int ln_2 = log2(executor->maxThreadPerBlock);
+        int maxThread = 1 << (ln_2/3);
+        executor->maxThreadPerBlockX = maxThread * (1 << (ln_2%3));
+        executor->maxThreadPerBlockY = maxThread;
+        executor->maxThreadPerBlockZ = maxThread;
+    }
+}
+
 __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
-    // Asynchronous launch kernel
     task->num_exec = 1;
     if (task->iterate) {
         struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
-        int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock);
-        int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock);
-        int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock);
+        calcBlockMaxThread(iterator, executor);
+        int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlockX);
+        int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlockY);
+        int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlockZ);
+        // launch kernel
         checkCudaErrors(cuLaunchKernel(task->function,
                     iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ,
                     blockDimX, blockDimY, blockDimZ,
@@ -55,12 +84,18 @@
     }
     // TODO: Implements pipeline
     // goto next(...);
-    goto meta(context, C_writeCUDAExecutor);
+    goto writeCUDAExecutor();
 }

 __code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
+    // Asynchronous launch kernel
+    checkCudaErrors(cuCtxSynchronize());
+    struct Timer* timer = executor->timer;
+    goto timer->end(writeCUDAExecutor1);
+}
+
+__code writeCUDAExecutor1(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
     //結果を取ってくるコマンドを入力する
-    //コマンドの終了待ちを行う
     struct CUDABuffer* buffer = executor->buffer;
     int paramLen = buffer->inputLen + buffer->outputLen;
     for (int i = 0; i < paramLen; i++) {
@@ -69,7 +104,5 @@
         checkCudaErrors(cuMemcpyDtoH(data, deviceptr, GET_SIZE(data)));
         cuMemFree(deviceptr);
     }
-    // wait for stream
-    checkCudaErrors(cuCtxSynchronize());
     goto next(...);
 }
--- a/src/parallel_execution/Executor.h	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/Executor.h	Mon Mar 19 21:02:50 2018 +0900
@@ -1,6 +1,7 @@
-typedef struct Executor<Type, Impl>{
-    Type* Executor;
+typedef struct Executor<Impl>{
+    union Data* Executor;
     struct Context* task;
+    __code next(...);
     __code read(Impl* executor, struct Context* task, __code next(...));
     __code exec(Impl* executor, struct Context* task, __code next(...));
     __code write(Impl* executor, struct Context* task, __code next(...));
--- a/src/parallel_execution/MultiDimIterator.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/MultiDimIterator.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -57,7 +57,7 @@
 __code execMultiDimIterator(struct MultiDimIterator* iterator, struct Context* task, int numGPU, __code next(...)) {
     // No GPU device
     if (numGPU == 0) {
-        goto meta(context, C_execMultiDimIterator1);
+        goto execMultiDimIterator1();
     }
     task->iterate = 1;
     task->gpu = 1;
@@ -85,7 +85,7 @@
             }
         }
     }
-    goto meta(context, C_execMultiDimIterator1);
+    goto execMultiDimIterator1();
 }

 __code barrierMultiDimIterator(struct MultiDimIterator* iterator, struct Context* task, __code next(...), __code whenWait(...)) {
--- a/src/parallel_execution/RedBlackTreeReWright.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/RedBlackTreeReWright.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -211,7 +211,7 @@
     tree->current->right->color = Black;
     goto stack->pop(insertBalance);

-  } else if(tree->current->color == Black && tree->current->left->color == Red && tree->current->left->right->color == Red) {
+  } else if (tree->current->color == Black && tree->current->left->color == Red && tree->current->left->right->color == Red) {
     struct Node* tmpCurrent   = tree->current;
     struct Node* tmpLeft      = tree->current->left;
     struct Node* tmpLeftRight = tree->current->left->right;
--- a/src/parallel_execution/SemaphoreImpl.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/SemaphoreImpl.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -19,7 +19,7 @@
 }

 __code pOperationSemaphoreImpl1(struct SemaphoreImpl* semaphore, __code next(...)) {
-    if(semaphore->value == 0) {
+    if (semaphore->value == 0) {
         pthread_cond_wait(&semaphore->cond, &semaphore->mutex);
         goto meta(context, C_pOperationSemaphoreImpl1);
     }
--- a/src/parallel_execution/SynchronizedQueue.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/SynchronizedQueue.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -11,7 +11,7 @@
 Queue* createSynchronizedQueue(struct Context* context) {
     struct Queue* queue = new Queue();
     struct SynchronizedQueue* synchronizedQueue = new SynchronizedQueue();
-    synchronizedQueue->top = new Element();
+    synchronizedQueue->top = new Element(); // allocate a free node
     synchronizedQueue->top->next = NULL;
     synchronizedQueue->last = synchronizedQueue->top;
     synchronizedQueue->atomic = createAtomicReference(context);
--- a/src/parallel_execution/TaskManagerImpl.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/TaskManagerImpl.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -99,7 +99,7 @@

 __code setWaitTaskTaskManagerImpl(struct TaskManagerImpl* taskManager, struct Context* task, __code next(...)) {
     int i = taskManager->loopCounter;
-    if(task->idg+i < task->maxIdg) {
+    if (task->idg+i < task->maxIdg) {
         struct Queue* queue = GET_WAIT_LIST(task->data[task->idg + i]);
         taskManager->loopCounter++;
         goto queue->put(task, setWaitTaskTaskManagerImpl);
@@ -122,7 +122,7 @@
     task->taskManager = taskManager;
     if (task->idgCount == 0) {
         // iterator task is normal task until spawned
-        if(task->iterator != NULL && task->iterate == 0) {
+        if (task->iterator != NULL && task->iterate == 0) {
             pthread_mutex_unlock(&taskManagerImpl->mutex);
             struct Iterator* iterator = task->iterator;
             goto iterator->exec(task, taskManagerImpl->cpu - taskManagerImpl->gpu, next(...));
@@ -148,18 +148,29 @@
 __code taskSend(struct TaskManagerImpl* taskManager, struct Context* task, __code next(...)) {
     // set workerId
     if (task->gpu) {
-        task->workerId = taskManager->sendGPUWorkerIndex;
-        if(++taskManager->sendGPUWorkerIndex >= taskManager->cpu) {
-            taskManager->sendGPUWorkerIndex = taskManager->gpu;
-        }
+        goto taskSend1();
     } else {
-        task->workerId = taskManager->sendCPUWorkerIndex;
-        if(++taskManager->sendCPUWorkerIndex >= taskManager->maxCPU) {
-            taskManager->sendCPUWorkerIndex = taskManager->cpu;
-        }
+        goto taskSend2();
+    }
+}
+
+__code taskSend1(struct TaskManagerImpl* taskManager, struct Context* task, __code next(...)) {
+    int workerId = taskManager->sendGPUWorkerIndex;
+    if (++taskManager->sendGPUWorkerIndex >= taskManager->cpu) {
+        taskManager->sendGPUWorkerIndex = taskManager->gpu;
     }
     pthread_mutex_unlock(&taskManager->mutex);
-    struct Queue* queue = taskManager->workers[task->workerId]->tasks;
+    struct Queue* queue = taskManager->workers[workerId]->tasks;
+    goto queue->put(task, next(...));
+}
+
+__code taskSend2(struct TaskManagerImpl* taskManager, struct Context* task, __code next(...)) {
+    int workerId = taskManager->sendCPUWorkerIndex;
+    if (++taskManager->sendCPUWorkerIndex >= taskManager->maxCPU) {
+        taskManager->sendCPUWorkerIndex = taskManager->cpu;
+    }
+    pthread_mutex_unlock(&taskManager->mutex);
+    struct Queue* queue = taskManager->workers[workerId]->tasks;
     goto queue->put(task, next(...));
 }
--- a/src/parallel_execution/context.h	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/context.h	Mon Mar 19 21:02:50 2018 +0900
@@ -96,16 +96,18 @@
     struct TaskManager* taskManager;
     int codeNum;
     __code (**code) (struct Context*);
+    union Data **data;
     void* heapStart;
     void* heap;
     long heapLimit;
     int dataNum;
+
+    // task parameter
     int idgCount; //number of waiting dataGear
     int idg;
     int maxIdg;
     int odg;
     int maxOdg;
-    int workerId;
     int gpu; // GPU task
     struct Context* task;
     struct Element* taskList;
@@ -114,8 +116,6 @@
     CUmodule module;
     CUfunction function;
 #endif
-    union Data **data;
-
     /* multi dimension parameter */
     int iterate;
     struct Iterator* iterator;
@@ -378,6 +378,10 @@
         CUdeviceptr** kernelParams;
         struct CUDABuffer* buffer;
         int maxThreadPerBlock;
+        int maxThreadPerBlockX;
+        int maxThreadPerBlockY;
+        int maxThreadPerBlockZ;
+        struct Timer* timer;
     } CUDAExecutor;
     struct CUDABuffer {
         int inputLen;
--- a/src/parallel_execution/examples/bitonicSort/bitonicSort.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/bitonicSort/bitonicSort.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -14,11 +14,6 @@
 int CPU_ANY = -1;
 int CPU_CUDA = -1;

-void *start_taskManager(struct Context *context) {
-    goto initDataGears(context, Gearef(context, LoopCounter), Gearef(context, TaskManager));
-    return 0;
-}
-
 __code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
     // loopCounter->tree = createRedBlackTree(context);
     loopCounter->i = 0;
@@ -26,13 +21,6 @@
     goto code1();
 }

-__code initDataGears_stub(struct Context* context) {
-    struct TaskManager* taskManager =  Gearef(context, TaskManager);
-    taskManager->taskManager = 0;
-    struct LoopCounter* loopCounter = Gearef(context, LoopCounter);
-    goto initDataGears(context, loopCounter, taskManager);
-}
-
 __code code1(struct LoopCounter* loopCounter) {
     printf("cpus:\t\t%d\n", cpu_num);
     printf("gpus:\t\t%d\n", gpu_num);
--- a/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -6,7 +6,6 @@
 #endif

 __code bitonicSwap(struct SortArray* inputArray, struct MultiDim* multiDim, __code next(struct SortArray* output, ...), struct LoopCounter* loopCounter) {
-    struct SortArray* output = *O_output;
     int block = inputArray->block;
     int first = inputArray->first;
     if (loopCounter->i < inputArray->prefix) {
@@ -21,12 +20,11 @@
             array[index2] = tmp;
         }
         loopCounter->i++;
-        goto meta(context, C_bitonicSwap);
+        goto bitonicSwap();
     }
     loopCounter->i = 0;
     output->array = inputArray->array;
-    *O_output = output;
-    goto meta(context, next);
+    goto next(output, ...);
 }

 __code bitonicSwap_stub(struct Context* context) {
--- a/src/parallel_execution/examples/bitonicSort/makeArray.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/bitonicSort/makeArray.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -4,8 +4,6 @@

 extern int length;
 __code makeArray(__code next(struct SortArray* output, struct Timer* output1, ...)){
-    struct SortArray* output = *O_output;
-    struct Timer* output1 = *O_output1;
     if (output->loopCounter == 0){
         output->array = (Integer*)ALLOCATE_ARRAY(context, Integer, length);
         srand((unsigned) time(NULL));
@@ -13,23 +11,10 @@
     if (output->loopCounter == GET_LEN(output->array)){
         printf("created Array\n");
         output->loopCounter = 0;
-        *O_output = output;
-        *O_output1 = output1;
-        goto output1->start(next(...));
+        goto output1->start(next(output1, ...));
     }
     output->array[output->loopCounter].value = rand() % 1000;
     //printf("%d\n", output->array[output->loopCounter]->value);
     output->loopCounter++;
-    *O_output = output;
-    *O_output1 = output1;
-    goto meta(context, C_makeArray);
+    goto makeArray();
 }
-
-__code makeArray_stub(struct Context* context) {
-    SortArray** O_output = (struct SortArray**)&context->data[context->odg];
-    Timer** O_output1 = (struct Timer**)&context->data[context->odg+1];
-    goto makeArray(context,
-                   context->next,
-                   O_output,
-                   O_output1);
-}
--- a/src/parallel_execution/examples/bitonicSort/printArray.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/bitonicSort/printArray.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -6,30 +6,17 @@
     goto inputTimer->end(printArray1);
 }

-__code printArray_stub(struct Context* context) {
-    goto printArray(context,
-                   &context->data[context->idg]->SortArray,
-                   &context->data[context->idg+1]->Timer,
-                   context->next);
-}
-
 __code printArray1(struct SortArray* inputArray, __code next(...)){
     //printf("%d\n", inputArray->array[inputArray->loopCounter].value);
     inputArray->loopCounter++;
     if (inputArray->loopCounter == GET_LEN(inputArray->array)){
         printf("sort completed\n");
         inputArray->loopCounter = 0;
-        goto meta(context, next);
+        goto next(...);
     }
     if (inputArray->array[inputArray->loopCounter-1].value > inputArray->array[inputArray->loopCounter].value) {
         printf("wrong result\n");
-        goto meta(context, next);
+        goto next(...);
     }
-    goto meta(context, C_printArray1);
+    goto printArray1();
 }
-
-__code printArray1_stub(struct Context* context) {
-    goto printArray1(context,
-                   &context->data[context->idg]->SortArray,
-                   context->next);
-}
--- a/src/parallel_execution/examples/bitonicSort/sort.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/bitonicSort/sort.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -36,7 +36,7 @@
 __code print(struct SortArray* sortArray){//配列表示
     if (sortArray->sortArray->loop_counter == MAX){//ループの終了→ソートへ
         printf("\n");
-        if(sortArray->sortArray->sort_finish == 1){//ソート終わってたら終了
+        if (sortArray->sortArray->sort_finish == 1){//ソート終わってたら終了
             goto meta(context, C_exit_code);
         }
         sortArray->sortArray->loop_counter = 0;
--- a/src/parallel_execution/examples/boundedBuffer/BoundedBuffer.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/boundedBuffer/BoundedBuffer.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -46,6 +46,7 @@
 __code putBoundedBuffer4(struct BoundedBuffer* buffer, union Data* data, __code next(...)) {
     goto next(...);
 }
+
 __code takeBoundedBuffer(struct BoundedBuffer* buffer, __code next(union Data* data, ...)) {
     struct Semaphore* semaphore = buffer->fullCount;
     goto semaphore->p(takeBoundedBuffer1);
--- a/src/parallel_execution/examples/boundedBuffer/SemaphoreImpl.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/boundedBuffer/SemaphoreImpl.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -22,7 +22,7 @@
 }

 __code pOperationSemaphoreImpl1(struct SemaphoreImpl* semaphore, __code next(...)) {
-    if(semaphore->value == 0) {
+    if (semaphore->value == 0) {
         context->next= C_pOperationSemaphoreImpl;
         struct Queue* queue = semaphore->waitThreadQueue;
         goto queue->put(context, pOperationSemaphoreImpl2); // put this context(thread, process)
--- a/src/parallel_execution/examples/boundedBuffer/consumer.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/boundedBuffer/consumer.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -2,24 +2,16 @@
 #include <stdio.h>
 #interface "Buffer.h"

-__code consumer(struct Buffer* buffer, Int length, __code next(...), struct LoopCounter* loopCounter) {
+__code consumer(struct Buffer* buffer, Int* length, __code next(...), struct LoopCounter* loopCounter) {
     int i = loopCounter->i;
-    if (i < length) {
+    if (i < *length) {
         loopCounter->i++;
         goto buffer->take(consumer1);
     }
     goto next(...);
 }

-__code consumer_stub(struct Context* context) {
-    goto consumer(context,
-                  &context->data[context->idg]->Buffer,
-                  context->data[context->idg+1]->Int,
-                  context->next,
-                  Gearef(context, LoopCounter));
-}
-
-__code consumer1(struct Buffer* buffer, Int length, __code next(...), struct Node* node) {
+__code consumer1(struct Buffer* buffer, Int* length, __code next(...), struct Node* node) {
     printf("getData %d\n", node->value->Int);
     goto consumer();
 }
@@ -27,7 +19,7 @@
 __code consumer1_stub(struct Context* context) {
     goto consumer1(context,
                    &context->data[context->idg]->Buffer,
-                   context->data[context->idg+1]->Int,
+                   &context->data[context->idg+1]->Int,
                    context->next,
                    &Gearef(context, Buffer)->data->Node);
 }
--- a/src/parallel_execution/examples/boundedBuffer/initBuffer.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/boundedBuffer/initBuffer.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -1,16 +1,5 @@
 #include "../../../context.h"

 __code initBuffer(__code next(struct Buffer* output, Int* output1, ...)) {
-    struct Buffer* output = *O_output;
-    Int* output1 = *O_output1;
     goto next(output, output1, ...);
 }
-
-__code initBuffer_stub(struct Context* context) {
-    struct Buffer** O_output = (struct Buffer**)&context->data[context->odg];
-    Int** O_output1 = (Int**)&context->data[context->odg+1];
-    goto initBuffer(context,
-                  context->next,
-                  O_output,
-                  O_output1);
-}
--- a/src/parallel_execution/examples/boundedBuffer/main.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/boundedBuffer/main.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -14,11 +14,6 @@
 int CPU_ANY = -1;
 int CPU_CUDA = -1;

-void *start_taskManager(struct Context *context) {
-    goto initDataGears(context, Gearef(context, LoopCounter), Gearef(context, TaskManager));
-    return 0;
-}
-
 __code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
     // loopCounter->tree = createRedBlackTree(context);
     loopCounter->i = 0;
@@ -26,13 +21,6 @@
     goto code1();
 }

-__code initDataGears_stub(struct Context* context) {
-    struct TaskManager* taskManager =  Gearef(context, TaskManager);
-    taskManager->taskManager = 0;
-    struct LoopCounter* loopCounter = Gearef(context, LoopCounter);
-    goto initDataGears(context, loopCounter, taskManager);
-}
-
 __code code1(struct Timer* timer) {
     printf("cpus:\t\t%d\n", cpu_num);
     printf("gpus:\t\t%d\n", gpu_num);
@@ -54,6 +42,8 @@
     *len = length;
     par goto producer(buffer, len, __exit);
     par goto producer(buffer, len, __exit);
+    par goto producer(buffer, len, __exit);
+    par goto consumer(buffer, len, __exit);
     par goto consumer(buffer, len, __exit);
     par goto consumer(buffer, len, __exit);
     par goto initBuffer(buffer, len, __exit);
--- a/src/parallel_execution/examples/boundedBuffer/producer.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/boundedBuffer/producer.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -1,9 +1,9 @@
 #include "../../../context.h"
 #interface "Buffer.h"

-__code producer(struct Buffer* buffer, Int length, __code next(...), struct LoopCounter* loopCounter) {
+__code producer(struct Buffer* buffer, Int* length, __code next(...), struct LoopCounter* loopCounter) {
     int i = loopCounter->i;
-    if (i < length) {
+    if (i < *length) {
         Node* node = new Node();
         node->value = (union Data*)new Int();
         node->value->Int = i;
@@ -12,11 +12,3 @@
     }
     goto next(...);
 }
-
-__code producer_stub(struct Context* context) {
-    goto producer(context,
-                  &context->data[context->idg]->Buffer,
-                  context->data[context->idg+1]->Int,
-                  context->next,
-                  Gearef(context, LoopCounter));
-}
--- a/src/parallel_execution/examples/calc/add.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/calc/add.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -1,18 +1,7 @@
 #include "../../../context.h"
 #include <stdio.h>
 __code add(struct Integer* input1, struct Integer* input2, __code next(struct Integer* output, ...)) {
-    struct Integer* output = *O_output;
     output->value = input1->value + input2->value;
     printf("%d + %d = %d\n", input1->value, input2->value, output->value);
-    *O_output = output;
-    goto meta(context, next);
+    goto next(output, ...);
 }
-
-__code add_stub(struct Context* context) {
-    Integer** O_output = (struct Integer **)&context->data[context->odg];
-    goto add(context,
-            &context->data[context->idg]->Integer,
-            &context->data[context->idg + 1]->Integer,
-            context->next,
-            O_output);
-}
--- a/src/parallel_execution/examples/calc/calc.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/calc/calc.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -13,11 +13,6 @@
 int CPU_ANY = -1;
 int CPU_CUDA = -1;

-void *start_taskManager(struct Context *context) {
-    goto initDataGears(context, Gearef(context, LoopCounter), Gearef(context, TaskManager));
-    return 0;
-}
-
 __code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
     // loopCounter->tree = createRedBlackTree(context);
     loopCounter->i = 0;
@@ -25,13 +20,6 @@
     goto meta(context, C_code1);
 }

-__code initDataGears_stub(struct Context* context) {
-    struct TaskManager* taskManager =  Gearef(context, TaskManager);
-    taskManager->taskManager = 0;
-    struct LoopCounter* loopCounter = Gearef(context, LoopCounter);
-    goto initDataGears(context, loopCounter, taskManager);
-}
-
 __code code1(struct Timer* timer) {
     printf("cpus:\t\t%d\n", cpu_num);
     printf("gpus:\t\t%d\n", gpu_num);
@@ -65,7 +53,7 @@
     goto meta(context, taskManager->taskManager->TaskManager.shutdown);
 }

-__code createTask2(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
+__code createTask2(struct LoopCounter* loopCounter) {
     Integer* integer1 = new Integer();
     Integer* integer2 = new Integer();
     Integer* integer3 = new Integer();
--- a/src/parallel_execution/examples/calc/initIntegerDataGears.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/calc/initIntegerDataGears.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -1,25 +1,8 @@
 #include "../../../context.h"
 #include <stdio.h>
 __code initIntegerDataGears(__code next(struct Integer* output1, struct Integer* output2, struct Integer* output3, ...)) {
-    struct Integer* output1 = *O_output1;
-    struct Integer* output2 = *O_output2;
-    struct Integer* output3 = *O_output3;
     output1->value = 1;
     output2->value = 2;
     output3->value = 3;
-    *O_output1 = output1;
-    *O_output2 = output2;
-    *O_output3 = output3;
-    goto meta(context, next);
+    goto next(output1, output2, output3, ...);
 }
-
-__code initIntegerDataGears_stub(struct Context* context) {
-    Integer** O_output1 = (struct Integer **)&context->data[context->odg];
-    Integer** O_output2 = (struct Integer **)&context->data[context->odg+1];
-    Integer** O_output3 = (struct Integer **)&context->data[context->odg+2];
-    goto initIntegerDataGears(context,
-            context->next,
-            O_output1,
-            O_output2,
-            O_output3);
-}
--- a/src/parallel_execution/examples/calc/mult.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/calc/mult.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -1,18 +1,7 @@
 #include "../../../context.h"
 #include <stdio.h>
 __code mult(struct Integer* input1, struct Integer* input2, __code next(struct Integer* output, ...)) {
-    struct Integer* output = *O_output;
     output->value = input1->value * input2->value;
     printf("%d * %d = %d\n", input1->value, input2->value, output->value);
-    *O_output = output;
-    goto meta(context, next);
+    goto next(output, ...);
 }
-
-__code mult_stub(struct Context* context) {
-    Integer** O_output = (struct Integer **)&context->data[context->odg];
-    goto mult(context,
-            &context->data[context->idg]->Integer,
-            &context->data[context->idg + 1]->Integer,
-            context->next,
-            O_output);
-}
--- a/src/parallel_execution/examples/twice/createArray.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/twice/createArray.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -6,8 +6,6 @@
 extern int split;

 __code createArray(__code next(struct Array* output, struct Timer* output1, ...), struct LoopCounter* loopCounter) {
-    struct Array* output = *O_output;
-    struct Timer* output1 = *O_output1;
     int i = loopCounter->i;
     if (i == 0){
         output->array = (Int*)ALLOCATE_ARRAY(context, Int, length);
@@ -16,23 +14,9 @@
     if (i == GET_LEN(output->array)){
         printf("created Array\n");
         loopCounter->i = 0;
-        *O_output = output;
-        *O_output1 = output1;
-        goto output1->start(next(...));
+        goto output1->start(next(output, output1, ...));
     }
     output->array[i] = i;
     loopCounter->i++;
-    *O_output = output;
-    *O_output1 = output1;
-    goto meta(context, C_createArray);
+    goto createArray();
 }
-
-__code createArray_stub(struct Context* context) {
-    Array** O_output = (struct Array **)&context->data[context->odg];
-    Timer** O_output1 = (struct Timer**)&context->data[context->odg+1];
-    goto createArray(context,
-            context->next,
-            O_output,
-            O_output1,
-            Gearef(context, LoopCounter));
-}
--- a/src/parallel_execution/examples/twice/main.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/twice/main.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -14,11 +14,6 @@
 int CPU_ANY = -1;
 int CPU_CUDA = -1;

-void *start_taskManager(struct Context *context) {
-    goto initDataGears(context, Gearef(context, LoopCounter), Gearef(context, TaskManager));
-    return 0;
-}
-
 __code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
     // loopCounter->tree = createRedBlackTree(context);
     loopCounter->i = 0;
@@ -26,13 +21,6 @@
     goto code1();
 }

-__code initDataGears_stub(struct Context* context) {
-    struct TaskManager* taskManager =  Gearef(context, TaskManager);
-    taskManager->taskManager = 0;
-    struct LoopCounter* loopCounter = Gearef(context, LoopCounter);
-    goto initDataGears(context, loopCounter, taskManager);
-}
-
 __code code1(struct LoopCounter* loopCounter) {
     printf("cpus:\t\t%d\n", cpu_num);
     printf("gpus:\t\t%d\n", gpu_num);
--- a/src/parallel_execution/examples/twice/printArray.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/twice/printArray.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -6,31 +6,17 @@
     goto inputTimer->end(printArray1);
 }

-__code printArray_stub(struct Context* context) {
-    goto printArray(context,
-                   &context->data[context->idg]->Array,
-                   &context->data[context->idg+1]->Timer,
-                   context->next);
-}
-
 __code printArray1(struct Array* array, __code next(...), struct LoopCounter* loopCounter){
     int i = loopCounter->i;
     //printf("%d\n", array->array[i]);
-    if(i < GET_LEN(array->array)) {
+    if (i < GET_LEN(array->array)) {
         if (array->array[i] == i*2) {
             loopCounter->i++;
-            goto meta(context, C_printArray1);
+            goto printArray1();
         } else {
             printf("wrong result\n");
         }
     }
    loopCounter->i = 0;
-    goto meta(context, next);
+    goto next(...);
 }
-
-__code printArray1_stub(struct Context* context) {
-    goto printArray1(context,
-            &context->data[context->idg]->Array,
-            context->next,
-            Gearef(context, LoopCounter));
-}
--- a/src/parallel_execution/examples/twice/twice.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/examples/twice/twice.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -7,7 +7,6 @@
 #endif

 __code twice(struct Array* array, struct MultiDim* multiDim, __code next(struct Array* output, ...), struct LoopCounter* loopCounter) {
-    struct Array* output = *O_output;
     int i = loopCounter->i;
     int index = multiDim->x;
     if (i < array->prefix) {
@@ -19,7 +18,7 @@

     loopCounter->i = 0;
     output->array = array->array;
-    goto meta(context, context->next);
+    goto next(output, ...);
 }

 __code twice_stub(struct Context* context) {
--- a/src/parallel_execution/generate_stub.pl	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/generate_stub.pl	Mon Mar 19 21:02:50 2018 +0900
@@ -118,6 +118,14 @@
                     &getDataGear("$interfaceHeader");
                     &getCodeGear("$interfaceHeader");
                 }
+            } elsif (/^\_\_code (\w+)\((.*)\)(.*)/) {
+                my $codeGearName = $1;
+                if ($filename =~ /^(.*)\/(.*)/) {
+                    $codeGearName = "$1/$codeGearName";
+                }
+                if ( -f "$codeGearName.cbc") {
+                    &getCodeGear("$codeGearName.cbc");
+                }
             }
             next;
         }
@@ -185,13 +193,22 @@
                 if ($args =~ s/(^\s*,\s*)//) {
                 }
                 if ($args =~ s/^(\s)*\_\_code\s+(\w+)\((.*?)\)//) {
+                    $codeGear{$codeGearName}->{"code"}->{$2} = "\_\_code";
                     $inputIncFlag = 0;
-                    $outputCount = split(/,/,$3);
-                    $outputCount--;
-                } elsif ($args =~ s/^(struct|union)?\s*(\w+)(\*)?+\s(\w+)//) {
-                    if($inputIncFlag) {
-                        $inputCount++;
+                    my @outputs = split(/,/,$3);
+                    for my $output (@outputs) {
+                        if ($output =~ /\s*(struct|union)?\s*(\w+)(\*)?+\s(\w+)/) {
+                            my $type = $2;
+                            my $varName = $4;
+                            $codeGear{$codeGearName}->{"var"}->{$varName} = "$type $outputCount";
+                            $outputCount++;
+                        }
                     }
+                } elsif ($args =~ s/^(struct|union)?\s*(\w+)(\*)?+\s(\w+)// && $inputIncFlag) {
+                    my $type = $2;
+                    my $varName = $4;
+                    $codeGear{$codeGearName}->{"var"}->{$varName} = "$type $inputCount";
+                    $inputCount++;
                 } elsif ($args =~ s/(.*,)//) {
                 } else {
                     last;
@@ -224,6 +241,7 @@
         # get implementation
         $dataGearName{$codeGearName} .= "\t$typeName* $varName = ($typeName*)GearImpl(context, $interface, $varName);\n";
     } else {
+        # interface var
         for my $ivar (keys %{$var{$interface}}) {
             #  input data gear field
             if ($varName eq $ivar) {
@@ -233,12 +251,13 @@
                         $outputVar{$codeGearName} .= "\t$typeName$ptrType $varName = *O_$varName;\n";
                         return 1;
                     }
-
                     $dataGearName{$codeGearName} .= "\t$typeName$ptrType $varName = Gearef(context, $interface)->$varName;\n";
                     return 1;
                 }
             }
         }
+
+        # interface continuation
         for my $cName (keys %{$code{$interface}}) {
             if ($varName eq $cName) {
                 # continuation field
@@ -246,6 +265,34 @@
                 return 1;
             }
         }
+
+        # par goto  var
+        for my $var (keys %{$codeGear{$codeGearName}->{"var"}}) {
+            #  input data gear field
+            if ($varName eq $var) {
+                my ($type, $count) = split(/\s/, $codeGear{$codeGearName}->{"var"}->{$var});
+                if ($typeName eq $type) {
+                    if ($output) {
+                        $dataGearName{$codeGearName} .= "\t$typeName$ptrType* O_$varName = ($typeName $ptrType*)&context->data[context->odg + $count];\n";
+                        $outputVar{$codeGearName} .= "\t$typeName$ptrType $varName = *O_$varName;\n";
+                        return 1;
+                    }
+                    $dataGearName{$codeGearName} .= "\t$typeName$ptrType $varName = &context->data[context->idg + $count]->$typeName;\n";
+                    return 1;
+                }
+            }
+        }
+
+        # par goto continuation
+        for my $cName (keys %{$codeGear{$codeGearName}->{"code"}}) {
+            if ($varName eq $cName) {
+                # continuation field
+                $dataGearName{$codeGearName} .= "\tenum Code $varName = context->next;\n";
+                return 1;
+            }
+        }
+
+        # par goto continuation
         # global or local variable case
         if ($typeName eq "Code") {
             $dataGearName{$codeGearName} .= "\tenum $typeName$ptrType $varName = Gearef(context, $interface)->$varName;\n";
@@ -333,6 +380,9 @@
                 if ($args=~/^struct Context\s*\*\s*context/) {
                     $newArgs = "";
                 }
+                if (!$args){
+                    $newArgs = "struct Context *context";
+                }
                 while($args) {
                     if ($args =~ s/(^\s*,\s*)//) {
                         $newArgs .= $1;
@@ -396,13 +446,15 @@
                 my $prev = $1;
                 my $next = $2;
                 my $method = $3;
-                my @args = split(/,/,$4);
+                my $tmpArgs = $4;
+                $tmpArgs =~ s/\(.*\)/\(\)/;
+                my @args = split(/,/,$tmpArgs);
                 my @types = @{$dataGearVarType{$codeGearName}};
                 my $ntype;
                 my $ftype;
                 for my $v (@{$dataGearVar{$codeGearName}}) {
                     my $t = shift @types;
-                    if ($v eq $next) {
+                    if ($v eq $next || $v eq "O_$next") {
                         $ntype = $t;
                         $ftype = lcfirst($ntype);
                     }
@@ -505,7 +557,7 @@
                 # convert it to the meta call form with two arugments, that is context and enum Code
                 my $prev = $1;
                 my $next = $2;
-                my @args = split(/, /,$3);
+                my @args = split(/,/, $3);
                 my $v = 0;
                 for my $n ( @{$dataGearVar{$codeGearName}} ) {
                     # continuation arguments
--- a/src/parallel_execution/test/multiDimIterator_test.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/test/multiDimIterator_test.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -2,7 +2,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <unistd.h>
-#interface "Iterator.h"
+#interface "TaskManager.h"

 #include "../../context.h"

@@ -16,14 +16,7 @@
     // loopCounter->tree = createRedBlackTree(context);
     loopCounter->i = 0;
     taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, gpu_num, 0);
-    goto meta(context, C_code1);
-}
-
-__code initDataGears_stub(struct Context* context) {
-    struct TaskManager* taskManager =  Gearef(context, TaskManager);
-    taskManager->taskManager = 0;
-    struct LoopCounter* loopCounter = Gearef(context, LoopCounter);
-    goto initDataGears(context, loopCounter, taskManager);
+    goto code1();
 }

 __code code1(struct LoopCounter* loopCounter) {
@@ -44,18 +37,26 @@

     if (i < length) {
         loopCounter->i++;
-        goto meta(context, C_createTask2);
+        goto createTask2();
     }

     loopCounter->i = 0;
     taskManager->next = C_exit_code;
-    goto meta(context, taskManager->taskManager->TaskManager.shutdown);
+    goto code2();
+}
+
+__code code2(struct TaskManager* taskManager) {
+    goto taskManager->shutdown(exit_code);
+}
+
+__code code2_stub(struct Context* context) {
+    goto code2(context, &Gearef(context, TaskManager)->taskManager->TaskManager);
 }

 __code createTask2(struct TaskManager* taskManager) {
-    par goto printIterator(iterate(2), exit);
-    par goto printIterator(iterate(2, 2), exit);
-    par goto printIterator(iterate(2, 2, 2), exit);
+    par goto printIterator(iterate(2), __exit);
+    par goto printIterator(iterate(2, 2), __exit);
+    par goto printIterator(iterate(2, 2, 2), __exit);
     goto createTask1();
 }

@@ -73,9 +74,5 @@
 }

 int main(int argc, char** argv) {
-    init(argc, argv);
-    struct Context* main_context = NEW(struct Context);
-    initContext(main_context);
-    main_context->next = C_initDataGears;
-    goto start_code(main_context);
+    goto initDataGears();
 }
--- a/src/parallel_execution/test/printIterator.cbc	Fri Jan 05 09:41:27 2018 +0900
+++ b/src/parallel_execution/test/printIterator.cbc	Mon Mar 19 21:02:50 2018 +0900
@@ -2,11 +2,5 @@
 #include <stdio.h>
 __code printIterator(struct MultiDim* multiDim, __code next(...)) {
     printf("x: %d, y: %d, z: %d\n", multiDim->x, multiDim->y, multiDim->z);
-    goto meta(context, next);
+    goto next(...);
 }
-
-__code printIterator_stub(struct Context* context) {
-    goto printIterator(context,
-                       &context->data[context->idg]->MultiDim,
-                       context->next);
-}