changeset 331:69cdda536487

Merge
author Tatsuki IHA <e125716@ie.u-ryukyu.ac.jp>
date Tue, 18 Apr 2017 18:59:36 +0900
parents d9887056ae5b (current diff) a258505bf9fd (diff)
children e46d9910640b
files src/parallel_execution/CUDAtwice.cbc src/parallel_execution/CUDAtwice.cu
diffstat 12 files changed, 574 insertions(+), 113 deletions(-) [+]
line wrap: on
line diff
--- a/Dockerfile	Fri Apr 14 19:52:26 2017 +0900
+++ b/Dockerfile	Tue Apr 18 18:59:36 2017 +0900
@@ -12,7 +12,7 @@
 RUN /root/CbC_llvm/configure --enable-assertions
 RUN make -j 2
 RUN make install
-
+ENV CBC_COMPILER /usr/local/bin/clang
 WORKDIR /root
 RUN git clone https://github.com/choller/llcov
 
--- a/src/parallel_execution/CMakeLists.txt	Fri Apr 14 19:52:26 2017 +0900
+++ b/src/parallel_execution/CMakeLists.txt	Tue Apr 18 18:59:36 2017 +0900
@@ -7,6 +7,7 @@
 
 set(CMAKE_C_COMPILER $ENV{CBC_COMPILER})
 add_definitions("-Wall -g")
+set(CMAKE_C_FLAGS "-lpthread")
 
 
 if (${USE_CUDA})
@@ -59,7 +60,14 @@
   TARGET
       twice
   SOURCES 
-      main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc
+      examples/twice.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc
+)
+
+GearsCommand(
+  TARGET
+      calc
+  SOURCES 
+      examples/calc.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc
 )
 
 if (${USE_CUDA})
@@ -67,7 +75,7 @@
       TARGET
           CUDAtwice
       SOURCES 
-          main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc  CUDAWorker.cbc CUDAtwice.cbc CUDAtwice.cu cuda.c
+          main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc  CUDAWorker.cbc examples/CUDAtwice.cbc examples/CUDAtwice.cu cuda.c
     )
     set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1 -DUSE_CUDA_MAIN_THREAD")
 endif()
--- a/src/parallel_execution/CPUWorker.cbc	Fri Apr 14 19:52:26 2017 +0900
+++ b/src/parallel_execution/CPUWorker.cbc	Tue Apr 18 18:59:36 2017 +0900
@@ -1,5 +1,3 @@
-#include <libkern/OSAtomic.h>
-
 #include "../context.h"
 
 static void start_worker(Worker* worker);
@@ -49,12 +47,10 @@
     goto getTask(context, worker, task);
 }
 
-__code odgCommit(struct LoopCounter* loopCounter, struct Queue* queue, struct Context* task) {
+__code odgCommit(struct LoopCounter* loopCounter, struct Context* task) {
     int i = loopCounter->i ;
     if(task->odg + i < task->maxOdg) {
-        queue->queue = (union Data*)GET_WAIT_LIST(task->data[task->odg+i]);
-        queue->next = C_odgCommit1;
-        goto meta(context, queue->queue->Queue.take);
+        goto meta(task, C_odgCommit1);
     }
     loopCounter->i = 0;
     goto meta(context, C_taskReceiveWorker);
@@ -63,30 +59,64 @@
 __code odgCommit_stub(struct Context* context) {
     struct Context* workerContext = context->worker->worker->CPUWorker.context;
     goto odgCommit(workerContext,
-                   Gearef(workerContext, LoopCounter),
-                   Gearef(workerContext, Queue),
+                   Gearef(context, LoopCounter),
                    context);
 }
 
-__code odgCommit1(struct TaskManager* taskManager, struct Context* task) {
+__code odgCommit1(struct LoopCounter* loopCounter, struct Queue* queue) {
+    int i = loopCounter->i ;
+    queue->queue = (union Data*)GET_WAIT_LIST(context->data[context->odg+i]);
+    queue->whenEmpty = C_odgCommit4;
+    queue->next = C_odgCommit2;
+    goto meta(context, queue->queue->Queue.isEmpty);
+}
+
+__code odgCommit1_stub(struct Context* context) {
+    goto odgCommit1(context,
+                   Gearef(context, LoopCounter),
+                   Gearef(context, Queue));
+}
+
+__code odgCommit2(struct Queue* queue) {
+    queue->next = C_odgCommit3;
+    goto meta(context, queue->queue->Queue.take);
+}
+
+__code odgCommit2_stub(struct Context* context) {
+    goto odgCommit2(context,
+                   Gearef(context, Queue));
+}
+
+__code odgCommit3(struct TaskManager* taskManager, struct Context* task) {
     if(__sync_fetch_and_sub(&task->idgCount, 1)) {
         if(task->idgCount == 0) {
             taskManager->taskManager = (union Data*)task->taskManager;
             taskManager->context = task;
-            taskManager->next = C_odgCommit;
+            taskManager->next = C_odgCommit1;
             goto meta(context, task->taskManager->spawn);
+        } else {
+            goto meta(context, C_odgCommit1);
         }
     } else {
-        goto meta(context, C_odgCommit1);
+        goto meta(context, C_odgCommit3);
     }
 }
 
-__code odgCommit1_stub(struct Context* context) {
+__code odgCommit3_stub(struct Context* context) {
     struct Context* task = &Gearef(context, Queue)->data->Context;
-    goto odgCommit1(context,
+    goto odgCommit3(context,
                     Gearef(context, TaskManager),
                     task);
-                 
+}
+
+__code odgCommit4(struct LoopCounter* loopCounter) {
+    loopCounter->i++;
+    goto meta(context, C_odgCommit);
+}
+
+__code odgCommit4_stub(struct Context* context) {
+    goto odgCommit4(context,
+                    Gearef(context, LoopCounter));
 }
 
 __code shutdownWorker(struct CPUWorker* worker) {
--- a/src/parallel_execution/CUDAtwice.cbc	Fri Apr 14 19:52:26 2017 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,28 +0,0 @@
-#include <stdio.h>
-#include "../context.h"
-
-
-extern void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter);
-
-__code CUDAtwice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) {
-    int i = loopCounter->i;
-    if (i < prefix) {
-        array[i+index*prefix] = array[i+index*prefix]*2;
-        loopCounter->i++;
-
-        goto meta(context, C_twice);
-    }
-
-    loopCounter->i = 0;
-    goto meta(workerContext, workerContext->next);
-}
-
-__code CUDAtwice_stub(struct Context* context) {
-printf("CUdAtwice stub\n");
-    struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter;
-    struct Array* array = &context->data[context->dataNum+1]->Array;
-    CUDAExec(context,array,loopCounter);
-
-    //continuationにそってGPUworkerに戻る
-    goto meta(context, context->next);
-}
--- a/src/parallel_execution/CUDAtwice.cu	Fri Apr 14 19:52:26 2017 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-extern "C" {
-
-#include <stdio.h>
-
-//    __global__ void twice(struct LoopCounter* loopCounter, int prefix ,int* array) {
-//         int index = blockIdx.x * blockDim.x + threadIdx.x;
-//         printf("array %p, blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n");
-//         int i = 0;
-//         while (i < prefix) {
-//              array[i+index*prefix] = array[i+index*prefix]*2;
-//         }
-//    }
-
-    struct LoopCounter {
-        int i;
-    } LoopCounter;
-
-    __global__ void twice(struct LoopCounter* loopCounter, int index, int prefix, int* array) {
-         printf("array %p, index = %d, prefix = %d loopCounter->i %d\n",array,index,prefix,loopCounter->i);
-C_twice:
-        int i = loopCounter->i;
-        if (i < prefix) {
-            array[i+index*prefix] = array[i+index*prefix]*2;
-            loopCounter->i++;
-
-            goto C_twice;
-        }
-
-        loopCounter->i = 0;
-    }
-
-
-}
--- a/src/parallel_execution/TaskManager.cbc	Fri Apr 14 19:52:26 2017 +0900
+++ b/src/parallel_execution/TaskManager.cbc	Tue Apr 18 18:59:36 2017 +0900
@@ -2,6 +2,7 @@
     union Data* taskManager;
     __code createTask(struct TaskManager* taskManager);
     __code spawn(Impl* taskManager, struct Queue* queue, struct Context* task, __code next(...));
+    __code setWaitTask(struct Queue* queue, struct Context* task, __code next(...));
     __code shutdown(struct LoopCounter* loopCounter, struct TaskManager* taskManager, Impl* taskManagerImpl, struct Queue* queue, __code next(...));
     __code next(...);
     __code task(...);
--- a/src/parallel_execution/TaskManagerImpl.cbc	Fri Apr 14 19:52:26 2017 +0900
+++ b/src/parallel_execution/TaskManagerImpl.cbc	Tue Apr 18 18:59:36 2017 +0900
@@ -6,7 +6,7 @@
 
 TaskManager* createTaskManagerImpl(struct Context* context, int numCPU, int numGPU, int numIO) {
     struct TaskManager* taskManager = new TaskManager();
-    // 0...numIO-1 IOProcessor 
+    // 0...numIO-1 IOProcessor
     // numIO...numIO+numGPU-1 GPUProcessor
     // numIO+numGPU...numIO+numGPU+numCPU-1 CPUProcessor
     taskManager->io = 0;
@@ -15,6 +15,7 @@
     taskManager->maxCPU = numIO+numGPU+numCPU;
     taskManager->createTask = C_createTask;
     taskManager->spawn = C_spawnTaskManager;
+    taskManager->setWaitTask = C_setWaitTask;
     taskManager->shutdown  = C_shutdownTaskManager;
     struct TaskManagerImpl* taskManagerImpl = new TaskManagerImpl();
     taskManagerImpl -> activeQueue = createSingleLinkedQueue(context);
@@ -43,7 +44,7 @@
 #else
         Queue* queue = createSynchronizedQueue(context);
         taskManagerImpl->workers[i] = (Worker*)createCPUWorker(context, i, queue);
-#endif        
+#endif
     }
     for (;i<taskManager->maxCPU;i++) {
         Queue* queue = createSynchronizedQueue(context);
@@ -54,7 +55,8 @@
 __code createTask(struct TaskManager* taskManager) {
     taskManager->context = NEW(struct Context);
     initContext(taskManager->context);
-    taskManager->context->taskManager = taskManager;
+    taskManager->context->taskManager = (struct TaskManager*)taskManager->taskManager;
+    taskManager->context->idg = taskManager->context->dataNum;
     goto meta(context, C_setWorker);
 }
 
@@ -71,17 +73,24 @@
     goto setWorker(context, taskManager, Gearef(context, TaskManager)->context, Gearef(context, TaskManager)->next);
 }
 
+__code setWaitTask(struct Queue* queue, struct Context* task, Data* data, __code next(...)) {
+    queue->queue = (Data *)GET_WAIT_LIST(data);
+    queue->next = next;
+    queue->data = (Data *)task;
+    goto meta(context, queue->queue->Queue.put);
+}
+
+__code setWaitTask_stub(struct Context* context) {
+    goto setWaitTask(context, Gearef(context, Queue), Gearef(context, TaskManager)->context, Gearef(context, TaskManager)->data, Gearef(context, TaskManager)->next);
+}
+
 __code spawnTaskManager(struct TaskManagerImpl* taskManager, struct Queue* queue, struct Context* task, __code next(...)) {
     if (task->idgCount == 0) {
-        // enqueue activeQueue
-        queue->queue = (union Data*)taskManager->activeQueue;
+        goto meta(context, C_taskSend);
     } else {
-        // enqueue waitQueue
-        queue->queue = (union Data*)taskManager->taskQueue;
+        pthread_mutex_unlock(&taskManager->mutex);
+        goto next(...);
     }
-    queue->data = (union Data*)task;
-    queue->next = C_spawnTaskManager1;
-    goto meta(context, queue->queue->Queue.put);
 }
 
 __code spawnTaskManager_stub(struct Context* context) {
@@ -94,35 +103,18 @@
                           Gearef(context, TaskManager)->next);
 }
 
-
-__code spawnTaskManager1(struct TaskManagerImpl* taskManager) {
-    pthread_mutex_unlock(&taskManager->mutex);
-    goto meta(context, C_taskSend);
-}
-
-__code spawnTaskManager1_stub(struct Context* context) {
-    TaskManagerImpl* taskManager = (TaskManagerImpl*)GearImpl(context, TaskManager, taskManager);
-    goto spawnTaskManager1(context,
-                           taskManager);
-}
-
-__code taskSend(struct TaskManagerImpl* taskManager, struct Queue* queue) {
-    queue->queue = (union Data*)taskManager->activeQueue;
-    queue->next = C_taskSend1;
-    goto meta(context, taskManager->activeQueue->take);
-}
-
-__code taskSend1(struct TaskManagerImpl* taskManager, struct Queue* queue, struct Context* task, __code next(...)) {
+__code taskSend(struct TaskManagerImpl* taskManager, struct Queue* queue, struct Context* task, __code next(...)) {
     struct Queue* tasks = taskManager->workers[task->workerId]->tasks;
     queue->queue = (union Data*)tasks;
     queue->data = (union Data*)task;
     queue->next = next;
+    pthread_mutex_unlock(&taskManager->mutex);
     goto meta(context, tasks->put);
 }
 
-__code taskSend1_stub(struct Context* context) {
+__code taskSend_stub(struct Context* context) {
     TaskManagerImpl* taskManager = (TaskManagerImpl*)GearImpl(context, TaskManager, taskManager);
-    goto taskSend1(context, taskManager, Gearef(context, Queue), Gearef(context, TaskManager)->context, Gearef(context, TaskManager)->next);
+    goto taskSend(context, taskManager, Gearef(context, Queue), Gearef(context, TaskManager)->context, Gearef(context, TaskManager)->next);
 }
 
 __code shutdownTaskManager(struct LoopCounter* loopCounter, struct TaskManager* taskManager, struct TaskManagerImpl* taskManagerImpl, struct Queue* queue, __code next(...)) {
--- a/src/parallel_execution/context.h	Fri Apr 14 19:52:26 2017 +0900
+++ b/src/parallel_execution/context.h	Tue Apr 18 18:59:36 2017 +0900
@@ -22,7 +22,7 @@
     struct Meta* meta = (struct Meta*)context->heap;\
     meta->type = D_##t;\
     meta->size = 1;\
-    context->heap += sizeof(struct Meta);                               \
+    context->heap += sizeof(struct Meta);\
     context->data[D_##dseg] = context->heap; context->heap += sizeof(struct t); (struct t *)context->data[D_##dseg]; })
 
 #define ALLOCATE(context, t) ({ \
@@ -45,6 +45,12 @@
     meta->size = len; \
     data; })
 
+#define ALLOCATE_DATA_GEAR(context, t) ({ \
+        union Data* data = ALLOCATE(context, t); \
+        struct Meta* meta = GET_META(data); \
+        meta->wait = createSingleLinkedQueue(context); \
+        data; })
+
 #define GET_META(dseg) ((struct Meta*)(((void*)dseg) - sizeof(struct Meta)))
 #define GET_TYPE(dseg) (GET_META(dseg)->type)
 #define GET_WAIT_LIST(dseg) (GET_META(dseg)->wait)
@@ -76,6 +82,8 @@
     long heapLimit;
     int dataNum;
     int idgCount; //number of waiting dataGear
+    int idg;
+    int maxIdg;
     int odg;
     int maxOdg;
     int workerId;
@@ -108,11 +116,13 @@
         union Data* taskManager;
         enum Code createTask; // create NEW  contexts for execution & argument
         enum Code spawn;      // start NEW context on the worker
+        enum Code setWaitTask;
         enum Code shutdown;
 
         enum Code next;
         enum Code task;
         struct Context* context;
+        union Data* data;
         int worker;
         int cpu;
         int gpu;
@@ -274,9 +284,9 @@
         enum Code next;
         long size;
     } Allocate;
-    struct OutPutDataSegments {
-        union Data **data;
-    } Ods;
+    struct Integer {
+        int value;
+    } Integer;
 }; // union Data end       this is necessary for context generator
 
 typedef union Data Data;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parallel_execution/examples/CUDAtwice.cbc	Tue Apr 18 18:59:36 2017 +0900
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include "../../context.h"
+
+
+extern void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter);
+
+__code CUDAtwice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) {
+    int i = loopCounter->i;
+    if (i < prefix) {
+        array[i+index*prefix] = array[i+index*prefix]*2;
+        loopCounter->i++;
+
+        goto meta(context, C_twice);
+    }
+
+    loopCounter->i = 0;
+    goto meta(workerContext, workerContext->next);
+}
+
+__code CUDAtwice_stub(struct Context* context) {
+printf("CUdAtwice stub\n");
+    struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter;
+    struct Array* array = &context->data[context->dataNum+1]->Array;
+    CUDAExec(context,array,loopCounter);
+
+    //continuationにそってGPUworkerに戻る
+    goto meta(context, context->next);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parallel_execution/examples/CUDAtwice.cu	Tue Apr 18 18:59:36 2017 +0900
@@ -0,0 +1,33 @@
+extern "C" {
+
+#include <stdio.h>
+
+//    __global__ void twice(struct LoopCounter* loopCounter, int prefix ,int* array) {
+//         int index = blockIdx.x * blockDim.x + threadIdx.x;
+//         printf("array %p, blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n");
+//         int i = 0;
+//         while (i < prefix) {
+//              array[i+index*prefix] = array[i+index*prefix]*2;
+//         }
+//    }
+
+    struct LoopCounter {
+        int i;
+    } LoopCounter;
+
+    __global__ void twice(struct LoopCounter* loopCounter, int index, int prefix, int* array) {
+         printf("array %p, index = %d, prefix = %d loopCounter->i %d\n",array,index,prefix,loopCounter->i);
+C_twice:
+        int i = loopCounter->i;
+        if (i < prefix) {
+            array[i+index*prefix] = array[i+index*prefix]*2;
+            loopCounter->i++;
+
+            goto C_twice;
+        }
+
+        loopCounter->i = 0;
+    }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parallel_execution/examples/calc.cbc	Tue Apr 18 18:59:36 2017 +0900
@@ -0,0 +1,226 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include "../../context.h"
+
+int cpu_num = 1;
+int length = 100;
+int gpu_num = 0;
+int CPU_ANY = -1;
+int CPU_CUDA = -1;
+
+void print_queue(struct Element* element) {
+    while (element) {
+        printf("%p\n", ((struct Task *)(element->data)));
+        element = element->next;
+    }
+}
+
+void print_tree(struct Node* node) {
+    if (node != 0) {
+        printf("%d\n", node->value->Array.index);
+        print_tree(node->left);
+        print_tree(node->right);
+    }
+}
+
+void *start_taskManager(struct Context *context) {
+    goto initDataGears(context, Gearef(context, LoopCounter), Gearef(context, TaskManager));
+    return 0;
+}
+
+#ifdef USE_CUDAWorker
+#ifdef USE_CUDA_MAIN_THREAD
+extern volatile int cuda_initialized;
+#endif
+#endif
+
+__code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
+    // loopCounter->tree = createRedBlackTree(context);
+    loopCounter->i = 0;
+    taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, gpu_num, 0);
+#ifdef USE_CUDAWorker
+#ifdef USE_CUDA_MAIN_THREAD
+    while(! cuda_initialized) {};
+#endif
+#endif
+    goto meta(context, C_createTask1);
+}
+
+__code initDataGears_stub(struct Context* context) {
+    struct TaskManager* taskManager =  Gearef(context, TaskManager);
+    taskManager->taskManager = 0;
+#if (! defined(USE_CUDAWorker) || ! defined(USE_CUDA_MAIN_THREAD))
+    struct LoopCounter* loopCounter = Gearef(context, LoopCounter);
+    goto initDataGears(context, loopCounter, taskManager);
+#else
+    cuda_initialized = 0;
+    pthread_t thread;
+    pthread_create(&thread, NULL, (void*)&start_taskManager, context);
+    while (taskManager->taskManager == 0);
+    TaskManager *t = (TaskManager*)taskManager->taskManager;
+    TaskManagerImpl *im = (TaskManagerImpl*)t->taskManager;
+    struct Queue *q = (Queue *)im->workers[0];
+    createCUDAWorker(context,0,q, im);
+    pthread_join(thread,0);
+    exit(0);
+#endif
+}
+
+__code code1(struct Time* time) {
+    printf("cpus:\t\t%d\n", cpu_num);
+    printf("gpus:\t\t%d\n", gpu_num);
+    printf("length:\t\t%d\n", length);
+    /* puts("queue"); */
+    /* print_queue(context->data[ActiveQueue]->queue.first); */
+    /* puts("tree"); */
+    /* print_tree(context->data[Tree]->tree.root); */
+    /* puts("result"); */
+
+    //time->next = C_code2;
+    goto meta(context, C_exit_code);
+    //goto meta(context, C_start_time);
+}
+
+__code code1_stub(struct Context* context) {
+    goto code1(context, Gearef(context, Time));
+}
+
+
+__code createTask1(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
+    int i = loopCounter->i;
+
+    if (i < length) {
+        loopCounter->i++;
+        taskManager->next = C_createTask2;
+        goto meta(context, taskManager->taskManager->TaskManager.createTask);
+    }
+
+    loopCounter->i = 0;
+    taskManager->next = C_code1;
+    sleep(3);
+    goto meta(context, taskManager->taskManager->TaskManager.shutdown);
+}
+
+__code createTask2(LoopCounter* loopCounter, TaskManager* taskManager, struct Context *task, Integer *integer1, Integer *integer2, Integer *integer3) {
+    int i = loopCounter->i;
+    task->idgCount = 1;
+    task->next = C_mult;
+    integer2->value = i;
+    task->data[task->idg] = (union Data*)integer1;
+    task->data[task->idg+1] = (union Data*)integer2;
+    task->maxIdg = task->idg + 2;
+    task->odg = task->maxIdg;
+    task->data[task->odg] = (union Data*)integer3;
+    task->maxOdg = task->odg + 1;
+    taskManager->next = C_createTask3;
+    taskManager->data = (union Data*)integer1;
+    goto meta(context, taskManager->taskManager->TaskManager.setWaitTask);
+}
+
+__code createTask2_stub(struct Context* context) {
+    Integer* integer1 = &ALLOCATE_DATA_GEAR(context, Integer)->Integer;
+    Integer* integer2 = &ALLOCATE_DATA_GEAR(context, Integer)->Integer;
+    Integer* integer3 = &ALLOCATE_DATA_GEAR(context, Integer)->Integer;
+    goto createTask2(context,
+            Gearef(context, LoopCounter),
+            Gearef(context, TaskManager),
+            Gearef(context, TaskManager)->context,
+            integer1,
+            integer2,
+            integer3);
+}
+
+__code createTask3(struct TaskManager* taskManager) {
+    taskManager->next = C_createTask4;
+    goto meta(context, taskManager->taskManager->TaskManager.spawn);
+}
+
+__code createTask4(struct TaskManager* taskManager) {
+    taskManager->next = C_createTask5;
+    goto meta(context, taskManager->taskManager->TaskManager.createTask);
+}
+
+__code createTask5(LoopCounter* loopCounter, TaskManager* taskManager, struct Context* task, Integer *integer1, Integer *integer2, Integer *integer3) {
+    int i = loopCounter->i;
+    task->next = C_add;
+    task->idgCount = 0;
+    integer1->value = i;
+    integer2->value = i+1;
+    task->data[task->idg] = (union Data*)integer1;
+    task->data[task->idg+1] = (union Data*)integer2;
+    task->maxIdg = task->idg + 2;
+    task->odg = task->maxIdg;
+    task->data[task->odg] = (union Data*)integer3;
+    task->maxOdg = task->odg + 1;
+    taskManager->next = C_createTask1;
+    goto meta(context, taskManager->taskManager->TaskManager.spawn);
+}
+
+__code createTask5_stub(struct Context* context) {
+    Integer* integer1 = &ALLOCATE_DATA_GEAR(context, Integer)->Integer;
+    Integer* integer2 = &ALLOCATE_DATA_GEAR(context, Integer)->Integer;
+    goto createTask5(context,
+            Gearef(context, LoopCounter),
+            Gearef(context, TaskManager),
+            Gearef(context, TaskManager)->context,
+            integer1,
+            integer2,
+            &Gearef(context, TaskManager)->data->Integer);
+}
+
+
+void init(int argc, char** argv) {
+    for (int i = 1; argv[i]; ++i) {
+        if (strcmp(argv[i], "-cpu") == 0)
+            cpu_num = (int)atoi(argv[i+1]);
+        else if (strcmp(argv[i], "-l") == 0)
+            length = (int)atoi(argv[i+1]);
+        else if (strcmp(argv[i], "-cuda") == 0) {
+            gpu_num = 1;
+            CPU_CUDA = 0;
+        }
+    }
+}
+
+
+int main(int argc, char** argv) {
+    init(argc, argv);
+    struct Context* main_context = NEW(struct Context);
+    initContext(main_context);
+    main_context->next = C_initDataGears;
+
+    goto start_code(main_context);
+}
+
+__code add(struct Integer* input1, struct Integer* input2, struct Integer* output) { 
+    output->value = input1->value + input2->value; 
+    printf("%d + %d = %d\n", input1->value, input2->value, output->value); 
+    goto meta(context, context->next); 
+}
+
+__code add_stub(struct Context* context) { 
+    goto add(context, 
+            &context->data[context->idg]->Integer,
+            &context->data[context->idg + 1]->Integer,
+            &context->data[context->odg]->Integer);
+
+}
+
+__code mult(struct Integer* input1, struct Integer* input2, struct Integer* output) { 
+    output->value = input1->value * input2->value; 
+    printf("%d * %d = %d\n", input1->value, input2->value, output->value); 
+    assert(output->value == 2 * (input2->value * input2->value) + input2->value);
+    goto meta(context, context->next);
+}
+
+__code mult_stub(struct Context* context) { 
+    goto mult(context, 
+            &context->data[context->idg]->Integer,
+            &context->data[context->idg + 1]->Integer,
+            &context->data[context->odg]->Integer);
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parallel_execution/examples/twice.cbc	Tue Apr 18 18:59:36 2017 +0900
@@ -0,0 +1,194 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "../../context.h"
+
+int cpu_num = 1;
+int length = 102400;
+int split = 8;
+int* array_ptr;
+int gpu_num = 0;
+int CPU_ANY = -1;
+int CPU_CUDA = -1;
+
+void print_queue(struct Element* element) {
+    while (element) {
+        printf("%p\n", ((struct Task *)(element->data)));
+        element = element->next;
+    }
+}
+
+void print_tree(struct Node* node) {
+    if (node != 0) {
+        printf("%d\n", node->value->Array.index);
+        print_tree(node->left);
+        print_tree(node->right);
+    }
+}
+
+void *start_taskManager(struct Context *context) {
+    goto initDataGears(context, Gearef(context, LoopCounter), Gearef(context, TaskManager));
+    return 0;
+}
+
+#ifdef USE_CUDAWorker
+#ifdef USE_CUDA_MAIN_THREAD
+extern volatile int cuda_initialized;
+#endif
+#endif
+
+__code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
+    // loopCounter->tree = createRedBlackTree(context);
+    loopCounter->i = 0;
+    taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, gpu_num, 0);
+#ifdef USE_CUDAWorker
+#ifdef USE_CUDA_MAIN_THREAD
+    while(! cuda_initialized) {};
+#endif
+#endif
+    goto meta(context, C_createTask1);
+}
+
+__code initDataGears_stub(struct Context* context) {
+    struct TaskManager* taskManager =  Gearef(context, TaskManager);
+    taskManager->taskManager = 0;
+#if (! defined(USE_CUDAWorker) || ! defined(USE_CUDA_MAIN_THREAD))
+    struct LoopCounter* loopCounter = Gearef(context, LoopCounter);
+    goto initDataGears(context, loopCounter, taskManager);
+#else
+    cuda_initialized = 0;
+    pthread_t thread;
+    pthread_create(&thread, NULL, (void*)&start_taskManager, context);
+    while (taskManager->taskManager == 0);
+    TaskManager *t = (TaskManager*)taskManager->taskManager;
+    TaskManagerImpl *im = (TaskManagerImpl*)t->taskManager;
+    struct Queue *q = (Queue *)im->workers[0];
+    createCUDAWorker(context,0,q, im);
+    pthread_join(thread,0);
+    exit(0);
+#endif
+}
+
+__code code1(struct Time* time) {
+    printf("cpus:\t\t%d\n", cpu_num);
+    printf("gpus:\t\t%d\n", gpu_num);
+    printf("length:\t\t%d\n", length);
+    printf("length/task:\t%d\n", length/split);
+    /* puts("queue"); */
+    /* print_queue(context->data[ActiveQueue]->queue.first); */
+    /* puts("tree"); */
+    /* print_tree(context->data[Tree]->tree.root); */
+    /* puts("result"); */
+
+    time->next = C_code2;
+    goto meta(context, C_code2);
+    //goto meta(context, C_exit_code);
+    //goto meta(context, C_start_time);
+}
+
+__code code1_stub(struct Context* context) {
+    goto code1(context, Gearef(context, Time));
+}
+
+__code code2(struct LoopCounter* loopCounter) {
+    int i = loopCounter->i;
+
+    if (i < length) {
+        //printf("%d\n", array_ptr[i]);
+        if (array_ptr[i] == (i*2)) {
+            loopCounter->i++;
+            goto meta(context, C_code2);
+        } else
+            puts("wrong result");
+
+    }
+
+    goto meta(context, C_exit_code);
+}
+
+__code createTask1(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
+    int i = loopCounter->i;
+
+    if ((length/split*i) < length) {
+        taskManager->next = C_createTask2;
+        goto meta(context, taskManager->taskManager->TaskManager.createTask);
+    }
+
+    loopCounter->i = 0;
+    taskManager->next = C_code1;
+#if ( defined(USE_CUDAWorker) && defined(USE_CUDA_MAIN_THREAD))
+sleep(5);
+#endif
+    goto meta(context, taskManager->taskManager->TaskManager.shutdown);
+}
+
+__code createTask2(LoopCounter* loopCounter, TaskManager* taskManager,struct Context* task, LoopCounter* loopCounter2, Array* array) {
+    int i = loopCounter->i;
+    array->index = i;
+    array->prefix = length/split;
+    array->array = array_ptr;
+    array->size = length;
+    loopCounter2->i = 0;
+    task->idgCount = 0;
+    if (gpu_num) {
+#ifdef USE_CUDAWorker
+        task->next = C_CUDAtwice;
+        task->workerId = CPU_CUDA;
+#else
+        task->next = C_twice;
+#endif
+    } else {
+        task->next = C_twice;
+    }
+    task->data[task->dataNum] = (union Data*)loopCounter2;
+    task->data[task->dataNum+1] = (union Data*)array;
+    task->odg = task->dataNum + 2;
+    task->maxOdg = task->odg;
+    taskManager->next = C_createTask1;
+    loopCounter->i++;
+    goto meta(context, taskManager->taskManager->TaskManager.spawn);
+}
+
+__code createTask2_stub(struct Context* context) {
+    LoopCounter* loopCounter = &ALLOCATE(context, LoopCounter)->LoopCounter;
+    Array* array = &ALLOCATE(context, Array)->Array;
+    goto createTask2(context,
+            Gearef(context, LoopCounter),
+            Gearef(context, TaskManager),
+            Gearef(context, TaskManager)->context,
+            loopCounter,
+            array);
+}
+
+void init(int argc, char** argv) {
+    for (int i = 1; argv[i]; ++i) {
+        if (strcmp(argv[i], "-cpu") == 0)
+            cpu_num = (int)atoi(argv[i+1]);
+        else if (strcmp(argv[i], "-l") == 0)
+            length = (int)atoi(argv[i+1]);
+        else if (strcmp(argv[i], "-s") == 0)
+            split = (int)atoi(argv[i+1]);
+        else if (strcmp(argv[i], "-cuda") == 0) {
+            gpu_num = 1;
+            CPU_CUDA = 0;
+        }
+    }
+}
+
+
+int main(int argc, char** argv) {
+    init(argc, argv);
+
+    array_ptr = NEWN(length, int);
+
+    for(int i=0; i<length; i++)
+        array_ptr[i]=i;
+
+    struct Context* main_context = NEW(struct Context);
+    initContext(main_context);
+    main_context->next = C_initDataGears;
+
+    goto start_code(main_context);
+}