changeset 410:85b0ddbf458e

Fix CudaWorker
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Thu, 14 Sep 2017 02:35:20 +0900
parents 4d1e3697a6b8
children 0eba9a04633f
files src/parallel_execution/CMakeLists.txt src/parallel_execution/CPUWorker.cbc src/parallel_execution/CUDAWorker.cbc src/parallel_execution/Iterator.cbc src/parallel_execution/MultiDimIterator.cbc src/parallel_execution/TaskManagerImpl.cbc src/parallel_execution/context.h src/parallel_execution/cuda.c src/parallel_execution/examples/twice/CUDAtwice.cbc src/parallel_execution/examples/twice/CUDAtwice.cu src/parallel_execution/examples/twice/createArray.cbc src/parallel_execution/examples/twice/main.cbc src/parallel_execution/generate_context.pl src/parallel_execution/main.cbc
diffstat 14 files changed, 252 insertions(+), 237 deletions(-) [+]
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/CMakeLists.txt	Thu Sep 14 02:35:20 2017 +0900
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.8)
 
 set(USE_CUDA,0)
 # -DUSE_CUDA
@@ -7,6 +7,8 @@
 set(CMAKE_C_COMPILER $ENV{CBC_COMPILER})
 add_definitions("-Wall -g")
 
+# -DCMAKE_BUILD_TYPE=Debug
+set(CMAKE_C_FLAGS_DEBUG "-O0")
 
 if (${USE_CUDA})
     include_directories("/usr/local/cuda/include")
@@ -81,9 +83,9 @@
       TARGET
           CUDAtwice
       SOURCES 
-          main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc  CUDAWorker.cbc examples/twice/CUDAtwice.cbc examples/twice/CUDAtwice.cu cuda.c
+        examples/twice/main.cbc examples/twice/CUDAtwice.cbc examples/twice/CUDAtwice.cu examples/twice/createArray.cbc CPUWorker.cbc TimeImpl.cbc examples/twice/twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc CUDAWorker.cbc cuda.c MultiDimIterator.cbc
     )
-    set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1 -DUSE_CUDA_MAIN_THREAD")
+    set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1")
 endif()
 
 GearsCommand(
--- a/src/parallel_execution/CPUWorker.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/CPUWorker.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -1,6 +1,6 @@
 #include "../context.h"
 
-static void start_worker(Worker* worker);
+static void startWorker(Worker* worker);
 
 Worker* createCPUWorker(struct Context* context, int id, Queue* queue) {
     struct Worker* worker = new Worker();
@@ -10,11 +10,11 @@
     cpuWorker->id = id;
     worker->taskReceive = C_taskReceiveWorker;
     worker->shutdown = C_shutdownWorker;
-    pthread_create(&worker->worker->CPUWorker.thread, NULL, (void*)&start_worker, worker);
+    pthread_create(&worker->worker->CPUWorker.thread, NULL, (void*)&startWorker, worker);
     return worker;
 }
 
-static void start_worker(Worker* worker) {
+static void startWorker(Worker* worker) {
     CPUWorker* cpuWorker = (CPUWorker*)worker->worker;
     cpuWorker->context = NEW(struct Context);
     initContext(cpuWorker->context);
--- a/src/parallel_execution/CUDAWorker.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/CUDAWorker.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -1,14 +1,8 @@
-#include <stdio.h>
-#include <sys/time.h>
-#include <string.h>
-#include <stdlib.h>
-#include <libkern/OSAtomic.h>
-
 #include "../context.h"
 
 extern void cudaInit(struct CUDAWorker *cudaWorker,int phase) ;
 
-static void start_CUDAworker(Worker* worker);
+static void startCUDAWorker(Worker* worker);
 
 #ifndef USE_CUDA_MAIN_THREAD
 volatile 
@@ -19,32 +13,31 @@
     struct Worker* worker = ALLOC(context, Worker);
     struct CUDAWorker* cudaWorker = new CUDAWorker();
 
-    cudaInit(cudaWorker,0);
+    cudaInit(cudaWorker, 0);
 
     worker->worker = (union Data*)cudaWorker;
     worker->tasks = queue;
     cudaWorker->id = id;
+    worker->taskReceive = C_taskReceiveWorker;
     worker->shutdown = C_shutdownCUDAWorker;
 #ifndef USE_CUDA_MAIN_THREAD
-    pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker);
+    pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&startCUDAWorker, worker);
 #else
     if (im) {
         im->workers[0] = worker;
     }
     cuda_initialized = 1;
-    start_CUDAworker(worker);
+    startCUDAWorker(worker);
 #endif
     return worker;
 }
 
-
-static void start_CUDAworker(Worker* worker) {
+static void startCUDAWorker(Worker* worker) {
     CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
     cudaWorker->context = NEW(struct Context);
     initContext(cudaWorker->context);
     Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker;
-
-    goto meta(cudaWorker->context, C_taskReceiveCUDAWorker);
+    goto meta(cudaWorker->context, worker->taskReceive);
 }
 
 __code taskReceiveCUDAWorker(struct Worker* worker,struct Queue* queue) {
@@ -60,10 +53,13 @@
 __code getTaskCUDA(struct Worker* worker, struct Context* task) {
     if (!task)
         return; // end thread
-    worker->taskReceive = C_taskReceiveCUDAWorker;
     task->worker = worker;
     enum Code taskCg = task->next;
-    task->next = C_odgCommitCUDA; // set CG after task exec
+    if (task->iterate) {
+        task->next = C_iterateCommitCUDA;
+    } else {
+        task->next = C_odgCommitCUDA; // set CG after task exec
+    }
     goto meta(task, taskCg);
 }
 
@@ -73,50 +69,97 @@
     goto getTaskCUDA(context, worker, task);
 }
 
-__code odgCommitCUDA(struct LoopCounter* loopCounter, struct Queue* queue, struct Context* task) {
+__code iterateCommitCUDA(struct Iterator* iterator) {
+    iterator->iterator = (union Data*)context->iterator;
+    iterator->task = context;
+    iterator->next = C_odgCommitCUDA;
+    iterator->whenWait = C_iterateCommitCUDA1;
+    goto meta(context, context->iterator->barrier);
+}
+
+__code iterateCommitCUDA1(struct Context* task) {
+    goto meta(context, C_taskReceiveWorker);
+}
+
+__code iterateCommitCUDA1_stub(struct Context* context) {
+    struct Context* workerContext = context->worker->worker->CUDAWorker.context;
+    goto iterateCommitCUDA1(workerContext, context);
+}
+
+__code odgCommitCUDA(struct LoopCounter* loopCounter, struct Context* task, struct TaskManager* taskManager) {
     int i = loopCounter->i ;
-    if(task->odg + i < task->maxOdg) {
-        queue->queue = (union Data*)GET_WAIT_LIST(task->data[task->odg+i]);
-        queue->next = C_odgCommitCUDA1;
-        goto meta(context, queue->queue->Queue.take);
+    if (task->odg+i < task->maxOdg) {
+        goto meta(task, C_odgCommitCUDA1);
     }
     loopCounter->i = 0;
-    goto meta(context, C_taskReceiveCUDAWorker);
+    taskManager->taskManager = (union Data*)task->taskManager;
+    taskManager->next = C_taskReceiveWorker;
+    goto meta(context, task->taskManager->decrementTaskCount);
 }
 
 __code odgCommitCUDA_stub(struct Context* context) {
     struct Context* workerContext = context->worker->worker->CUDAWorker.context;
     goto odgCommitCUDA(workerContext,
-                   Gearef(workerContext, LoopCounter),
-                   Gearef(workerContext, Queue),
-                   context);
+            Gearef(context, LoopCounter),
+            context,
+            Gearef(workerContext, TaskManager));
 }
 
-__code odgCommitCUDA1(struct TaskManager* taskManager, struct Context* task) {
-    if(__sync_fetch_and_sub(&task->idgCount, 1)) {
-        if(task->idgCount == 0) {
-            taskManager->taskManager = (union Data*)task->taskManager;
-            taskManager->context = task;
-            taskManager->next = C_odgCommitCUDA;
-            goto meta(context, task->taskManager->spawn);
-        }
-    } else {
-        goto meta(context, C_odgCommitCUDA1);
-    }
+__code odgCommitCUDA1(struct LoopCounter* loopCounter, struct Queue* queue) {
+    int i = loopCounter->i ;
+    queue->queue = (union Data*)GET_WAIT_LIST(context->data[context->odg+i]);
+    queue->whenEmpty = C_odgCommitCUDA4;
+    queue->next = C_odgCommitCUDA2;
+    goto meta(context, queue->queue->Queue.isEmpty);
 }
 
 __code odgCommitCUDA1_stub(struct Context* context) {
+    goto odgCommitCUDA1(context,
+            Gearef(context, LoopCounter),
+            Gearef(context, Queue));
+}
+
+__code odgCommitCUDA2(struct Queue* queue) {
+    queue->next = C_odgCommitCUDA3;
+    goto meta(context, queue->queue->Queue.take);
+}
+
+__code odgCommitCUDA2_stub(struct Context* context) {
+    goto odgCommitCUDA2(context,
+            Gearef(context, Queue));
+}
+
+__code odgCommitCUDA3(struct TaskManager* taskManager, struct Context* task) {
+    if (__sync_fetch_and_sub(&task->idgCount, 1) == 1) { // atomic decrement idg counter(__sync_fetch_and_sub function return initial value of task->idgCount point)
+        taskManager->taskManager = (union Data*)task->taskManager;
+        taskManager->context = task;
+        taskManager->next = C_odgCommitCUDA1;
+        goto meta(context, task->taskManager->spawn);
+    }
+    goto meta(context, C_odgCommitCUDA1);
+}
+
+__code odgCommitCUDA3_stub(struct Context* context) {
     struct Context* task = &Gearef(context, Queue)->data->Context;
-    goto odgCommitCUDA1(context,
-                    Gearef(context, TaskManager),
-                    task);
-                 
+    goto odgCommitCUDA3(context,
+            Gearef(context, TaskManager),
+            task);
+}
+
+__code odgCommitCUDA4(struct LoopCounter* loopCounter) {
+    loopCounter->i++;
+    goto meta(context, C_odgCommitCUDA);
+}
+
+__code odgCommitCUDA4_stub(struct Context* context) {
+    goto odgCommitCUDA4(context,
+            Gearef(context, LoopCounter));
 }
 
 extern void cudaShutdown( CUDAWorker *cudaWorker) ;
 
 __code shutdownCUDAWorker(struct Context* context, CUDAWorker* worker) {
-    cudaShutdown( worker) ;
+    cudaShutdown(worker) ;
 }
 
 __code shutdownCUDAWorker_stub(struct Context* context) {
--- a/src/parallel_execution/Iterator.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/Iterator.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -1,7 +1,8 @@
 typedef struct Iterator<Impl>{
         union Data* iterator;
         struct Context* task;
-        __code exec(Impl* iterator, struct TaskManager* taskManager, struct Context* task, __code next(...));
+        int numGPU;
+        __code exec(Impl* iterator, struct TaskManager* taskManager, struct Context* task, int numGPU, __code next(...));
         __code barrier(Impl* iterator, struct Context* task, __code next(...), __code whenWait(...));
         __code whenWait(...);
         __code next(...);
--- a/src/parallel_execution/MultiDimIterator.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/MultiDimIterator.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -53,14 +53,16 @@
     return task1;
 }
 
-__code execMultiDimIterator(struct MultiDimIterator* iterator, struct TaskManager* taskManager, struct Context* task, __code next(...)) {
-    int x = iterator->counterX;
-    int y = iterator->counterY;
-    int z = iterator->counterZ;
-    struct Context* iterateTask = createMultiDimIterateTask(task, x, y, z);
+__code execMultiDimIterator(struct MultiDimIterator* iterator, struct TaskManager* taskManager, struct Context* task, int numGPU, __code next(...)) {
+    // No GPU device
+    if (numGPU == 0) {
+        goto meta(context, C_execMultiDimIterator1);
+    }
+    task->iterate = 1;
+    task->gpu = 1;
     taskManager->taskManager = (union Data*)task->taskManager;
-    taskManager->context = iterateTask;
-    taskManager->next = C_execMultiDimIterator1;
+    taskManager->context = task;
+    taskManager->next = next;
     goto meta(context, task->taskManager->spawn);
 }
 
@@ -68,11 +70,31 @@
     MultiDimIterator* iterator = (MultiDimIterator*)GearImpl(context, Iterator, iterator);
     TaskManager* taskManager = Gearef(context, TaskManager);
     Context* task = Gearef(context, Iterator)->task;
+    int numGPU = Gearef(context, Iterator)->numGPU;
     enum Code next = Gearef(context, Iterator)->next;
-    goto execMultiDimIterator(context, iterator, taskManager, task, next);
+    goto execMultiDimIterator(context, iterator, taskManager, task, numGPU, next);
 } 
 
-__code execMultiDimIterator1(struct MultiDimIterator* iterator, struct Context* task, __code next(...)) {
+__code execMultiDimIterator1(struct MultiDimIterator* iterator, struct TaskManager* taskManager, struct Context* task, __code next(...)) {
+    int x = iterator->counterX;
+    int y = iterator->counterY;
+    int z = iterator->counterZ;
+    struct Context* iterateTask = createMultiDimIterateTask(task, x, y, z);
+    taskManager->taskManager = (union Data*)task->taskManager;
+    taskManager->context = iterateTask;
+    taskManager->next = C_execMultiDimIterator2;
+    goto meta(context, task->taskManager->spawn);
+}
+
+__code execMultiDimIterator1_stub(struct Context* context) {
+    MultiDimIterator* iterator = (MultiDimIterator*)GearImpl(context, Iterator, iterator);
+    TaskManager* taskManager = Gearef(context, TaskManager);
+    Context* task = Gearef(context, Iterator)->task;
+    enum Code next = Gearef(context, Iterator)->next;
+    goto execMultiDimIterator1(context, iterator, taskManager, task, next);
+} 
+
+__code execMultiDimIterator2(struct MultiDimIterator* iterator, struct Context* task, __code next(...)) {
     if (++iterator->counterX >= iterator->x) {
         iterator->counterX = 0;
         if (++iterator->counterY >= iterator->y) {
@@ -83,7 +105,7 @@
             }
         }
     }
-    goto meta(context, C_execMultiDimIterator);
+    goto meta(context, C_execMultiDimIterator1);
 }
 
 __code barrierMultiDimIterator(struct MultiDimIterator* iterator, struct Context* task, __code next(...), __code whenWait(...)) {
--- a/src/parallel_execution/TaskManagerImpl.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/TaskManagerImpl.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -22,7 +22,8 @@
     struct TaskManagerImpl* taskManagerImpl = new TaskManagerImpl();
     taskManagerImpl->taskQueue = createSingleLinkedQueue(context);
     taskManagerImpl->numWorker = taskManager->maxCPU;
-    taskManagerImpl->sendWorkerIndex = 0;
+    taskManagerImpl->sendGPUWorkerIndex = taskManager->gpu;
+    taskManagerImpl->sendCPUWorkerIndex = taskManager->cpu;
     taskManagerImpl->taskCount = 0;
     taskManagerImpl->loopCounter = new LoopCounter();
     taskManagerImpl->loopCounter -> i = 0;
@@ -162,18 +163,19 @@
     goto next(...);
 }
 
-__code spawnTaskManagerImpl(struct TaskManagerImpl* taskManager, struct Iterator* iterator, struct Context* task, __code next(...)) {
+__code spawnTaskManagerImpl(struct TaskManagerImpl* taskManagerImpl, struct Iterator* iterator, struct TaskManager* taskManager, struct Context* task, __code next(...)) {
     if (task->idgCount == 0) {
         if(task->iterator != NULL && task->iterate == 0) {
             iterator->iterator = (union Data*)task->iterator;
             iterator->task = task;
             iterator->next = next;
-            pthread_mutex_unlock(&taskManager->mutex);
+            iterator->numGPU = taskManager->cpu - taskManager->gpu;
+            pthread_mutex_unlock(&taskManagerImpl->mutex);
             goto meta(context, task->iterator->exec);
         }
         goto meta(context, C_taskSend);
     }
-    pthread_mutex_unlock(&taskManager->mutex);
+    pthread_mutex_unlock(&taskManagerImpl->mutex);
     goto next(...);
 }
 
@@ -181,30 +183,40 @@
     TaskManagerImpl* taskManager = (TaskManagerImpl*)GearImpl(context, TaskManager, taskManager);
     pthread_mutex_lock(&taskManager->mutex);
     goto spawnTaskManagerImpl(context,
-            taskManager,
-            Gearef(context, Iterator),
-            Gearef(context, TaskManager)->context,
-            Gearef(context, TaskManager)->next);
+                              taskManager,
+                              Gearef(context, Iterator),
+                              &Gearef(context, TaskManager)->taskManager->TaskManager,
+                              Gearef(context, TaskManager)->context,
+                              Gearef(context, TaskManager)->next);
 }
 
-__code taskSend(struct TaskManagerImpl* taskManager, struct Queue* queue, struct Context* task, __code next(...)) {
+__code taskSend(struct TaskManagerImpl* taskManagerImpl, struct Queue* queue, struct TaskManager* taskManager, struct Context* task, __code next(...)) {
     // set workerId
-    task->workerId = taskManager->sendWorkerIndex;
-    if(++taskManager->sendWorkerIndex >= taskManager->numWorker) {
-        taskManager->sendWorkerIndex = 0;
+    if (task->gpu) {
+        task->workerId = taskManagerImpl->sendGPUWorkerIndex;   
+        if(++taskManagerImpl->sendGPUWorkerIndex >= taskManager->cpu) {
+            taskManagerImpl->sendGPUWorkerIndex = taskManager->gpu;
+        }
+    } else {
+        task->workerId = taskManagerImpl->sendCPUWorkerIndex;
+        if(++taskManagerImpl->sendCPUWorkerIndex >= taskManager->maxCPU) {
+            taskManagerImpl->sendCPUWorkerIndex = taskManager->cpu;
+        }
     }
-    struct Queue* tasks = taskManager->workers[task->workerId]->tasks;
+    struct Queue* tasks = taskManagerImpl->workers[task->workerId]->tasks;
     queue->queue = (union Data*)tasks;
     queue->data = (union Data*)task;
     queue->next = next;
-    pthread_mutex_unlock(&taskManager->mutex);
+    pthread_mutex_unlock(&taskManagerImpl->mutex);
     goto meta(context, tasks->put);
 }
 
 __code taskSend_stub(struct Context* context) {
     TaskManagerImpl* taskManager = (TaskManagerImpl*)GearImpl(context, TaskManager, taskManager);
     goto taskSend(context,
-            taskManager, Gearef(context, Queue),
+            taskManager,
+            Gearef(context, Queue),
+            &Gearef(context, TaskManager)->taskManager->TaskManager,
             Gearef(context, TaskManager)->context,
             Gearef(context, TaskManager)->next);
 }
@@ -230,10 +242,10 @@
 __code shutdownTaskManagerImpl_stub(struct Context* context) {
     TaskManagerImpl* taskManagerImpl = (TaskManagerImpl*)GearImpl(context, TaskManager, taskManager);
     goto shutdownTaskManagerImpl(context,
-                                 taskManagerImpl,
-                                 Gearef(context, TaskManager)->next,
-                                 &Gearef(context, TaskManager)->taskManager->TaskManager,
-                                 Gearef(context, Queue));
+            taskManagerImpl,
+            Gearef(context, TaskManager)->next,
+            &Gearef(context, TaskManager)->taskManager->TaskManager,
+            Gearef(context, Queue));
 }
 
 __code shutdownTaskManagerImpl1(TaskManagerImpl* taskManager) {
--- a/src/parallel_execution/context.h	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/context.h	Thu Sep 14 02:35:20 2017 +0900
@@ -97,6 +97,7 @@
     int odg;
     int maxOdg;
     int workerId;
+    int gpu; // GPU task
     struct Context* task;
     struct Queue* tasks;
 #ifdef USE_CUDAWorker
@@ -155,7 +156,8 @@
     struct TaskManagerImpl {
         enum Code next;
         int numWorker;
-        int sendWorkerIndex;
+        int sendCPUWorkerIndex;
+        int sendGPUWorkerIndex;
         int taskCount;
         pthread_mutex_t mutex;
         struct Queue* activeQueue;
@@ -316,6 +318,7 @@
     struct Iterator {
         union Data* iterator;
         struct Context* task;
+        int numGPU;
         enum Code exec;
         enum Code barrier;
         enum Code whenWait;
--- a/src/parallel_execution/cuda.c	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/cuda.c	Thu Sep 14 02:35:20 2017 +0900
@@ -11,8 +11,9 @@
 #include "helper_cuda.h"
 #include "pthread.h"
 
-// #include "context.h"
+#include "context.h"
 
+/*
 struct Context {
     int next;
     struct Worker* worker;
@@ -24,85 +25,99 @@
     long heapLimit;
     int dataNum;
     int idgCount; //number of waiting dataGear
+    int idg;
+    int maxIdg;
     int odg;
     int maxOdg;
     int workerId;
+    struct Context* task;
+    struct Queue* tasks;
     int num_exec;
     CUmodule module;
     CUfunction function;
     union Data **data;
+
+    // multi dimension parameter
+    int iterate;
+    struct Iterator* iterator;
 };
 
-    struct CUDAWorker {
-        CUdevice device;
-        CUcontext cuCtx;
-        pthread_t thread;
-        struct Context* context;
-        int id;
-        struct Queue* tasks;
-        int runFlag;
-        int next;
-        int num_stream;
-        CUstream *stream;
-    } CUDAWorker;
+struct CUDAWorker {
+    CUdevice device;
+    CUcontext cuCtx;
+    pthread_t thread;
+    struct Context* context;
+    int id;
+    struct Queue* tasks;
+    int runFlag;
+    int next;
+    int num_stream;
+    CUstream *stream;
+} CUDAWorker;
 
-    struct LoopCounter {
-        int i;
-    } LoopCounter;
+struct LoopCounter {
+    int i;
+} LoopCounter;
 
-    struct Array {
-        int size;
-        int index;
-        int prefix;
-        int* array;
-    } Array;
-
-
+struct Array {
+    int size;
+    int index;
+    int prefix;
+    int* array;
+} Array;
+*/
 
 void cudaInit(struct CUDAWorker *cudaWorker,int phase) {
     // initialize and load kernel
     cudaWorker->num_stream = 1; // number of stream
-//    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
-   if (phase==0)
-    checkCudaErrors(cuInit(0));
-   if (phase==0)
-    checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
-   if (phase==0)
-    checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
-//    if (cudaWorker->num_stream) {
-//        for (int i=0;i<cudaWorker->num_stream;i++)
-//            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
-//    }
+    //    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
+    if (phase==0)
+        checkCudaErrors(cuInit(0));
+    if (phase==0)
+        checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
+    if (phase==0)
+        checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
+    //    if (cudaWorker->num_stream) {
+    //        for (int i=0;i<cudaWorker->num_stream;i++)
+    //            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
+    //    }
+    printf("cuda Init: Done\n");
 }
 
 
-void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) {
+void CUDAExec(struct Context* context, struct Array* array) {
+    printf("cuda exec start\n");
     // Worker *worker = context->worker;
     // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
-  // memory allocate
+    // memory allocate
     CUdeviceptr devA;
-    CUdeviceptr devLoopCounter;
 
-    checkCudaErrors(cuMemAlloc(&devA, array->size));
-    checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter)));
+    checkCudaErrors(cuMemAlloc(&devA, sizeof(int)*array->size));
 
     //twiceカーネルが定義されてなければそれをロードする
-    checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx"));
+    checkCudaErrors(cuModuleLoad(&context->module, "c/examples/twice/CUDAtwice.ptx"));
     checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice"));
 
     //入力のDataGearをGPUにbuffer経由で送る
     // Synchronous data transfer(host to device)
-    checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter)));
     checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size));
 
-  // Asynchronous launch kernel
-     context->num_exec = 1;
-     void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA};
-     checkCudaErrors(cuLaunchKernel(context->function,
-                       1, 1, 1,
-                       1, 1, 1,
-                       0, NULL, args, NULL));
+    // Asynchronous launch kernel
+    context->num_exec = 1;
+    void* args[] = {&devA};
+    if (context->iterate) {
+        struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator;
+        checkCudaErrors(cuLaunchKernel(context->function,
+                    iterator->x, iterator->y, iterator->z,
+                    1, 1, 1,
+                    0, NULL, args, NULL));
 
+    } else {
+        checkCudaErrors(cuLaunchKernel(context->function,
+                    1, 1, 1,
+                    1, 1, 1,
+                    0, NULL, args, NULL));
+    }
     //結果を取ってくるコマンドを入力する
     //コマンドの終了待ちを行う   
     checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size));
@@ -112,7 +127,7 @@
 }
 
 void cudaShutdown( struct CUDAWorker *worker) {
-//    for (int i=0;i<worker->num_stream;i++)
-//        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
+    //    for (int i=0;i<worker->num_stream;i++)
+    //        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
     checkCudaErrors(cuCtxDestroy(worker->cuCtx));
 }
--- a/src/parallel_execution/examples/twice/CUDAtwice.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/examples/twice/CUDAtwice.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -2,7 +2,7 @@
 #include "../../../context.h"
 
 
-extern void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter);
+extern void CUDAExec(struct Context* context, Array* array);
 
 __code CUDAtwice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) {
     int i = loopCounter->i;
@@ -18,10 +18,9 @@
 }
 
 __code CUDAtwice_stub(struct Context* context) {
-printf("CUdAtwice stub\n");
-    struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter;
+printf("CUDAtwice stub\n");
     struct Array* array = &context->data[context->dataNum+1]->Array;
-    CUDAExec(context,array,loopCounter);
+    CUDAExec(context,array);
 
     //continuationにそってGPUworkerに戻る
     goto meta(context, context->next);
--- a/src/parallel_execution/examples/twice/CUDAtwice.cu	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/examples/twice/CUDAtwice.cu	Thu Sep 14 02:35:20 2017 +0900
@@ -1,33 +1,8 @@
 extern "C" {
 
 #include <stdio.h>
-
-//    __global__ void twice(struct LoopCounter* loopCounter, int prefix ,int* array) {
-//         int index = blockIdx.x * blockDim.x + threadIdx.x;
-//         printf("array %p, blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n");
-//         int i = 0;
-//         while (i < prefix) {
-//              array[i+index*prefix] = array[i+index*prefix]*2;
-//         }
-//    }
-
-    struct LoopCounter {
-        int i;
-    } LoopCounter;
-
-    __global__ void twice(struct LoopCounter* loopCounter, int index, int prefix, int* array) {
-         printf("array %p, index = %d, prefix = %d loopCounter->i %d\n",array,index,prefix,loopCounter->i);
-C_twice:
-        int i = loopCounter->i;
-        if (i < prefix) {
-            array[i+index*prefix] = array[i+index*prefix]*2;
-            loopCounter->i++;
-
-            goto C_twice;
-        }
-
-        loopCounter->i = 0;
+    __global__ void twice(int* array) {
+        printf("array %p",array);
+        array[blockIdx.x] = array[blockIdx.x]*2;
     }
-
-
 }
--- a/src/parallel_execution/examples/twice/createArray.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/examples/twice/createArray.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -12,6 +12,7 @@
     output->array  = array_ptr;
     output->size   = length;
     *O_output = output;
+    printf("created Array\n");
     goto meta(context, context->next);
 }
 
--- a/src/parallel_execution/examples/twice/main.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/examples/twice/main.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -98,7 +98,7 @@
 
     par goto createArray(array, __exit);
 
-    par goto twice(array, iterate(split), __exit);
+    par goto CUDAtwice(array, iterate(split), __exit);
     goto code2();
 }
 
--- a/src/parallel_execution/generate_context.pl	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/generate_context.pl	Thu Sep 14 02:35:20 2017 +0900
@@ -139,6 +139,8 @@
         $code_init .=  "    context->code[C_${code}]    = ${code}_stub;\n";
     }
 
+    my $data_num = keys(%dataGear);
+    $data_num++;
 my $context_c = << "EOFEOF";
 #include <stdlib.h>
 
@@ -155,7 +157,7 @@
 $code_init
 
 #include "dataGearInit.c"
-    context->dataNum = D_Worker;
+    context->dataNum = $data_num;
     context->tasks = createSingleLinkedQueue(context);
 }
 EOFEOF
--- a/src/parallel_execution/main.cbc	Wed Sep 06 22:01:27 2017 +0900
+++ b/src/parallel_execution/main.cbc	Thu Sep 14 02:35:20 2017 +0900
@@ -13,21 +13,6 @@
 int CPU_ANY = -1;
 int CPU_CUDA = -1;
 
-void print_queue(struct Element* element) {
-    while (element) {
-        printf("%p\n", ((struct Task *)(element->data)));
-        element = element->next;
-    }
-}
-
-void print_tree(struct Node* node) {
-    if (node != 0) {
-        printf("%d\n", node->value->Array.index);
-        print_tree(node->left);
-        print_tree(node->right);
-    }
-}
-
 void *start_taskManager(struct Context *context) {
     goto initDataGears(context, Gearef(context, LoopCounter), Gearef(context, TaskManager));
     return 0;
@@ -48,7 +33,7 @@
     while(! cuda_initialized) {};
 #endif
 #endif
-    goto meta(context, C_createTask1);
+    goto meta(context, C_code1);
 }
 
 __code initDataGears_stub(struct Context* context) {
@@ -81,25 +66,25 @@
     /* puts("tree"); */
     /* print_tree(context->data[Tree]->tree.root); */
     /* puts("result"); */
-
-    time->next = C_code2;
-    goto meta(context, C_code2);
-    //goto meta(context, C_exit_code);
-    //goto meta(context, C_start_time);
+    time->time = (union Data*)createTimeImpl(context);
+    time->next = C_createTask1;
+    goto meta(context, time->time->Time.start);
 }
 
-__code code1_stub(struct Context* context) {
-    goto code1(context, Gearef(context, Time));
+__code code2(struct Time* time, struct TaskManager* taskManager) {
+    time->next = C_code3;
+    taskManager->next = time->time->Time.end;
+    goto meta(context, taskManager->taskManager->TaskManager.shutdown);
 }
 
-__code code2(struct LoopCounter* loopCounter) {
+__code code3(struct LoopCounter* loopCounter) {
     int i = loopCounter->i;
 
     if (i < length) {
         //printf("%d\n", array_ptr[i]);
         if (array_ptr[i] == (i*2)) {
             loopCounter->i++;
-            goto meta(context, C_code2);
+            goto meta(context, C_code3);
         } else
             puts("wrong result");
 
@@ -109,57 +94,12 @@
 }
 
 __code createTask1(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
-    int i = loopCounter->i;
-
-    if ((length/split*i) < length) {
-        taskManager->next = C_createTask2;
-        goto meta(context, taskManager->taskManager->TaskManager.createTask);
-    }
-
-    loopCounter->i = 0;
-    taskManager->next = C_code1;
-#if ( defined(USE_CUDAWorker) && defined(USE_CUDA_MAIN_THREAD))
-sleep(5);
-#endif
-    goto meta(context, taskManager->taskManager->TaskManager.shutdown);
-}
+    Array* array = &ALLOCATE_DATA_GEAR(context, Array)->Array;
 
-__code createTask2(LoopCounter* loopCounter, TaskManager* taskManager,struct Context* task, LoopCounter* loopCounter2, Array* array) {
-    int i = loopCounter->i;
-    array->index = i;
-    array->prefix = length/split;
-    array->array = array_ptr;
-    array->size = length;
-    loopCounter2->i = 0;
-    task->idgCount = 0;
-    if (gpu_num) {
-#ifdef USE_CUDAWorker
-        task->next = C_CUDAtwice;
-        task->workerId = CPU_CUDA;
-#else
-        task->next = C_twice;
-#endif
-    } else {
-        task->next = C_twice;
-    }
-    task->data[task->dataNum] = (union Data*)loopCounter2;
-    task->data[task->dataNum+1] = (union Data*)array;
-    task->odg = task->dataNum + 2;
-    task->maxOdg = task->odg;
-    taskManager->next = C_createTask1;
-    loopCounter->i++;
-    goto meta(context, taskManager->taskManager->TaskManager.spawn);
-}
+    par goto createArray(array, __exit);
 
-__code createTask2_stub(struct Context* context) {
-    LoopCounter* loopCounter = &ALLOCATE(context, LoopCounter)->LoopCounter;
-    Array* array = &ALLOCATE(context, Array)->Array;
-    goto createTask2(context,
-            Gearef(context, LoopCounter),
-            Gearef(context, TaskManager),
-            Gearef(context, TaskManager)->context,
-            loopCounter,
-            array);
+    par goto twice(array, iterate(split), __exit);
+    goto code2();
 }
 
 void init(int argc, char** argv) {