changeset 436:08a93fc2f0d3

Fix CudaExecutor but not work
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Sat, 04 Nov 2017 06:52:32 +0900
parents af0ec811b20e
children 2c1b1d56bf1e
files src/parallel_execution/CMakeLists.txt src/parallel_execution/CUDAExecutor.cbc src/parallel_execution/CUDAWorker.cbc src/parallel_execution/context.h src/parallel_execution/cuda.c src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc src/parallel_execution/generate_context.pl
diffstat 7 files changed, 57 insertions(+), 126 deletions(-) [+]
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt	Sat Nov 04 04:14:36 2017 +0900
+++ b/src/parallel_execution/CMakeLists.txt	Sat Nov 04 06:52:32 2017 +0900
@@ -91,7 +91,7 @@
       TARGET
         CUDAbitonicSort
       SOURCES 
-        examples/bitonicSort/bitonicSort.cbc examples/bitonicSort/bitonicSwap.cbc examples/bitonicSort/CUDAbitonicSwap.cu examples/bitonicSort/makeArray.cbc examples/bitonicSort/printArray.cbc CPUWorker.cbc CUDAWorker.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc cuda.c MultiDimIterator.cbc TimeImpl.cbc
+        examples/bitonicSort/bitonicSort.cbc examples/bitonicSort/bitonicSwap.cbc examples/bitonicSort/CUDAbitonicSwap.cu examples/bitonicSort/makeArray.cbc examples/bitonicSort/printArray.cbc CPUWorker.cbc CUDAWorker.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc cuda.c MultiDimIterator.cbc TimeImpl.cbc CudaExecutor.cbc
     )
     set_target_properties(CUDAbitonicSort PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1")
 endif()
--- a/src/parallel_execution/CUDAExecutor.cbc	Sat Nov 04 04:14:36 2017 +0900
+++ b/src/parallel_execution/CUDAExecutor.cbc	Sat Nov 04 06:52:32 2017 +0900
@@ -5,7 +5,8 @@
 #include <driver_types.h>
 #include <cuda_runtime.h>
 #include <cuda.h>
-#include "helper_cuda.h"
+#include "../helper_cuda.h"
+#include "pthread.h"
 
 Executor* createCUDAExecutor(struct Context* context) {
     struct Executor* executor = new Executor();
@@ -17,57 +18,57 @@
     return executor;
 }
 
-__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
+__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
+    struct CUDABuffer* buffer = executor->buffer;
     int paramLen = buffer->inputLen + buffer->outputLen;
-    struct CUDABuffer buffer = executor->buffer;
-    buffer->kernelParams = ALLOCATE_PTR_ARRAY(context, CudevicePtr, paramLen);
-    struct CUDABuffer buffer = executor->buffer;
-    CUdeviceptr* deviceptrs = ALLOCATE_ARRAY(context, CudevicePtr, paramLen);
+    executor->kernelParams = (CUdeviceptr**)ALLOCATE_PTR_ARRAY(context, CUdeviceptr, paramLen);
+    CUdeviceptr* deviceptrs = (CUdeviceptr*)ALLOCATE_ARRAY(context, CUdeviceptr, paramLen);
     for (int i = 0; i < paramLen; i++) {
         CUdeviceptr deviceptr = deviceptrs[i];
         // memory allocate
-        union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen];
-        checkCUDAErrors(cuMemAlloc(deviceptr, GET_SIZE(data)));
-        checkCUDAErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data)));
+        union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen];
+        checkCudaErrors(cuMemAlloc(&deviceptr, GET_SIZE(data)));
+        checkCudaErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data)));
         // Synchronous data transfer(host to device)
-        buffer->kernelParams[paramCount++] = &deviceptr;
+        executor->kernelParams[i] = &deviceptr;
     }
+    // TODO: Implements pipeline
+    // goto next(...);
+    goto meta(context, C_execCUDAExecutor);
 }
 
-void cudaLoadFunction(struct Context* context, char* filename, char* function) {
-    checkCUDAErrors(cuModuleLoad(&context->module, filename));
-    checkCUDAErrors(cuModuleGetFunction(&context->function, context->module, function));
-}
-
-__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
+__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
     // Asynchronous launch kernel
     task->num_exec = 1;
-    struct CUDABuffer buffer = executor->buffer;
     if (task->iterate) {
         struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
-        checkCUDAErrors(cuLaunchKernel(task->function,
+        checkCudaErrors(cuLaunchKernel(task->function,
                     iterator->x, iterator->y, iterator->z,
                     1, 1, 1,
-                    0, NULL, (void**)buffer->kernelParams, NULL));
+                    0, NULL, (void**)executor->kernelParams, NULL));
     } else {
-        checkCUDAErrors(cuLaunchKernel(task->function,
+        checkCudaErrors(cuLaunchKernel(task->function,
                     1, 1, 1,
                     1, 1, 1,
-                    0, NULL, (void**)buffer->kernelParams, NULL));
+                    0, NULL, (void**)executor->kernelParams, NULL));
     }
+    // TODO: Implements pipeline
+    // goto next(...);
+    goto meta(context, C_writeCUDAExecutor);
 }
 
-__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
+__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
     //結果を取ってくるコマンドを入力する
     //コマンドの終了待ちを行う   
+    struct CUDABuffer* buffer = executor->buffer;
     int paramLen = buffer->inputLen + buffer->outputLen;
-    struct CUDABuffer buffer = executor->buffer;
     for (int i = 0; i < paramLen; i++) {
-        CUdeviceptr* deviceptr =  buffer->kernelParams[i];
-        union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen];
-        checkCUDAErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data)));
+        CUdeviceptr* deviceptr =  executor->kernelParams[i];
+        union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen];
+        checkCudaErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data)));
         cuMemFree(*deviceptr);
     }
     // wait for stream
-    checkCUDAErrors(cuCtxSynchronize());
+    checkCudaErrors(cuCtxSynchronize());
+    goto next(...);
 }
--- a/src/parallel_execution/CUDAWorker.cbc	Sat Nov 04 04:14:36 2017 +0900
+++ b/src/parallel_execution/CUDAWorker.cbc	Sat Nov 04 06:52:32 2017 +0900
@@ -4,11 +4,6 @@
 
 static void startCUDAWorker(Worker* worker);
 
-#ifndef USE_CUDA_MAIN_THREAD
-volatile 
-#endif
-int cuda_initialized = 0;
-
 Worker* createCUDAWorker(struct Context* context, int id, Queue* queue, TaskManagerImpl *im) {
     struct Worker* worker = new Worker();
     struct CUDAWorker* cudaWorker = new CUDAWorker();
@@ -17,23 +12,16 @@
     cudaWorker->id = id;
     worker->taskReceive = C_taskReceiveCUDAWorker;
     worker->shutdown = C_shutdownCUDAWorker;
-#ifndef USE_CUDA_MAIN_THREAD
     pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&startCUDAWorker, worker);
-#else
-    if (im) {
-        im->workers[0] = worker;
-    }
-    cuda_initialized = 1;
-    startCUDAWorker(worker);
-#endif
     return worker;
 }
 
 static void startCUDAWorker(Worker* worker) {
     struct CUDAWorker* cudaWorker = &worker->worker->CUDAWorker;
     cudaInit(cudaWorker, 0);
-    cudaWorker->context = NEW(struct Context);
+    cudaWorker->context  = NEW(struct Context);
     initContext(cudaWorker->context);
+    cudaWorker->executor = createCUDAExecutor(cudaWorker->context);
     Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker;
     goto meta(cudaWorker->context, worker->taskReceive);
 }
--- a/src/parallel_execution/context.h	Sat Nov 04 04:14:36 2017 +0900
+++ b/src/parallel_execution/context.h	Sat Nov 04 06:52:32 2017 +0900
@@ -198,6 +198,7 @@
         int runFlag;
         enum Code next;
         int num_stream;
+        struct Executor* executor;
         CUstream *stream;
     } CUDAWorker;
 #else
@@ -346,7 +347,7 @@
         int z;
     } MultiDim;
     struct Executor {
-        struct Executor* executor;
+        union Data* executor;
         struct Context* task;
         enum Code read;
         enum Code exec;
@@ -355,10 +356,10 @@
     } Executor;
 #ifdef USE_CUDAWorker
     struct CUDAExecutor {
-        void** kernelParams;
-        CUDABuffer* buffer;
+        CUdeviceptr** kernelParams;
+        struct CUDABuffer* buffer;
     } CUDAExecutor;
-    CudevicePtr CudevicePtr;
+    CUdeviceptr CUdeviceptr;
 #else
     struct CUDAExecutor {
     } CUDAExecutor;
--- a/src/parallel_execution/cuda.c	Sat Nov 04 04:14:36 2017 +0900
+++ b/src/parallel_execution/cuda.c	Sat Nov 04 06:52:32 2017 +0900
@@ -84,82 +84,12 @@
     printf("cuda Init: Done\n");
 }
 
-void cudaRead(struct CudaBuffer* buffer) {
-    buffer->kernelParams = (void **)calloc(buffer->inputLen + buffer->outputLen, sizeof(void *));
-    int paramCount = 0;
-    for (int i = 0; i < buffer->inputLen; i++) {
-        CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr));
-        // memory allocate
-        checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->inputData[i])));
-        // Synchronous data transfer(host to device)
-        checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->inputData[i], GET_SIZE(buffer->inputData[i])));
-        buffer->kernelParams[paramCount++] = deviceptr;
-    }
-
-    for (int i = 0; i < buffer->outputLen; i++) {
-        CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr));
-        // memory allocate
-        checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->outputData[i])));
-        // Synchronous data transfer(host to device)
-        checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->outputData[i], GET_SIZE(buffer->outputData[i])));
-        buffer->kernelParams[paramCount++] = deviceptr;
-    }
-}
-
 void cudaLoadFunction(struct Context* context, char* filename, char* function) {
     checkCudaErrors(cuModuleLoad(&context->module, filename));
     checkCudaErrors(cuModuleGetFunction(&context->function, context->module, function));
 }
 
-void cudaExec2(struct Context* context, struct CudaBuffer* buffer) {
-    // Asynchronous launch kernel
-    context->num_exec = 1;
-    if (context->iterate) {
-        struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator;
-        checkCudaErrors(cuLaunchKernel(context->function,
-                    iterator->x/1024, iterator->y, iterator->z,
-                    1024, 1, 1,
-                    0, NULL, buffer->kernelParams, NULL));
-
-    } else {
-        checkCudaErrors(cuLaunchKernel(context->function,
-                    1, 1, 1,
-                    1, 1, 1,
-                    0, NULL, buffer->kernelParams, NULL));
-    }
-}
-
-void cudaWrite(struct CudaBuffer* buffer) {
-    //結果を取ってくるコマンドを入力する
-    //コマンドの終了待ちを行う   
-    int paramCount = 0;
-    for (int i = 0; i < buffer->inputLen; i++) {
-        CUdeviceptr* deviceptr =  buffer->kernelParams[paramCount++];
-        checkCudaErrors(cuMemcpyDtoH(buffer->inputData[i], *deviceptr, GET_SIZE(buffer->inputData[i])));
-        cuMemFree(*deviceptr);
-        free(deviceptr);
-    }
-
-    for (int i = 0; i < buffer->outputLen; i++) {
-        CUdeviceptr* deviceptr =  buffer->kernelParams[paramCount++];
-        checkCudaErrors(cuMemcpyDtoH(buffer->outputData[i], *deviceptr, GET_SIZE(buffer->outputData[i])));
-        cuMemFree(*deviceptr);
-        free(deviceptr);
-    }
-    free(buffer->kernelParams);
-    // wait for stream
-    checkCudaErrors(cuCtxSynchronize());
-}
-
-void cudaExec(struct Context* context, struct CudaBuffer* buffer, char* filename, char* function) {
-    // カーネルが定義されてなければそれをロードする
-    cudaLoadFunction(context, filename, function);
-    cudaRead(buffer);
-    cudaExec2(context, buffer);
-    cudaWrite(buffer);
-}
-
-void cudaShutdown( struct CUDAWorker *worker) {
+void cudaShutdown(struct CUDAWorker *worker) {
     //    for (int i=0;i<worker->num_stream;i++)
     //        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
     checkCudaErrors(cuCtxDestroy(worker->cuCtx));
--- a/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc	Sat Nov 04 04:14:36 2017 +0900
+++ b/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc	Sat Nov 04 06:52:32 2017 +0900
@@ -1,6 +1,10 @@
 #include "../../../context.h"
 #include <stdio.h>
 
+#ifdef USE_CUDAWorker
+extern void cudaLoadFunction(struct Context* context, char* filename, char* function);
+#endif
+
 __code bitonicSwap(struct SortArray* inputArray, struct MultiDim* multiDim, __code next(struct SortArray* output, ...), struct LoopCounter* loopCounter) {
     struct SortArray* output = *O_output;
     int block = inputArray->block;
@@ -28,19 +32,24 @@
 __code bitonicSwap_stub(struct Context* context) {
 #ifdef USE_CUDAWorker
     if (context->gpu) {
-        struct SortArray* inputSortArray  = &context->data[context->idg]->SortArray;
-        struct SortArray* outputSortArray = &context->data[context->odg]->SortArray;
-        struct CudaBuffer* buffer = new CudaBuffer();
+        SortArray* inputSortArray  = &context->data[context->idg]->SortArray;
+        SortArray* outputSortArray = &context->data[context->odg]->SortArray;
+        CUDABuffer* buffer = &ALLOCATE(context, CUDABuffer)->CUDABuffer;
         buffer->inputData = (union Data**)ALLOCATE_PTR_ARRAY(context, SortArray, 2);
         buffer->inputData[0] = (union Data*)inputSortArray->array;
         buffer->inputData[1] = (union Data*)inputSortArray;
         buffer->outputData = NULL;
         buffer->inputLen = 2;
         buffer->outputLen = 0;
-        cudaExec(context, buffer, "c/examples/bitonicSort/CUDAbitonicSwap.ptx", "bitonicSwap");
         //continuationにそってGPUworkerに戻る
         outputSortArray->array = inputSortArray->array;
-        goto meta(context, context->next);
+        Executor* executor = context->worker->worker->CUDAWorker.executor;
+        executor->executor->CUDAExecutor.buffer = buffer;
+        cudaLoadFunction(context, "c/examples/bitonicSort/CUDAbitonicSwap.ptx", "bitonicSwap");
+        Gearef(context, Executor)->executor = (union Data*)executor;
+        Gearef(context, Executor)->task = context;
+        Gearef(context, Executor)->next = context->next;
+        goto meta(context, executor->read);
     }
 #endif
     SortArray** O_output = (struct SortArray **)&context->data[context->odg];
--- a/src/parallel_execution/generate_context.pl	Sat Nov 04 04:14:36 2017 +0900
+++ b/src/parallel_execution/generate_context.pl	Sat Nov 04 06:52:32 2017 +0900
@@ -106,11 +106,11 @@
         }
         last if (/union Data end/);
         if (/struct (\w+) \{/) {
-            $dataGear{$1} = $1;
+            $dataGear{$1} = 'struct';
         } elsif (/^\s{4}(\w+) (\w+);/) { # primitive type
-            $dataGear{$1} = $1;
+            $dataGear{$1} = 'primitive';
         }
-        $dataGear{"Context"} = "Context";
+        $dataGear{"Context"} = "struct";
     }
 }
 
@@ -211,7 +211,9 @@
 
 open my $fd,">","$ddir/typedefData.h" or die("can't open $ddir/typedefData.h $!");
 for my $data ( sort keys %dataGear ) {
-    print $fd "typedef struct ${data} ${data};\n";
+    if ($dataGear{$data} eq 'struct') {
+        print $fd "typedef struct ${data} ${data};\n";
+    }
 }
 
 open my $fd,">","$ddir/dataGearInit.c" or die("can't open $ddir/dataGearInit.c $!");