Mercurial > hg > Gears > GearsAgda
changeset 436:08a93fc2f0d3
Fix CudaExecutor but not work
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sat, 04 Nov 2017 06:52:32 +0900 |
parents | af0ec811b20e |
children | 2c1b1d56bf1e |
files | src/parallel_execution/CMakeLists.txt src/parallel_execution/CUDAExecutor.cbc src/parallel_execution/CUDAWorker.cbc src/parallel_execution/context.h src/parallel_execution/cuda.c src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc src/parallel_execution/generate_context.pl |
diffstat | 7 files changed, 57 insertions(+), 126 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt Sat Nov 04 04:14:36 2017 +0900 +++ b/src/parallel_execution/CMakeLists.txt Sat Nov 04 06:52:32 2017 +0900 @@ -91,7 +91,7 @@ TARGET CUDAbitonicSort SOURCES - examples/bitonicSort/bitonicSort.cbc examples/bitonicSort/bitonicSwap.cbc examples/bitonicSort/CUDAbitonicSwap.cu examples/bitonicSort/makeArray.cbc examples/bitonicSort/printArray.cbc CPUWorker.cbc CUDAWorker.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc cuda.c MultiDimIterator.cbc TimeImpl.cbc + examples/bitonicSort/bitonicSort.cbc examples/bitonicSort/bitonicSwap.cbc examples/bitonicSort/CUDAbitonicSwap.cu examples/bitonicSort/makeArray.cbc examples/bitonicSort/printArray.cbc CPUWorker.cbc CUDAWorker.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc cuda.c MultiDimIterator.cbc TimeImpl.cbc CudaExecutor.cbc ) set_target_properties(CUDAbitonicSort PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1") endif()
--- a/src/parallel_execution/CUDAExecutor.cbc Sat Nov 04 04:14:36 2017 +0900 +++ b/src/parallel_execution/CUDAExecutor.cbc Sat Nov 04 06:52:32 2017 +0900 @@ -5,7 +5,8 @@ #include <driver_types.h> #include <cuda_runtime.h> #include <cuda.h> -#include "helper_cuda.h" +#include "../helper_cuda.h" +#include "pthread.h" Executor* createCUDAExecutor(struct Context* context) { struct Executor* executor = new Executor(); @@ -17,57 +18,57 @@ return executor; } -__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { +__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { + struct CUDABuffer* buffer = executor->buffer; int paramLen = buffer->inputLen + buffer->outputLen; - struct CUDABuffer buffer = executor->buffer; - buffer->kernelParams = ALLOCATE_PTR_ARRAY(context, CudevicePtr, paramLen); - struct CUDABuffer buffer = executor->buffer; - CUdeviceptr* deviceptrs = ALLOCATE_ARRAY(context, CudevicePtr, paramLen); + executor->kernelParams = (CUdeviceptr**)ALLOCATE_PTR_ARRAY(context, CUdeviceptr, paramLen); + CUdeviceptr* deviceptrs = (CUdeviceptr*)ALLOCATE_ARRAY(context, CUdeviceptr, paramLen); for (int i = 0; i < paramLen; i++) { CUdeviceptr deviceptr = deviceptrs[i]; // memory allocate - union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen]; - checkCUDAErrors(cuMemAlloc(deviceptr, GET_SIZE(data))); - checkCUDAErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data))); + union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen]; + checkCudaErrors(cuMemAlloc(&deviceptr, GET_SIZE(data))); + checkCudaErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data))); // Synchronous data transfer(host to device) - buffer->kernelParams[paramCount++] = &deviceptr; + executor->kernelParams[i] = &deviceptr; } + // TODO: Implements pipeline + // goto next(...); + goto meta(context, C_execCUDAExecutor); } -void cudaLoadFunction(struct Context* context, char* filename, char* function) { - checkCUDAErrors(cuModuleLoad(&context->module, filename)); - checkCUDAErrors(cuModuleGetFunction(&context->function, context->module, function)); -} - -__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { +__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { // Asynchronous launch kernel task->num_exec = 1; - struct CUDABuffer buffer = executor->buffer; if (task->iterate) { struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; - checkCUDAErrors(cuLaunchKernel(task->function, + checkCudaErrors(cuLaunchKernel(task->function, iterator->x, iterator->y, iterator->z, 1, 1, 1, - 0, NULL, (void**)buffer->kernelParams, NULL)); + 0, NULL, (void**)executor->kernelParams, NULL)); } else { - checkCUDAErrors(cuLaunchKernel(task->function, + checkCudaErrors(cuLaunchKernel(task->function, 1, 1, 1, 1, 1, 1, - 0, NULL, (void**)buffer->kernelParams, NULL)); + 0, NULL, (void**)executor->kernelParams, NULL)); } + // TODO: Implements pipeline + // goto next(...); + goto meta(context, C_writeCUDAExecutor); } -__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { +__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う + struct CUDABuffer* buffer = executor->buffer; int paramLen = buffer->inputLen + buffer->outputLen; - struct CUDABuffer buffer = executor->buffer; for (int i = 0; i < paramLen; i++) { - CUdeviceptr* deviceptr = buffer->kernelParams[i]; - union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen]; - checkCUDAErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data))); + CUdeviceptr* deviceptr = executor->kernelParams[i]; + union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen]; + checkCudaErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data))); cuMemFree(*deviceptr); } // wait for stream - checkCUDAErrors(cuCtxSynchronize()); + checkCudaErrors(cuCtxSynchronize()); + goto next(...); }
--- a/src/parallel_execution/CUDAWorker.cbc Sat Nov 04 04:14:36 2017 +0900 +++ b/src/parallel_execution/CUDAWorker.cbc Sat Nov 04 06:52:32 2017 +0900 @@ -4,11 +4,6 @@ static void startCUDAWorker(Worker* worker); -#ifndef USE_CUDA_MAIN_THREAD -volatile -#endif -int cuda_initialized = 0; - Worker* createCUDAWorker(struct Context* context, int id, Queue* queue, TaskManagerImpl *im) { struct Worker* worker = new Worker(); struct CUDAWorker* cudaWorker = new CUDAWorker(); @@ -17,23 +12,16 @@ cudaWorker->id = id; worker->taskReceive = C_taskReceiveCUDAWorker; worker->shutdown = C_shutdownCUDAWorker; -#ifndef USE_CUDA_MAIN_THREAD pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&startCUDAWorker, worker); -#else - if (im) { - im->workers[0] = worker; - } - cuda_initialized = 1; - startCUDAWorker(worker); -#endif return worker; } static void startCUDAWorker(Worker* worker) { struct CUDAWorker* cudaWorker = &worker->worker->CUDAWorker; cudaInit(cudaWorker, 0); - cudaWorker->context = NEW(struct Context); + cudaWorker->context = NEW(struct Context); initContext(cudaWorker->context); + cudaWorker->executor = createCUDAExecutor(cudaWorker->context); Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker; goto meta(cudaWorker->context, worker->taskReceive); }
--- a/src/parallel_execution/context.h Sat Nov 04 04:14:36 2017 +0900 +++ b/src/parallel_execution/context.h Sat Nov 04 06:52:32 2017 +0900 @@ -198,6 +198,7 @@ int runFlag; enum Code next; int num_stream; + struct Executor* executor; CUstream *stream; } CUDAWorker; #else @@ -346,7 +347,7 @@ int z; } MultiDim; struct Executor { - struct Executor* executor; + union Data* executor; struct Context* task; enum Code read; enum Code exec; @@ -355,10 +356,10 @@ } Executor; #ifdef USE_CUDAWorker struct CUDAExecutor { - void** kernelParams; - CUDABuffer* buffer; + CUdeviceptr** kernelParams; + struct CUDABuffer* buffer; } CUDAExecutor; - CudevicePtr CudevicePtr; + CUdeviceptr CUdeviceptr; #else struct CUDAExecutor { } CUDAExecutor;
--- a/src/parallel_execution/cuda.c Sat Nov 04 04:14:36 2017 +0900 +++ b/src/parallel_execution/cuda.c Sat Nov 04 06:52:32 2017 +0900 @@ -84,82 +84,12 @@ printf("cuda Init: Done\n"); } -void cudaRead(struct CudaBuffer* buffer) { - buffer->kernelParams = (void **)calloc(buffer->inputLen + buffer->outputLen, sizeof(void *)); - int paramCount = 0; - for (int i = 0; i < buffer->inputLen; i++) { - CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr)); - // memory allocate - checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->inputData[i]))); - // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->inputData[i], GET_SIZE(buffer->inputData[i]))); - buffer->kernelParams[paramCount++] = deviceptr; - } - - for (int i = 0; i < buffer->outputLen; i++) { - CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr)); - // memory allocate - checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->outputData[i]))); - // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->outputData[i], GET_SIZE(buffer->outputData[i]))); - buffer->kernelParams[paramCount++] = deviceptr; - } -} - void cudaLoadFunction(struct Context* context, char* filename, char* function) { checkCudaErrors(cuModuleLoad(&context->module, filename)); checkCudaErrors(cuModuleGetFunction(&context->function, context->module, function)); } -void cudaExec2(struct Context* context, struct CudaBuffer* buffer) { - // Asynchronous launch kernel - context->num_exec = 1; - if (context->iterate) { - struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator; - checkCudaErrors(cuLaunchKernel(context->function, - iterator->x/1024, iterator->y, iterator->z, - 1024, 1, 1, - 0, NULL, buffer->kernelParams, NULL)); - - } else { - checkCudaErrors(cuLaunchKernel(context->function, - 1, 1, 1, - 1, 1, 1, - 0, NULL, buffer->kernelParams, NULL)); - } -} - -void cudaWrite(struct CudaBuffer* buffer) { - //結果を取ってくるコマンドを入力する - //コマンドの終了待ちを行う - int paramCount = 0; - for (int i = 0; i < buffer->inputLen; i++) { - CUdeviceptr* deviceptr = buffer->kernelParams[paramCount++]; - checkCudaErrors(cuMemcpyDtoH(buffer->inputData[i], *deviceptr, GET_SIZE(buffer->inputData[i]))); - cuMemFree(*deviceptr); - free(deviceptr); - } - - for (int i = 0; i < buffer->outputLen; i++) { - CUdeviceptr* deviceptr = buffer->kernelParams[paramCount++]; - checkCudaErrors(cuMemcpyDtoH(buffer->outputData[i], *deviceptr, GET_SIZE(buffer->outputData[i]))); - cuMemFree(*deviceptr); - free(deviceptr); - } - free(buffer->kernelParams); - // wait for stream - checkCudaErrors(cuCtxSynchronize()); -} - -void cudaExec(struct Context* context, struct CudaBuffer* buffer, char* filename, char* function) { - // カーネルが定義されてなければそれをロードする - cudaLoadFunction(context, filename, function); - cudaRead(buffer); - cudaExec2(context, buffer); - cudaWrite(buffer); -} - -void cudaShutdown( struct CUDAWorker *worker) { +void cudaShutdown(struct CUDAWorker *worker) { // for (int i=0;i<worker->num_stream;i++) // checkCudaErrors(cuStreamDestroy(worker->stream[i])); checkCudaErrors(cuCtxDestroy(worker->cuCtx));
--- a/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc Sat Nov 04 04:14:36 2017 +0900 +++ b/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc Sat Nov 04 06:52:32 2017 +0900 @@ -1,6 +1,10 @@ #include "../../../context.h" #include <stdio.h> +#ifdef USE_CUDAWorker +extern void cudaLoadFunction(struct Context* context, char* filename, char* function); +#endif + __code bitonicSwap(struct SortArray* inputArray, struct MultiDim* multiDim, __code next(struct SortArray* output, ...), struct LoopCounter* loopCounter) { struct SortArray* output = *O_output; int block = inputArray->block; @@ -28,19 +32,24 @@ __code bitonicSwap_stub(struct Context* context) { #ifdef USE_CUDAWorker if (context->gpu) { - struct SortArray* inputSortArray = &context->data[context->idg]->SortArray; - struct SortArray* outputSortArray = &context->data[context->odg]->SortArray; - struct CudaBuffer* buffer = new CudaBuffer(); + SortArray* inputSortArray = &context->data[context->idg]->SortArray; + SortArray* outputSortArray = &context->data[context->odg]->SortArray; + CUDABuffer* buffer = &ALLOCATE(context, CUDABuffer)->CUDABuffer; buffer->inputData = (union Data**)ALLOCATE_PTR_ARRAY(context, SortArray, 2); buffer->inputData[0] = (union Data*)inputSortArray->array; buffer->inputData[1] = (union Data*)inputSortArray; buffer->outputData = NULL; buffer->inputLen = 2; buffer->outputLen = 0; - cudaExec(context, buffer, "c/examples/bitonicSort/CUDAbitonicSwap.ptx", "bitonicSwap"); //continuationにそってGPUworkerに戻る outputSortArray->array = inputSortArray->array; - goto meta(context, context->next); + Executor* executor = context->worker->worker->CUDAWorker.executor; + executor->executor->CUDAExecutor.buffer = buffer; + cudaLoadFunction(context, "c/examples/bitonicSort/CUDAbitonicSwap.ptx", "bitonicSwap"); + Gearef(context, Executor)->executor = (union Data*)executor; + Gearef(context, Executor)->task = context; + Gearef(context, Executor)->next = context->next; + goto meta(context, executor->read); } #endif SortArray** O_output = (struct SortArray **)&context->data[context->odg];
--- a/src/parallel_execution/generate_context.pl Sat Nov 04 04:14:36 2017 +0900 +++ b/src/parallel_execution/generate_context.pl Sat Nov 04 06:52:32 2017 +0900 @@ -106,11 +106,11 @@ } last if (/union Data end/); if (/struct (\w+) \{/) { - $dataGear{$1} = $1; + $dataGear{$1} = 'struct'; } elsif (/^\s{4}(\w+) (\w+);/) { # primitive type - $dataGear{$1} = $1; + $dataGear{$1} = 'primitive'; } - $dataGear{"Context"} = "Context"; + $dataGear{"Context"} = "struct"; } } @@ -211,7 +211,9 @@ open my $fd,">","$ddir/typedefData.h" or die("can't open $ddir/typedefData.h $!"); for my $data ( sort keys %dataGear ) { - print $fd "typedef struct ${data} ${data};\n"; + if ($dataGear{$data} eq 'struct') { + print $fd "typedef struct ${data} ${data};\n"; + } } open my $fd,">","$ddir/dataGearInit.c" or die("can't open $ddir/dataGearInit.c $!");