Mercurial > hg > Gears > GearsAgda
changeset 435:af0ec811b20e
Add CUDAExecutor
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sat, 04 Nov 2017 04:14:36 +0900 |
parents | b75badf42701 |
children | 08a93fc2f0d3 |
files | src/parallel_execution/CUDAExecutor.cbc src/parallel_execution/Executor.cbc src/parallel_execution/context.h src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc src/parallel_execution/generate_context.pl |
diffstat | 5 files changed, 91 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/parallel_execution/CUDAExecutor.cbc Sat Nov 04 04:14:36 2017 +0900 @@ -0,0 +1,73 @@ +#include "../context.h" +#include <stdio.h> + +// includes, project +#include <driver_types.h> +#include <cuda_runtime.h> +#include <cuda.h> +#include "helper_cuda.h" + +Executor* createCUDAExecutor(struct Context* context) { + struct Executor* executor = new Executor(); + struct CUDAExecutor* cudaExecutor = new CUDAExecutor(); + executor->executor = (union Data*)cudaExecutor; + executor->read = C_readCUDAExecutor; + executor->exec = C_execCUDAExecutor; + executor->write = C_writeCUDAExecutor; + return executor; +} + +__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { + int paramLen = buffer->inputLen + buffer->outputLen; + struct CUDABuffer buffer = executor->buffer; + buffer->kernelParams = ALLOCATE_PTR_ARRAY(context, CudevicePtr, paramLen); + struct CUDABuffer buffer = executor->buffer; + CUdeviceptr* deviceptrs = ALLOCATE_ARRAY(context, CudevicePtr, paramLen); + for (int i = 0; i < paramLen; i++) { + CUdeviceptr deviceptr = deviceptrs[i]; + // memory allocate + union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen]; + checkCUDAErrors(cuMemAlloc(deviceptr, GET_SIZE(data))); + checkCUDAErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data))); + // Synchronous data transfer(host to device) + buffer->kernelParams[paramCount++] = &deviceptr; + } +} + +void cudaLoadFunction(struct Context* context, char* filename, char* function) { + checkCUDAErrors(cuModuleLoad(&context->module, filename)); + checkCUDAErrors(cuModuleGetFunction(&context->function, context->module, function)); +} + +__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { + // Asynchronous launch kernel + task->num_exec = 1; + struct CUDABuffer buffer = executor->buffer; + if (task->iterate) { + struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; + checkCUDAErrors(cuLaunchKernel(task->function, + iterator->x, iterator->y, iterator->z, + 1, 1, 1, + 0, NULL, (void**)buffer->kernelParams, NULL)); + } else { + checkCUDAErrors(cuLaunchKernel(task->function, + 1, 1, 1, + 1, 1, 1, + 0, NULL, (void**)buffer->kernelParams, NULL)); + } +} + +__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { + //結果を取ってくるコマンドを入力する + //コマンドの終了待ちを行う + int paramLen = buffer->inputLen + buffer->outputLen; + struct CUDABuffer buffer = executor->buffer; + for (int i = 0; i < paramLen; i++) { + CUdeviceptr* deviceptr = buffer->kernelParams[i]; + union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen]; + checkCUDAErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data))); + cuMemFree(*deviceptr); + } + // wait for stream + checkCUDAErrors(cuCtxSynchronize()); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/parallel_execution/Executor.cbc Sat Nov 04 04:14:36 2017 +0900 @@ -0,0 +1,7 @@ +typedef struct Executor<Impl>{ + union Data* Executor; + struct Context* task; + __code read(Impl* executor, struct Context* task, __code next(...)); + __code exec(Impl* executor, struct Context* task, __code next(...)); + __code write(Impl* executor, struct Context* task, __code next(...)); +}
--- a/src/parallel_execution/context.h Tue Oct 31 17:55:50 2017 +0900 +++ b/src/parallel_execution/context.h Sat Nov 04 04:14:36 2017 +0900 @@ -348,27 +348,27 @@ struct Executor { struct Executor* executor; struct Context* task; - struct Buffer* buffer; enum Code read; enum Code exec; enum Code write; enum Code next; } Executor; #ifdef USE_CUDAWorker - struct CudaExecutor { + struct CUDAExecutor { void** kernelParams; - } CudaExecutor; - CudaDevicePtr CudaDevicePtr; + CUDABuffer* buffer; + } CUDAExecutor; + CudevicePtr CudevicePtr; #else - struct CudaExecutor { - } CudaExecutor; + struct CUDAExecutor { + } CUDAExecutor; #endif - struct Buffer { + struct CUDABuffer { int inputLen; int outputLen; union Data** inputData; union Data** outputData; - } Buffer; + } CUDABuffer; }; // union Data end this is necessary for context generator typedef union Data Data;
--- a/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc Tue Oct 31 17:55:50 2017 +0900 +++ b/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc Sat Nov 04 04:14:36 2017 +0900 @@ -30,7 +30,7 @@ if (context->gpu) { struct SortArray* inputSortArray = &context->data[context->idg]->SortArray; struct SortArray* outputSortArray = &context->data[context->odg]->SortArray; - struct Buffer* buffer = &ALLOCATE(context, Buffer)->Buffer; + struct CudaBuffer* buffer = new CudaBuffer(); buffer->inputData = (union Data**)ALLOCATE_PTR_ARRAY(context, SortArray, 2); buffer->inputData[0] = (union Data*)inputSortArray->array; buffer->inputData[1] = (union Data*)inputSortArray;
--- a/src/parallel_execution/generate_context.pl Tue Oct 31 17:55:50 2017 +0900 +++ b/src/parallel_execution/generate_context.pl Sat Nov 04 04:14:36 2017 +0900 @@ -107,6 +107,8 @@ last if (/union Data end/); if (/struct (\w+) \{/) { $dataGear{$1} = $1; + } elsif (/^\s{4}(\w+) (\w+);/) { # primitive type + $dataGear{$1} = $1; } $dataGear{"Context"} = "Context"; }