Mercurial > hg > Gears > GearsAgda
diff src/parallel_execution/CUDAExecutor.cbc @ 435:af0ec811b20e
Add CUDAExecutor
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sat, 04 Nov 2017 04:14:36 +0900 |
parents | |
children | 08a93fc2f0d3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/parallel_execution/CUDAExecutor.cbc Sat Nov 04 04:14:36 2017 +0900 @@ -0,0 +1,73 @@ +#include "../context.h" +#include <stdio.h> + +// includes, project +#include <driver_types.h> +#include <cuda_runtime.h> +#include <cuda.h> +#include "helper_cuda.h" + +Executor* createCUDAExecutor(struct Context* context) { + struct Executor* executor = new Executor(); + struct CUDAExecutor* cudaExecutor = new CUDAExecutor(); + executor->executor = (union Data*)cudaExecutor; + executor->read = C_readCUDAExecutor; + executor->exec = C_execCUDAExecutor; + executor->write = C_writeCUDAExecutor; + return executor; +} + +__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { + int paramLen = buffer->inputLen + buffer->outputLen; + struct CUDABuffer buffer = executor->buffer; + buffer->kernelParams = ALLOCATE_PTR_ARRAY(context, CudevicePtr, paramLen); + struct CUDABuffer buffer = executor->buffer; + CUdeviceptr* deviceptrs = ALLOCATE_ARRAY(context, CudevicePtr, paramLen); + for (int i = 0; i < paramLen; i++) { + CUdeviceptr deviceptr = deviceptrs[i]; + // memory allocate + union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen]; + checkCUDAErrors(cuMemAlloc(deviceptr, GET_SIZE(data))); + checkCUDAErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data))); + // Synchronous data transfer(host to device) + buffer->kernelParams[paramCount++] = &deviceptr; + } +} + +void cudaLoadFunction(struct Context* context, char* filename, char* function) { + checkCUDAErrors(cuModuleLoad(&context->module, filename)); + checkCUDAErrors(cuModuleGetFunction(&context->function, context->module, function)); +} + +__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { + // Asynchronous launch kernel + task->num_exec = 1; + struct CUDABuffer buffer = executor->buffer; + if (task->iterate) { + struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; + checkCUDAErrors(cuLaunchKernel(task->function, + iterator->x, iterator->y, iterator->z, + 1, 1, 1, + 0, NULL, (void**)buffer->kernelParams, NULL)); + } else { + checkCUDAErrors(cuLaunchKernel(task->function, + 1, 1, 1, + 1, 1, 1, + 0, NULL, (void**)buffer->kernelParams, NULL)); + } +} + +__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { + //結果を取ってくるコマンドを入力する + //コマンドの終了待ちを行う + int paramLen = buffer->inputLen + buffer->outputLen; + struct CUDABuffer buffer = executor->buffer; + for (int i = 0; i < paramLen; i++) { + CUdeviceptr* deviceptr = buffer->kernelParams[i]; + union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen]; + checkCUDAErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data))); + cuMemFree(*deviceptr); + } + // wait for stream + checkCUDAErrors(cuCtxSynchronize()); +}