Mercurial > hg > GearsTemplate
view src/parallel_execution/CUDAExecutor.cbc @ 435:af0ec811b20e
Add CUDAExecutor
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Sat, 04 Nov 2017 04:14:36 +0900 |
parents | |
children | 08a93fc2f0d3 |
line wrap: on
line source
#include "../context.h" #include <stdio.h> // includes, project #include <driver_types.h> #include <cuda_runtime.h> #include <cuda.h> #include "helper_cuda.h" Executor* createCUDAExecutor(struct Context* context) { struct Executor* executor = new Executor(); struct CUDAExecutor* cudaExecutor = new CUDAExecutor(); executor->executor = (union Data*)cudaExecutor; executor->read = C_readCUDAExecutor; executor->exec = C_execCUDAExecutor; executor->write = C_writeCUDAExecutor; return executor; } __code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { int paramLen = buffer->inputLen + buffer->outputLen; struct CUDABuffer buffer = executor->buffer; buffer->kernelParams = ALLOCATE_PTR_ARRAY(context, CudevicePtr, paramLen); struct CUDABuffer buffer = executor->buffer; CUdeviceptr* deviceptrs = ALLOCATE_ARRAY(context, CudevicePtr, paramLen); for (int i = 0; i < paramLen; i++) { CUdeviceptr deviceptr = deviceptrs[i]; // memory allocate union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen]; checkCUDAErrors(cuMemAlloc(deviceptr, GET_SIZE(data))); checkCUDAErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data))); // Synchronous data transfer(host to device) buffer->kernelParams[paramCount++] = &deviceptr; } } void cudaLoadFunction(struct Context* context, char* filename, char* function) { checkCUDAErrors(cuModuleLoad(&context->module, filename)); checkCUDAErrors(cuModuleGetFunction(&context->function, context->module, function)); } __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { // Asynchronous launch kernel task->num_exec = 1; struct CUDABuffer buffer = executor->buffer; if (task->iterate) { struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; checkCUDAErrors(cuLaunchKernel(task->function, iterator->x, iterator->y, iterator->z, 1, 1, 1, 0, NULL, (void**)buffer->kernelParams, NULL)); } else { checkCUDAErrors(cuLaunchKernel(task->function, 1, 1, 1, 1, 1, 1, 0, NULL, (void**)buffer->kernelParams, NULL)); } } __code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) { //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う int paramLen = buffer->inputLen + buffer->outputLen; struct CUDABuffer buffer = executor->buffer; for (int i = 0; i < paramLen; i++) { CUdeviceptr* deviceptr = buffer->kernelParams[i]; union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen]; checkCUDAErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data))); cuMemFree(*deviceptr); } // wait for stream checkCUDAErrors(cuCtxSynchronize()); }