Mercurial > hg > Game > Cerium
changeset 1924:14f9fc88872c draft
fix
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 30 Jan 2014 16:22:51 +0900 |
parents | e801016bd47c |
children | cd5bbd8ec5d6 |
files | TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h example/Cuda/main.cc |
diffstat | 3 files changed, 57 insertions(+), 58 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc Wed Jan 29 20:32:24 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Thu Jan 30 16:22:51 2014 +0900 @@ -32,53 +32,57 @@ if (ret!=0) { error(convert_error_status(ret)); } - cuStreamCreate(stream, 0); } CudaScheduler::~CudaScheduler() { - cuStreamDestroy(stream); cuCtxDestroy(context); } void CudaScheduler::initCudaBuffer(CudaBufferPtr m) { - m->size = 0; m->allcate_size = 64; - m->buf = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); + m->in_size = 0; + m->out_size = 0; + m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); + m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*)); - m->stream = (CUStream*)malloc(m->allcate_size*sizeof(CUStream*)); + error = cuStreamCreate(&m->stream, 0); + if (error!=0) + error(convert_error_status(error)); } void CudaScheduler::destroyCudaBuffer(CudaBufferPtr m) { - free(m->buf); + free(m->memin); + free(m->memout); free(m->event); - free(m->stream); m->size = 0; m->allcate_size = 0; m->buf = 0; m->event = 0; + cuStreamDestroy(m->stream); } CUdeviceptr -CudaScheduler::createBuffer(CudaBufferPtr m,int i, CUcontext context, size_t size, int* error) { +CudaScheduler::createBuffer(CUdeviceptr* mem, int i, size_t size, int* error) { if (i > m->allcate_size) { m->allcate_size *= 2; - m->buf = (CUdeviceptr*)realloc(m->buf, m->allcate_size*sizeof(CUdeviceptr*)); - m->event = (CUevent*)remalloc(m->allcate_size*sizeof(CUevent*)); - m->stream = (CUStream*)remalloc(m->allcate_size*sizeof(CUStream*)); + m->memin = (CUdeviceptr*)realloc(m->memin, m->allcate_size*sizeof(CUdeviceptr*)); + m->memout = (CUdeviceptr*)realloc(m->memout, m->allcate_size*sizeof(CUdeviceptr*)); + m->event = (CUevent*)remalloc(m->event, m->allcate_size*sizeof(CUevent*)); } - error = cuMemAlloc(&m->buf[i], size); - return m->buf[i]; + error = cuMemAlloc(mem[i], size); + + return mem[i]; } #define NOP_REPLY NULL static void release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) { - for (int i=0; i<mem[1-cur].size; i++) { + for (int i=0; i<mem[1-cur].in_size+mem[]; i++) { if (mem[1-cur].event[i] != 0) cuEventDestroy(mem[1-cur].event[i]); mem[1-cur].event[i] = 0; @@ -143,19 +147,25 @@ void CudaScheduler::run() { int cur = 0; + int stage = 8; TaskListPtr tasklist = NULL; reply = 0; - initCudaBuffer(&memin[0]);initCudaBuffer(&memin[1]); - initCudaBuffer(&memout[0]);initCudaBuffer(&memout[1]); + cudabuffer = (CudaBuffer*)malloc(sizeof(CudaBuffer*)*stage); + + for (int i = 0; i<stage; i++) { + initCudaBuffer(&cudabuffer[i]); + } + memset(&flag, 0, sizeof(HTask::htask_flag)*2); for (;;) { memaddr param_addr = connector->task_list_mail_read(); if ((memaddr)param_addr === (memaddr)MY_SPE_COMMAND_EXIT) { - cuStreamDestroy(stream); - destroyCudaBuffer(&memin[0]);destroyCudaBuffer(&memin[1]); - destroyCudaBuffer(&memout[0]);destroyCudaBuffer(&memout[1]); + for (int i = 0; i<stage; i++) { + destroyCudaBuffer(&cudabuffer[i]); + } + free(cudabuffer); return; } @@ -192,15 +202,11 @@ int param = 0; // set arg count - CUdeviceptr memparam = createBuffer(&memin[cur], 0, context, - sizeof(memaddr)*nextTask->param_count, &ret); + CUdeviceptr memparam = createBuffer(cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count, &ret); if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } // parameter is passed as first kernel arg - ret = cuMemcpyHtoDAsync(memparam, nextTask->param(0), sizeof(memaddr)*nextTask->param_count, stream); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } - - ret = cuParamSetv(kernel[cur], 0, memin[cur].buf[0], sizeof(memaddr)); + ret = cuMemcpyHtoDAsync(memparam, nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } param++; @@ -208,48 +214,45 @@ for(int i=0;i<nextTask->inData_count;i++) { ListElement *input_buf = nextTask->inData(i); if (input_buf->size==0) break; - createBuffer(&memin[cur], param, context, input_buf->size, &ret); + createBuffer(cudabuffer[cur].memin, param, input_buf->size, &ret); if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } - ret = cuMemcpyHtoDAsync(memin[cur].buf[param], input_buf->addr, input_buf->size, stream); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } - ret = cuParamSetv(kernel[cur], 0, memin[cur].buf[param], sizeof(memaddr)); + ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } param++; } - memin[cur].size = param; // +1 means param + cudabuffer[cur].in_size = param; // +1 means param for(int i = 0; i<nextTask->outData_count;i++) { // set output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; if (!flag[cur].flip) { // flip use memin for output - createBuffer(&memout[cur], i, context, output_buf->size, &ret); + createBuffer(cudabuffer[cur].memout, i, output_buf->size, &ret); if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } - ret = cuParamSetv(kernel[cur], 0, memout[cur].buf[i], sizeof(memout)); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue;} // enqueue later } param++; } - memout[cur].size = param - memin[cur].size; // no buffer on flip, but flip use memout event + cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event void* kernelParams; if (!flag[cur.flip]) { kernelParams = malloc(sizeof(void*)*param); - for (int i = 0; i<memin[cur].size; i++) { - kernelParams[i] = memin[cur].buf[i]; + kernelParams[0] = memparam; + for (int i = 1; i<cudabuffer[cur].in_size; i++) { + kernelParams[i] = cudabuffer[cur].memin[i-1]; } - for (int i = 0; i<memout[cur].size; i++) { - kernelParams[i+memin[cur].size] = memout[cur][i]; + for (int i = 0; i<cudabuffer[cur].out_size; i++) { + kernelParams[i+cudabuffer[cur].in_size] = cudabuffer[cur].memout[i]; } } else { - kernelParams = malloc(sizeof(void*)*memin[cur].size); - for (int i = ; i<memin[cur].size; i++) { - kernelParams[i] = memin[cur].buf[i]; + kernelParams = malloc(sizeof(void*)*cudabuffer[cur].in_size); + kernelParams[0] = memparam; + for (int i = 1; i<cudabuffer[cur].in_size; i++) { + kernelParams[i] = memin[cur].buf[i-1]; } } - if (tasklist->dim > 0) { ret = cuLaunchKernel(kernel[cur],
--- a/TaskManager/Cuda/CudaScheduler.h Wed Jan 29 20:32:24 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Thu Jan 30 16:22:51 2014 +0900 @@ -16,23 +16,22 @@ public: typedef struct cudabuffer { int allcate_size; - int size; - CUdeviceptr* buf; + int in_size; + int out_size; + CUdeviceptr* memin; + CUdeviceptr* memout; CUevent* event; - CUStream* stream; - } CudaBuffer; - cudabuffer* CudaBufferPtr; + CUstream stream; + } CudaBuffer, *CudaBufferPtr; CudaScheduler(); virtual ~CudaScheduler(); void init_gpu(); void wait_for_event(CUevent* event, CudaBufferPtr m, TaskListPtr taskList, int cur); void run(); - + // platform platform; // platform は OpenCL が複数のメーカーの GPU に対応してるから必要 // Cuda の場合、NVIDIA だけなので必要ない? - // Cuda で CPU 使うとき要るんじゃね? - // そもそも CPU 使えたっけ? CUdevice device; unsigned int ret_num_platforms; // たぶん要らない unsigned int ret_num_devices; @@ -40,7 +39,6 @@ // command_queue command_queue; // Cuda には command_queue に相当するものはない // Closest approximation would be the CUDA Stream mechanism. らしい... - CUstream stream; int ret; memaddr reply; // cl_kernel に相当 @@ -48,8 +46,7 @@ // とりあえず、kernel で CUfunction kernel[2]; CUevent kernel_event[2]; - CudaBuffer memin[2]; - CudaBuffer memout[2]; + CudaBuffer* cudabuffer; HTask::htask_flag[2]; privete:
--- a/example/Cuda/main.cc Wed Jan 29 20:32:24 2014 +0900 +++ b/example/Cuda/main.cc Thu Jan 30 16:22:51 2014 +0900 @@ -26,7 +26,7 @@ CUcontext context; CUmodule module; CUfunction function; - CUStream stream; + CUstream stream; cuInit(0); cuDeviceGet(&device, 0); @@ -34,7 +34,7 @@ cuModuleLoad(&module, "multiply.ptx"); cuModuleGetFunction(&function, module, "multiply"); - cuStramCreate(&steam,0); + cuStreamCreate(&stream,0); float* A = new float[LENGTH]; @@ -60,11 +60,10 @@ cuLaunchKernel(function, LENGTH, 1, 1, - 1, 1, 1, + 2, 1, 1, 0, stream, args, NULL); - cuMemcpyDtoH(C, devC, LENGTH*sizeof(float)); - cuStreamWaitEvent(stream, ,0); + cuMemcpyDtoHAsync(C, devC, LENGTH*sizeof(float), stream); // print_result(C); check_data(A, B, C);