Mercurial > hg > Game > Cerium
changeset 1925:cd5bbd8ec5d6 draft
fix CudaScheduler
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 31 Jan 2014 05:56:23 +0900 |
parents | 14f9fc88872c |
children | 777bdbf6c072 |
files | TaskManager/Cell/spe/SpeTaskManagerImpl.h TaskManager/Cuda/CudaError.h TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h TaskManager/Cuda/CudaThreads.cc TaskManager/Cuda/CudaThreads.h TaskManager/Makefile.cuda TaskManager/kernel/schedule/Scheduler.h example/Cuda/main.cc |
diffstat | 9 files changed, 146 insertions(+), 97 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/Cell/spe/SpeTaskManagerImpl.h Thu Jan 30 16:22:51 2014 +0900 +++ b/TaskManager/Cell/spe/SpeTaskManagerImpl.h Fri Jan 31 05:56:23 2014 +0900 @@ -40,6 +40,14 @@ void append_waitTask(HTask* p); #endif + +#ifdef __CERIUM_CUDA__ + + SpeTaskManagerImpl(int i); + void append_activeTask(HTask* p); + void append_waitTask(HTask* p); + +#endif } ;
--- a/TaskManager/Cuda/CudaError.h Thu Jan 30 16:22:51 2014 +0900 +++ b/TaskManager/Cuda/CudaError.h Fri Jan 31 05:56:23 2014 +0900 @@ -37,7 +37,7 @@ const char* message; for(int i=0; Error_Status[i].status_string != NULL; i++) { - if (Error_Status[i].status = status) { + if (Error_Status[i].status == status) { message = Error_Status[i].status_string; } }
--- a/TaskManager/Cuda/CudaScheduler.cc Thu Jan 30 16:22:51 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Fri Jan 31 05:56:23 2014 +0900 @@ -1,6 +1,6 @@ #include "TaskManager.h" #include "CudaScheduler.h" -#include "ReferenceDmaManager.h" +#include "ReferencedDmaManager.h" #include "PreRefDmaManager.h" #include "SchedTask.h" #include "CudaError.h" @@ -25,9 +25,9 @@ cuInit(0); cuDeviceGetCount(&ret_num_devices); if (ret_num_devices == 0) { - exit(EXIT_FILURE); + exit(EXIT_FAILURE); } - cuDeviceGet(&context, 0); + cuDeviceGet(&device, 0); ret = cuCtxCreate(&context, 0, device); if (ret!=0) { error(convert_error_status(ret)); @@ -47,9 +47,9 @@ m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*)); - error = cuStreamCreate(&m->stream, 0); - if (error!=0) - error(convert_error_status(error)); + ret = cuStreamCreate(&m->stream, 0); + if (ret!=0) + error(convert_error_status(ret)); } void @@ -57,23 +57,28 @@ free(m->memin); free(m->memout); free(m->event); - m->size = 0; + ret = cuStreamDestroy(m->stream); + if (ret!=0) + error(convert_error_status(ret)); + m->memin = 0; + m->memout = 0; + m->in_size = 0; + m->out_size = 0; m->allcate_size = 0; - m->buf = 0; m->event = 0; - cuStreamDestroy(m->stream); + m->stream = 0; } CUdeviceptr -CudaScheduler::createBuffer(CUdeviceptr* mem, int i, size_t size, int* error) { - if (i > m->allcate_size) { - m->allcate_size *= 2; - m->memin = (CUdeviceptr*)realloc(m->memin, m->allcate_size*sizeof(CUdeviceptr*)); - m->memout = (CUdeviceptr*)realloc(m->memout, m->allcate_size*sizeof(CUdeviceptr*)); - m->event = (CUevent*)remalloc(m->event, m->allcate_size*sizeof(CUevent*)); +CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size, int* error) { + if (i > cudabuffer->allcate_size) { + cudabuffer->allcate_size *= 2; + cudabuffer->memin = (CUdeviceptr*)realloc(cudabuffer->memin, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); + cudabuffer->memout = (CUdeviceptr*)realloc(cudabuffer->memout, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); + cudabuffer->event = (CUevent*)realloc(cudabuffer->event, cudabuffer->allcate_size*sizeof(CUevent*)); } - error = cuMemAlloc(mem[i], size); + error = (int*)cuMemAlloc(&mem[i], size); return mem[i]; } @@ -82,23 +87,29 @@ static void release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) { - for (int i=0; i<mem[1-cur].in_size+mem[]; i++) { - if (mem[1-cur].event[i] != 0) - cuEventDestroy(mem[1-cur].event[i]); - mem[1-cur].event[i] = 0; - if (mem[1-cur].buf[i]) - cuMemFree(mem[1-cur].buf[i]); - mem[1-cur].buf[i] = 0; + for (int i=0; i<mem[cur-1].in_size; i++) { + if (mem[cur-1].memin[i]) + cuMemFree(mem[cur-1].memin[i]); + mem[cur-1].memin[i] = 0; } - mem[1-cur].size = 0; + for (int i=0; i<mem[cur-1].out_size; i++) { + if (mem[cur-1].event[i] != 0) + cuEventDestroy(mem[cur-1].event[i]); + mem[cur-1].event[i] = 0; + if (mem[1-cur].memout[i]) + cuMemFree(mem[cur-1].memout[i]); + mem[cur-1].memout[i] = 0; + } + mem[cur-1].in_size = 0; + mem[cur-1].out_size = 0; } void -CudaBufferPtr::wait_for_event(CUevent* kernel_event, CudaBufferPtr memout, CudaBufferPtr memin, TaskListPtr taskList, int cur) { - if (kernel_event[1-cur] == NOP_REPLY) { - - } else { - ret = cuEventSynchronize(kernel_event[1-cur]); +CudaScheduler::wait_for_event(CUevent* kernel_event, CudaBufferPtr cudabuffer, TaskListPtr taskList, int cur) { + if (kernel_event[cur-1] == NOP_REPLY) { + + } else if (kernel_event[cur-1] != NULL){ + ret = cuEventSynchronize(kernel_event[cur-1]); if (ret!=0) { error(convert_error_status(ret)); @@ -108,18 +119,19 @@ unsigned long end = 0; // timestamp 取る方法がない? } - cuEventDestroy(kernel_event[1-cur]); - kernel_event[1-cur] = 0; - } - - if (memout[1-cur].size > 0) { - ret = cuEventSynchronize(memout[1-cur].event); - if (ret!=0) error(convert_error_status(ret)); - release_buf_event(cur, memout); - } - - if (memin[1-cur].size > 0) { - release_buf_event(cur, memin); + ret = cuEventDestroy(kernel_event[cur-1]); + if (ret!=0) { + error(convert_error_status(ret)); + } + kernel_event[cur-1] = 0; + + if (cudabuffer[cur-1].out_size > 0) { + for (int i = 0; i<cudabuffer[cur-1].out_size; i++) { + ret = cuEventSynchronize(cudabuffer[cur-1].event[i]); + if (ret!=0) error(convert_error_status(ret)); + } + } + release_buf_event(cur, cudabuffer); } if(reply) { @@ -130,18 +142,15 @@ } void -CudaScheduler::CudaTaskError(int cur, TaskListPtr taskList, int ret) { +CudaScheduler::CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret) { error(convert_error_status(ret)); if (kernel_event[cur] != 0) cuEventDestroy(kernel_event[cur]); kernel_event[cur] = NOP_REPLY; - // if (kernel[cur] != 0) - // kerneldestroy(); kernel[cur] = 0; - release_buf_event(1-cur, memout); - release_buf_event(1-cur, memin); + release_buf_event(cur+1, cudabuffer); - wait_for_event(kernel_event, memout, taskList, cur); + wait_for_event(kernel_event, cudabuffer, taskList, cur); } void @@ -161,8 +170,11 @@ for (;;) { memaddr param_addr = connector->task_list_mail_read(); - if ((memaddr)param_addr === (memaddr)MY_SPE_COMMAND_EXIT) { + if ((memaddr)param_addr == (memaddr)MY_SPE_COMMAND_EXIT) { for (int i = 0; i<stage; i++) { + ret = cuStreamSynchronize(cudabuffer[i].stream); + if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } + destroyCudaBuffer(&cudabuffer[i]); } free(cudabuffer); @@ -170,9 +182,9 @@ } (*connector->start_dmawait_profile)(&(connector->start_time)); - while (params_addr) { + while (param_addr) { // since we are on the same memory space, we don't has to use dma_load here - tasklist = (TaskListPtr)connector->dma_load(this, params_addr, + tasklist = (TaskListPtr)connector->dma_load(this, param_addr, sizeof(TaskList), DMA_READ_TASKLIST); // tasklist[cur]->task_start_time = gettime(); tasklist->task_start_time = 0; @@ -192,32 +204,32 @@ if(nextTask->command==StartProfile) { connector->start_profile(); continue; } - if (load_kernel(nextTask->command) == 0) { cudaTaskError(cur,tasklist,ret); continue; } + if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } CUmodule& module = *cuda_task_list[nextTask->command].cudatask->module; const char *funcname = cuda_task_list[nextTask->command].name; - ret = cuModuleGetFunction(kernel[cur], module, funcname); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } + ret = cuModuleGetFunction(&kernel[cur], module, funcname); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } int param = 0; // set arg count - CUdeviceptr memparam = createBuffer(cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count, &ret); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } + CUdeviceptr memparam = createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count, &ret); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } // parameter is passed as first kernel arg ret = cuMemcpyHtoDAsync(memparam, nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } param++; for(int i=0;i<nextTask->inData_count;i++) { ListElement *input_buf = nextTask->inData(i); if (input_buf->size==0) break; - createBuffer(cudabuffer[cur].memin, param, input_buf->size, &ret); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } + createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size, &ret); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } param++; } @@ -227,67 +239,86 @@ ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; if (!flag[cur].flip) { // flip use memin for output - createBuffer(cudabuffer[cur].memout, i, output_buf->size, &ret); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } + createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size, &ret); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } // enqueue later } param++; } cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event - void* kernelParams; + void** kernelParams; - if (!flag[cur.flip]) { - kernelParams = malloc(sizeof(void*)*param); - kernelParams[0] = memparam; + if (!flag[cur].flip) { + kernelParams = (void**)malloc(sizeof(void*)*param); + kernelParams[0] = &memparam; for (int i = 1; i<cudabuffer[cur].in_size; i++) { - kernelParams[i] = cudabuffer[cur].memin[i-1]; + kernelParams[i] = &cudabuffer[cur].memin[i-1]; } for (int i = 0; i<cudabuffer[cur].out_size; i++) { - kernelParams[i+cudabuffer[cur].in_size] = cudabuffer[cur].memout[i]; + kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i]; } } else { - kernelParams = malloc(sizeof(void*)*cudabuffer[cur].in_size); - kernelParams[0] = memparam; + kernelParams = (void**)malloc(sizeof(void*)*cudabuffer[cur].in_size); + kernelParams[0] = &memparam; for (int i = 1; i<cudabuffer[cur].in_size; i++) { - kernelParams[i] = memin[cur].buf[i-1]; + kernelParams[i] = &cudabuffer[cur].memin[i-1]; } } - + + ret = cuEventCreate(&kernel_event[cur], 0); + if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } + if (tasklist->dim > 0) { ret = cuLaunchKernel(kernel[cur], tasklist->x*tasklist->y*tasklist->z, 1, 1, 1, 1, 1, - stream, kernelParams, NULL); + 0, cudabuffer[cur].stream, kernelParams, NULL); } else { ret = cuLaunchKernel(kernel[cur], 1, 1, 1, 1, 1, 1, - stream, kernelParams, NULL); + 0, cudabuffer[cur].stream, kernelParams, NULL); } - if (ret!=0) { cudaTaskError(cur, tasklist, ret); continue; } + if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } + ret = cuEventRecord(kernel_event[cur], cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } + for(int i=0;i<nextTask->outData_count;i++) { // read output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; - GpuBufferPtr mem = flag[cur].flip ? memin : memout ; + CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ; int i0 = flag[cur].flip ? i+1 : i ; // flip use memin buffer and memout event - ret = cuMemcpyDtoHAsync(mem[cur].buf[i0], output_buf->addr, output_buf->size, stream); - if (ret!=0) { cudaTaskError(cur,tasklist,ret); continue; } + ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + ret = cuEventCreate(&cudabuffer[cur].event[i], 0); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + ret = cuEventRecord(cudabuffer[cur].event[i], cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } } // wait kernel[1-cur] and write[1-cur] // pipeline : cur - // to stop pipeline set 1-cur - wait_for_event(kernel_event, memout, tasklist, cur); - cur = 1 - cur; + // to stop pipeline set cur+1 + if (cur == 0) { + wait_for_event(kernel_event, cudabuffer, tasklist, stage); // to stop pipeline comment out this line + } else { + wait_for_event(kernel_event, cudabuffer, tasklist, cur); + } + cur += 1; + if (stage <= cur) + cur = 0; free(kernelParams); + cuModuleUnload(module); } reply = (memaddr)tasklist->waiter; - params_addr = (memaddr)tasklist->next; + param_addr = (memaddr)tasklist->next; } - wait_for_event(kernel_event, memout, tasklist, cur); - + wait_for_event(kernel_event, cudabuffer, tasklist, cur); + unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); connector->mail_write((memaddr)MY_SPE_STATUS_READY);
--- a/TaskManager/Cuda/CudaScheduler.h Thu Jan 30 16:22:51 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Fri Jan 31 05:56:23 2014 +0900 @@ -7,8 +7,6 @@ #include "HTask.h" #include "TaskManager.h" #include <cuda.h> -#include <cuda_runtime.h> - extern TaskObject cuda_task_list[MAX_TASK_OBJECT]; @@ -34,7 +32,7 @@ // Cuda の場合、NVIDIA だけなので必要ない? CUdevice device; unsigned int ret_num_platforms; // たぶん要らない - unsigned int ret_num_devices; + int ret_num_devices; CUcontext context; // command_queue command_queue; // Cuda には command_queue に相当するものはない @@ -47,14 +45,14 @@ CUfunction kernel[2]; CUevent kernel_event[2]; CudaBuffer* cudabuffer; - HTask::htask_flag[2]; + HTask::htask_flag flag[2]; - privete: + private: int load_kernel(int cmd); - CUdeviceptr createBuffer(CudaBufferPtr m, int i, CUcontext context, /* mem_flag mem_flag, */size_t size, int* error); + CUdeviceptr createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size, int* error); void initCudaBuffer(CudaBufferPtr m); void destroyCudaBuffer(CudaBufferPtr m); - void CudaTaskError(int cur, TaskListPtr taskList, int ret); + void CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret); }; #define CudaSchedRegister(str, filename, functionname) \
--- a/TaskManager/Cuda/CudaThreads.cc Thu Jan 30 16:22:51 2014 +0900 +++ b/TaskManager/Cuda/CudaThreads.cc Fri Jan 31 05:56:23 2014 +0900 @@ -11,7 +11,7 @@ args = new cuda_thread_arg_t; } -CuduThreads::~CudaThreads() +CudaThreads::~CudaThreads() { memaddr mail = (memaddr)MY_SPE_COMMAND_EXIT; send_mail(0,1,&mail);
--- a/TaskManager/Cuda/CudaThreads.h Thu Jan 30 16:22:51 2014 +0900 +++ b/TaskManager/Cuda/CudaThreads.h Fri Jan 31 05:56:23 2014 +0900 @@ -3,7 +3,6 @@ #include <pthread.h> #include <cuda.h> -#include <cuda_runtime.h> #include "Threads.h" #include "CudaScheduler.h" #include "Sem.h"
--- a/TaskManager/Makefile.cuda Thu Jan 30 16:22:51 2014 +0900 +++ b/TaskManager/Makefile.cuda Fri Jan 31 05:56:23 2014 +0900 @@ -1,7 +1,7 @@ include ./Makefile.def TARGET = libCudaManager.a CFLAGS += -DHAS_POSIX_MEMALIGN -CUDA_PATH = /Developer/NVIDIA/CUDA-5.5/include +INCLUDE += -I/Developer/NVIDIA/CUDA-5.5/include VPATH = CUDA_PATH
--- a/TaskManager/kernel/schedule/Scheduler.h Thu Jan 30 16:22:51 2014 +0900 +++ b/TaskManager/kernel/schedule/Scheduler.h Fri Jan 31 05:56:23 2014 +0900 @@ -20,6 +20,10 @@ #endif #endif +#ifdef __CERIUM_CUDA__ +#include <cuda.h> +#endif + #define MAX_USER_TASK 100 #define MAX_SYSTEM_TASK 2 #define MAX_TASK_OBJECT MAX_USER_TASK + MAX_SYSTEM_TASK @@ -40,6 +44,11 @@ #endif } GpuTaskObject; +typedef struct cuda_task_object { +#ifdef __CERIUM_CUDA__ + CUmodule* module; +#endif +} CudaTaskObject; // Task Object Table // this is named TaskObjectRun but it is not an object. // It is a pointer to an object creation function @@ -57,7 +66,7 @@ void (*wait)(Scheduler *,int); GpuTaskObject *gputask; - + CudaTaskObject* cudatask; } __attribute__ ((aligned (DEFAULT_ALIGNMENT))) //sizeはどれくらい? TaskObject, *TaskObjectPtr;
--- a/example/Cuda/main.cc Thu Jan 30 16:22:51 2014 +0900 +++ b/example/Cuda/main.cc Fri Jan 31 05:56:23 2014 +0900 @@ -56,11 +56,15 @@ cuMemcpyHtoDAsync(devB, B, LENGTH*sizeof(float), stream); cuMemcpyHtoDAsync(devC, C, LENGTH*sizeof(float), stream); - void* args[] = {&devA, &devB, &devC}; + // void* args[] = {&devA, &devB, &devC}; + void** args=(void**)malloc(sizeof(void*)*3); + args[0] = &devA; + args[1] = &devB; + args[2] = &devC; cuLaunchKernel(function, LENGTH, 1, 1, - 2, 1, 1, + 1, 1, 1, 0, stream, args, NULL); cuMemcpyDtoHAsync(C, devC, LENGTH*sizeof(float), stream);