Mercurial > hg > Game > Cerium
changeset 1963:6988e5478a8c draft
fix CudaScheduler
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 12 Feb 2014 17:56:40 +0900 |
parents | fdffcf8feeab |
children | 33d07fd99291 |
files | TaskManager/Cell/spe/SpeTaskManagerImpl.h TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h TaskManager/kernel/ppe/CpuThreads.cc TaskManager/kernel/ppe/CpuThreads.h example/word_count/main.cc |
diffstat | 6 files changed, 42 insertions(+), 98 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/Cell/spe/SpeTaskManagerImpl.h Tue Feb 11 19:58:38 2014 +0900 +++ b/TaskManager/Cell/spe/SpeTaskManagerImpl.h Wed Feb 12 17:56:40 2014 +0900 @@ -33,19 +33,10 @@ void free_htask(HTaskPtr htask) {} void print_arch(); -#ifdef __CERIUM_GPU__ - +#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__ SpeTaskManagerImpl(int i); void append_activeTask(HTask* p); void append_waitTask(HTask* p); - -#endif -#ifdef __CERIUM_CUDA__ - - SpeTaskManagerImpl(int i); - void append_activeTask(HTask* p); - void append_waitTask(HTask* p); - #endif } ;
--- a/TaskManager/Cuda/CudaScheduler.cc Tue Feb 11 19:58:38 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Wed Feb 12 17:56:40 2014 +0900 @@ -46,17 +46,17 @@ m->out_size = 0; m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); - m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*)); ret = cuStreamCreate(&(m->stream), 0); if (ret!=0) error(convert_error_status(ret)); + m->kernelParams = (void**)malloc(m->allcate_size*2*sizeof(void*)); } void CudaScheduler::destroyCudaBuffer(CudaBufferPtr m) { free(m->memin); free(m->memout); - free(m->event); + free(m->kernelParams); ret = cuStreamDestroy(m->stream); if (ret!=0) error(convert_error_status(ret)); @@ -65,20 +65,20 @@ m->in_size = 0; m->out_size = 0; m->allcate_size = 0; - m->event = 0; m->stream = 0; + m->kernelParams = 0; } void -CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size) { - if (i > cudabuffer->allcate_size) { +CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int param, size_t size) { + if (param > cudabuffer->allcate_size) { cudabuffer->allcate_size *= 2; cudabuffer->memin = (CUdeviceptr*)realloc(cudabuffer->memin, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); cudabuffer->memout = (CUdeviceptr*)realloc(cudabuffer->memout, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); - cudabuffer->event = (CUevent*)realloc(cudabuffer->event, cudabuffer->allcate_size*sizeof(CUevent*)); + cudabuffer->kernelParams = (void**)realloc(cudabuffer->kernelParams, cudabuffer->allcate_size*2*sizeof(void*)); } - ret = cuMemAlloc(&mem[i], size); + ret = cuMemAlloc(&mem[param], size); } #define NOP_REPLY NULL @@ -91,9 +91,6 @@ mem[cur].memin[i] = 0; } for (int i=0; i<mem[cur].out_size; i++) { - if (mem[cur].event[i] != 0) - cuEventDestroy(mem[cur].event[i]); - mem[cur].event[i] = 0; if (mem[cur].memout[i]) cuMemFree(mem[cur].memout[i]); mem[cur].memout[i] = 0; @@ -104,10 +101,10 @@ void CudaScheduler::wait_for_event(CUevent* kernel_event, CudaBufferPtr cudabuffer, TaskListPtr taskList, int cur) { - if (kernel_event[cur-1] == NOP_REPLY) { + if (cuEventQuery(kernel_event[cur]) == CUDA_SUCCESS) { - } else if (kernel_event[cur-1] != NULL){ - ret = cuEventSynchronize(kernel_event[cur-1]); + } else if (cuEventQuery(kernel_event[cur]) == CUDA_ERROR_NOT_READY){ + ret = cuEventSynchronize(kernel_event[cur]); if (ret!=0) { error(convert_error_status(ret)); @@ -117,19 +114,15 @@ unsigned long end = 0; // timestamp 取る方法がない? } - ret = cuEventDestroy(kernel_event[cur-1]); + ret = cuEventDestroy(kernel_event[cur]); if (ret!=0) { error(convert_error_status(ret)); } - kernel_event[cur-1] = 0; - - if (cudabuffer[cur-1].out_size > 0) { - for (int i = 0; i<cudabuffer[cur-1].out_size; i++) { - ret = cuEventSynchronize(cudabuffer[cur-1].event[i]); - if (ret!=0) error(convert_error_status(ret)); - } - } - release_buf_event(cur-1, cudabuffer); + ret = cuEventCreate(&kernel_event[cur], CU_EVENT_DISABLE_TIMING); + if (ret!=0) { + error(convert_error_status(ret)); + } + release_buf_event(cur, cudabuffer); } if(reply) { @@ -142,9 +135,10 @@ void CudaScheduler::CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret) { error(convert_error_status(ret)); - if (kernel_event[cur] != 0) + if (cuEventQuery(kernel_event[cur]) == CUDA_ERROR_NOT_READY) { cuEventDestroy(kernel_event[cur]); - kernel_event[cur] = NOP_REPLY; + cuEventCreate(&kernel_event[cur], CU_EVENT_DISABLE_TIMING); + } kernel[cur] = 0; release_buf_event(cur, cudabuffer); @@ -160,7 +154,7 @@ for (int i = 0; i<STAGE; i++) { initCudaBuffer(&cudabuffer[i]); - kernel_event[i]=0; + cuEventCreate(&kernel_event[i], CU_EVENT_DISABLE_TIMING); } memset(&flag, 0, sizeof(HTask::htask_flag)*STAGE); @@ -198,10 +192,8 @@ connector->start_profile(); continue; } if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - CUmodule& module = *cuda_task_list[nextTask->command].cudatask->module; - const char *funcname = cuda_task_list[nextTask->command].name; // move to load_kernel. - ret = cuModuleGetFunction(&kernel[cur], module, funcname); + ret = cuModuleGetFunction(&kernel[cur], *cuda_task_list[nextTask->command].cudatask->module, cuda_task_list[nextTask->command].name); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } int param = 0; @@ -240,40 +232,34 @@ } cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event - void** kernelParams; // move to cudabuffer. - if (!flag[cur].flip) { - kernelParams = (void**)malloc(sizeof(void*)*param); for (int i = 0; i<cudabuffer[cur].in_size; i++) { - kernelParams[i] = &cudabuffer[cur].memin[i]; + cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i]; } for (int i = 0; i<cudabuffer[cur].out_size; i++) { - kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i]; + cudabuffer[cur].kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i]; } } else { - kernelParams = (void**)malloc(sizeof(void*)*cudabuffer[cur].in_size); for (int i = 0; i<cudabuffer[cur].in_size; i++) { - kernelParams[i] = &cudabuffer[cur].memin[i]; + cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i]; } } - ret = cuEventCreate(&kernel_event[cur], 0); if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } if (tasklist->dim > 0) { ret = cuLaunchKernel(kernel[cur], - tasklist->x*tasklist->y*tasklist->z, 1, 1, + tasklist->x, tasklist->y, tasklist->z, 1, 1, 1, - 0, cudabuffer[cur].stream, kernelParams, NULL); + 0, cudabuffer[cur].stream, cudabuffer[cur].kernelParams, NULL); } else { ret = cuLaunchKernel(kernel[cur], 1, 1, 1, 1, 1, 1, - 0, cudabuffer[cur].stream, kernelParams, NULL); + 0, cudabuffer[cur].stream, cudabuffer[cur].kernelParams, NULL); } if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } - ret = cuEventRecord(kernel_event[cur], cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } for(int i=0;i<nextTask->outData_count;i++) { // read output data @@ -283,35 +269,22 @@ int i0 = flag[cur].flip ? i+1 : i ; // flip use memin buffer and memout event ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - ret = cuEventCreate(&cudabuffer[cur].event[i], 0); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - ret = cuEventRecord(cudabuffer[cur].event[i], cudabuffer[cur].stream); + ret = cuEventRecord(kernel_event[cur], cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } } - // wait kernel[cur] and write[cur] - // pipeline : cur - // to stop pipeline set cur+1 - if (cur == 0) { - wait_for_event(kernel_event, cudabuffer, tasklist, STAGE); - } else { - wait_for_event(kernel_event, cudabuffer, tasklist, cur); - } cur++; if (STAGE <= cur) cur = 0; - free(kernelParams); + // wait kernel[cur] and write[cur] + // pipeline : cur + // to stop pipeline set cur-11 + wait_for_event(kernel_event, cudabuffer, tasklist, cur); } reply = (memaddr)tasklist->waiter; param_addr = (memaddr)tasklist->next; } - if (cur == 0) { - wait_for_event(kernel_event, cudabuffer, tasklist, STAGE); - } else { - wait_for_event(kernel_event, cudabuffer, tasklist, cur); - } + wait_for_event(kernel_event, cudabuffer, tasklist, cur-1); for (int i = 0; i<STAGE; i++) { ret = cuStreamSynchronize(cudabuffer[i].stream); if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } @@ -348,7 +321,7 @@ CUmodule* module = new CUmodule; ret = cuModuleLoad(module, cuda_task_list[cmd].cudatask->filename); - + if(ret!=0) { error(convert_error_status(ret)); }
--- a/TaskManager/Cuda/CudaScheduler.h Tue Feb 11 19:58:38 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Wed Feb 12 17:56:40 2014 +0900 @@ -20,8 +20,8 @@ int out_size; CUdeviceptr* memin; CUdeviceptr* memout; - CUevent* event; CUstream stream; + void** kernelParams; } CudaBuffer, *CudaBufferPtr; CudaScheduler(); virtual ~CudaScheduler();
--- a/TaskManager/kernel/ppe/CpuThreads.cc Tue Feb 11 19:58:38 2014 +0900 +++ b/TaskManager/kernel/ppe/CpuThreads.cc Wed Feb 12 17:56:40 2014 +0900 @@ -53,10 +53,7 @@ delete [] threads; delete [] args; -#ifdef __CERIUM_GPU__ - delete gpu; -#endif -#ifdef __CERIUM_CUDA__ +#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__ delete gpu; #endif } @@ -126,10 +123,7 @@ for (int i = 0; i < cpu_num+io_num; i++) { args[i].scheduler->connector->set_mail_waiter(w); } -#ifdef __CERIUM_GPU__ - gpu->set_mail_waiter(w); -#endif -#ifdef __CERIUM_CUDA__ +#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__ gpu->set_mail_waiter(w); #endif } @@ -158,10 +152,7 @@ int CpuThreads::get_mail(int cpuid, int count, memaddr *ret) { -#ifdef __CERIUM_GPU__ - if (is_gpu(cpuid)) return gpu->get_mail(cpuid, count, ret); -#endif -#ifdef __CERIUM_CUDA__ +#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__ if (is_gpu(cpuid)) return gpu->get_mail(cpuid, count, ret); #endif *ret = args[cpuid-id_offset].scheduler->mail_read_from_host(); @@ -171,10 +162,7 @@ int CpuThreads::has_mail(int cpuid, int count, memaddr *ret) { -#ifdef __CERIUM_GPU__ - if (is_gpu(cpuid)) return gpu->has_mail(cpuid, count, ret); -#endif -#ifdef __CERIUM_CUDA__ +#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__ if (is_gpu(cpuid)) return gpu->has_mail(cpuid, count, ret); #endif if (args[cpuid-id_offset].scheduler->has_mail_from_host() != 0) { @@ -201,13 +189,7 @@ void CpuThreads::send_mail(int cpuid, int num, memaddr *data) { -#ifdef __CERIUM_GPU__ - if (is_gpu(cpuid)){ - gpu->send_mail(cpuid, num, data); - return; - } -#endif -#ifdef __CERIUM_CUDA__ +#if defined __CERIUM_GPU__ || defined __CERIUM_CUDA__ if (is_gpu(cpuid)){ gpu->send_mail(cpuid, num, data); return;
--- a/TaskManager/kernel/ppe/CpuThreads.h Tue Feb 11 19:58:38 2014 +0900 +++ b/TaskManager/kernel/ppe/CpuThreads.h Wed Feb 12 17:56:40 2014 +0900 @@ -4,9 +4,7 @@ #include <pthread.h> #include "Threads.h" #include "GpuThreads.h" -#ifdef __CERIUM_CUDA__ #include "CudaThreads.h" -#endif #include "TaskManagerImpl.h" #include "MainScheduler.h" #include "Sem.h"
--- a/example/word_count/main.cc Tue Feb 11 19:58:38 2014 +0900 +++ b/example/word_count/main.cc Wed Feb 12 17:56:40 2014 +0900 @@ -26,7 +26,7 @@ int use_compat = 0; int use_iterate = 0; int use_iterate_all = 0; -int array_task_num = 16; +int array_task_num = 11; int spe_num = 1; CPU_TYPE spe_cpu = SPE_ANY; const char *usr_help_str = "Usage: ./word_count [-a -c -s] [-cpu spe_num] [-g] [-file filename]\n";