Mercurial > hg > Game > Cerium
changeset 1983:c3b4083c4467 draft
fix CudaScheduler
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Sat, 15 Mar 2014 17:46:13 +0900 |
parents | 455e620ad2b2 |
children | 7bea670cdba0 |
files | TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h example/Cuda/main.cc |
diffstat | 3 files changed, 150 insertions(+), 121 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc Sat Mar 15 16:06:03 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Sat Mar 15 17:46:13 2014 +0900 @@ -89,6 +89,123 @@ #define NOP_REPLY NULL +int +CudaScheduler::read(TaskPtr nextTask, TaskListPtr tasklist) { + int cur = 0; + for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) { + if (STAGE <= cur) return cur; + + /* + * get flip flag + * flip : When caluculate on input data, to treat this as a output data + */ + if (tasklist->self) { + flag[cur] = tasklist->self->flag; + } else { + memset(&flag[cur], 0, sizeof(HTask::htask_flag)); // unnecessary ? + } + + if(nextTask->command==ShowTime) { + connector->show_profile(); continue; + } + if(nextTask->command==StartProfile) { + connector->start_profile(); continue; + } + if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + ret = cuModuleGetFunction(&kernel[cur], *cuda_task_list[nextTask->command].cudatask->module, cuda_task_list[nextTask->command].name); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + int param = 0; + + // set arg count + createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + // parameter is passed as first kernel arg + ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + param++; + + for(int i=0;i<nextTask->inData_count;i++) { + ListElement *input_buf = nextTask->inData(i); + if (input_buf->size==0) break; + createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + param++; + } + cudabuffer[cur].in_size = param; // +1 means param + + for(int i = 0; i<nextTask->outData_count;i++) { // set output data + ListElement *output_buf = nextTask->outData(i); + if (output_buf->size==0) break; + if (!flag[cur].flip) { // flip use memin for output + createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + // enqueue later + } + param++; + } + cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event + + if (!flag[cur].flip) { + for (int i = 0; i<cudabuffer[cur].in_size; i++) { + cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i]; + } + for (int i = 0; i<cudabuffer[cur].out_size; i++) { + cudabuffer[cur].kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i]; + } + } else { + for (int i = 0; i<cudabuffer[cur].in_size; i++) { + cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i]; + } + } + + if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } + } + return cur; +} + +void +CudaScheduler::exec(TaskListPtr tasklist, int cur) { + for (int i=0;i<cur;i++) { + if (tasklist->dim > 0) { + ret = cuLaunchKernel(kernel[i], + tasklist->x, tasklist->y, tasklist->z, + 1, 1, 1, + 0, cudabuffer[i].stream, cudabuffer[i].kernelParams, NULL); + } else { + ret = cuLaunchKernel(kernel[i], + 1, 1, 1, + 1, 1, 1, + 0, cudabuffer[i].stream, cudabuffer[i].kernelParams, NULL); + } + if (ret!=0) { CudaTaskError(cudabuffer , i, tasklist, ret); continue; } + } +} + +TaskPtr +CudaScheduler::write(TaskPtr nextTask, TaskListPtr tasklist) { + int cur = 0; + for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) { + if (STAGE <= cur) break; + for(int i=0;i<nextTask->outData_count;i++) { // read output data + ListElement *output_buf = nextTask->outData(i); + if (output_buf->size==0) break; + CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ; + int i0 = flag[cur].flip ? i+1 : i ; + // flip use memin buffer and memout event + ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + } + } + return nextTask; +} + static void release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) { for (int i=0; i<mem[cur].in_size; i++) { @@ -107,15 +224,15 @@ void CudaScheduler::wait_for_event(CudaBufferPtr cudabuffer, TaskListPtr taskList, int cur) { - - if (cuStreamQuery(cudabuffer[cur].stream) == CUDA_SUCCESS) { - + for (int i=0;i<cur;i++) { + if (cuStreamQuery(cudabuffer[i].stream) == CUDA_SUCCESS) continue; // all operation is not executed in the stream - } else if (cuStreamQuery(cudabuffer[cur].stream) == CUDA_ERROR_NOT_READY){ - // wait for finish - ret = cuStreamSynchronize(cudabuffer[cur].stream); - if (ret!=0) { - error(convert_error_status(ret)); + else if (cuStreamQuery(cudabuffer[i].stream) == CUDA_ERROR_NOT_READY){ + // wait for finish + ret = cuStreamSynchronize(cudabuffer[i].stream); + if (ret!=0) { + error(convert_error_status(ret)); + } } } @@ -125,8 +242,10 @@ // timestamp 取る方法がない? } - if (cudabuffer[cur].in_size > 0 || cudabuffer[cur].out_size > 0) - release_buf_event(cur, cudabuffer); + for (int i=0;i<cur;i++) { + if (cudabuffer[i].in_size > 0 || cudabuffer[i].out_size > 0) + release_buf_event(i, cudabuffer); + } if(reply) { connector->mail_write(reply); @@ -172,109 +291,16 @@ tasklist = (TaskListPtr)connector->dma_load(this, param_addr, sizeof(TaskList), DMA_READ_TASKLIST); tasklist->task_start_time = 0; - /* - * get flip flag - * flip : When caluculate on input data, to treat this as a output data - */ - if (tasklist->self) { - flag[cur] = tasklist->self->flag; - } else { - memset(&flag[cur], 0, sizeof(HTask::htask_flag)); // unnecessary ? - } - for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) { - if(nextTask->command==ShowTime) { - connector->show_profile(); continue; - } - if(nextTask->command==StartProfile) { - connector->start_profile(); continue; - } - if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - ret = cuModuleGetFunction(&kernel[cur], *cuda_task_list[nextTask->command].cudatask->module, cuda_task_list[nextTask->command].name); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - int param = 0; - - // set arg count - createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - // parameter is passed as first kernel arg - ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - param++; - - for(int i=0;i<nextTask->inData_count;i++) { - ListElement *input_buf = nextTask->inData(i); - if (input_buf->size==0) break; - createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - param++; - } - cudabuffer[cur].in_size = param; // +1 means param - - for(int i = 0; i<nextTask->outData_count;i++) { // set output data - ListElement *output_buf = nextTask->outData(i); - if (output_buf->size==0) break; - if (!flag[cur].flip) { // flip use memin for output - createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - // enqueue later - } - param++; - } - cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event - - if (!flag[cur].flip) { - for (int i = 0; i<cudabuffer[cur].in_size; i++) { - cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i]; - } - for (int i = 0; i<cudabuffer[cur].out_size; i++) { - cudabuffer[cur].kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i]; - } - } else { - for (int i = 0; i<cudabuffer[cur].in_size; i++) { - cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i]; - } - } - - if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } - - if (tasklist->dim > 0) { - ret = cuLaunchKernel(kernel[cur], - tasklist->x, tasklist->y, tasklist->z, - 1, 1, 1, - 0, cudabuffer[cur].stream, cudabuffer[cur].kernelParams, NULL); - } else { - ret = cuLaunchKernel(kernel[cur], - 1, 1, 1, - 1, 1, 1, - 0, cudabuffer[cur].stream, cudabuffer[cur].kernelParams, NULL); - } - if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } - - for(int i=0;i<nextTask->outData_count;i++) { // read output data - ListElement *output_buf = nextTask->outData(i); - if (output_buf->size==0) break; - CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ; - int i0 = flag[cur].flip ? i+1 : i ; - // flip use memin buffer and memout event - ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - } + for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last();) { + cur = read(nextTask, tasklist); + exec(tasklist, cur); + nextTask = write(nextTask, tasklist); wait_for_event(cudabuffer, tasklist, cur); - cur++; // wait write[cur+1] - if (STAGE <= cur) // to stop pipeline move to after wait_for_event - cur = 0; // } reply = (memaddr)tasklist->waiter; param_addr = (memaddr)tasklist->next; } - wait_for_event(cudabuffer, tasklist, cur); + wait_for_event(cudabuffer, tasklist, 0); unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time));
--- a/TaskManager/Cuda/CudaScheduler.h Sat Mar 15 16:06:03 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Sat Mar 15 17:46:13 2014 +0900 @@ -55,6 +55,9 @@ void initCudaBuffer(CudaBufferPtr m); void destroyCudaBuffer(CudaBufferPtr m); void CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret); + int read(TaskPtr nextTask, TaskListPtr tasklist); + void exec(TaskListPtr tasklist, int cur); + TaskPtr write(TaskPtr nextTask, TaskListPtr tasklist); }; #define CudaSchedRegister(str, filename, functionname) \
--- a/example/Cuda/main.cc Sat Mar 15 16:06:03 2014 +0900 +++ b/example/Cuda/main.cc Sat Mar 15 17:46:13 2014 +0900 @@ -85,12 +85,12 @@ // Asynchronous data transfer(host to device) int cur = 0; - // for (int i=0;i<num_exec;i++,cur++) { - // if (num_stream <= cur) - // cur = 0; - // B[i] = (float)(i+1); - // cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); - // } + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + B[i] = (float)(i+1); + cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); + } cur = 0; @@ -99,24 +99,24 @@ if (num_stream <= cur) cur=0; B[i] = (float)(i+1); - cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); + //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); void* args[] = {&devA, &devB[i], &devOut[i]}; cuLaunchKernel(function, LENGTH, 1, 1, THREAD, 1, 1, 0, stream[cur], args, NULL); - cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); + //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); } cur = 0; // Asynchronous data transfer(device to host) - // for (int i=0;i<num_exec;i++,cur++) { - // if (num_stream <= cur) - // cur = 0; - // cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); - // } + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); + } // wait for stream for (int i=0;i<num_stream;i++)