# HG changeset patch # User Shohei KOKUBO # Date 1394873173 -32400 # Node ID c3b4083c4467cc7eebac876e10a5af8a7f14aa6e # Parent 455e620ad2b2ef4af028b817331145b6c7b4dbc2 fix CudaScheduler diff -r 455e620ad2b2 -r c3b4083c4467 TaskManager/Cuda/CudaScheduler.cc --- a/TaskManager/Cuda/CudaScheduler.cc Sat Mar 15 16:06:03 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Sat Mar 15 17:46:13 2014 +0900 @@ -89,6 +89,123 @@ #define NOP_REPLY NULL +int +CudaScheduler::read(TaskPtr nextTask, TaskListPtr tasklist) { + int cur = 0; + for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) { + if (STAGE <= cur) return cur; + + /* + * get flip flag + * flip : When caluculate on input data, to treat this as a output data + */ + if (tasklist->self) { + flag[cur] = tasklist->self->flag; + } else { + memset(&flag[cur], 0, sizeof(HTask::htask_flag)); // unnecessary ? + } + + if(nextTask->command==ShowTime) { + connector->show_profile(); continue; + } + if(nextTask->command==StartProfile) { + connector->start_profile(); continue; + } + if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + ret = cuModuleGetFunction(&kernel[cur], *cuda_task_list[nextTask->command].cudatask->module, cuda_task_list[nextTask->command].name); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + int param = 0; + + // set arg count + createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + // parameter is passed as first kernel arg + ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + param++; + + for(int i=0;iinData_count;i++) { + ListElement *input_buf = nextTask->inData(i); + if (input_buf->size==0) break; + createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + + param++; + } + cudabuffer[cur].in_size = param; // +1 means param + + for(int i = 0; ioutData_count;i++) { // set output data + ListElement *output_buf = nextTask->outData(i); + if (output_buf->size==0) break; + if (!flag[cur].flip) { // flip use memin for output + createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + // enqueue later + } + param++; + } + cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event + + if (!flag[cur].flip) { + for (int i = 0; idim > 0) { + ret = cuLaunchKernel(kernel[i], + tasklist->x, tasklist->y, tasklist->z, + 1, 1, 1, + 0, cudabuffer[i].stream, cudabuffer[i].kernelParams, NULL); + } else { + ret = cuLaunchKernel(kernel[i], + 1, 1, 1, + 1, 1, 1, + 0, cudabuffer[i].stream, cudabuffer[i].kernelParams, NULL); + } + if (ret!=0) { CudaTaskError(cudabuffer , i, tasklist, ret); continue; } + } +} + +TaskPtr +CudaScheduler::write(TaskPtr nextTask, TaskListPtr tasklist) { + int cur = 0; + for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) { + if (STAGE <= cur) break; + for(int i=0;ioutData_count;i++) { // read output data + ListElement *output_buf = nextTask->outData(i); + if (output_buf->size==0) break; + CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ; + int i0 = flag[cur].flip ? i+1 : i ; + // flip use memin buffer and memout event + ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + } + } + return nextTask; +} + static void release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) { for (int i=0; i 0 || cudabuffer[cur].out_size > 0) - release_buf_event(cur, cudabuffer); + for (int i=0;i 0 || cudabuffer[i].out_size > 0) + release_buf_event(i, cudabuffer); + } if(reply) { connector->mail_write(reply); @@ -172,109 +291,16 @@ tasklist = (TaskListPtr)connector->dma_load(this, param_addr, sizeof(TaskList), DMA_READ_TASKLIST); tasklist->task_start_time = 0; - /* - * get flip flag - * flip : When caluculate on input data, to treat this as a output data - */ - if (tasklist->self) { - flag[cur] = tasklist->self->flag; - } else { - memset(&flag[cur], 0, sizeof(HTask::htask_flag)); // unnecessary ? - } - for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) { - if(nextTask->command==ShowTime) { - connector->show_profile(); continue; - } - if(nextTask->command==StartProfile) { - connector->start_profile(); continue; - } - if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - ret = cuModuleGetFunction(&kernel[cur], *cuda_task_list[nextTask->command].cudatask->module, cuda_task_list[nextTask->command].name); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - int param = 0; - - // set arg count - createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - // parameter is passed as first kernel arg - ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - param++; - - for(int i=0;iinData_count;i++) { - ListElement *input_buf = nextTask->inData(i); - if (input_buf->size==0) break; - createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - - param++; - } - cudabuffer[cur].in_size = param; // +1 means param - - for(int i = 0; ioutData_count;i++) { // set output data - ListElement *output_buf = nextTask->outData(i); - if (output_buf->size==0) break; - if (!flag[cur].flip) { // flip use memin for output - createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - // enqueue later - } - param++; - } - cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event - - if (!flag[cur].flip) { - for (int i = 0; idim > 0) { - ret = cuLaunchKernel(kernel[cur], - tasklist->x, tasklist->y, tasklist->z, - 1, 1, 1, - 0, cudabuffer[cur].stream, cudabuffer[cur].kernelParams, NULL); - } else { - ret = cuLaunchKernel(kernel[cur], - 1, 1, 1, - 1, 1, 1, - 0, cudabuffer[cur].stream, cudabuffer[cur].kernelParams, NULL); - } - if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } - - for(int i=0;ioutData_count;i++) { // read output data - ListElement *output_buf = nextTask->outData(i); - if (output_buf->size==0) break; - CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ; - int i0 = flag[cur].flip ? i+1 : i ; - // flip use memin buffer and memout event - ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - } + for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last();) { + cur = read(nextTask, tasklist); + exec(tasklist, cur); + nextTask = write(nextTask, tasklist); wait_for_event(cudabuffer, tasklist, cur); - cur++; // wait write[cur+1] - if (STAGE <= cur) // to stop pipeline move to after wait_for_event - cur = 0; // } reply = (memaddr)tasklist->waiter; param_addr = (memaddr)tasklist->next; } - wait_for_event(cudabuffer, tasklist, cur); + wait_for_event(cudabuffer, tasklist, 0); unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); diff -r 455e620ad2b2 -r c3b4083c4467 TaskManager/Cuda/CudaScheduler.h --- a/TaskManager/Cuda/CudaScheduler.h Sat Mar 15 16:06:03 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Sat Mar 15 17:46:13 2014 +0900 @@ -55,6 +55,9 @@ void initCudaBuffer(CudaBufferPtr m); void destroyCudaBuffer(CudaBufferPtr m); void CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret); + int read(TaskPtr nextTask, TaskListPtr tasklist); + void exec(TaskListPtr tasklist, int cur); + TaskPtr write(TaskPtr nextTask, TaskListPtr tasklist); }; #define CudaSchedRegister(str, filename, functionname) \ diff -r 455e620ad2b2 -r c3b4083c4467 example/Cuda/main.cc --- a/example/Cuda/main.cc Sat Mar 15 16:06:03 2014 +0900 +++ b/example/Cuda/main.cc Sat Mar 15 17:46:13 2014 +0900 @@ -85,12 +85,12 @@ // Asynchronous data transfer(host to device) int cur = 0; - // for (int i=0;i