Mercurial > hg > Game > Cerium
view TaskManager/Cuda/CudaScheduler.cc @ 1941:f19885ea776d draft
add wordcount for cuda. fix CudaScheduler. add makefile
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 04 Feb 2014 02:18:07 +0900 |
parents | 67e50779feb4 |
children | da22fc4db5b2 |
line wrap: on
line source
#include "TaskManager.h" #include "CudaScheduler.h" #include "ReferencedDmaManager.h" #include "PreRefDmaManager.h" #include "SchedTask.h" #include "CudaError.h" #include "ListData.h" #include "SysFunc.h" #include "gettime.h" #include "error.h" #include <stdio.h> #include <fcntl.h> #include <sys/stat.h> #include <string.h> #include <cuda.h> TaskObject cuda_task_list[MAX_TASK_OBJECT]; CudaScheduler::CudaScheduler() { } void CudaScheduler::init_gpu() { cuInit(0); cuDeviceGetCount(&ret_num_devices); if (ret_num_devices == 0) { exit(EXIT_FAILURE); } cuDeviceGet(&device, 0); ret = cuCtxCreate(&context, 0, device); if (ret!=0) { error(convert_error_status(ret)); } } CudaScheduler::~CudaScheduler() { cuCtxDestroy(context); } void CudaScheduler::initCudaBuffer(CudaBufferPtr m) { m->allcate_size = 64; m->in_size = 0; m->out_size = 0; m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*)); ret = cuStreamCreate(&(m->stream), 0); if (ret!=0) error(convert_error_status(ret)); } void CudaScheduler::destroyCudaBuffer(CudaBufferPtr m) { free(m->memin); free(m->memout); free(m->event); ret = cuStreamDestroy(m->stream); if (ret!=0) error(convert_error_status(ret)); m->memin = 0; m->memout = 0; m->in_size = 0; m->out_size = 0; m->allcate_size = 0; m->event = 0; m->stream = 0; } void CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size) { if (i > cudabuffer->allcate_size) { cudabuffer->allcate_size *= 2; cudabuffer->memin = (CUdeviceptr*)realloc(cudabuffer->memin, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); cudabuffer->memout = (CUdeviceptr*)realloc(cudabuffer->memout, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); cudabuffer->event = (CUevent*)realloc(cudabuffer->event, cudabuffer->allcate_size*sizeof(CUevent*)); } ret = cuMemAlloc(&mem[i], size); } #define NOP_REPLY NULL static void release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) { for (int i=0; i<mem[cur-1].in_size; i++) { if (mem[cur-1].memin[i]) cuMemFree(mem[cur-1].memin[i]); mem[cur-1].memin[i] = 0; } for (int i=0; i<mem[cur-1].out_size; i++) { if (mem[cur-1].event[i] != 0) cuEventDestroy(mem[cur-1].event[i]); mem[cur-1].event[i] = 0; if (mem[cur-1].memout[i]) cuMemFree(mem[cur-1].memout[i]); mem[cur-1].memout[i] = 0; } mem[cur-1].in_size = 0; mem[cur-1].out_size = 0; } void CudaScheduler::wait_for_event(CUevent* kernel_event, CudaBufferPtr cudabuffer, TaskListPtr taskList, int cur) { if (kernel_event[cur-1] == NOP_REPLY) { } else if (kernel_event[cur-1] != NULL){ ret = cuEventSynchronize(kernel_event[cur-1]); if (ret!=0) { error(convert_error_status(ret)); } if (taskList!=NULL) { unsigned long start = 0; unsigned long end = 0; // timestamp 取る方法がない? } ret = cuEventDestroy(kernel_event[cur-1]); if (ret!=0) { error(convert_error_status(ret)); } kernel_event[cur-1] = 0; if (cudabuffer[cur-1].out_size > 0) { for (int i = 0; i<cudabuffer[cur-1].out_size; i++) { ret = cuEventSynchronize(cudabuffer[cur-1].event[i]); if (ret!=0) error(convert_error_status(ret)); } } release_buf_event(cur, cudabuffer); } if(reply) { connector->mail_write(reply); __debug(this, "CUDA %d %s\t%lld\n", taskList->cpu_type, (char*)(cuda_task_list[taskList->tasks[0].command].name), taskList->task_end_time-taskList->task_start_time); reply = 0; } } void CudaScheduler::CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret) { error(convert_error_status(ret)); if (kernel_event[cur] != 0) cuEventDestroy(kernel_event[cur]); kernel_event[cur] = NOP_REPLY; kernel[cur] = 0; release_buf_event(cur+1, cudabuffer); wait_for_event(kernel_event, cudabuffer, taskList, cur); } void CudaScheduler::run() { init_gpu(); int cur = 0; TaskListPtr tasklist = NULL; reply = 0; for (int i = 0; i<STAGE; i++) { initCudaBuffer(&cudabuffer[i]); kernel_event[i]=0; } memset(&flag, 0, sizeof(HTask::htask_flag)*STAGE); for (;;) { memaddr param_addr = connector->task_list_mail_read(); if ((memaddr)param_addr == (memaddr)MY_SPE_COMMAND_EXIT) { for (int i = 0; i<STAGE; i++) { destroyCudaBuffer(&cudabuffer[i]); } return; } (*connector->start_dmawait_profile)(&(connector->start_time)); while (param_addr) { // since we are on the same memory space, we don't has to use dma_load here tasklist = (TaskListPtr)connector->dma_load(this, param_addr, sizeof(TaskList), DMA_READ_TASKLIST); // tasklist[cur]->task_start_time = gettime(); tasklist->task_start_time = 0; /* * get flip flag * flip : When caluculate on input data, to treat this as a output data */ if (tasklist->self) { flag[cur] = tasklist->self->flag; } else { memset(&flag[cur], 0, sizeof(HTask::htask_flag)); } for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) { if(nextTask->command==ShowTime) { connector->show_profile(); continue; } if(nextTask->command==StartProfile) { connector->start_profile(); continue; } if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } CUmodule& module = *cuda_task_list[nextTask->command].cudatask->module; const char *funcname = cuda_task_list[nextTask->command].name; ret = cuModuleGetFunction(&kernel[cur], module, funcname); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } int param = 0; // set arg count createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } // parameter is passed as first kernel arg ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } param++; for(int i=0;i<nextTask->inData_count;i++) { ListElement *input_buf = nextTask->inData(i); if (input_buf->size==0) break; createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } param++; } cudabuffer[cur].in_size = param; // +1 means param for(int i = 0; i<nextTask->outData_count;i++) { // set output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; if (!flag[cur].flip) { // flip use memin for output createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } // enqueue later } param++; } cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event void** kernelParams; if (!flag[cur].flip) { kernelParams = (void**)malloc(sizeof(void*)*param); for (int i = 0; i<cudabuffer[cur].in_size; i++) { kernelParams[i] = &cudabuffer[cur].memin[i]; } for (int i = 0; i<cudabuffer[cur].out_size; i++) { kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i]; } } else { kernelParams = (void**)malloc(sizeof(void*)*cudabuffer[cur].in_size); for (int i = 0; i<cudabuffer[cur].in_size-1; i++) { kernelParams[i] = &cudabuffer[cur].memin[i]; } } ret = cuEventCreate(&kernel_event[cur], 0); if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } if (tasklist->dim > 0) { ret = cuLaunchKernel(kernel[cur], tasklist->x*tasklist->y*tasklist->z, 1, 1, 1, 1, 1, 0, cudabuffer[cur].stream, kernelParams, NULL); } else { ret = cuLaunchKernel(kernel[cur], 1, 1, 1, 1, 1, 1, 0, cudabuffer[cur].stream, kernelParams, NULL); } if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } ret = cuEventRecord(kernel_event[cur], cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } for(int i=0;i<nextTask->outData_count;i++) { // read output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ; int i0 = flag[cur].flip ? i+1 : i ; // flip use memin buffer and memout event ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } ret = cuEventCreate(&cudabuffer[cur].event[i], 0); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } ret = cuEventRecord(cudabuffer[cur].event[i], cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } } // wait kernel[1-cur] and write[1-cur] // pipeline : cur // to stop pipeline set cur+1 if (cur == 0) { wait_for_event(kernel_event, cudabuffer, tasklist, STAGE); // to stop pipeline comment out this line } else { wait_for_event(kernel_event, cudabuffer, tasklist, cur); } cur++; if (STAGE <= cur) cur = 0; free(kernelParams); } reply = (memaddr)tasklist->waiter; param_addr = (memaddr)tasklist->next; } if (cur == 0) { wait_for_event(kernel_event, cudabuffer, tasklist, STAGE); } else { wait_for_event(kernel_event, cudabuffer, tasklist, cur); } for (int i = 0; i<STAGE; i++) { ret = cuStreamSynchronize(cudabuffer[i].stream); if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } } unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); connector->mail_write((memaddr)MY_SPE_STATUS_READY); } /* NOT REACHED */ } int not_ready(SchedTask* smanager, void* r, void *w) { smanager->printf("GPU task not ready %d\n", smanager->atask->command); return 0; } /* * kernel file open and build program */ int CudaScheduler::load_kernel(int cmd) { if (cuda_task_list[cmd].run == null_run) { return 1; } if (cuda_task_list[cmd].cudatask == 0 || cuda_task_list[cmd].cudatask->filename == 0) { fprintf(stderr, "CUDA module %d not defined.\n",cmd); return 0; } CUmodule* module = new CUmodule; ret = cuModuleLoad(module, cuda_task_list[cmd].cudatask->filename); if(ret!=0) { error(convert_error_status(ret)); } cuda_task_list[cmd].cudatask->module = module; cuda_task_list[cmd].run = null_run; // kernel is ready return 1; } // regist kernel file name void cuda_register_task(int cmd, const char* filename, const char* functionname) { cuda_task_list[cmd].run = not_ready; // not yet ready cuda_task_list[cmd].load = null_loader; cuda_task_list[cmd].wait = null_loader; cuda_task_list[cmd].name = functionname; cuda_task_list[cmd].cudatask->filename = (const char*)filename; } /* end */