Mercurial > hg > Game > Cerium
view TaskManager/Cuda/CudaScheduler.cc @ 2022:fac44ad2867d draft
make a sound
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 16 Jul 2014 02:50:32 +0900 |
parents | 1d7d1e398833 |
children |
line wrap: on
line source
#include "TaskManager.h" #include "CudaScheduler.h" #include "ReferencedDmaManager.h" #include "PreRefDmaManager.h" #include "SchedTask.h" #include "CudaError.h" #include "ListData.h" #include "SysFunc.h" #include "gettime.h" #include "error.h" #include <stdio.h> #include <fcntl.h> #include <sys/stat.h> #include <string.h> #include <cuda.h> #include <map> using namespace std; TaskObject cuda_task_list[MAX_TASK_OBJECT]; CudaScheduler::CudaScheduler() { } void CudaScheduler::init_gpu() { cuInit(0); cuDeviceGetCount(&ret_num_devices); if (ret_num_devices == 0) { error("no cuda device."); exit(EXIT_FAILURE); } cuDeviceGet(&device, 0); /* context flog CU_CTX_SCHED_AUTO CU_CTX_SCHED_SPIN CU_CTX_SCHED_YIELD */ ret = cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device); if (ret!=0) { error(convert_error_status(ret)); } } CudaScheduler::~CudaScheduler() { cuCtxDestroy(context); } void CudaScheduler::initCudaBuffer(CudaBufferPtr m) { m->allcate_size = 64; m->in_size = 0; m->out_size = 0; m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); ret = cuStreamCreate(&(m->stream), 0); if (ret!=0) error(convert_error_status(ret)); m->kernelParams = (void**)malloc(m->allcate_size*2*sizeof(void*)); } void CudaScheduler::destroyCudaBuffer(CudaBufferPtr m) { free(m->memin); free(m->memout); free(m->kernelParams); ret = cuStreamDestroy(m->stream); if (ret!=0) error(convert_error_status(ret)); m->memin = 0; m->memout = 0; m->in_size = 0; m->out_size = 0; m->allcate_size = 0; m->stream = 0; m->kernelParams = 0; } void CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int param, size_t size) { if (param > cudabuffer->allcate_size) { cudabuffer->allcate_size *= 2; cudabuffer->memin = (CUdeviceptr*)realloc(cudabuffer->memin, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); cudabuffer->memout = (CUdeviceptr*)realloc(cudabuffer->memout, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); cudabuffer->kernelParams = (void**)realloc(cudabuffer->kernelParams, cudabuffer->allcate_size*2*sizeof(void*)); } ret = cuMemAlloc(&mem[param], size); } #define NOP_REPLY NULL int CudaScheduler::read(TaskPtr nextTask, TaskListPtr tasklist) { int cur = 0; for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) { if (STAGE <= cur) return cur; /* * get flip flag * flip : When caluculate on input data, to treat this as a output data */ if (tasklist->self) { flag[cur] = tasklist->self->flag; } else { memset(&flag[cur], 0, sizeof(HTask::htask_flag)); // unnecessary ? } if(nextTask->command==ShowTime) { connector->show_profile(); continue; } if(nextTask->command==StartProfile) { connector->start_profile(); continue; } if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } ret = cuModuleGetFunction(&kernel[cur], *cuda_task_list[nextTask->command].cudatask->module, cuda_task_list[nextTask->command].name); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } int param = 0; // set arg count createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } // parameter is passed as first kernel arg ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } cudabuffer[cur].kernelParams[param] = &cudabuffer[cur].memin[param]; param++; for(int i=0;i<nextTask->inData_count;i++) { ListElement *input_buf = nextTask->inData(i); if (input_buf->size==0) break; if (!transmitted.count(input_buf->addr)) { createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } transmitted.insert(make_pair(input_buf->addr, &cudabuffer[cur].memin[param])); reverse_map.insert(make_pair(&cudabuffer[cur].memin[param], input_buf->addr)); } cudabuffer[cur].kernelParams[param] = transmitted[input_buf->addr]; param++; } cudabuffer[cur].in_size = param; // +1 means param for(int i = 0; i<nextTask->outData_count;i++) { // set output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; if (!transmitted.count(output_buf->addr)) { createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } transmitted.insert(make_pair(output_buf->addr, &cudabuffer[cur].memout[i])); reverse_map.insert(make_pair(&cudabuffer[cur].memout[i], output_buf->addr)); cudabuffer[cur].kernelParams[param] = transmitted[output_buf->addr]; param++; } } cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event } return cur; } void CudaScheduler::exec(TaskListPtr tasklist, int cur) { for (int i=0;i<cur;i++) { if (tasklist->dim > 0) { ret = cuLaunchKernel(kernel[i], tasklist->x, tasklist->y, tasklist->z, 1, 1, 1, 0, cudabuffer[i].stream, cudabuffer[i].kernelParams, NULL); } else { ret = cuLaunchKernel(kernel[i], 1, 1, 1, 1, 1, 1, 0, cudabuffer[i].stream, cudabuffer[i].kernelParams, NULL); } if (ret!=0) { CudaTaskError(cudabuffer , i, tasklist, ret); continue; } } } TaskPtr CudaScheduler::write(TaskPtr nextTask, TaskListPtr tasklist) { int cur = 0; for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) { if (STAGE <= cur) break; // enable flip : not data transfer device to host if (flag[cur].flip) continue; for(int i=0;i<nextTask->outData_count;i++) { // read output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; if (transmitted.count(output_buf->addr)) { ret = cuMemcpyDtoHAsync(output_buf->addr, *transmitted[output_buf->addr], output_buf->size, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } reverse_map.erase(transmitted[output_buf->addr]); transmitted.erase(output_buf->addr); } } } return nextTask; } static void release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem, map<CUdeviceptr*, memaddr> map) { for (int i=0; i<mem[cur].in_size; i++) { if (!map.count(&mem[cur].memin[i])) { cuMemFree(mem[cur].memin[i]); mem[cur].memin[i] = 0; } } for (int i=0; i<mem[cur].out_size; i++) { if (!map.count(&mem[cur].memout[i])) { cuMemFree(mem[cur].memout[i]); mem[cur].memout[i] = 0; } } } void CudaScheduler::wait_for_event(CudaBufferPtr cudabuffer, TaskListPtr taskList, int cur) { for (int i=0;i<cur;i++) { if (cuStreamQuery(cudabuffer[i].stream) == CUDA_SUCCESS) continue; // all operation is not executed in the stream else if (cuStreamQuery(cudabuffer[i].stream) == CUDA_ERROR_NOT_READY){ // wait for finish ret = cuStreamSynchronize(cudabuffer[i].stream); if (ret!=0) { error(convert_error_status(ret)); } } } if (taskList!=NULL) { // unsigned long start = 0; // unsigned long end = 0; // timestamp 取る方法がない? } for (int i=0;i<cur;i++) { if (cudabuffer[i].in_size > 0 || cudabuffer[i].out_size > 0) release_buf_event(i, cudabuffer, reverse_map); } if(reply) { connector->mail_write(reply); __debug(this, "CUDA %d %s\t%lld\n", taskList->self->cpu_type, (char*)(cuda_task_list[taskList->tasks[0].command].name), taskList->task_end_time-taskList->task_start_time); reply = 0; } } void CudaScheduler::CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret) { error(convert_error_status(ret)); kernel[cur] = 0; wait_for_event(cudabuffer, taskList, cur); } void CudaScheduler::run() { init_gpu(); int cur = 0; // current pipeline index. TaskListPtr tasklist = NULL; reply = 0; for (int i = 0; i<STAGE; i++) { initCudaBuffer(&cudabuffer[i]); } memset(&flag, 0, sizeof(HTask::htask_flag)*STAGE); for (;;) { memaddr param_addr = connector->task_list_mail_read(); if ((memaddr)param_addr == (memaddr)MY_SPE_COMMAND_EXIT) { for (int i = 0; i<STAGE; i++) { destroyCudaBuffer(&cudabuffer[i]); } return; } (*connector->start_dmawait_profile)(&(connector->start_time)); while (param_addr) { // since we are on the same memory space, we don't has to use dma_load here tasklist = (TaskListPtr)connector->dma_load(this, param_addr, sizeof(TaskList), DMA_READ_TASKLIST); tasklist->task_start_time = 0; for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last();) { cur = read(nextTask, tasklist); exec(tasklist, cur); nextTask = write(nextTask, tasklist); wait_for_event(cudabuffer, tasklist, cur); } reply = (memaddr)tasklist->waiter; param_addr = (memaddr)tasklist->next; } wait_for_event(cudabuffer, tasklist, 0); unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); connector->mail_write((memaddr)MY_SPE_STATUS_READY); } /* NOT REACHED */ } int not_ready(SchedTask* smanager, void* r, void *w) { smanager->printf("GPU task not ready %d\n", smanager->atask->command); return 0; } /* * kernel file open and build program */ int CudaScheduler::load_kernel(int cmd) { if (cuda_task_list[cmd].run == null_run) { return 1; } if (cuda_task_list[cmd].cudatask == 0 || cuda_task_list[cmd].cudatask->filename == 0) { fprintf(stderr, "CUDA module %d not defined.\n",cmd); return 0; } CUmodule* module = new CUmodule; ret = cuModuleLoad(module, cuda_task_list[cmd].cudatask->filename); if(ret!=0) { error(convert_error_status(ret)); } cuda_task_list[cmd].cudatask->module = module; cuda_task_list[cmd].run = null_run; // kernel is ready return 1; } // regist kernel file name void cuda_register_task(int cmd, const char* filename, const char* functionname) { cuda_task_list[cmd].run = not_ready; // not yet ready cuda_task_list[cmd].load = null_loader; cuda_task_list[cmd].wait = null_loader; cuda_task_list[cmd].name = functionname; cuda_task_list[cmd].cudatask->filename = (const char*)filename; } /* end */