Mercurial > hg > Game > Cerium
view TaskManager/Gpu/GpuScheduler.cc @ 1886:c3573a5ac6a1 draft
GPU also waits
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 03 Jan 2014 16:57:34 +0900 |
parents | 44fa0f1320a9 |
children | 5238ca826d6e |
line wrap: on
line source
#include "TaskManager.h" #include "GpuScheduler.h" #include "ReferencedDmaManager.h" #include "PreRefDmaManager.h" #include "SchedTask.h" #include "stdio.h" #include "GpuError.h" #include "ListData.h" #include "SysFunc.h" #include "gettime.h" #include <fcntl.h> #include <sys/stat.h> #include <string.h> TaskObject gpu_task_list[MAX_TASK_OBJECT]; GpuScheduler::GpuScheduler() { init_gpu(); } void GpuScheduler::init_impl(int useRefDma) { if (useRefDma & 0x10) { fifoDmaManager = new PreRefDmaManager(); // Prefetch command and no copy } else if (useRefDma & 0x01) { fifoDmaManager = new FifoDmaManager(); // memcpy } else { fifoDmaManager = new ReferencedDmaManager(); // no copy } connector = fifoDmaManager; } /* * Prepare OpenCL: * get OpenCL information * create command queue */ void GpuScheduler::init_gpu() { clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); // clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); // unavailable GPU if(ret_num_devices == 0) { exit(EXIT_FAILURE); } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } } GpuScheduler::~GpuScheduler() { clReleaseCommandQueue(command_queue); clReleaseContext(context); } void GpuScheduler::initGpuBuffer(GpuBufferPtr m) { m->size = 0; m->allocate_size = 64; m->buf = (cl_mem*)malloc(m->allocate_size*sizeof(cl_mem*)); m->event = (cl_event*)malloc(m->allocate_size*sizeof(cl_event*)); } void GpuScheduler::destroyGpuBuffer(GpuBufferPtr m) { free(m->buf); free(m->event); m->size = 0; m->allocate_size = 0; m->buf = 0; m->event = 0; } cl_mem GpuScheduler::createBuffer(GpuBufferPtr m, int i, cl_context context, cl_mem_flags flags, size_t size, cl_int *error) { if (i > m->allocate_size) { // reallocate buffer size m->allocate_size *= 2; m->buf = (cl_mem*)realloc(m->buf, m->allocate_size*sizeof(cl_mem*)); m->event = (cl_event*)realloc(m->event, m->allocate_size*sizeof(cl_event*)); } m->buf[i] = clCreateBuffer(context, flags, size, 0, error); return m->buf[i]; } #define NOP_REPLY NULL static void release_buf_event(int cur, GpuScheduler::GpuBufferPtr memout) { for (int i=0; i < memout[1-cur].size; i++) { if (memout[1-cur].event[i] != 0) clReleaseEvent(memout[1-cur].event[i]); memout[1-cur].event[i] = 0; if (memout[1-cur].buf[i] != 0) clReleaseMemObject(memout[1-cur].buf[i]); memout[1-cur].buf[i] = 0; } memout[1-cur].size = 0; } /** * wait for previous pipeline termination * kernel_event, memout_event */ void GpuScheduler::wait_for_event(cl_event* kernel_event, GpuBufferPtr memout, TaskListPtr taskList, int cur) { if (kernel_event[1-cur] == NOP_REPLY) { } else if (kernel_event[1-cur] != NULL) { int ret=clWaitForEvents(1,&kernel_event[1-cur]); if (ret<0) { error(convert_error_status(ret)); } if (taskList!=NULL){ cl_ulong start = 0; cl_ulong end = 0; clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); if (taskList->task_start_time == 0) taskList->task_start_time = start; taskList->task_end_time = end; } clReleaseEvent(kernel_event[1-cur]); kernel_event[1-cur] = 0; } if (memout[1-cur].size > 0) { int ret=clWaitForEvents(memout[1-cur].size, memout[1-cur].event); if (ret<0) error(convert_error_status(ret)); release_buf_event(cur,memout); } if (memin[1-cur].size > 0) { release_buf_event(cur,memin); } if(reply) { connector->mail_write(reply); printf("GPU %d %s\t%lld\n",taskList->self->cpu_type,(char*)(gpu_task_list[taskList->tasks[0].command].name),taskList->task_end_time-taskList->task_start_time); reply = 0; } } void GpuScheduler::gpuTaskError(int cur, TaskListPtr tasklist, int ret) { error(convert_error_status(ret)); if (kernel_event[cur] != 0) clReleaseEvent(kernel_event[cur]); kernel_event[cur] = NOP_REPLY; if (kernel[cur] != 0) clReleaseKernel(kernel[cur]); kernel[cur] = 0; release_buf_event(1-cur,memout); release_buf_event(1-cur,memin); // wait kernel[1-cur] and write[1-cur] wait_for_event(kernel_event, memout, tasklist, cur); } /* * run GPU task * Get input and output data from tasklist. * Enqueue OpenCL command and clflush. * Enqueue and clflush are pipelined structure. * * flip means that kernel modify input buffer only, copy GPU input buffer to outData. * */ void GpuScheduler::run() { int cur = 0; TaskListPtr tasklist = NULL; reply = 0; initGpuBuffer(&memin[0]);initGpuBuffer(&memin[1]); initGpuBuffer(&memout[0]);initGpuBuffer(&memout[1]); memset(&flag, 0, sizeof(HTask::htask_flag)*2); for (;;) { memaddr params_addr = connector->task_list_mail_read(); // read task list mail from DmaManager if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) { // wait_for_envet was called, so all kernel,buf,event have been released. clFinish(command_queue); destroyGpuBuffer(&memout[1-cur]); destroyGpuBuffer(&memout[cur]); destroyGpuBuffer(&memin[cur]); destroyGpuBuffer(&memin[1-cur]); return ; } (*connector->start_dmawait_profile)(&(connector->start_time)); while (params_addr) { // since we are on the same memory space, we don't has to use dma_load here tasklist = (TaskListPtr)connector->dma_load(this, params_addr, sizeof(TaskList), DMA_READ_TASKLIST); // tasklist[cur]->task_start_time = gettime(); tasklist->task_start_time = 0; /* * get flip flag * flip : When caluculate on input data, to treat this as a output data */ if (tasklist->self) { flag[cur] = tasklist->self->flag; } else { memset(&flag[cur], 0, sizeof(HTask::htask_flag)); } for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) { if(nextTask->command==ShowTime) { connector->show_profile(); continue; } if(nextTask->command==StartProfile) { connector->start_profile(); continue; } if (load_kernel(nextTask->command) == 0) { gpuTaskError(cur,tasklist,ret); continue; } cl_program& program = *gpu_task_list[nextTask->command].gputask->program; const char *function = gpu_task_list[nextTask->command].name; if (kernel[cur]) clReleaseKernel(kernel[cur]); kernel[cur] = clCreateKernel(program, function, &ret); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } int param = 0; // set arg count cl_mem memparam = createBuffer(&memin[cur], 0, context, CL_MEM_READ_ONLY, sizeof(memaddr)*nextTask->param_count, &ret); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } // parameter is passed as first kernel arg ret = clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count, nextTask->param(0), 0, NULL, &memin[cur].event[0]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } ret = clSetKernelArg(kernel[cur], 0, sizeof(memaddr),(void *)&memin[cur].buf[0]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } param++; cl_mem_flags mem_flag = flag[cur].flip ? CL_MEM_READ_WRITE : CL_MEM_READ_ONLY; for(int i=0;i<nextTask->inData_count;i++) { ListElement *input_buf = nextTask->inData(i); if (input_buf->size==0) break; createBuffer(&memin[cur], param, context, mem_flag, input_buf->size, &ret); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } ret = clEnqueueWriteBuffer(command_queue, memin[cur].buf[param], CL_TRUE, 0, input_buf->size, input_buf->addr, 0, NULL, &memin[cur].event[param]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr), (void *)&memin[cur].buf[param]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } param++; } memin[cur].size = param; // +1 means param for(int i = 0; i<nextTask->outData_count;i++) { // set output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; if (!flag[cur].flip) { // flip use memin for output createBuffer(&memout[cur], i, context, CL_MEM_WRITE_ONLY, output_buf->size, &ret); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr), (void *)&memout[cur].buf[i]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue;} // enqueue later } param++; } memout[cur].size = param - memin[cur].size; // no buffer on flip, but flip use memout event if (tasklist->dim > 0) { ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist->dim, NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]); } else { ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size, memin[cur].event, &kernel_event[cur]); } if (ret<0) { gpuTaskError(cur, tasklist, ret); continue; } for(int i=0;i<nextTask->outData_count;i++) { // read output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; GpuBufferPtr mem = flag[cur].flip ? memin : memout ; int i0 = flag[cur].flip ? i+1 : i ; // flip use memin buffer and memout event ret = clEnqueueReadBuffer(command_queue, mem[cur].buf[i0], CL_FALSE, 0, output_buf->size, output_buf->addr, 1, &kernel_event[cur], &memout[cur].event[i]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } } // wait kernel[1-cur] and write[1-cur] // pipeline : cur // to stop pipeline set 1-cur wait_for_event(kernel_event, memout, tasklist, cur); cur = 1 - cur; } reply = (memaddr)tasklist->waiter; params_addr = (memaddr)tasklist->next; } wait_for_event(kernel_event, memout, tasklist, cur); unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); connector->mail_write((memaddr)MY_SPE_STATUS_READY); } /* NOT REACHED */ } int not_ready(SchedTask* smanager, void* r, void *w) { smanager->printf("GPU task not ready %d\n", smanager->atask->command); return 0; } /* * kernel file open and build program */ int GpuScheduler::load_kernel(int cmd) { if (gpu_task_list[cmd].run == null_run) { return 1; } if (gpu_task_list[cmd].gputask == 0 || gpu_task_list[cmd].gputask->program == 0) { fprintf(stderr, "GPU kernel %d not defined.\n",cmd); return 0; } const char *filename = (const char *)gpu_task_list[cmd].gputask->program; int fd; char *source_str; size_t source_size; fd = open(filename, O_RDONLY); if (fd<0) { fprintf(stderr, "Failed to open kernel %s.\n",filename); exit(1); } struct stat stats; fstat(fd,&stats); off_t size = stats.st_size; if (size<=0) { fprintf(stderr, "Failed to read kernel.\n"); exit(1); } source_str = (char*)alloca(size+1); source_size = read(fd, source_str, size); close(fd); source_str[size] = 0; cl_program *program = new cl_program; *program = clCreateProgramWithSource(context, 1, (const char **)&source_str, 0, &ret); ret = clBuildProgram(*program, 1, &device_id, NULL, NULL, NULL); if(ret<0) { size_t size; clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); char *log = new char[size]; clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); error(log); } gpu_task_list[cmd].gputask->program = program; gpu_task_list[cmd].run = null_run; // kernel is ready return 1; } // regist kernel file name void gpu_register_task(int cmd, const char* filename, const char* functionname) { gpu_task_list[cmd].run = not_ready; // not yet ready gpu_task_list[cmd].load = null_loader; gpu_task_list[cmd].wait = null_loader; gpu_task_list[cmd].name = functionname; gpu_task_list[cmd].gputask->program = (cl_program *) filename; } /* end */