Mercurial > hg > Game > Cerium
view TaskManager/Gpu/GpuScheduler.cc @ 1812:df5fc3a6d318 draft
Release mem object in GPU
author | YuhiTOMARI |
---|---|
date | Wed, 11 Dec 2013 20:41:24 +0900 |
parents | 8039c48763c4 |
children | d7973604e81f |
line wrap: on
line source
#include "TaskManager.h" #include "GpuScheduler.h" #include "ReferencedDmaManager.h" #include "PreRefDmaManager.h" #include "SchedTask.h" #include "stdio.h" #include "GpuError.h" #include "ListData.h" #include "SysFunc.h" #include "gettime.h" #include <fcntl.h> #include <sys/stat.h> #include <string.h> TaskObject gpu_task_list[MAX_TASK_OBJECT]; GpuScheduler::GpuScheduler() { init_impl(0); init_gpu(); } void GpuScheduler::init_impl(int useRefDma) { if (useRefDma & 0x10) { fifoDmaManager = new PreRefDmaManager(); // Prefetch command and no copy } else if (useRefDma & 0x01) { fifoDmaManager = new FifoDmaManager(); // memcpy } else { fifoDmaManager = new ReferencedDmaManager(); // no copy } connector = fifoDmaManager; } /* * Prepare OpenCL: * get OpenCL information * create command queue */ void GpuScheduler::init_gpu() { clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); // unavailable GPU if(ret_num_devices == 0) { exit(EXIT_FAILURE); } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } } GpuScheduler::~GpuScheduler() { clReleaseCommandQueue(command_queue); clReleaseContext(context); } void GpuScheduler::initGpuBuffer(GpuBufferPtr m) { m->size = 0; m->allocate_size = 64; m->buf = (cl_mem*)malloc(m->allocate_size*sizeof(cl_mem*)); m->event = (cl_event*)malloc(m->allocate_size*sizeof(cl_event*)); } void GpuScheduler::destroyGpuBuffer(GpuBufferPtr m) { free(m->buf); free(m->event); m->size = 0; m->allocate_size = 0; m->buf = 0; m->event = 0; } cl_mem GpuScheduler::createBuffer(GpuBufferPtr m, int i, cl_context context, cl_mem_flags flags, size_t size, cl_int *error) { if (i > m->allocate_size) { // reallocate buffer size m->allocate_size *= 2; m->buf = (cl_mem*)realloc(m->buf, m->allocate_size*sizeof(cl_mem*)); m->event = (cl_event*)realloc(m->event, m->allocate_size*sizeof(cl_event*)); } m->buf[i] = clCreateBuffer(context, flags, size, 0, error); return m->buf[i]; } #define NOP_REPLY NULL /** * wait for previous pipeline termination * kernel_event, memout_event */ void GpuScheduler::wait_for_event(cl_event* kernel_event, GpuBufferPtr memout, memaddr* reply, TaskListPtr *taskList, int cur) { if (kernel_event[1-cur] == NOP_REPLY) { if(reply[1-cur]) { connector->mail_write(reply[1-cur]); reply[1-cur]=0; } } else if (kernel_event[1-cur] != NULL) { int ret=clWaitForEvents(1,&kernel_event[1-cur]); if (ret<0) { error(convert_error_status(ret)); } clReleaseEvent(kernel_event[1-cur]); kernel_event[1-cur] = 0; } if (memout[1-cur].size > 0) { for (int i=0; i < memout[1-cur].size; i++) { int ret=clWaitForEvents(memout[1-cur].size, &memout[1-cur].event[i]); if (ret<0) { error(convert_error_status(ret)); } } for (int i=0; i < memout[1-cur].size; i++) { clReleaseEvent(memout[1-cur].event[i]); memout[1-cur].event[i] = 0; clReleaseMemObject(memout[1-cur].buf[i]); memout[1-cur].buf[i] = 0; } } if (memin[1-cur].size > 0) { for (int i=0; i < memin[1-cur].size; i++) { clReleaseEvent(memin[1-cur].event[i]); memin[1-cur].event[i] = 0; clReleaseMemObject(memin[1-cur].buf[i]); memin[1-cur].buf[i] = 0; } } if(reply[1-cur]) { connector->mail_write(reply[1-cur]); reply[1-cur]=0; } if (taskList[1-cur]!=NULL){ cl_ulong start = 0; cl_ulong end = 0; clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); taskList[1-cur]->task_start_time = start; taskList[1-cur]->task_end_time = end; } } void GpuScheduler::gpuTaskError(int cur, TaskListPtr *tasklist, int ret) { error(convert_error_status(ret)); kernel_event[cur] = NOP_REPLY; kernel[cur] = 0; memout[cur].buf = 0; memin[cur].buf = 0; reply[cur] = (memaddr)tasklist[cur]->waiter; // wait kernel[1-cur] and write[1-cur] wait_for_event(kernel_event, memout, reply, tasklist, cur); } /* * run GPU task * Get input and output data from tasklist. * Enqueue OpenCL command and clflush. * Enqueue and clflush are pipelined structure. */ void GpuScheduler::run() { int cur = 0; TaskListPtr tasklist[2]; tasklist[0]=NULL;tasklist[1]=NULL; initGpuBuffer(&memin[0]);initGpuBuffer(&memin[1]); initGpuBuffer(&memout[0]);initGpuBuffer(&memout[1]); memset(&flag, 0, sizeof(HTask::htask_flag)); for (;;) { memaddr params_addr = connector->task_list_mail_read(); // read task list mail from DmaManager if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) { clFinish(command_queue); if (kernel[0]) clReleaseKernel(kernel[0]); if (kernel[1]) clReleaseKernel(kernel[1]); if (kernel_event[0] && kernel_event[0]!=NOP_REPLY) clReleaseEvent(kernel_event[0]); if (kernel_event[1] && kernel_event[1]!=NOP_REPLY) clReleaseEvent(kernel_event[1]); destroyGpuBuffer(&memout[cur-1]); destroyGpuBuffer(&memout[cur]); destroyGpuBuffer(&memin[cur]); destroyGpuBuffer(&memin[cur-1]); return ; } (*connector->start_dmawait_profile)(&(connector->start_time)); while (params_addr) { // since we are on the same memory space, we don't has to use dma_load here tasklist[cur] = (TaskListPtr)connector->dma_load(this, params_addr, sizeof(TaskList), DMA_READ_TASKLIST); // tasklist[cur]->task_start_time = gettime(); /* * get flip flag * flip : When caluculate on input data, to treat this as a output data */ if (tasklist[cur]->self) { flag = tasklist[cur]->self->flag; } for (TaskPtr nextTask = tasklist[cur]->tasks;nextTask < tasklist[cur]->last(); nextTask = nextTask->next(),params_addr = (memaddr)tasklist[cur]->next) { if(nextTask->command==ShowTime) { connector->show_profile(); gpuTaskError(cur,tasklist,ret); continue; } if(nextTask->command==StartProfile) { connector->start_profile(); gpuTaskError(cur,tasklist,ret); continue; } if (load_kernel(nextTask->command) == 0) { gpuTaskError(cur,tasklist,ret); continue; } cl_program& program = *gpu_task_list[nextTask->command].gputask->program; const char *function = gpu_task_list[nextTask->command].name; if (kernel[cur]) clReleaseKernel(kernel[cur]); kernel[cur] = clCreateKernel(program, function, &ret); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } int param = 0; // set arg count cl_mem memparam = createBuffer(&memin[cur], 0, context, CL_MEM_READ_ONLY, sizeof(memaddr)*nextTask->param_count, &ret); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } ret = clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count, nextTask->param(param), 0, NULL, &memin[cur].event[0]); // parameter is passed as first kernel arg param=0; if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr),(void *)&memin[cur].buf[0]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } param++; cl_mem_flags mem_flag = CL_MEM_READ_ONLY; if (!flag.flip) { // set input data when not flip for(int i=0;i<nextTask->inData_count;i++) { ListElement *input_buf = nextTask->inData(i); if (input_buf->size==0) break; createBuffer(&memin[cur], i+1, context, mem_flag, input_buf->size, &ret); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } ret = clEnqueueWriteBuffer(command_queue, memin[cur].buf[i+1], CL_TRUE, 0, input_buf->size, input_buf->addr, 0, NULL, &memin[cur].event[i+1]); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr), (void *)&memin[cur].buf[i+1]); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } param++; } } cl_mem_flags out_mem_flag; if (flag.flip) { out_mem_flag = CL_MEM_READ_WRITE; } else { out_mem_flag = CL_MEM_WRITE_ONLY; } for(int i = 0; i<nextTask->outData_count;i++) { // set output data ListElement *output_buf = flag.flip? nextTask->inData(i) : nextTask->outData(i); if (output_buf->size==0) break; createBuffer(&memout[cur], i, context, out_mem_flag, output_buf->size, &ret); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } if (flag.flip) { // use output buffer as input buffer ListElement *input_buf = nextTask->inData(i); ret = clEnqueueWriteBuffer(command_queue, memout[cur].buf[i+1], CL_TRUE, 0, input_buf->size, input_buf->addr, 0, NULL, &memout[cur].event[i+1]); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } } ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr), (void *)&memout[cur].buf[i]); if (ret<0) { const char *msg=convert_error_status(ret); error(msg); } param++; } memin[cur].size = nextTask->inData_count+1; // +1 means param memout[cur].size = nextTask->outData_count; tasklist[cur]->task_start_time = gettime(); if (tasklist[cur]->dim > 0) { ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist[cur]->dim, NULL, &tasklist[cur]->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]); } else { ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size, memin[cur].event, &kernel_event[cur]); } if (ret<0) { gpuTaskError(cur, tasklist, ret); } for(int i=0;i<nextTask->outData_count;i++) { // read output data ListElement *output_buf = flag.flip? nextTask->inData(i) :nextTask->outData(i); if (output_buf->size==0) break; ret = clEnqueueReadBuffer(command_queue, memout[cur].buf[i], CL_FALSE, 0, output_buf->size, output_buf->addr, 1, &kernel_event[cur], &memout[cur].event[i]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } } tasklist[cur]->task_end_time = gettime(); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } reply[cur] = (memaddr)tasklist[cur]->waiter; // wait kernel[1-cur] and write[1-cur] wait_for_event(kernel_event, memout, reply, tasklist, cur); } printf("GPU %d %s\t%lld\n",tasklist[cur]->self->cpu_type,(char*)(gpu_task_list[tasklist[cur]->tasks[0].command].name),tasklist[cur]->task_end_time-tasklist[cur]->task_start_time); // pipeline : 1-cur // no pipeline : cur cur = 1 - cur; } wait_for_event(kernel_event, memout, reply, tasklist, cur); unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); connector->mail_write((memaddr)MY_SPE_STATUS_READY); } /* NOT REACHED */ } int not_ready(SchedTask* smanager, void* r, void *w) { smanager->printf("GPU task not ready %d\n", smanager->atask->command); return 0; } /* * kernel file open and build program */ int GpuScheduler::load_kernel(int cmd) { if (gpu_task_list[cmd].run == null_run) { fprintf(stderr, "GPU kernel %d not defined.\n",cmd); return 0; } const char *filename = (const char *)gpu_task_list[cmd].gputask->program; int fd; char *source_str; size_t source_size; fd = open(filename, O_RDONLY); if (fd<0) { fprintf(stderr, "Failed to load kernel %s.\n",filename); exit(1); } struct stat stats; fstat(fd,&stats); off_t size = stats.st_size; if (size<=0) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)alloca(size); source_size = read(fd, source_str, size); close(fd); cl_program *program = new cl_program; *program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(*program, 1, &device_id, NULL, NULL, NULL); if(ret<0) { size_t size; clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); char *log = new char[size]; clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); error(log); } gpu_task_list[cmd].gputask->program = program; gpu_task_list[cmd].run = null_run; // kernel is ready return 1; } // regist kernel file name void gpu_register_task(int cmd, const char* filename, const char* functionname) { gpu_task_list[cmd].run = not_ready; // not yet ready gpu_task_list[cmd].load = null_loader; gpu_task_list[cmd].wait = null_loader; gpu_task_list[cmd].name = functionname; gpu_task_list[cmd].gputask->program = (cl_program *) filename; } /* end */