Mercurial > hg > Game > Cerium
changeset 1960:273de551f726 draft
use multiple command_queue
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 11 Feb 2014 16:28:22 +0900 |
parents | 6d343611bb03 |
children | 7d1afa7aeccd |
files | TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaTaskManagerFactory.cc TaskManager/Gpu/GpuScheduler.cc TaskManager/Gpu/GpuScheduler.h TaskManager/Makefile.def example/flip/Makefile example/flip/Makefile.def example/flip/main.cc example/flip/task_init.cc example/flip/twice.cl example/many_task/Makefile example/many_task/ppe/task_init.cc example/multiply/cuda/multiply.cu example/multiply/main.cc example/word_count/Makefile.cuda example/word_count/main.cc |
diffstat | 16 files changed, 148 insertions(+), 114 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc Sat Feb 08 14:19:41 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Tue Feb 11 16:28:22 2014 +0900 @@ -84,21 +84,21 @@ static void release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) { - for (int i=0; i<mem[cur-1].in_size; i++) { - if (mem[cur-1].memin[i]) - cuMemFree(mem[cur-1].memin[i]); - mem[cur-1].memin[i] = 0; + for (int i=0; i<mem[cur].in_size; i++) { + if (mem[cur].memin[i]) + cuMemFree(mem[cur].memin[i]); + mem[cur].memin[i] = 0; } - for (int i=0; i<mem[cur-1].out_size; i++) { - if (mem[cur-1].event[i] != 0) - cuEventDestroy(mem[cur-1].event[i]); - mem[cur-1].event[i] = 0; - if (mem[cur-1].memout[i]) - cuMemFree(mem[cur-1].memout[i]); - mem[cur-1].memout[i] = 0; + for (int i=0; i<mem[cur].out_size; i++) { + if (mem[cur].event[i] != 0) + cuEventDestroy(mem[cur].event[i]); + mem[cur].event[i] = 0; + if (mem[cur].memout[i]) + cuMemFree(mem[cur].memout[i]); + mem[cur].memout[i] = 0; } - mem[cur-1].in_size = 0; - mem[cur-1].out_size = 0; + mem[cur].in_size = 0; + mem[cur].out_size = 0; } void @@ -128,12 +128,12 @@ if (ret!=0) error(convert_error_status(ret)); } } - release_buf_event(cur, cudabuffer); + release_buf_event(cur-1, cudabuffer); } if(reply) { connector->mail_write(reply); - __debug(this, "CUDA %d %s\t%lld\n", taskList->cpu_type, (char*)(cuda_task_list[taskList->tasks[0].command].name), taskList->task_end_time-taskList->task_start_time); + __debug(this, "CUDA %d %s\t%lld\n", taskList->self->cpu_type, (char*)(cuda_task_list[taskList->tasks[0].command].name), taskList->task_end_time-taskList->task_start_time); reply = 0; } } @@ -145,7 +145,7 @@ cuEventDestroy(kernel_event[cur]); kernel_event[cur] = NOP_REPLY; kernel[cur] = 0; - release_buf_event(cur+1, cudabuffer); + release_buf_event(cur, cudabuffer); wait_for_event(kernel_event, cudabuffer, taskList, cur); } @@ -291,7 +291,7 @@ ret = cuEventRecord(cudabuffer[cur].event[i], cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } } - // wait kernel[1-cur] and write[1-cur] + // wait kernel[cur] and write[cur] // pipeline : cur // to stop pipeline set cur+1 if (cur == 0) {
--- a/TaskManager/Cuda/CudaTaskManagerFactory.cc Sat Feb 08 14:19:41 2014 +0900 +++ b/TaskManager/Cuda/CudaTaskManagerFactory.cc Tue Feb 11 16:28:22 2014 +0900 @@ -1,4 +1,3 @@ -#define DEBUG #include "CellTaskManagerImpl.h" #include "CudaThreads.h" #include "CpuThreads.h"
--- a/TaskManager/Gpu/GpuScheduler.cc Sat Feb 08 14:19:41 2014 +0900 +++ b/TaskManager/Gpu/GpuScheduler.cc Tue Feb 11 16:28:22 2014 +0900 @@ -36,17 +36,20 @@ exit(EXIT_FAILURE); } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); - command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); - if (ret<0) { - const char *msg=convert_error_status(ret); - error(msg); + for (int i=0;i<STAGE;i++) { + command_queue[i] = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); + if (ret<0) { + const char *msg=convert_error_status(ret); + error(msg); + } } } GpuScheduler::~GpuScheduler() { - clReleaseCommandQueue(command_queue); + for (int i=0;i<STAGE;i++) + clReleaseCommandQueue(command_queue[i]); clReleaseContext(context); } @@ -85,15 +88,15 @@ static void release_buf_event(int cur, GpuScheduler::GpuBufferPtr memout) { - for (int i=0; i < memout[1-cur].size; i++) { - if (memout[1-cur].event[i] != 0) - clReleaseEvent(memout[1-cur].event[i]); - memout[1-cur].event[i] = 0; - if (memout[1-cur].buf[i] != 0) - clReleaseMemObject(memout[1-cur].buf[i]); - memout[1-cur].buf[i] = 0; + for (int i=0; i < memout[cur].size; i++) { + if (memout[cur].event[i] != 0) + clReleaseEvent(memout[cur].event[i]); + memout[cur].event[i] = 0; + if (memout[cur].buf[i] != 0) + clReleaseMemObject(memout[cur].buf[i]); + memout[cur].buf[i] = 0; } - memout[1-cur].size = 0; + memout[cur].size = 0; } /** @@ -102,10 +105,10 @@ */ void GpuScheduler::wait_for_event(cl_event* kernel_event, GpuBufferPtr memout, TaskListPtr taskList, int cur) { - if (kernel_event[1-cur] == NOP_REPLY) { + if (kernel_event[cur-1] == NOP_REPLY) { - } else if (kernel_event[1-cur] != NULL) { - int ret=clWaitForEvents(1,&kernel_event[1-cur]); + } else if (kernel_event[cur-1] != NULL) { + int ret=clWaitForEvents(1,&kernel_event[cur-1]); if (ret<0) { error(convert_error_status(ret)); @@ -113,25 +116,25 @@ if (taskList!=NULL){ cl_ulong start = 0; cl_ulong end = 0; - clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); - clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + clGetEventProfilingInfo(kernel_event[cur-1],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); + clGetEventProfilingInfo(kernel_event[cur-1],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); if (taskList->task_start_time == 0) taskList->task_start_time = start; taskList->task_end_time = end; } - clReleaseEvent(kernel_event[1-cur]); - kernel_event[1-cur] = 0; + clReleaseEvent(kernel_event[cur-1]); + kernel_event[cur-1] = 0; } - if (memout[1-cur].size > 0) { - int ret=clWaitForEvents(memout[1-cur].size, memout[1-cur].event); + if (memout[cur-1].size > 0) { + int ret=clWaitForEvents(memout[cur-1].size, memout[cur-1].event); if (ret<0) error(convert_error_status(ret)); - release_buf_event(cur,memout); + release_buf_event(cur-1,memout); } - if (memin[1-cur].size > 0) { - release_buf_event(cur,memin); + if (memin[cur-1].size > 0) { + release_buf_event(cur-1,memin); } if(reply) { connector->mail_write(reply); @@ -150,8 +153,8 @@ if (kernel[cur] != 0) clReleaseKernel(kernel[cur]); kernel[cur] = 0; - release_buf_event(1-cur,memout); - release_buf_event(1-cur,memin); + release_buf_event(cur,memout); + release_buf_event(cur,memin); // wait kernel[1-cur] and write[1-cur] wait_for_event(kernel_event, memout, tasklist, cur); @@ -172,9 +175,13 @@ int cur = 0; TaskListPtr tasklist = NULL; reply = 0; - initGpuBuffer(&memin[0]);initGpuBuffer(&memin[1]); - initGpuBuffer(&memout[0]);initGpuBuffer(&memout[1]); - memset(&flag, 0, sizeof(HTask::htask_flag)*2); + for (int i=0;i<STAGE;i++) { + initGpuBuffer(&memin[i]); + initGpuBuffer(&memout[i]); + kernel[i]=0; + kernel_event[i]=0; + } + memset(&flag, 0, sizeof(HTask::htask_flag)*STAGE); for (;;) { memaddr params_addr = connector->task_list_mail_read(); @@ -182,11 +189,12 @@ if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) { // wait_for_envet was called, so all kernel,buf,event have been released. - clFinish(command_queue); - destroyGpuBuffer(&memout[1-cur]); - destroyGpuBuffer(&memout[cur]); - destroyGpuBuffer(&memin[cur]); - destroyGpuBuffer(&memin[1-cur]); + for (int i=0;i<STAGE;i++) + clFinish(command_queue[i]); + for (int i=0;i<STAGE;i++) { + destroyGpuBuffer(&memin[i]); + destroyGpuBuffer(&memout[i]); + } return ; } @@ -194,7 +202,7 @@ while (params_addr) { // since we are on the same memory space, we don't has to use dma_load here tasklist = (TaskListPtr)connector->dma_load(this, params_addr, - sizeof(TaskList), DMA_READ_TASKLIST); + sizeof(TaskList), DMA_READ_TASKLIST); // tasklist[cur]->task_start_time = gettime(); tasklist->task_start_time = 0; /* @@ -204,7 +212,7 @@ if (tasklist->self) { flag[cur] = tasklist->self->flag; } else { - memset(&flag[cur], 0, sizeof(HTask::htask_flag)); + memset(&flag[cur], 0, sizeof(HTask::htask_flag)); } for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) { if(nextTask->command==ShowTime) { @@ -223,14 +231,14 @@ if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } int param = 0; - + // set arg count cl_mem memparam = createBuffer(&memin[cur], 0, context, CL_MEM_READ_ONLY, sizeof(memaddr)*nextTask->param_count, &ret); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } - + // parameter is passed as first kernel arg - ret = clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count, + ret = clEnqueueWriteBuffer(command_queue[cur], memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count, nextTask->param(0), 0, NULL, &memin[cur].event[0]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } @@ -246,7 +254,7 @@ if (input_buf->size==0) break; createBuffer(&memin[cur], param, context, mem_flag, input_buf->size, &ret); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } - ret = clEnqueueWriteBuffer(command_queue, memin[cur].buf[param], CL_TRUE, 0, + ret = clEnqueueWriteBuffer(command_queue[cur], memin[cur].buf[param], CL_TRUE, 0, input_buf->size, input_buf->addr, 0, NULL, &memin[cur].event[param]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } @@ -272,10 +280,10 @@ memout[cur].size = param - memin[cur].size; // no buffer on flip, but flip use memout event if (tasklist->dim > 0) { - ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist->dim, + ret = clEnqueueNDRangeKernel(command_queue[cur], kernel[cur], tasklist->dim, NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]); } else { - ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size, + ret = clEnqueueTask(command_queue[cur], kernel[cur], memin[cur].size, memin[cur].event, &kernel_event[cur]); } if (ret<0) { gpuTaskError(cur, tasklist, ret); continue; } @@ -286,21 +294,30 @@ GpuBufferPtr mem = flag[cur].flip ? memin : memout ; int i0 = flag[cur].flip ? i+1 : i ; // flip use memin buffer and memout event - ret = clEnqueueReadBuffer(command_queue, mem[cur].buf[i0], CL_FALSE, 0, + ret = clEnqueueReadBuffer(command_queue[cur], mem[cur].buf[i0], CL_FALSE, 0, output_buf->size, output_buf->addr, 1, &kernel_event[cur], &memout[cur].event[i]); if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; } } // wait kernel[1-cur] and write[1-cur] // pipeline : cur // to stop pipeline set 1-cur - wait_for_event(kernel_event, memout, tasklist, cur); - cur = 1 - cur; + if (cur == 0) { + wait_for_event(kernel_event, memout, tasklist, STAGE); + } else { + wait_for_event(kernel_event, memout, tasklist, cur); + } + cur++; + if (STAGE <= cur) + cur = 0; } reply = (memaddr)tasklist->waiter; params_addr = (memaddr)tasklist->next; } - wait_for_event(kernel_event, memout, tasklist, cur); - + if (cur == 0) { + wait_for_event(kernel_event, memout, tasklist, STAGE); + } else { + wait_for_event(kernel_event, memout, tasklist, cur); + } unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); connector->mail_write((memaddr)MY_SPE_STATUS_READY);
--- a/TaskManager/Gpu/GpuScheduler.h Sat Feb 08 14:19:41 2014 +0900 +++ b/TaskManager/Gpu/GpuScheduler.h Tue Feb 11 16:28:22 2014 +0900 @@ -15,6 +15,8 @@ extern TaskObject gpu_task_list[MAX_TASK_OBJECT]; +#define STAGE 8 + class GpuScheduler : public MainScheduler { public: typedef struct gpubuffer { @@ -34,14 +36,14 @@ cl_uint ret_num_platforms; cl_uint ret_num_devices; cl_context context; - cl_command_queue command_queue; + cl_command_queue command_queue[STAGE]; cl_int ret; memaddr reply; - cl_kernel kernel[2]; - cl_event kernel_event[2]; - GpuBuffer memin[2]; - GpuBuffer memout[2]; - HTask::htask_flag flag[2]; + cl_kernel kernel[STAGE]; + cl_event kernel_event[STAGE]; + GpuBuffer memin[STAGE]; + GpuBuffer memout[STAGE]; + HTask::htask_flag flag[STAGE]; private: int load_kernel(int cmd); cl_mem createBuffer(GpuBufferPtr m, int i, cl_context context, cl_mem_flags flags, size_t size, cl_int *error);
--- a/TaskManager/Makefile.def Sat Feb 08 14:19:41 2014 +0900 +++ b/TaskManager/Makefile.def Tue Feb 11 16:28:22 2014 +0900 @@ -48,7 +48,7 @@ CC = clang++ CXX = clang++ -CFLAGS = -Wall `sdl-config --cflags` -m$(ABIBIT) $(OPT) +CFLAGS = -Wall `sdl-config --cflags` -m$(ABIBIT) $(OPT) #-DDEBUG CXXFLAGS = $(CFLAGS) LIBS = -m$(ABIBIT)
--- a/example/flip/Makefile Sat Feb 08 14:19:41 2014 +0900 +++ b/example/flip/Makefile Tue Feb 11 16:28:22 2014 +0900 @@ -1,29 +1,15 @@ -include ./Makefile.def +default: gpu -SRCS_TMP = $(wildcard *.cc) -SRCS_EXCLUDE = # 除外するファイルを書く -SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP)) -OBJS = $(SRCS:.cc=.o) - -LIBS += -lGpuManager -framework opencl `sdl-config --libs` - -.SUFFIXES: .cc .o +gpu: FORCE + @echo "Make for OpenCL" + @$(MAKE) -f Makefile.gpu -.cc.o: - $(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@ - -all: $(TARGET) -gpu: all +cuda: FORCE + @echo "Make for CUDA" + @$(MAKE) -f Makefile.cuda -$(TARGET): $(OBJS) - $(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS) - -link: - $(CC) -o $(TARGET) $(OBJS) $(LIBS) - -debug: $(TARGET) - sudo gdb ./$(TARGET) +FORCE: clean: - rm -f $(TARGET) $(OBJS) - rm -f *~ \#* + @$(MAKE) -f Makefile.gpu clean + @$(MAKE) -f Makefile.cuda clean
--- a/example/flip/Makefile.def Sat Feb 08 14:19:41 2014 +0900 +++ b/example/flip/Makefile.def Tue Feb 11 16:28:22 2014 +0900 @@ -7,3 +7,6 @@ INCLUDE = -I${CERIUM}/include/TaskManager -I. -I../.. LIBS = -L${CERIUM}/TaskManager + +NVCC = nvcc +NVCCFLAGS = -ptx -arch=sm_20 \ No newline at end of file
--- a/example/flip/main.cc Sat Feb 08 14:19:41 2014 +0900 +++ b/example/flip/main.cc Tue Feb 11 16:28:22 2014 +0900 @@ -11,8 +11,10 @@ static long int length = DEFAULT; static int task = 1; int *indata; +int *data; extern void task_init(void); +void TMend(TaskManager*); char usr_help_str[] = "GpuRun [length]\n"; @@ -72,25 +74,28 @@ void test(TaskManager *manager) { indata = new int[length]; - + data = new int; + for (int c=0; c < length ;c++) { indata[c] = c; } - + *data = 2; print_data(indata, length, "before"); - + HTaskPtr twice = manager->create_task(Twice); - + twice->set_param(0, (memaddr)length); twice->set_inData(0, indata, sizeof (int)*length); - // twice->set_outData(0, indata, sizeof (int)*length); + twice->set_inData(1, data, sizeof(int)); + twice->set_outData(0, indata, sizeof (int)*length); + twice->set_outData(1, data, sizeof(int)); twice->set_cpu(GPU_0); twice->flip(); /* * set_post() で ppe task を渡せるようにしたい */ - twice->set_post(twice_result, (void*)indata, (void*)length); + //twice->set_post(twice_result, (void*)indata, (void*)length); twice->spawn(); } @@ -107,8 +112,15 @@ for (int i = 0; i < task; ++i) { test(manager); } - + + manager->set_TMend(TMend); return 0; } +void +TMend(TaskManager* manager) { + print_data(indata, length, "after"); + delete[] indata; + delete data; +} /* end */
--- a/example/flip/task_init.cc Sat Feb 08 14:19:41 2014 +0900 +++ b/example/flip/task_init.cc Tue Feb 11 16:28:22 2014 +0900 @@ -1,8 +1,14 @@ #include "GpuFunc.h" #include "GpuScheduler.h" +#include "CudaScheduler.h" void task_init(void) { +#ifdef __CERIUM_GPU__ GpuSchedRegister(Twice, "twice.cl", "twice"); +#endif +#ifdef __CERIUM_CUDA__ + CudaSchedRegister(Twice, "twice.ptx", "twice"); +#endif }
--- a/example/flip/twice.cl Sat Feb 08 14:19:41 2014 +0900 +++ b/example/flip/twice.cl Tue Feb 11 16:28:22 2014 +0900 @@ -1,12 +1,9 @@ __kernel void twice(__constant int *data_count, __global int *input_data) - // __global int *output_data) { long count = (long)data_count[0]; for (int i = 0; i<count; i++) { - // output_data[i] = 2*input_data[i]; input_data[i] *= 2; } - }
--- a/example/many_task/Makefile Sat Feb 08 14:19:41 2014 +0900 +++ b/example/many_task/Makefile Tue Feb 11 16:28:22 2014 +0900 @@ -20,6 +20,10 @@ @echo "Make for OpenCL" @$(MAKE) -f Makefile.gpu +cuda: FORCE + @echo "Make for Cuda" + @$(MAKE) -f Makefile.cuda + gpu-test: FORCE @echo "Make for OpenCL" @$(MAKE) -f Makefile.gpu test @@ -33,3 +37,5 @@ @$(MAKE) -f Makefile.macosx clean @$(MAKE) -f Makefile.linux clean @$(MAKE) -f Makefile.cell clean + @$(MAKE) -f Makefile.gpu clean + @$(MAKE) -f Makefile.cuda clean
--- a/example/many_task/ppe/task_init.cc Sat Feb 08 14:19:41 2014 +0900 +++ b/example/many_task/ppe/task_init.cc Tue Feb 11 16:28:22 2014 +0900 @@ -1,5 +1,9 @@ #include "Func.h" #include "Scheduler.h" +#ifdef __CERIUM_CUDA__ +#include "CudaScheduler.h" +#endif + SchedExternTask(QuickSort); SchedExternTask(SortSimple); @@ -13,4 +17,8 @@ SchedRegister(SortSimple); SchedRegister(SortCompat); SchedRegister(SortTaskArray); + +#ifdef __CERIUM_CUDA__ + CudaSchedRegister(QUICK_SORT, "cuda/QuickSort.ptx", "quick_sort"); +#endif }
--- a/example/multiply/cuda/multiply.cu Sat Feb 08 14:19:41 2014 +0900 +++ b/example/multiply/cuda/multiply.cu Tue Feb 11 16:28:22 2014 +0900 @@ -2,7 +2,7 @@ __global__ void multi(long* params, float* A, float* B, float* C) { int id = blockIdx.x * blockDim.x + threadIdx.x; long length = params[0]; - for (int id = 0; id < length; id++) - C[id]=A[id]*B[id]; + // for (int id = 0; id < length; id++) + C[id]=A[id]*B[id]; } }
--- a/example/multiply/main.cc Sat Feb 08 14:19:41 2014 +0900 +++ b/example/multiply/main.cc Tue Feb 11 16:28:22 2014 +0900 @@ -105,7 +105,7 @@ multiply->set_outData(0,(memaddr)C, sizeof(float)*length); multiply->set_param(0,(long)length); // param 0に0~length-1をsetしたtaskをlength個spawnする - //multiply->iterate(length); + multiply->iterate(length); // hoge = manager->create_task(MULTIPLY_TASK); // hoge->set_cpu(spe_cpu); @@ -116,7 +116,7 @@ // hoge->set_param(0,(long)0); // hoge->wait_for(multiply); // hoge->iterate(length); - multiply->spawn(); + //multiply->spawn(); // } }
--- a/example/word_count/Makefile.cuda Sat Feb 08 14:19:41 2014 +0900 +++ b/example/word_count/Makefile.cuda Tue Feb 11 16:28:22 2014 +0900 @@ -25,7 +25,6 @@ NVCC = /Developer/NVIDIA/CUDA-5.5/bin/nvcc NVCCFLAGS = -ptx -arch=sm_20 -INDEX = 0 .SUFFIXES: .cc .o .cu .ptx
--- a/example/word_count/main.cc Sat Feb 08 14:19:41 2014 +0900 +++ b/example/word_count/main.cc Tue Feb 11 16:28:22 2014 +0900 @@ -231,7 +231,6 @@ return 0; } - static int blocks = 48; //static int blocks = 31 * 6 * 24; static int division = 16; // in Kbyte @@ -254,7 +253,7 @@ w->size = w->file_size = st_mmap.size; w->file_mmap = st_mmap.file_mmap; printf("w %lx\n",(long)w); - + /* 1task分のデータサイズ(byte) */ if (w->size >= 1024*division) { w->division_size = 1024 * division;/*16kbyte*/