Mercurial > hg > Game > Cerium
changeset 1506:a7895ab4d0e3 draft
add flip flag and NDRange flag
author | Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 10 Sep 2012 15:04:39 +0900 |
parents | 0ad321ee074d |
children | 7abad41d12af |
files | TaskManager/ChangeLog TaskManager/Gpu/GpuScheduler.cc TaskManager/Gpu/GpuScheduler.h TaskManager/kernel/ppe/HTask.h TaskManager/kernel/ppe/TaskList.h TaskManager/test/GpuRegistTaskTest/GpuRegistTaskTest.cc TaskManager/test/GpuRegistTaskTest/Makefile TaskManager/test/GpuRegistTaskTest/Makefile.orig TaskManager/test/GpuRegistTaskTest/twice.cl TaskManager/test/GpuRunTest/GpuRunTest.cc example/OpenCL/twice.cl example/many_task/Makefile.macosx example/many_task/main.cc example/word_count/a.txt |
diffstat | 14 files changed, 152 insertions(+), 240 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/ChangeLog Fri Aug 24 18:03:12 2012 +0900 +++ b/TaskManager/ChangeLog Mon Sep 10 15:04:39 2012 +0900 @@ -1,3 +1,22 @@ +2012-9-5 Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> + + set_cpu(SPE_ANY) + CPUで実行するかGPUで実行するか選択可能にする + CPU_ANYとGPU_ANYを追加 + SPE_ANYを選択したときにGPUで実行できるかはコマンドラインで選択した方がよい + -cpuと-gpuで選択する + ベンチマークを取る事も考える + + sortではflipを使っている + flip:input data上で計算を行ったときにそのinput bufferをそのままoutput bufferにする機能 + これをGPUで実現するにはbufferをread writeにすればよい + そのためにはtask投入時にflipするかどうかを知っている必要がある + taskにbit fieldを使ったflagがあるので、そのAPIを足す。かなりの変更が必要 + schedTaskのflipはやめる + + 将来的にはCLのソースとCeriumのソースは同じにしたい + CLに合わせるか、CLを生成するかのどちらか + 2012-8-22 Shinji KONO <kono@ie.u-ryukyu.ac.jp> 今後の課題
--- a/TaskManager/Gpu/GpuScheduler.cc Fri Aug 24 18:03:12 2012 +0900 +++ b/TaskManager/Gpu/GpuScheduler.cc Mon Sep 10 15:04:39 2012 +0900 @@ -5,6 +5,7 @@ #include "stdio.h" #include <fcntl.h> #include <sys/stat.h> +#include <string.h> GpuScheduler::GpuScheduler() { @@ -16,11 +17,11 @@ GpuScheduler::init_impl(int useRefDma) { if (useRefDma & 0x10) { - fifoDmaManager = new PreRefDmaManager(); + fifoDmaManager = new PreRefDmaManager(); // Prefetch command and no copy } else if (useRefDma & 0x01) { - fifoDmaManager = new FifoDmaManager(); + fifoDmaManager = new FifoDmaManager(); // memcpy } else { - fifoDmaManager = new ReferencedDmaManager(); + fifoDmaManager = new ReferencedDmaManager(); // no copy } connector = fifoDmaManager; } @@ -28,7 +29,7 @@ void GpuScheduler::init_gpu() { - clGetPlatformIDs(1, &platform_id, &ret_num_platforms); + clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); // unavailable GPU if(ret_num_devices == 0) { @@ -38,26 +39,38 @@ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); } +GpuScheduler::~GpuScheduler() +{ + clReleaseCommandQueue(command_queue); + clReleaseContext(context); +} void GpuScheduler::run() { for (;;) { memaddr params_addr = connector->task_list_mail_read(); - // Get OpenCL infomation - + if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) { clFinish(command_queue); return ; } - + + HTask::htask_flag flag; + + memset(&flag, 0, sizeof(HTask::htask_flag)); + while (params_addr) { // since we are on the same memory space, we don't hae to use dma_load here - TaskListPtr tasklist = (TaskListPtr)connector->dma_load(this, params_addr, + TaskListPtr tasklist = (TaskListPtr)connector->dma_load(this, params_addr, sizeof(TaskList), DMA_READ_TASKLIST); - - for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) { + if (tasklist->self) { + flag = tasklist->self->flag; + } + + for (TaskPtr nextTask = tasklist->tasks; + nextTask < tasklist->last(); nextTask = nextTask->next()) { load_kernel(nextTask->command); @@ -65,40 +78,52 @@ int err = CL_SUCCESS; int param = 0; - cl_mem memin = clCreateBuffer(context, CL_MEM_READ_ONLY, + + + cl_mem memparam = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(memaddr)*nextTask->param_count, NULL, NULL); - err |= clEnqueueWriteBuffer(command_queue, memin, CL_TRUE, 0, sizeof(memaddr)*nextTask->param_count, + err |= clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0, sizeof(memaddr)*nextTask->param_count, nextTask->param(0), 0, NULL, NULL); - err |= clSetKernelArg(kernel, param, sizeof(memaddr),(void *)&memin); - + err |= clSetKernelArg(kernel, param, sizeof(memaddr),(void *)&memparam); + param++; - for(int i=0;i<nextTask->inData_count;i++) { - cl_mem memin = clCreateBuffer(context, CL_MEM_READ_ONLY, nextTask->inData(i)->size, NULL, NULL); - err |= clEnqueueWriteBuffer(command_queue, memin, CL_TRUE, 0, - nextTask->inData(i)->size, nextTask->inData(i)->addr, 0, NULL, NULL); - err |= clSetKernelArg(kernel, param, sizeof(memaddr), (void *)&memin); + + cl_mem_flags mem_flag = CL_MEM_READ_ONLY; + cl_mem *memin = new cl_mem[nextTask->inData_count]; + if (!flag.flip) { + for(int i=0;i<nextTask->inData_count;i++) { + memin[i] = clCreateBuffer(context, mem_flag, nextTask->inData(i)->size, NULL, NULL); + + err |= clEnqueueWriteBuffer(command_queue, memin[i], CL_TRUE, 0, + nextTask->inData(i)->size, nextTask->inData(i)->addr, 0, NULL, NULL); + err |= clSetKernelArg(kernel, param, sizeof(memaddr), (void *)&memin[i]); - param++; + param++; + } } - + cl_mem *memout = new cl_mem[nextTask->outData_count]; - + cl_mem_flags out_mem_flag = flag.flip? CL_MEM_READ_WRITE : CL_MEM_WRITE_ONLY; + for(int i=0;i<nextTask->outData_count;i++) { - memout[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, nextTask->outData(i)->size, NULL, NULL); + memout[i] = clCreateBuffer(context, out_mem_flag, nextTask->outData(i)->size, NULL, NULL); + if (flag.flip) { // use output buffer as input buffer + err |= clEnqueueWriteBuffer(command_queue, memout[i], CL_TRUE, 0, + nextTask->inData(i)->size, nextTask->inData(i)->addr, 0, NULL, NULL); } err |= clSetKernelArg(kernel, param, sizeof(memaddr), (void *)&memout[i]); param++; } - + cl_event ev = NULL; clEnqueueTask(command_queue, kernel, 0, NULL, &ev); - + for(int i=0;i<nextTask->outData_count;i++) { - err |= clEnqueueReadBuffer(command_queue, memout[i], CL_TRUE, 0, + err |= clEnqueueReadBuffer(command_queue, memout[i], CL_TRUE, 0, nextTask->outData(i)->size, nextTask->outData(i)->addr, 1, &ev, NULL); } - delete memout; } clFlush(command_queue); // waiting for queued task + connector->mail_write((memaddr)(tasklist->waiter)); params_addr = (memaddr)tasklist->next; } @@ -125,29 +150,31 @@ int fp; char *source_str; size_t source_size; - + fp = open(filename, O_RDONLY); - + if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } - + struct stat stats; fstat(fp,&stats); off_t size = stats.st_size; - + if (!size) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } - + source_str = (char*)alloca(size); source_size = read(fp, source_str, size); close(fp); - cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, - (const size_t *)&source_size, &ret); + cl_program program = + clCreateProgramWithSource(context, 1, + (const char **)&source_str, + (const size_t *)&source_size, &ret); clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); cl_kernel *kernel = new cl_kernel; @@ -159,7 +186,7 @@ void gpu_register_task(int cmd, const char* filename, const char* functionname) -{ +{ task_list[cmd].run = not_ready; // not yet ready task_list[cmd].load = null_loader; task_list[cmd].wait = null_loader;
--- a/TaskManager/Gpu/GpuScheduler.h Fri Aug 24 18:03:12 2012 +0900 +++ b/TaskManager/Gpu/GpuScheduler.h Mon Sep 10 15:04:39 2012 +0900 @@ -4,6 +4,7 @@ #include "Scheduler.h" #include "FifoDmaManager.h" #include "GpuThreads.h" +#include "HTask.h" #ifdef __APPLE__ #include <OpenCL/opencl.h> @@ -11,11 +12,10 @@ #include <CL/cl.h> #endif - class GpuScheduler : public Scheduler { public: GpuScheduler(); - + virtual ~GpuScheduler(); void init_impl(int useRefDma); void init_gpu(); void run();
--- a/TaskManager/kernel/ppe/HTask.h Fri Aug 24 18:03:12 2012 +0900 +++ b/TaskManager/kernel/ppe/HTask.h Mon Sep 10 15:04:39 2012 +0900 @@ -19,9 +19,9 @@ /*! @class - + @brief - + Cerium の Task で、spawn() でキューに格納されて順次実行される。 cpu の指定があれば並列に実行される。 特定の Task を待ち合わせる事が可能。 @@ -50,13 +50,15 @@ HTask *next; HTask *prev; - struct { - unsigned no_auto_free:1; // bit 0 auto free flag (0 .. auto, 1 manual) + struct htask_flag { + unsigned no_auto_free:1; // bit 0 auto free flag (0 .. auto, 1 manual) + unsigned flip:1; // use read write buffers for all + unsigned nd_range:1; // openCL nd_range } flag; void spawn(); void wait_for(HTask *); - void set_cpu(CPU_TYPE type); + void set_cpu(CPU_TYPE type); void set_post(PostFunction func, void *read, void *write); Task *create_task_array(int task_id, int num_task, int num_param, int num_inData, int num_outData); Task *next_task_array(int task_id, Task *t); @@ -64,10 +66,10 @@ void spawn_task_array(Task *t); HTask *init(int cmd, memaddr rbuf, int rs, memaddr wbuf, int ws) { - init(cmd); - set_input(rbuf, rs); - set_output(wbuf, ws); - return this; + init(cmd); + set_input(rbuf, rs); + set_output(wbuf, ws); + return this; } void initOnce() { @@ -87,26 +89,26 @@ void set_inData_t(int index, memaddr addr, int size) { #ifdef EARLY_TOUCH if ((unsigned long)addr&0xf) { - printf("inData is not aligned. command = %d, index = %d, addr = 0x%lx, size = %d\n", - command, index, (unsigned long)addr, size); - } + printf("inData is not aligned. command = %d, index = %d, addr = 0x%lx, size = %d\n", + command, index, (unsigned long)addr, size); + } char *p = (char *)addr; char b = *p; p = (char *)(addr+size-1); b += *p; #endif - Task *t = ((TaskList*)rbuf)->tasks; - t->set_inData_t(index, addr,size); + Task *t = ((TaskList*)rbuf)->tasks; + t->set_inData_t(index, addr,size); } - void set_outData_t(int index, memaddr addr, int size) { + void set_outData_t(int index, memaddr addr, int size) { #ifdef EARLY_TOUCH - if ((unsigned long)addr&0xf) { - printf("inData is not aligned. command = %d, index = %d, addr = 0x%lx, size = %d\n", - command, index, (unsigned long)addr, size); - } + if ((unsigned long)addr&0xf) { + printf("inData is not aligned. command = %d, index = %d, addr = 0x%lx, size = %d\n", + command, index, (unsigned long)addr, size); + } char *p = (char *)addr; char b = *p; p = (char *)(addr+size-1); b += *p; #endif - Task *t = ((TaskList*)rbuf)->tasks; - t->set_outData_t(index, addr,size); + Task *t = ((TaskList*)rbuf)->tasks; + t->set_outData_t(index, addr,size); } void set_param_t(int index, memaddr param) { Task *t = ((TaskList*)rbuf)->tasks; @@ -114,28 +116,41 @@ } void no_auto_free() { - flag.no_auto_free = 1; + flag.no_auto_free = 1; } void auto_free() { - flag.no_auto_free = 0; + flag.no_auto_free = 0; + } + + void flip() { + flag.flip = 1; + } + void no_flip() { + flag.flip = 0; + } + + htask_flag get_flag(){ + return flag; } void init() { - next = prev = NULL; - waiter = NULL; + next = prev = NULL; + waiter = NULL; } void init(int cmd) { - command = cmd; - flag.no_auto_free = 0; - self = (memaddr) this; + command = cmd; + flag.no_auto_free = 0; + flag.flip = 0; + flag.nd_range = 0; + self = (memaddr) this; - post_func = NULL; - mimpl = NULL; - cpu_type = CPU_PPE; + post_func = NULL; + mimpl = NULL; + cpu_type = CPU_PPE; - post_arg1 = NULL; - post_arg2 = NULL; + post_arg1 = NULL; + post_arg2 = NULL; } #define set_param(index,param) set_param_t(index, (memaddr) (param))
--- a/TaskManager/kernel/ppe/TaskList.h Fri Aug 24 18:03:12 2012 +0900 +++ b/TaskManager/kernel/ppe/TaskList.h Mon Sep 10 15:04:39 2012 +0900 @@ -19,6 +19,7 @@ HTask *self; // 4 byte long dummy[3]; // 16 byte Task tasks[TASK_MAX_SIZE]; // 32*TASK_MAX_SIZE + TaskPtr last() { return (TaskPtr)(((memaddr)tasks)+lastTask); } void set_last(Task *t) { lastTask = ((memaddr)t) - ((memaddr)tasks); }
--- a/TaskManager/test/GpuRegistTaskTest/GpuRegistTaskTest.cc Fri Aug 24 18:03:12 2012 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -#include <stdio.h> -#include <OpenCL/opencl.h> -#include "GpuThreads.h" -#include "GpuScheduler.h" -#include "CellTaskManagerImpl.h" - -#define DEFAULT 5 -extern void gpu_register_task(GpuThreads*, int, const char*, const char*); - -void -print_data(int *data, int size, const char *title){ - printf("%s ---\n", title); - - for ( int i = 0; i < size; i++) { - printf("%2d ", data[i]); - } - - printf("\n"); -} - -void -tester(int *indata,int *outdata, int num) -{ - - //check - int check = 0; - for (int c=0; c<num; c++){ - if(outdata[c] == indata[c]*2){ - check++; - } - } - printf("Computed '%d/%d' correct values\n",check,num); - -} - -void -test(int task_array_num) -{ - - GpuScheduler* g_scheduler = new GpuScheduler(); - - int *indata,*outdata; - int count; - - indata = new int(task_array_num); - outdata = new int(task_array_num); - - // prepare input data - for (count=0; count < task_array_num ;count++) { - indata[count] = count; - } - - g_scheduler->regist_task(1,"./twice.cl","twice"); - - - cl_int ret; - cl_context context = (cl_context)g_scheduler->command_queue; - - cl_mem memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*count, NULL, &ret); - cl_mem memobj_out = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*count, NULL, &ret); - cl_mem data_count = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*count, NULL, &ret); - - cl_command_queue command_queue = g_scheduler->command_queue; - ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, - sizeof(int)*count, indata, 0, NULL, NULL); - ret = clEnqueueWriteBuffer(command_queue, data_count, CL_TRUE, 0, - sizeof(count), &count, 0, NULL, NULL); - - print_data(indata, count, "before"); - - - cl_kernel *kernel = task_list[1].kernel; - clSetKernelArg(*kernel, 0, sizeof(cl_mem), (void *)&memobj_in); - clSetKernelArg(*kernel, 1, sizeof(cl_mem), (void *)&memobj_out); - clSetKernelArg(*kernel, 2, sizeof(cl_mem), (void *)&data_count); - - cl_event ev; - clEnqueueTask(command_queue, *kernel, 0, NULL, &ev); - - clEnqueueReadBuffer(command_queue, memobj_out, CL_TRUE, 0, - sizeof(int)*count, outdata, 1, &ev, NULL); - - - print_data(outdata, count, "after"); - tester(indata,outdata,count); - - delete [] indata; - delete [] outdata; - delete [] g_scheduler; - clReleaseCommandQueue(command_queue); - clReleaseContext(context); - -} - -int -main(int argc, char* argv[]) -{ - - int length = DEFAULT; - - if (argc > 1) { // if exist arg - if(atoi(argv[1])) {// if arg is number - length = atoi(argv[1]); - } - } - - test(length); - -}
--- a/TaskManager/test/GpuRegistTaskTest/Makefile Fri Aug 24 18:03:12 2012 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -include ../../Makefile.def - -CPPFLAGS += -g -Wall -I../../../include/TaskManager -m$(ABIBIT) - -TARGET=GpuRegistTaskTest - - - -LIBS += ../../libGpuManager.a -framework opencl - -GpuRegistTaskTest : GpuRegistTaskTest.o - $(CC) $(CFLAGS) -o $@ $? $(LIBS) - -clean: - rm -rf *.o $(TARGET)
--- a/TaskManager/test/GpuRegistTaskTest/Makefile.orig Fri Aug 24 18:03:12 2012 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -include ../../Makefile.def - -CPPFLAGS += -g -Wall -I../../../include/TaskManager -m$(ABIBIT) - -TARGET= GpuRegistTaskTest -framework opencl - -$(TARGET) : - -LIBS += ../../libGpuManager.a - -CpuRegistTaskTest : GpuRegistTaskTest.o - $(CC) $(CFLAGS) -o $@ $? $(LIBS) - -clean: - rm -rf *.o $(TARGET)
--- a/TaskManager/test/GpuRegistTaskTest/twice.cl Fri Aug 24 18:03:12 2012 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -__kernel void -twice(__global int *input_data, - __global int *output_data, - __global int *data_count) -{ - int count = *data_count; - for (int i = 0; i<count; i++) { - output_data[i] = input_data[i] * 2; - } - -} \ No newline at end of file
--- a/TaskManager/test/GpuRunTest/GpuRunTest.cc Fri Aug 24 18:03:12 2012 +0900 +++ b/TaskManager/test/GpuRunTest/GpuRunTest.cc Mon Sep 10 15:04:39 2012 +0900 @@ -54,7 +54,7 @@ void tester(int *indata, int *outdata, int num) { - + //チェック int check = 0; for (int c=0; c<num; c++) { @@ -62,30 +62,31 @@ check++; } } - + printf("Computed '%d/%d' correct values\n",check,num); - + } void test(TaskManager *manager) { - + int *indata = new int[length]; int *outdata = new int[length]; - + for (int c=0; c < length ;c++) { indata[c] = c; } print_data(indata, length, "before"); - + HTaskPtr twice = manager->create_task(Twice); twice->set_param(0, (memaddr)length); twice->set_inData(0, indata, sizeof (int)*length); twice->set_outData(0, outdata, sizeof (int)*length); twice->set_cpu(GPU_0); + twice->no_flip(); /* * set_post() で ppe task を渡せるようにしたい @@ -93,22 +94,22 @@ twice->set_post(twice_result, (void*)outdata, (void*)length); twice->spawn(); - + } - + int TMmain(TaskManager *manager, int argc, char* argv[]) { if (init(argc, argv) < 0) { return -1; } - + task_init(); for (int i = 0; i < task; ++i) { test(manager); } - + return 0; }
--- a/example/OpenCL/twice.cl Fri Aug 24 18:03:12 2012 +0900 +++ b/example/OpenCL/twice.cl Mon Sep 10 15:04:39 2012 +0900 @@ -1,11 +1,11 @@ __kernel void twice(__global int *input_data, __global int *output_data, - __global int *data_count,) + __global int *data_count) { + int count = *data_count; for (int i = 0; i<count; i++) { output_data[i] = input_data[i] * 2; - } - -} \ No newline at end of file + } +}
--- a/example/many_task/Makefile.macosx Fri Aug 24 18:03:12 2012 +0900 +++ b/example/many_task/Makefile.macosx Mon Sep 10 15:04:39 2012 +0900 @@ -1,3 +1,4 @@ + include ./Makefile.def @@ -40,3 +41,4 @@ rm -f *~ \#* rm -f ppe/*~ ppe/\#* rm -f spe/*~ spe/\#* + rm -f gpu/*~ gpu/\#*
--- a/example/many_task/main.cc Fri Aug 24 18:03:12 2012 +0900 +++ b/example/many_task/main.cc Mon Sep 10 15:04:39 2012 +0900 @@ -25,6 +25,7 @@ static double ed_time; static int length = 1200; +static int // prototype void TMend(TaskManager *); @@ -65,7 +66,7 @@ all = 1; } if (strcmp(argv[i], "-c") == 0 ) { - sort_task = SortCompat; + sort_task = SortCompat } if (strcmp(argv[i], "-s") == 0 ) { sort_task = SortSimple; @@ -113,6 +114,7 @@ HTaskPtr restart = manager->create_task(sort_task,0,0,0,0); restart->set_param(0,(memaddr)&sorter); + //set flip flag restart->spawn(); }