Mercurial > hg > Game > Cerium
changeset 1915:effb5653fd5c draft
update cuda, yet running
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 20 Jan 2014 21:59:56 +0900 |
parents | 08e9e416c2e0 |
children | 64bd56aed386 |
files | TaskManager/ChangeLog TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h TaskManager/Cuda/CudaTaskManagerFactory.cc TaskManager/Cuda/CudaThreads.cc TaskManager/Cuda/CudaThreads.h TaskManager/Gpu/GpuScheduler.cc TaskManager/Makefile TaskManager/Makefile.cuda TaskManager/Makefile.def |
diffstat | 10 files changed, 423 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/ChangeLog Mon Jan 20 19:02:22 2014 +0900 +++ b/TaskManager/ChangeLog Mon Jan 20 21:59:56 2014 +0900 @@ -1,3 +1,21 @@ +2014-1-20 Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> + + cuda で clEnqueueNDRangeKernel に相当するものが cuLaunchKernel + OpenCL の場合は global_work_size(0)*...*global_work_size(work_dim-1) で起動する kernel の数が決まる。 + cuda の場合は gridDim * blockDim で決まる。 + ただし、gridDim と blockDim には最大数がある。gridDim は 2^16, blockDim は 2^9 + いまの iterate では cuda に対応できない。 + + cuda には OpenCL の command_queue に相当するものがない。 + stream が command_queue に近い。 + 複数の stream は並列に走らせることができる。 + 実行の順序は gpu 側で制御されるとか言う記述が... + out of order で実行される? + OpenCL も複数の command_queue を並列に走らせることができる? + command_queue も1つの queue に全部入れるんじゃなくて、command_queue を複数作ったほうがいい? + command_queue 同士で同期は取れるけど、べつの queue の event とか待てるのか? + command_queue の粒度は下げれば event 使わなくても出来そうな気がする。 + 2014-1-4 Shinji kONO <kono@ie.u-ryukyu.ac.jp> MY_SPE_STATUS_READY は task 終了を待ってから出しているが、あまり、望ましくない。
--- a/TaskManager/Cuda/CudaScheduler.cc Mon Jan 20 19:02:22 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Mon Jan 20 21:59:56 2014 +0900 @@ -12,6 +12,8 @@ #include <fcntl.h> #include <sys/stat.h> #include <string.h> +#include <cuda.h> +#include <cuda_runtime.h> TaskObject cuda_task_list[MAX_TASK_OBJECT]; @@ -32,10 +34,12 @@ const char* msg = convert_error_status(ret); error(msg); } + cuStreamCreate(stream, 0); } CudaScheduler::~CudaScheduler() { + cuStreamDestroy(stream); cuCtxDestroy(context); } @@ -65,7 +69,7 @@ m->event = (CUevent*)remalloc(m->allcate_size*sizeof(CUevent*)); } - cuMemAlloc(&m->buf[i], size); + error = cuMemAlloc(&m->buf[i], size); return m->buf[i]; } @@ -126,5 +130,191 @@ if (kernel_event[cur] != 0) cuEventDestroy(kernel_event[cur]); kernel_event[cur] = NOP_REPLY; - if (kernel[cur] != 0) - + // if (kernel[cur] != 0) + // kerneldestroy(); + kernel[cur] = 0; + release_buf_event(1-cur, memout); + release_buf_event(1-cur, memin); + + wait_for_event(kernel_event, memout, taskList, cur); +} + +void +CudaScheduler::run() { + int cur = 0; + TaskListPtr tasklist = NULL; + reply = 0; + initCudaBuffer(&memin[0]);initCudaBuffer(&memin[1]); + initCudaBuffer(&memout[0]);initCudaBuffer(&memout[1]); + memset(&flag, 0, sizeof(HTask::htask_flag)*2); + + for (;;) { + memaddr param_addr = connector->task_list_mail_read(); + + if ((memaddr)param_addr === (memaddr)MY_SPE_COMMAND_EXIT) { + cuStreamDestroy(stream); + destroyCudaBuffer(&memin[0]);destroyCudaBuffer(&memin[1]); + destroyCudaBuffer(&memout[0]);destroyCudaBuffer(&memout[1]); + return; + } + + (*connector->start_dmawait_profile)(&(connector->start_time)); + while (params_addr) { + // since we are on the same memory space, we don't has to use dma_load here + tasklist = (TaskListPtr)connector->dma_load(this, params_addr, + sizeof(TaskList), DMA_READ_TASKLIST); + // tasklist[cur]->task_start_time = gettime(); + tasklist->task_start_time = 0; + /* + * get flip flag + * flip : When caluculate on input data, to treat this as a output data + */ + if (tasklist->self) { + flag[cur] = tasklist->self->flag; + } else { + memset(&flag[cur], 0, sizeof(HTask::htask_flag)); + } + for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) { + if(nextTask->command==ShowTime) { + connector->show_profile(); continue; + } + if(nextTask->command==StartProfile) { + connector->start_profile(); continue; + } + if (load_kernel(nextTask->command) == 0) { cudaTaskError(cur,tasklist,ret); continue; } + CUmodule& module = *cuda_task_list[nextTask->command].cudatask->module; + const char *funcname = cuda_task_list[nextTask->command].name; + + ret = cuModuleGetFunction(kernel[cur], module, funcname); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + + int param = 0; + + // set arg count + CUdeviceptr memparam = createBuffer(&memin[cur], 0, context, + sizeof(memaddr)*nextTask->param_count, &ret); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + + // parameter is passed as first kernel arg + ret = cuMemcpyHtoDAsync(memparam, nextTask->param(0), sizeof(memaddr)*nextTask->param_count, stream); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + + ret = cuParamSetv(kernel[cur], 0, memin[cur].buf[0], sizeof(memaddr)); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + + param++; + + for(int i=0;i<nextTask->inData_count;i++) { + ListElement *input_buf = nextTask->inData(i); + if (input_buf->size==0) break; + createBuffer(&memin[cur], param, context, input_buf->size, &ret); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + ret = cuMemcpyHtoDAsync(memin[cur].buf[param], input_buf->addr, input_buf->size, stream); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + ret = cuParamSetv(kernel[cur], 0, memin[cur].buf[param], sizeof(memaddr)); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + + param++; + } + memin[cur].size = param; // +1 means param + + for(int i = 0; i<nextTask->outData_count;i++) { // set output data + ListElement *output_buf = nextTask->outData(i); + if (output_buf->size==0) break; + if (!flag[cur].flip) { // flip use memin for output + createBuffer(&memout[cur], i, context, CL_MEM_WRITE_ONLY, output_buf->size, &ret); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + ret = cuParamSetv(kernel[cur], 0, memout[cur].buf[i], sizeof(memout)); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue;} + // enqueue later + } + param++; + } + memout[cur].size = param - memin[cur].size; // no buffer on flip, but flip use memout event + + + if (tasklist->dim > 0) { + ret = cuLaunchKernel(kernel[cur], + tasklist->x, tasklist->y, tasklist->z, + 1, 1, 1, + stream, NULL, NULL); + } else { + ret = cuLaunchKernel(kernel[cur], + 1, 1, 1, + 1, 1, 1, + stream, NULL, NULL); + } + if (ret<0) { cudaTaskError(cur, tasklist, ret); continue; } + + for(int i=0;i<nextTask->outData_count;i++) { // read output data + ListElement *output_buf = nextTask->outData(i); + if (output_buf->size==0) break; + GpuBufferPtr mem = flag[cur].flip ? memin : memout ; + int i0 = flag[cur].flip ? i+1 : i ; + // flip use memin buffer and memout event + ret = cuMemcpyDtoHAsync(mem[cur].buf[i0], output_buf->addr, output_buf->size, stream); + if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; } + } + // wait kernel[1-cur] and write[1-cur] + // pipeline : cur + // to stop pipeline set 1-cur + wait_for_event(kernel_event, memout, tasklist, cur); + cur = 1 - cur; + } + reply = (memaddr)tasklist->waiter; + params_addr = (memaddr)tasklist->next; + } + wait_for_event(kernel_event, memout, tasklist, cur); + + unsigned long long wait = 0; + (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); + connector->mail_write((memaddr)MY_SPE_STATUS_READY); + } + /* NOT REACHED */ +} + +int +not_ready(SchedTask* smanager, void* r, void *w) +{ + smanager->printf("GPU task not ready %d\n", smanager->atask->command); + return 0; +} + +/* + * kernel file open and build program + */ +int +CudaScheduler::load_kernel(int cmd) +{ + if (cuda_task_list[cmd].run == null_run) { + return 1; + } + + if (cuda_task_list[cmd].cudatask == 0 || cuda_task_list[cmd].cudatask->module == 0) { + fprintf(stderr, "CUDA module %d not defined.\n",cmd); + return 0; + } + + CUmodule* module = new CUmodule; + ret = cuModuleLoad(module, (const char*)cuda_task_list[cmd].cudatask->module); + + if(ret<0) { + error(convert_error_status(ret)); + } + cuda_task_list[cmd].cudatask->module = module; + cuda_task_list[cmd].run = null_run; // kernel is ready + return 1; +} + +// regist kernel file name +void +cuda_register_task(int cmd, const char* filename, const char* functionname) +{ + cuda_task_list[cmd].run = not_ready; // not yet ready + cuda_task_list[cmd].load = null_loader; + cuda_task_list[cmd].wait = null_loader; + cuda_task_list[cmd].name = functionname; + cuda_task_list[cmd].cudatask->module = (CUmodule*)filename; +} + +/* end */
--- a/TaskManager/Cuda/CudaScheduler.h Mon Jan 20 19:02:22 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Mon Jan 20 21:59:56 2014 +0900 @@ -6,6 +6,9 @@ #include "CudaThreads.h" #include "HTask.h" #include "TaskManager.h" +#include <cuda.h> +#include <cuda_runtime.h> + extern TaskObject cuda_task_list[MAX_TASK_OBJECT]; @@ -36,6 +39,7 @@ // command_queue command_queue; // Cuda には command_queue に相当するものはない // Closest approximation would be the CUDA Stream mechanism. らしい... + CUstream stream; int ret; memaddr reply; // cl_kernel に相当
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TaskManager/Cuda/CudaTaskManagerFactory.cc Mon Jan 20 21:59:56 2014 +0900 @@ -0,0 +1,13 @@ +#define DEBUG +#include "CellTaskManagerImpl.h" +#include "CudaThreads.h" +#include "CpuThreads.h" + +TaskManagerImpl *create_impl(int num, int num_gpu, int useRefDma) +{ + int io_num = 2; // two threads for I/O + init_task_list(cuda_task_list); + Threads *cpus = new CpuThreads(num, io_num, useRefDma,num_gpu); + num += num_gpu; // for GPU + return new CellTaskManagerImpl(num, num_gpu, cpus); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TaskManager/Cuda/CudaThreads.cc Mon Jan 20 21:59:56 2014 +0900 @@ -0,0 +1,96 @@ +#include "CudaThreads.h" +#include "CudaScheduler.h" +#include "TaskManagerImpl.h" +#include "SpeTaskManagerImpl.h" + +const int gpu_num = 1; + +CudaThreads::CudaThreads(int useRefDma) : use_refdma(useRefDma) +{ + threads = new pthread_t[gpu_num]; + args = new cuda_thread_arg_t; +} + +CuduThreads::~CudaThreads() +{ + memaddr mail = (memaddr)MY_SPE_COMMAND_EXIT; + send_mail(0,1,&mail); + pthread_join(threads[0], NULL); + + delete threads; + delete args; +} + +void +CudaThreads::set_wait(SemPtr wait) +{ + args->wait=wait; +} + +void +CudaThreads::init() +{ + args->scheduler = new CudaScheduler(); + args->useRefDma = use_refdma; + + pthread_create(&threads[0], NULL, &cuda_thread_run, args); +} + +void +CudaThreads::set_mail_waiter(SemPtr w) +{ + args->scheduler->connector->set_mail_waiter(w); +} + +void * +CudaThreads::cuda_thread_run(void *args) +{ + cuda_thread_arg_t *argt = (cuda_thread_arg_t *) args; + Scheduler *g_scheduler = argt->scheduler; + + TaskManagerImpl *manager = new SpeTaskManagerImpl(); + g_scheduler->init(manager, argt->useRefDma); + + manager->set_scheduler(g_scheduler); + + argt->wait->sem_v(); + + g_scheduler->run(); + g_scheduler->finish(); + + return NULL; +} + +int +CudaThreads::spawn_task(int id, TaskListPtr p) { + send_mail(id, 1, (memaddr*)p); + return 0; +} + +int +CudaThreads::get_mail(int speid, int count, memaddr *ret) +{ + *ret = args->scheduler->mail_read_from_host(); + return 1; +} + +int +CudaThreads::has_mail(int speid, int count, memaddr *ret) +{ + if (args->scheduler->has_mail_from_host() != 0) { + return get_mail(0, 0, ret); + } else { + return 0; + } +} + +void +CudaThreads::send_mail(int speid, int num, memaddr *data) +{ + args->scheduler->mail_write_from_host(*data); +} + +void +CudaThreads::add_output_tasklist(int command, memaddr buff, int alloc_size) +{ +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TaskManager/Cuda/CudaThreads.h Mon Jan 20 21:59:56 2014 +0900 @@ -0,0 +1,51 @@ +#ifndef INCLUDED_CUDA_THREADS +#define INCLUDED_CUDA_THREADS + +#include <pthread.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include "Threads.h" +#include "CudaScheduler.h" +#include "Sem.h" + + +class CudaScheduler; + +typedef struct cuda_arg { + int cpuid; + // should be syncrhonized + CudaScheduler *scheduler; + TaskManagerImpl *manager; + SemPtr wait; + int useRefDma; +} cuda_thread_arg_t; + +class CudaThreads : public Threads { + + public: + /* + static GpuThreads* getInstance() { + static GpuThreads singleton; + return &singleton; + }*/ + CudaThreads(int useRefDma); + ~CudaThreads(); + + void init(); + static void *cuda_thread_run(void *args); + virtual int spawn_task(int cpu_num, TaskListPtr p); + virtual void set_mail_waiter(SemPtr w); + + int get_mail(int speid, int count, memaddr *ret); + int has_mail(int speid, int count, memaddr *ret); + void send_mail(int speid, int num, memaddr *data); + void add_output_tasklist(int command, memaddr buff, int alloc_size); + void set_wait(SemPtr); + + private: + cuda_thread_arg_t *args; + pthread_t *threads; + int use_refdma; +}; + +#endif
--- a/TaskManager/Gpu/GpuScheduler.cc Mon Jan 20 19:02:22 2014 +0900 +++ b/TaskManager/Gpu/GpuScheduler.cc Mon Jan 20 21:59:56 2014 +0900 @@ -273,7 +273,7 @@ if (tasklist->dim > 0) { ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist->dim, - NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]); + NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]); } else { ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size, memin[cur].event, &kernel_event[cur]);
--- a/TaskManager/Makefile Mon Jan 20 19:02:22 2014 +0900 +++ b/TaskManager/Makefile Mon Jan 20 21:59:56 2014 +0900 @@ -24,6 +24,9 @@ gpu: FORCE @$(MAKE) -f Makefile.gpu +cuda: FORCE + @$(MAKE) -f Makefile.cuda + FORCE: -mkdir -p ../include/TaskManager rsync `find . -name Test -prune -or -name '*.h' -print` ../include/TaskManager
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TaskManager/Makefile.cuda Mon Jan 20 21:59:56 2014 +0900 @@ -0,0 +1,40 @@ +include ./Makefile.def +TARGET = libCudaManager.a +CFLAGS += -DHAS_POSIX_MEMALIGN +CUDA_PATH = /Developer/NVIDIA/CUDA-5.5/include + +VPATH = CUDA_PATH + +ifdef LINUX +CFLAGS += -lrt +endif + +.SUFFIXS: .c .o + +EXTRA_CFLAGS = -D__CERIUM_CUDA__ -D__CERIUM_PARALLEL__ + +.cc.o: + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(INCLUDE) -c $< -o $@ + + +all: default +default: $(TARGET) + +ALL_OBJS = $(KERN_MAIN_OBJS) $(KERN_PPE_OBJS) $(KERN_SCHED_OBJS) \ + $(KERN_SYSTASK_OBJS) $(IMPL_FIFO_OBJS) $(KERN_MEM_OBJS) \ + $(IMPL_MANYCORE_OBJS) $(IMPL_CUDA_OBJS) Cell/spe/SpeTaskManagerImpl.o Cell/CellTaskManagerImpl.o Cuda/CudaTaskManagerFactory.o + +Makefile.dep: + make -f Makefile.cuda depend + +depend: + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(INCLUDE) $(ALL_OBJS:.o=.cc) -MM > Makefile.dep + +$(TARGET): $(ALL_OBJS) + ar crus $@ $(ALL_OBJS) + +cudadistclean: cudaclean + rm -f $(TARGET) + +cudaclean: + rm -f $(TARGET) $(ALL_OBJS)
--- a/TaskManager/Makefile.def Mon Jan 20 19:02:22 2014 +0900 +++ b/TaskManager/Makefile.def Mon Jan 20 21:59:56 2014 +0900 @@ -31,6 +31,10 @@ IMPL_GPU_SRCS = $(wildcard $(IMPL_GPU_DIR)/*.cc) IMPL_GPU_OBJS = $(filter-out $(IMPL_GPU_DIR)/GpuTaskManagerFactory.o,$(IMPL_GPU_SRCS:.cc=.o)) +IMPL_CUDA_DIR = Cuda +IMPL_CUDA_SRCS = $(wildcard $(IMPL_CUDA_DIR)/*.cc) +IMPL_CUDA_OBJS = $(filter-out $(IMPL_CUDA_DIR)/CudaTaskManagerFactory.o,$(IMPL_CUDA_SRCS:.cc=.o)) + IMPL_MANYCORE_DIR = ManyCore IMPL_MANYCORE_SRCS = $(wildcard $(IMPL_MANYCORE_DIR)/*.cc) IMPL_MANYCORE_OBJS = $(filter-out $(IMPL_MANYCORE_DIR)/ManyCoreTaskManagerFactory.o,$(IMPL_MANYCORE_SRCS:.cc=.o))