changeset 1915:effb5653fd5c draft

update cuda, yet running
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Mon, 20 Jan 2014 21:59:56 +0900
parents 08e9e416c2e0
children 64bd56aed386
files TaskManager/ChangeLog TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h TaskManager/Cuda/CudaTaskManagerFactory.cc TaskManager/Cuda/CudaThreads.cc TaskManager/Cuda/CudaThreads.h TaskManager/Gpu/GpuScheduler.cc TaskManager/Makefile TaskManager/Makefile.cuda TaskManager/Makefile.def
diffstat 10 files changed, 423 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/ChangeLog	Mon Jan 20 19:02:22 2014 +0900
+++ b/TaskManager/ChangeLog	Mon Jan 20 21:59:56 2014 +0900
@@ -1,3 +1,21 @@
+2014-1-20 Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
+
+	cuda で clEnqueueNDRangeKernel に相当するものが cuLaunchKernel
+	OpenCL の場合は global_work_size(0)*...*global_work_size(work_dim-1) で起動する kernel の数が決まる。
+	cuda の場合は gridDim * blockDim で決まる。
+	ただし、gridDim と blockDim には最大数がある。gridDim は 2^16, blockDim は 2^9
+	いまの iterate では cuda に対応できない。
+
+	cuda には OpenCL の command_queue に相当するものがない。
+	stream が command_queue に近い。
+	複数の stream は並列に走らせることができる。
+		実行の順序は gpu 側で制御されるとか言う記述が...
+		out of order で実行される?
+	OpenCL も複数の command_queue を並列に走らせることができる?
+		command_queue も1つの queue に全部入れるんじゃなくて、command_queue を複数作ったほうがいい?
+		command_queue 同士で同期は取れるけど、べつの queue の event とか待てるのか?
+			command_queue の粒度は下げれば event 使わなくても出来そうな気がする。
+
 2014-1-4 Shinji kONO <kono@ie.u-ryukyu.ac.jp>
 
 	MY_SPE_STATUS_READY は task 終了を待ってから出しているが、あまり、望ましくない。
--- a/TaskManager/Cuda/CudaScheduler.cc	Mon Jan 20 19:02:22 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.cc	Mon Jan 20 21:59:56 2014 +0900
@@ -12,6 +12,8 @@
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <string.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
 
 TaskObject cuda_task_list[MAX_TASK_OBJECT];
 
@@ -32,10 +34,12 @@
         const char* msg = convert_error_status(ret);
         error(msg);
     }
+    cuStreamCreate(stream, 0);
 }
 
 CudaScheduler::~CudaScheduler()
 {
+    cuStreamDestroy(stream);
     cuCtxDestroy(context);
 }
 
@@ -65,7 +69,7 @@
         m->event = (CUevent*)remalloc(m->allcate_size*sizeof(CUevent*));
     }
 
-    cuMemAlloc(&m->buf[i], size);
+    error = cuMemAlloc(&m->buf[i], size);
     return m->buf[i];
 }
 
@@ -126,5 +130,191 @@
     if (kernel_event[cur] != 0)
         cuEventDestroy(kernel_event[cur]);
     kernel_event[cur] = NOP_REPLY;
-    if (kernel[cur] != 0)
-        
+    //    if (kernel[cur] != 0)
+    // kerneldestroy();
+    kernel[cur] = 0;
+    release_buf_event(1-cur, memout);
+    release_buf_event(1-cur, memin);
+
+    wait_for_event(kernel_event, memout, taskList, cur);
+}
+
+void
+CudaScheduler::run() {
+    int cur = 0;
+    TaskListPtr tasklist = NULL;
+    reply = 0;
+    initCudaBuffer(&memin[0]);initCudaBuffer(&memin[1]);
+    initCudaBuffer(&memout[0]);initCudaBuffer(&memout[1]);
+    memset(&flag, 0, sizeof(HTask::htask_flag)*2);
+
+    for (;;) {
+        memaddr param_addr = connector->task_list_mail_read();
+
+        if ((memaddr)param_addr === (memaddr)MY_SPE_COMMAND_EXIT) {
+            cuStreamDestroy(stream);
+            destroyCudaBuffer(&memin[0]);destroyCudaBuffer(&memin[1]);
+            destroyCudaBuffer(&memout[0]);destroyCudaBuffer(&memout[1]);
+            return;
+        }
+
+        (*connector->start_dmawait_profile)(&(connector->start_time));
+        while (params_addr) {
+            // since we are on the same memory space, we don't has to use dma_load here
+            tasklist = (TaskListPtr)connector->dma_load(this, params_addr,
+                                                        sizeof(TaskList), DMA_READ_TASKLIST);
+            //            tasklist[cur]->task_start_time = gettime();
+            tasklist->task_start_time = 0;
+            /*
+             * get flip flag
+             * flip : When caluculate on input data, to treat this as a output data
+             */
+            if (tasklist->self) {
+                flag[cur] = tasklist->self->flag;
+            } else {
+                memset(&flag[cur], 0, sizeof(HTask::htask_flag));
+            }
+            for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) {
+                if(nextTask->command==ShowTime) {
+                    connector->show_profile(); continue;
+                }
+                if(nextTask->command==StartProfile) {
+                    connector->start_profile(); continue;
+                }
+                if (load_kernel(nextTask->command) == 0) { cudaTaskError(cur,tasklist,ret); continue; }
+                CUmodule& module = *cuda_task_list[nextTask->command].cudatask->module;
+                const char *funcname = cuda_task_list[nextTask->command].name;
+                
+                ret = cuModuleGetFunction(kernel[cur], module, funcname);
+                if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+
+                int param = 0;
+
+                // set arg count
+                CUdeviceptr memparam = createBuffer(&memin[cur], 0, context,
+                                                    sizeof(memaddr)*nextTask->param_count, &ret);
+                if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+
+                // parameter is passed as first kernel arg 
+                ret = cuMemcpyHtoDAsync(memparam, nextTask->param(0), sizeof(memaddr)*nextTask->param_count, stream);
+                if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+                
+                ret = cuParamSetv(kernel[cur], 0, memin[cur].buf[0], sizeof(memaddr));
+                if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+                
+                param++;
+                
+                for(int i=0;i<nextTask->inData_count;i++) {
+                    ListElement *input_buf = nextTask->inData(i);
+                    if (input_buf->size==0) break;
+                    createBuffer(&memin[cur], param, context, input_buf->size, &ret);
+                    if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+                    ret = cuMemcpyHtoDAsync(memin[cur].buf[param], input_buf->addr, input_buf->size, stream);
+                    if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+                    ret = cuParamSetv(kernel[cur], 0, memin[cur].buf[param], sizeof(memaddr));
+                    if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+                    
+                    param++;
+                }
+                memin[cur].size  = param; // +1 means param
+                
+                for(int i = 0; i<nextTask->outData_count;i++) { // set output data
+                    ListElement *output_buf = nextTask->outData(i);
+                    if (output_buf->size==0) break;
+                    if (!flag[cur].flip) { // flip use memin for output 
+                        createBuffer(&memout[cur], i, context, CL_MEM_WRITE_ONLY, output_buf->size, &ret);
+                        if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+                        ret = cuParamSetv(kernel[cur], 0, memout[cur].buf[i], sizeof(memout));
+                        if (ret<0) { cudaTaskError(cur,tasklist,ret); continue;}
+                        // enqueue later
+                    }
+                    param++;
+                }
+                memout[cur].size = param - memin[cur].size;  // no buffer on flip, but flip use memout event
+
+                
+                if (tasklist->dim > 0) {
+                    ret = cuLaunchKernel(kernel[cur],
+                                         tasklist->x, tasklist->y, tasklist->z,
+                                         1, 1, 1,
+                                         stream, NULL, NULL);
+                } else {
+                    ret = cuLaunchKernel(kernel[cur],
+                                         1, 1, 1,
+                                         1, 1, 1,
+                                         stream, NULL, NULL);
+                }
+                if (ret<0) { cudaTaskError(cur, tasklist, ret); continue; }
+
+                for(int i=0;i<nextTask->outData_count;i++) { // read output data
+                    ListElement *output_buf = nextTask->outData(i);
+                    if (output_buf->size==0) break;
+                    GpuBufferPtr mem = flag[cur].flip ? memin : memout ;
+                    int i0 = flag[cur].flip ? i+1 : i ;
+                    // flip use memin buffer and memout event
+                    ret = cuMemcpyDtoHAsync(mem[cur].buf[i0], output_buf->addr, output_buf->size, stream);
+                    if (ret<0) { cudaTaskError(cur,tasklist,ret); continue; }
+                }
+                // wait kernel[1-cur] and write[1-cur]
+                // pipeline    : cur
+                // to stop pipeline set 1-cur
+                wait_for_event(kernel_event, memout, tasklist, cur);
+                cur = 1 - cur;
+            }
+            reply = (memaddr)tasklist->waiter;
+            params_addr = (memaddr)tasklist->next;
+        }
+        wait_for_event(kernel_event, memout, tasklist, cur);
+
+        unsigned long long wait = 0;
+        (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time));
+        connector->mail_write((memaddr)MY_SPE_STATUS_READY);
+    }
+    /* NOT REACHED */
+}
+
+int
+not_ready(SchedTask* smanager, void* r, void *w)
+{
+    smanager->printf("GPU task not ready %d\n", smanager->atask->command);
+    return 0;
+}
+
+/*
+ * kernel file open and build program
+ */
+int
+CudaScheduler::load_kernel(int cmd)
+{
+    if (cuda_task_list[cmd].run == null_run) {
+        return 1;
+    }
+
+    if (cuda_task_list[cmd].cudatask == 0 || cuda_task_list[cmd].cudatask->module == 0) {
+        fprintf(stderr, "CUDA module %d not defined.\n",cmd);
+        return 0;
+    }
+
+    CUmodule* module = new CUmodule;
+    ret = cuModuleLoad(module, (const char*)cuda_task_list[cmd].cudatask->module);
+
+    if(ret<0) {
+        error(convert_error_status(ret));
+    }
+    cuda_task_list[cmd].cudatask->module = module;
+    cuda_task_list[cmd].run = null_run; // kernel is ready
+    return 1;
+}
+
+// regist kernel file name
+void
+cuda_register_task(int cmd, const char* filename, const char* functionname)
+{
+    cuda_task_list[cmd].run = not_ready;  // not yet ready
+    cuda_task_list[cmd].load = null_loader;
+    cuda_task_list[cmd].wait = null_loader;
+    cuda_task_list[cmd].name = functionname;
+    cuda_task_list[cmd].cudatask->module = (CUmodule*)filename;
+}
+
+/* end */
--- a/TaskManager/Cuda/CudaScheduler.h	Mon Jan 20 19:02:22 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.h	Mon Jan 20 21:59:56 2014 +0900
@@ -6,6 +6,9 @@
 #include "CudaThreads.h"
 #include "HTask.h"
 #include "TaskManager.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+
 
 extern TaskObject cuda_task_list[MAX_TASK_OBJECT];
 
@@ -36,6 +39,7 @@
     // command_queue command_queue;
     // Cuda には command_queue に相当するものはない
     // Closest approximation would be the CUDA Stream mechanism. らしい...
+    CUstream stream;
     int ret;
     memaddr reply;
     // cl_kernel に相当
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/Cuda/CudaTaskManagerFactory.cc	Mon Jan 20 21:59:56 2014 +0900
@@ -0,0 +1,13 @@
+#define DEBUG
+#include "CellTaskManagerImpl.h"
+#include "CudaThreads.h"
+#include "CpuThreads.h"
+
+TaskManagerImpl *create_impl(int num, int num_gpu, int useRefDma)
+{
+    int io_num = 2; // two threads for I/O
+    init_task_list(cuda_task_list);
+    Threads *cpus = new CpuThreads(num, io_num, useRefDma,num_gpu);
+    num += num_gpu; // for GPU
+    return new CellTaskManagerImpl(num, num_gpu, cpus);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/Cuda/CudaThreads.cc	Mon Jan 20 21:59:56 2014 +0900
@@ -0,0 +1,96 @@
+#include "CudaThreads.h"
+#include "CudaScheduler.h"
+#include "TaskManagerImpl.h"
+#include "SpeTaskManagerImpl.h"
+
+const int gpu_num = 1;
+
+CudaThreads::CudaThreads(int useRefDma) : use_refdma(useRefDma)
+{
+    threads = new pthread_t[gpu_num];
+    args = new cuda_thread_arg_t;
+}
+
+CuduThreads::~CudaThreads()
+{
+    memaddr mail = (memaddr)MY_SPE_COMMAND_EXIT;
+    send_mail(0,1,&mail);
+    pthread_join(threads[0], NULL);
+
+    delete threads;
+    delete args;
+}
+
+void
+CudaThreads::set_wait(SemPtr wait)
+{
+    args->wait=wait;
+}
+
+void
+CudaThreads::init()
+{
+    args->scheduler = new CudaScheduler();
+    args->useRefDma = use_refdma;
+
+    pthread_create(&threads[0], NULL, &cuda_thread_run, args);
+}
+
+void
+CudaThreads::set_mail_waiter(SemPtr w)
+{
+    args->scheduler->connector->set_mail_waiter(w);
+}
+
+void *
+CudaThreads::cuda_thread_run(void *args)
+{
+    cuda_thread_arg_t *argt = (cuda_thread_arg_t *) args;
+    Scheduler *g_scheduler = argt->scheduler;
+
+    TaskManagerImpl *manager = new SpeTaskManagerImpl();
+    g_scheduler->init(manager, argt->useRefDma);
+
+    manager->set_scheduler(g_scheduler);
+
+    argt->wait->sem_v();
+
+    g_scheduler->run();
+    g_scheduler->finish();
+
+    return NULL;
+}
+
+int
+CudaThreads::spawn_task(int id, TaskListPtr p) {
+    send_mail(id, 1, (memaddr*)p);
+    return 0;
+}
+
+int
+CudaThreads::get_mail(int speid, int count, memaddr *ret)
+{
+    *ret = args->scheduler->mail_read_from_host();
+    return 1;
+}
+
+int
+CudaThreads::has_mail(int speid, int count, memaddr *ret)
+{
+    if (args->scheduler->has_mail_from_host() != 0) {
+        return get_mail(0, 0, ret);
+    } else {
+        return 0;
+    }
+}
+
+void
+CudaThreads::send_mail(int speid, int num, memaddr *data)
+{
+    args->scheduler->mail_write_from_host(*data);
+}
+
+void
+CudaThreads::add_output_tasklist(int command, memaddr buff, int alloc_size)
+{
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/Cuda/CudaThreads.h	Mon Jan 20 21:59:56 2014 +0900
@@ -0,0 +1,51 @@
+#ifndef INCLUDED_CUDA_THREADS
+#define INCLUDED_CUDA_THREADS
+
+#include <pthread.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "Threads.h"
+#include "CudaScheduler.h"
+#include "Sem.h"
+
+
+class CudaScheduler;
+
+typedef struct cuda_arg {
+    int cpuid;
+    // should be syncrhonized
+    CudaScheduler *scheduler;
+    TaskManagerImpl *manager;
+    SemPtr wait;
+    int useRefDma;
+} cuda_thread_arg_t;
+
+class CudaThreads : public Threads {
+
+ public:
+    /*
+      static GpuThreads* getInstance() {
+      static GpuThreads singleton;
+      return &singleton;
+      }*/
+    CudaThreads(int useRefDma);
+    ~CudaThreads();
+    
+    void init();
+    static void *cuda_thread_run(void *args);
+    virtual int spawn_task(int cpu_num, TaskListPtr p);
+    virtual void set_mail_waiter(SemPtr w);
+
+    int get_mail(int speid, int count, memaddr *ret);
+    int has_mail(int speid, int count, memaddr *ret);
+    void send_mail(int speid, int num, memaddr *data);
+    void add_output_tasklist(int command, memaddr buff, int alloc_size);
+    void set_wait(SemPtr);
+    
+  private:
+    cuda_thread_arg_t *args;
+    pthread_t *threads;
+    int use_refdma;
+};
+
+#endif
--- a/TaskManager/Gpu/GpuScheduler.cc	Mon Jan 20 19:02:22 2014 +0900
+++ b/TaskManager/Gpu/GpuScheduler.cc	Mon Jan 20 21:59:56 2014 +0900
@@ -273,7 +273,7 @@
 
                 if (tasklist->dim > 0) {
                     ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist->dim,
-                                 NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]);
+                                                 NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]);
                 } else {
                     ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size,
                                         memin[cur].event, &kernel_event[cur]);
--- a/TaskManager/Makefile	Mon Jan 20 19:02:22 2014 +0900
+++ b/TaskManager/Makefile	Mon Jan 20 21:59:56 2014 +0900
@@ -24,6 +24,9 @@
 gpu: FORCE
 	@$(MAKE) -f Makefile.gpu
 
+cuda: FORCE
+	@$(MAKE) -f Makefile.cuda
+
 FORCE:
 	-mkdir -p ../include/TaskManager
 	rsync `find . -name Test -prune -or -name '*.h' -print` ../include/TaskManager
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/Makefile.cuda	Mon Jan 20 21:59:56 2014 +0900
@@ -0,0 +1,40 @@
+include ./Makefile.def
+TARGET = libCudaManager.a
+CFLAGS += -DHAS_POSIX_MEMALIGN
+CUDA_PATH = /Developer/NVIDIA/CUDA-5.5/include
+
+VPATH = CUDA_PATH
+
+ifdef LINUX
+CFLAGS += -lrt
+endif
+
+.SUFFIXS: .c .o
+
+EXTRA_CFLAGS = -D__CERIUM_CUDA__ -D__CERIUM_PARALLEL__
+
+.cc.o:
+	$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(INCLUDE) -c $< -o $@
+
+
+all: default
+default: $(TARGET)
+
+ALL_OBJS = $(KERN_MAIN_OBJS) $(KERN_PPE_OBJS) $(KERN_SCHED_OBJS) \
+	$(KERN_SYSTASK_OBJS) $(IMPL_FIFO_OBJS) $(KERN_MEM_OBJS) \
+	$(IMPL_MANYCORE_OBJS) $(IMPL_CUDA_OBJS) Cell/spe/SpeTaskManagerImpl.o Cell/CellTaskManagerImpl.o Cuda/CudaTaskManagerFactory.o
+
+Makefile.dep: 
+	make -f Makefile.cuda depend
+
+depend:
+	$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(INCLUDE) $(ALL_OBJS:.o=.cc) -MM  > Makefile.dep
+
+$(TARGET): $(ALL_OBJS)
+	ar crus $@ $(ALL_OBJS)
+
+cudadistclean: cudaclean
+	rm -f $(TARGET)
+
+cudaclean:
+	rm -f $(TARGET) $(ALL_OBJS)
--- a/TaskManager/Makefile.def	Mon Jan 20 19:02:22 2014 +0900
+++ b/TaskManager/Makefile.def	Mon Jan 20 21:59:56 2014 +0900
@@ -31,6 +31,10 @@
 IMPL_GPU_SRCS = $(wildcard $(IMPL_GPU_DIR)/*.cc)
 IMPL_GPU_OBJS = $(filter-out $(IMPL_GPU_DIR)/GpuTaskManagerFactory.o,$(IMPL_GPU_SRCS:.cc=.o))
 
+IMPL_CUDA_DIR  = Cuda
+IMPL_CUDA_SRCS = $(wildcard $(IMPL_CUDA_DIR)/*.cc)
+IMPL_CUDA_OBJS = $(filter-out $(IMPL_CUDA_DIR)/CudaTaskManagerFactory.o,$(IMPL_CUDA_SRCS:.cc=.o))
+
 IMPL_MANYCORE_DIR  = ManyCore
 IMPL_MANYCORE_SRCS = $(wildcard $(IMPL_MANYCORE_DIR)/*.cc)
 IMPL_MANYCORE_OBJS = $(filter-out $(IMPL_MANYCORE_DIR)/ManyCoreTaskManagerFactory.o,$(IMPL_MANYCORE_SRCS:.cc=.o))