changeset 1804:1febe61a935a draft

create GpuBuffer
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Mon, 09 Dec 2013 20:41:11 +0900
parents fa15a19d27ef
children 8c79f9697179
files TaskManager/Gpu/GpuScheduler.cc TaskManager/Gpu/GpuScheduler.h
diffstat 2 files changed, 90 insertions(+), 54 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Gpu/GpuScheduler.cc	Mon Dec 09 18:43:56 2013 +0900
+++ b/TaskManager/Gpu/GpuScheduler.cc	Mon Dec 09 20:41:11 2013 +0900
@@ -62,26 +62,70 @@
     clReleaseContext(context);
 }
 
+void
+GpuScheduler::initGpuBuffer(GpuBufferPtr m) {
+    m->size = 0;
+    m->alllocate_size = 64;
+    m->buf  = (cl_mem*)malloc(m->alllocate_size*sizeof(cl_mem*));
+    m->event  = (cl_event*)malloc(m->alllocate_size*sizeof(cl_event*));
+}
+
+
+
+cl_mem
+GpuScheduler::createBuffer(GpuBufferPtr m, int i,  cl_context context, cl_mem_flags flags, size_t size, cl_int *error) {
+    if (i > m->alllocate_size) {
+        // reallocate buffer size 
+        m->allocate_size *= 2;
+        m->buf = (cl_mem*)realloc(m->buf, m->alllocate_size*sizeof(cl_mem*));
+        m->event = (cl_event*)realloc(m->event, m->alllocate_size*sizeof(cl_event*));
+    }
+
+    if (m->buf[i]) {
+        clReleaseMemObject(m->buf);
+    }
+
+    flags |= CL_MEM_USE_HOST_PTR;
+    void *buf = m->buf[i];
+    clCreateBuffer(context, flags, size, buf, error);
+}
+
 #define NOP_REPLY NULL
 
+/**
+ * wait for previous pipeline termination
+ * kernel_event, memout_event
+ */
 void
-GpuScheduler::wait_for_event(cl_event* event,memaddr* reply,TaskListPtr taskList, int cur) {
-    if (event[1-cur] == NOP_REPLY) {
+GpuScheduler::wait_for_event(cl_event* kernel_event, GpuBufferPtr memout, memaddr* reply,TaskListPtr taskList, int cur) {
+    if (kernel_event[1-cur] == NOP_REPLY) {
         if(reply[1-cur]) {
             connector->mail_write(reply[1-cur]);
             reply[1-cur]=0;
         }
-    }
-    if (event[1-cur] != NULL) {
-        int ret=clWaitForEvents(1,&event[1-cur]);
+    } else if (kernel_event[1-cur] != NULL) {
+        int ret=clWaitForEvents(1,&kernel_event[1-cur]);
         if (ret<0) {
             error(convert_error_status(ret));
         }
-        if(reply[1-cur]) {
-            connector->mail_write(reply[1-cur]);
-            reply[1-cur]=0;
+        clReleaseEvent(kernel_event[1-cur]);
+        kernel_evetn[1-cur] = 0;
+    }
+    if (memout[1-cur].size > 0) {
+        int ret=clWaitForEvents(memout[1-cur].size, &memout[1-cur].event);
+        if (ret<0) {
+            error(convert_error_status(ret));
+        }
+        for (int i=0; i < memout[1-cur].size; i++) {
+            clReleaseEvent(memout[1-cur].event[i]);
+            memout[1-cur].event[i] = 0;
         }
     }
+    if(reply[1-cur]) {
+        connector->mail_write(reply[1-cur]);
+        reply[1-cur]=0;
+    }
+    
     if (tasklist[1-cur]!=NULL) {
         cl_ulong start = 0;
         cl_ulong end   = 0;
@@ -115,12 +159,12 @@
             clFinish(command_queue);
             if (kernel[0]) clReleaseKernel(kernel[0]);
             if (kernel[1]) clReleaseKernel(kernel[1]);
-            if (event[0] && event[0]!=NOP_REPLY) clReleaseEvent(event[0]);
-            if (event[1] && event[1]!=NOP_REPLY) clReleaseEvent(event[1]);
-            if ( memout[cur-1] )  clReleaseMemObject(*memout[cur-1]);
-            if ( memout[cur] )  clReleaseMemObject(*memout[cur]);
-            if ( *memin[cur-1] )  clReleaseMemObject(*memin[cur-1]);
-            if ( *memin[cur] )  clReleaseMemObject(*memin[cur]);
+            if (kernel_event[0] && kernel_event[0]!=NOP_REPLY) clReleaseEvent(event[0]);
+            if (kernel_event[1] && kernel_event[1]!=NOP_REPLY) clReleaseEvent(event[1]);
+            destroyGpuBuffer(memout[cur-1]);
+            destroyGpuBuffer(memout[cur]);
+            destroyGpuBuffer(memin[cur]);
+            destroyGpuBuffer(memin[cur-1]);
             return ;
         }
 
@@ -165,15 +209,16 @@
                 int param = 0;
 
                 // set arg count
-                cl_mem memparam = clCreateBuffer(context, CL_MEM_READ_ONLY,
-                                                 sizeof(memaddr)*nextTask->param_count, NULL, &ret);
+                cl_mem memparam = createBuffer(memin[cur], 0, context, CL_MEM_READ_ONLY,
+                                               sizeof(memaddr)*nextTask->param_count, &ret);
                 if (ret<0) {
                     const char *msg=convert_error_status(ret);
                     error(msg);
                 }
 
                 ret = clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count,
-                                           nextTask->param(param), 0, NULL, &param_event[1-cur]); // set event flag
+                                           nextTask->param(param), 0, NULL, &memin[cur].event[0]);
+                // parameter is passed as first kernel arg 
                 param=0;
                 if (ret<0) {
                     const char *msg=convert_error_status(ret);
@@ -181,7 +226,7 @@
                         goto nop_reply;
                 }
 
-                ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr),(void *)&memparam);
+                ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr),(void *)&memin[cur].buf[0]);
                 if (ret<0) {
                     const char *msg=convert_error_status(ret);
                     error(msg);
@@ -191,26 +236,24 @@
                 param++;
 
                 cl_mem_flags mem_flag = CL_MEM_READ_ONLY;
-                if ( memout[cur] )  clReleaseMemObject(*memout[cur]); // using preallocated memorry
-                memin[cur] = new cl_mem[nextTask->inData_count];
 
                 if (!flag.flip) { // set input data when not flip
                     for(int i=0;i<nextTask->inData_count;i++) {
                         ListElement *input_buf = nextTask->inData(i);
                         if (input_buf->size==0) break;
-                        memin[cur][i] = clCreateBuffer(context, mem_flag, input_buf->size, NULL, &ret);
+                        createBuffer(memin[cur], i+1, context, mem_flag, input_buf->size, &ret);
                         if (ret<0) {
                             const char *msg=convert_error_status(ret);
                             error(msg);
                         }
-                        ret = clEnqueueWriteBuffer(command_queue, memin[cur][i], CL_TRUE, 0,
-                                                   input_buf->size, input_buf->addr, 1, 
-                                                   param_event[1-cur], memin_event[1-cur][i]);
+                        ret = clEnqueueWriteBuffer(command_queue, memin[cur][i+1], CL_TRUE, 0,
+                                                   input_buf->size, input_buf->addr, 0, 
+                                                   NULL, &memin[cur].event[i+1]);
                         if (ret<0) {
                             const char *msg=convert_error_status(ret);
                             error(msg);
                         }
-                        ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memin[cur][i]);
+                        ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memin[cur].buf[i+1]);
                         if (ret<0) {
                             const char *msg=convert_error_status(ret);
                             error(msg);
@@ -221,12 +264,8 @@
                 }
                 cl_mem_flags out_mem_flag;
                 if (flag.flip) {
-                    if ( memout[cur] )  clReleaseMemObject(*memout[cur]);
-                    memout[cur] = new cl_mem[nextTask->inData_count];
                     out_mem_flag = CL_MEM_READ_WRITE;
                 } else {
-                    if ( memout[cur] )  clReleaseMemObject(*memout[cur]);
-                    memout[cur] = new cl_mem[nextTask->outData_count];
                     out_mem_flag = CL_MEM_WRITE_ONLY;
                 }
 
@@ -234,7 +273,7 @@
                 for(int i = 0; i<nextTask->outData_count;i++) { // set output data
                     ListElement *output_buf = flag.flip? nextTask->inData(i) : nextTask->outData(i);
                     if (output_buf->size==0) break;
-                    memout[cur][i] = clCreateBuffer(context, out_mem_flag, output_buf->size, NULL, &ret);
+                    createBuffer(memout[cur], i, context, out_mem_flag, output_buf->size, &ret);
                     if (ret<0) {
                         const char *msg=convert_error_status(ret);
                         error(msg);
@@ -243,15 +282,15 @@
                     if (flag.flip) { // use output buffer as input buffer
                         ListElement *input_buf = nextTask->inData(i);
 
-                        ret = clEnqueueWriteBuffer(command_queue, memout[cur][i], CL_TRUE, 0,
+                        ret = clEnqueueWriteBuffer(command_queue, memout[cur].buf[i+1], CL_TRUE, 0,
                                                    input_buf->size, input_buf->addr, 
-                                                   memparam_size[1-cur],param_event[1-cur], memin_event[1-cur][i]);
+                                                   0, NULL, memout[cur].event[i+1]);
                         if (ret<0) {
                             const char *msg=convert_error_status(ret);
                             error(msg);
                         }
                     }
-                    ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memout[cur][i]);
+                    ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memout[cur].buf[i]);
                     if (ret<0) {
                         const char *msg=convert_error_status(ret);
                         error(msg);
@@ -259,13 +298,15 @@
                     param++;
                 }
                 
+                memin[cur].size  = taskList[cur]->inDataSize+1; // +1 means param
+                memout[cur].size = taskList[cur]->outDataSize;
                 tasklist[cur]->task_start_time = gettime();
                 if (tasklist[cur]->dim > 0) {
                     ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist[cur]->dim,
-                                 NULL, &tasklist[cur]->x, 0, memin_size[1-cur], memin_event[1-cur], &kernel_event[cur]);
+                                 NULL, &tasklist[cur]->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]);
                 } else {
-                    ret = clEnqueueTask(command_queue, kernel[cur], memin_size[1-cur],
-                                        memin_event[1-cur], &kernel_event[cur]);
+                    ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size,
+                                        memin[cur].event, &kernel_event[cur]);
                 }
                 if (ret<0) {
                     const char *msg=convert_error_status(ret);
@@ -276,8 +317,8 @@
                 for(int i=0;i<nextTask->outData_count;i++) { // read output data
                     ListElement *output_buf = flag.flip? nextTask->inData(i) :nextTask->outData(i);
                     if (output_buf->size==0) break;
-                    ret = clEnqueueReadBuffer(command_queue, memout[cur][i], CL_FALSE, 0,
-                                    output_buf->size, output_buf->addr, 1, kernel_event[1-cur], memout_event[1-cur][i]);
+                    ret = clEnqueueReadBuffer(command_queue, memout[cur].buf[i], CL_FALSE, 0,
+                                    output_buf->size, output_buf->addr, 1, kernel_event[cur], memout[cur].event[i]);
                     if (ret<0) {
                         const char *msg=convert_error_status(ret);
                         error(msg);
@@ -299,8 +340,7 @@
                 reply[cur] = (memaddr)tasklist[cur]->waiter;
                 
                 // wait kernel[1-cur] and write[1-cur]
-                wait_for_event(event,reply,tasklist,cur);
-                event[1-cur] = NULL;
+                wait_for_event(kernel_event, memout, reply, tasklist, cur);
                 // pipeline    : 1-cur
                 // no pipeline : cur
                 params_addr = (memaddr)tasklist[cur]->next;
@@ -308,7 +348,7 @@
             printf("GPU %d %s\t%lld\n",tasklist[cur]->self->cpu_type,(char*)(gpu_task_list[tasklist[cur]->tasks[0].command].name),tasklist[cur]->task_end_time-tasklist[cur]->task_start_time);
             cur = 1 - cur;
         }
-        wait_for_event(memout_event,reply,tasklist,cur);
+        wait_for_event(kernel_event, memout, reply, tasklist, cur);
 
         unsigned long long wait = 0;
         (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time));
--- a/TaskManager/Gpu/GpuScheduler.h	Mon Dec 09 18:43:56 2013 +0900
+++ b/TaskManager/Gpu/GpuScheduler.h	Mon Dec 09 20:41:11 2013 +0900
@@ -17,6 +17,12 @@
 
 class GpuScheduler : public Scheduler {
  public:
+    typedef struct gpubuffer {
+        cl_int allocate_size;
+        cl_int size;
+        cl_mem *buf; // clCreateBuffer
+        cl_event *event;
+    } GpuBuffer, *GpuBufferPtr;
     GpuScheduler();
     virtual ~GpuScheduler();
     void init_impl(int useRefDma);
@@ -46,23 +52,13 @@
     memaddr reply[2];
     cl_kernel kernel[2];
     cl_event kernel_event[2];
-    cl_mem memparam[2]; // clCreateBuffer
-    cl_int memparam_size[2];
-    cl_event param_event[2];
-    cl_event *param_wait[2]; // wait list for param input
-    cl_mem *memin[2]; // clCreateBuffer
-    cl_int *memin_size[2];
-    cl_event *memin_event[2];
-    cl_event *memin_wait[2]; // wait list for param input
-    cl_mem *memout[2]; // clCreateBuffer
-    cl_int *memout_size[2];
-    cl_event *memout_event[2];
-    cl_event *memout_wait[2]; // wait list for param input
+    GpuBuffer memin[2];
+    GpuBuffer memout[2];
     HTask::htask_flag flag;
 private:
     FifoDmaManager *fifoDmaManager;
     int load_kernel(int cmd);
-
+    cl_mem createBuffer(GpuBufferPtr m, int i, cl_context context, cl_mem_flags flags, size_t size, cl_int *error);
 };
 
 #define GpuSchedRegister(str, filename, functionname)   \