changeset 1846:a0cb52163e57 draft

simplify GPU flip. Do not use output argments in flip. flip requires different kernel.
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Sat, 21 Dec 2013 08:38:13 +0900
parents ad05aeed3a98
children 7db7242990f7
files TaskManager/Gpu/GpuScheduler.cc example/fft/main.cc
diffstat 2 files changed, 46 insertions(+), 65 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Gpu/GpuScheduler.cc	Fri Dec 20 22:25:38 2013 +0900
+++ b/TaskManager/Gpu/GpuScheduler.cc	Sat Dec 21 08:38:13 2013 +0900
@@ -138,13 +138,7 @@
 
     }
     
-    if (flag[1-cur].flip) {
-        if (memout[1-cur].size > 1) {
-            int ret=clWaitForEvents(memout[1-cur].size-1, &memout[1-cur].event[1]);
-            if (ret<0) error(convert_error_status(ret));
-            release_buf_event(cur,memout);
-        }
-    } else if (memout[1-cur].size > 0) {
+    if (memout[1-cur].size > 0) {
         int ret=clWaitForEvents(memout[1-cur].size, memout[1-cur].event);
         if (ret<0) error(convert_error_status(ret));
         release_buf_event(cur,memout);
@@ -182,6 +176,9 @@
  * Get input and output data from tasklist.
  * Enqueue OpenCL command and clflush.
  * Enqueue and clflush are pipelined structure.
+ *
+ * flip means that kernel modify input buffer only, copy GPU input buffer to outData.
+ *
  */
 void
 GpuScheduler::run()
@@ -242,85 +239,69 @@
                 int param = 0;
 
                 // set arg count
-                GpuBufferPtr gpumem = flag[cur].flip ? memout : memin;
-                cl_mem memparam = createBuffer(&gpumem[cur], 0, context, CL_MEM_READ_ONLY,
+                cl_mem memparam = createBuffer(&memin[cur], 0, context, CL_MEM_READ_ONLY,
                                                sizeof(memaddr)*nextTask->param_count, &ret);
                 if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
 
                 // parameter is passed as first kernel arg 
                 ret = clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count,
-                                           nextTask->param(0), 0, NULL, &gpumem[cur].event[0]);
+                                           nextTask->param(0), 0, NULL, &memin[cur].event[0]);
                 if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
 
-                ret = clSetKernelArg(kernel[cur], 0, sizeof(memaddr),(void *)&gpumem[cur].buf[0]);
+                ret = clSetKernelArg(kernel[cur], 0, sizeof(memaddr),(void *)&memin[cur].buf[0]);
                 if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
 
                 param++;
 
-                cl_mem_flags mem_flag = CL_MEM_READ_ONLY;
+                cl_mem_flags mem_flag = flag[cur].flip ? CL_MEM_READ_WRITE : CL_MEM_READ_ONLY;
                 
-                if (!flag[cur].flip) { // set input data when not flip
-                    for(int i=0;i<nextTask->inData_count;i++) {
-                        ListElement *input_buf = nextTask->inData(i);
-                        if (input_buf->size==0) break;
-                        createBuffer(&memin[cur], param, context, mem_flag, input_buf->size, &ret);
-                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
-                        ret = clEnqueueWriteBuffer(command_queue, memin[cur].buf[param], CL_TRUE, 0,
-                                                   input_buf->size, input_buf->addr, 0, 
-                                                   NULL, &memin[cur].event[param]);
-                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
-                        ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memin[cur].buf[param]);
-                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
-                        
-                        param++;
-                    }
-                }
-                cl_mem_flags out_mem_flag;
-                if (flag[cur].flip) {
-                    out_mem_flag = CL_MEM_READ_WRITE;
-                } else {
-                    out_mem_flag = CL_MEM_WRITE_ONLY;
-                    memin[cur].size  = param; // +1 means param
-                }
-                
-                for(int i = 0; i<nextTask->outData_count;i++) { // set output data
-                    ListElement *output_buf = flag[cur].flip? nextTask->inData(i) : nextTask->outData(i);
-                    if (output_buf->size==0) break;
-                    int i0 = flag[cur].flip ? i+1 : i;
-                    createBuffer(&memout[cur], i0, context, out_mem_flag, output_buf->size, &ret);
+                for(int i=0;i<nextTask->inData_count;i++) {
+                    ListElement *input_buf = nextTask->inData(i);
+                    if (input_buf->size==0) break;
+                    createBuffer(&memin[cur], param, context, mem_flag, input_buf->size, &ret);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
-
-                    if (flag[cur].flip) { // use output buffer as input buffer
-                        ListElement *input_buf = nextTask->inData(i);
-                        
-                        ret = clEnqueueWriteBuffer(command_queue, memout[cur].buf[i+1], CL_TRUE, 0,
-                                                   input_buf->size, input_buf->addr, 
-                                                   0, NULL, &memout[cur].event[i0]);
-                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
-                        ret = clSetKernelArg(kernel[cur],  param+1, sizeof(memaddr), (void *)&memout[cur].buf[i0]);
-                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue;}
-                    }
-                    ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memout[cur].buf[i0]);
-                    if (ret<0) { gpuTaskError(cur,tasklist,ret); continue;}
+                    ret = clEnqueueWriteBuffer(command_queue, memin[cur].buf[param], CL_TRUE, 0,
+                                               input_buf->size, input_buf->addr, 0, 
+                                               NULL, &memin[cur].event[param]);
+                    if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
+                    ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memin[cur].buf[param]);
+                    if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
+                    
                     param++;
                 }
-                
-                memout[cur].size = param - memin[cur].size;
+                memin[cur].size  = param; // +1 means param
+
+                for(int i = 0; i<nextTask->outData_count;i++) { // set output data
+                    ListElement *output_buf = nextTask->outData(i);
+                    if (output_buf->size==0) break;
+                    if (!flag[cur].flip) { // flip use memin for output 
+                        createBuffer(&memout[cur], i, context, CL_MEM_WRITE_ONLY, output_buf->size, &ret);
+                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
+                        ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memout[cur].buf[i]);
+                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue;}
+                        // enqueue later
+                    }
+                    param++;
+                }
+                memout[cur].size = param - memin[cur].size;  // no buffer on flip, but flip use memout event
+
                 if (tasklist->dim > 0) {
                     ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist->dim,
-                                 NULL, &tasklist->x, 0, gpumem[cur].size, gpumem[cur].event, &kernel_event[cur]);
+                                 NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]);
                 } else {
-                    ret = clEnqueueTask(command_queue, kernel[cur], gpumem[cur].size,
-                                        gpumem[cur].event, &kernel_event[cur]);
+                    ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size,
+                                        memin[cur].event, &kernel_event[cur]);
                 }
                 if (ret<0) { gpuTaskError(cur, tasklist, ret); continue; }
 
                 for(int i=0;i<nextTask->outData_count;i++) { // read output data
-                    ListElement *output_buf = flag[cur].flip? nextTask->inData(i) :nextTask->outData(i);
+                    ListElement *output_buf = nextTask->outData(i);
                     if (output_buf->size==0) break;
-                    int i0 = flag[cur].flip? i+1:i;
-                    ret = clEnqueueReadBuffer(command_queue, memout[cur].buf[i0], CL_FALSE, 0,
-                                              output_buf->size, output_buf->addr, 1, &kernel_event[cur], &memout[cur].event[i0]);
+                    GpuBufferPtr mem = flag[cur].flip ? memin : memout ;
+                    int i0 = flag[cur].flip ? i+1 : i ;
+                    // flip use memin buffer and memout event
+                    ret = clEnqueueReadBuffer(command_queue, mem[cur].buf[i0], CL_FALSE, 0,
+                                              output_buf->size, output_buf->addr, 1, &kernel_event[cur], &memout[cur].event[i]);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                 }
                 // wait kernel[1-cur] and write[1-cur]
--- a/example/fft/main.cc	Fri Dec 20 22:25:38 2013 +0900
+++ b/example/fft/main.cc	Sat Dec 21 08:38:13 2013 +0900
@@ -153,7 +153,7 @@
         norm->set_inData(0,dst,length_dst*sizeof(cl_float2));
         norm->set_outData(0, dst, length_dst*sizeof(cl_float2));
         norm->set_param(0,n);
-        norm->flip();
+        // norm->flip();
         norm->set_cpu(spe_cpu);
         norm->wait_for(waitTask);
         norm->iterate(gws[0],gws[1]);
@@ -253,7 +253,7 @@
     setWorkSize(gws,lws,n,n);
     hpfl->set_inData(0,rm,length_r*sizeof(cl_float2));
     hpfl->set_outData(0, rm, length_r*sizeof(cl_float2));
-    hpfl->flip();
+    // hpfl->flip();
     hpfl->set_param(0,n);
     hpfl->set_param(1,(long)radius);
     hpfl->set_cpu(spe_cpu);