changeset 2014:8c618e912c88 draft

optimization data transfer. wrong result
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 01 Jul 2014 17:04:01 +0900
parents d43c2b7932ea
children 6bf6450bd45a
files TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h example/fft/main.cc
diffstat 3 files changed, 33 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc	Tue Jul 01 11:17:12 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.cc	Tue Jul 01 17:04:01 2014 +0900
@@ -13,6 +13,9 @@
 #include <sys/stat.h>
 #include <string.h>
 #include <cuda.h>
+#include <map>
+
+using namespace std;
 
 TaskObject cuda_task_list[MAX_TASK_OBJECT];
 
@@ -137,6 +140,7 @@
                 ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream);
                 if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                 transmitted.insert(make_pair(input_buf, &cudabuffer[cur].memin[param]));
+                reverse_map.insert(make_pair(&cudabuffer[cur].memin[param], input_buf));
             }
             cudabuffer[cur].kernelParams[param] = transmitted[input_buf];
             param++;
@@ -151,6 +155,7 @@
                 createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size);
                 if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                 transmitted.insert(make_pair(output_buf, &cudabuffer[cur].memout[i]));
+                reverse_map.insert(make_pair(&cudabuffer[cur].memout[i], output_buf));
             }
             cudabuffer[cur].kernelParams[param] = transmitted[output_buf];
             param++;
@@ -183,34 +188,36 @@
     int cur = 0;
     for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) {
         if (STAGE <= cur) break;
+        // enable flip : not data transfer device to host
+        if (flag[cur].flip) continue;
         for(int i=0;i<nextTask->outData_count;i++) { // read output data
             ListElement *output_buf = nextTask->outData(i);
             if (output_buf->size==0) break;
-            CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ;
-            int i0 = flag[cur].flip ? i+1 : i ;
-            // flip use memin buffer and memout event
-            ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream);
-            if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-            transmitted.erase(output_buf);
+            if (transmitted.count(output_buf)) {
+                ret = cuMemcpyDtoHAsync(output_buf->addr, *transmitted[output_buf], output_buf->size, cudabuffer[cur].stream);
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+                reverse_map.erase(transmitted[output_buf]);
+                transmitted.erase(output_buf);
+            }
         }
     }
     return nextTask;
 }
 
 static void
-release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) {
+release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem, map<CUdeviceptr*, ListElement*> map) {
     for (int i=0; i<mem[cur].in_size; i++) {
-        if (mem[cur].memin[i])
+        if (!map.count(&mem[cur].memin[i])) {
             cuMemFree(mem[cur].memin[i]);
-        mem[cur].memin[i] = 0;
+            mem[cur].memin[i] = 0;
+        }
     }
     for (int i=0; i<mem[cur].out_size; i++) {
-        if (mem[cur].memout[i])
+        if (!map.count(&mem[cur].memout[i])) {
             cuMemFree(mem[cur].memout[i]);
-        mem[cur].memout[i] = 0;
+            mem[cur].memout[i] = 0;
+        }
     }
-    mem[cur].in_size = 0;
-    mem[cur].out_size = 0;
 }
 
 void
@@ -235,7 +242,7 @@
     
     for (int i=0;i<cur;i++) {
         if (cudabuffer[i].in_size > 0 || cudabuffer[i].out_size > 0)
-            release_buf_event(i, cudabuffer);
+            release_buf_event(i, cudabuffer, reverse_map);
     }
 
     if(reply) {
--- a/TaskManager/Cuda/CudaScheduler.h	Tue Jul 01 11:17:12 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.h	Tue Jul 01 17:04:01 2014 +0900
@@ -48,7 +48,8 @@
     CudaBuffer cudabuffer[STAGE];
     
     // record transmitted data.
-    map<ListElement*, void*> transmitted;
+    map<ListElement*, CUdeviceptr*> transmitted;
+    map<CUdeviceptr*, ListElement*> reverse_map;
 
     HTask::htask_flag flag[STAGE];
     
--- a/example/fft/main.cc	Tue Jul 01 11:17:12 2014 +0900
+++ b/example/fft/main.cc	Tue Jul 01 17:04:01 2014 +0900
@@ -126,6 +126,7 @@
     brev->set_inData(0, src, length_src*sizeof(cl_float2));
     brev->set_outData(0, dst, length_dst*sizeof(cl_float2));
     brev->set_cpu(spe_cpu);
+    brev->flip();
     brev->wait_for(waitTask);
     brev->iterate(gws[0],gws[1]);
 
@@ -141,6 +142,7 @@
         bfly->set_inData(1, spin, sizeof(cl_float2)*(n/2));
         bfly->set_outData(0,dst,length_dst*sizeof(cl_float2));
         bfly->set_cpu(spe_cpu);
+        bfly->flip();
         bfly->wait_for(waitTask);
         bfly->iterate(gws[0],gws[1]);
         waitTask = bfly;
@@ -163,9 +165,9 @@
 
 char *
 init(int argc, char**argv){
-
+    
     char *filename = 0;
-
+    
     //    printf("%s ",argv[4]);
     for (int i = 1; argv[i]; ++i) {
         if (strcmp(argv[i], "-file") == 0) {
@@ -191,11 +193,11 @@
     long m = (cl_int)(log((double)n)/log(2.0));
     size_t *gws = new size_t[2];
     size_t *lws = new size_t[2];
-
+    
     xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
     rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
     wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2));
-
+    
     HTask* waitTask;
     /*
      * [cl_float2]
@@ -226,6 +228,7 @@
     sfac->set_outData(0, wm, length_w*sizeof(cl_float2));
     sfac->set_param(0,n);
     sfac->set_cpu(spe_cpu);
+    sfac->flip();
     sfac->iterate(gws[0]);
 
     // Butterfly Operation
@@ -239,6 +242,7 @@
     first_trns->set_outData(0,xm,length_r*sizeof(cl_float2));
     first_trns->set_param(0,n);
     first_trns->set_cpu(spe_cpu);
+    first_trns->flip();
     first_trns->wait_for(waitTask);
     first_trns->iterate(gws[0],gws[1]);
 
@@ -254,6 +258,7 @@
     hpfl->set_param(0,n);
     hpfl->set_param(1,(long)radius);
     hpfl->set_cpu(spe_cpu);
+    hpfl->flip();
     hpfl->wait_for(waitTask);
     hpfl->iterate(gws[0],gws[1]);
 
@@ -269,6 +274,7 @@
     second_trns->set_outData(0,rm,length_r*sizeof(cl_float2));
     second_trns->set_param(0,n);
     second_trns->set_cpu(spe_cpu);
+    second_trns->flip();
     second_trns->wait_for(waitTask);
     second_trns->iterate(gws[0],gws[1]);