changeset 2012:9360e782a431 draft

Optimization data transfer. not running
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 01 Jul 2014 03:40:54 +0900
parents faaea4e1ce1c
children d43c2b7932ea
files TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h example/cuda_fft/Makefile.def example/fft/main.cc
diffstat 4 files changed, 23 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc	Wed Jun 11 17:22:17 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.cc	Tue Jul 01 03:40:54 2014 +0900
@@ -125,47 +125,38 @@
         // parameter is passed as first kernel arg 
         ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream);
         if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+        cudabuffer[cur].kernelParams[param] = &cudabuffer[cur].memin[param];
         
         param++;
         
         for(int i=0;i<nextTask->inData_count;i++) {
             ListElement *input_buf = nextTask->inData(i);
             if (input_buf->size==0) break;
-            createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size);
-            if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-            ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream);
-            if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-            
+            if (!transmitted[input_buf]) {
+                createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size);
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+                ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream);
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+                transmitted.insert(make_pair(input_buf, cudabuffer[cur].memin[param]));
+            }
+            cudabuffer[cur].kernelParams[param] = &(transmitted[input_buf]);
             param++;
         }
+
         cudabuffer[cur].in_size = param; // +1 means param
         
         for(int i = 0; i<nextTask->outData_count;i++) { // set output data
             ListElement *output_buf = nextTask->outData(i);
             if (output_buf->size==0) break;
-            if (!flag[cur].flip) { // flip use memin for output 
+            if (!transmitted[output_buf]) {
                 createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size);
                 if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-                // enqueue later
+                transmitted.insert(make_pair(output_buf, cudabuffer[cur].memout[i]));
             }
+            cudabuffer[cur].kernelParams[param] = &(transmitted[output_buf]);
             param++;
         }
         cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event
-        
-        if (!flag[cur].flip) {
-            for (int i = 0; i<cudabuffer[cur].in_size; i++) {
-                cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i];
-            }
-            for (int i = 0; i<cudabuffer[cur].out_size; i++) {
-                cudabuffer[cur].kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i];
-            }
-        } else {
-            for (int i = 0; i<cudabuffer[cur].in_size; i++) {
-                cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i];
-            }
-        }
-        
-        if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
     }
     return cur;
 }
@@ -201,6 +192,7 @@
             // flip use memin buffer and memout event
             ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream);
             if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+            transmitted.erase(output_buf);
         }
     }
     return nextTask;
--- a/TaskManager/Cuda/CudaScheduler.h	Wed Jun 11 17:22:17 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.h	Tue Jul 01 03:40:54 2014 +0900
@@ -7,9 +7,12 @@
 #include "HTask.h"
 #include "TaskManager.h"
 #include <cuda.h>
+#include <map>
 
 extern TaskObject cuda_task_list[MAX_TASK_OBJECT];
 
+using namespace std;
+
 #define STAGE 8
 
 class CudaScheduler : public MainScheduler {
@@ -33,7 +36,6 @@
     // platform は OpenCL が複数のメーカーの GPU に対応してるから必要
     // Cuda の場合、NVIDIA だけなので必要ない?
     CUdevice device;
-    unsigned int ret_num_platforms; // たぶん要らない
     int ret_num_devices;
     CUcontext context;
     // command_queue command_queue;
@@ -43,10 +45,13 @@
     memaddr reply;
     // cl_kernel に相当
     // 変数名は function にすべきか kernel にすべきか
-    // とりあえず、kernel で
+    // とりあえず、OpenCL に合わせて kernel で
     CUfunction kernel[STAGE];
     CudaBuffer cudabuffer[STAGE];
     
+    // record transmitted data.
+    map<ListElement*, CUdeviceptr> transmitted;
+
     HTask::htask_flag flag[STAGE];
     
  private:
--- a/example/cuda_fft/Makefile.def	Wed Jun 11 17:22:17 2014 +0900
+++ b/example/cuda_fft/Makefile.def	Tue Jul 01 03:40:54 2014 +0900
@@ -5,4 +5,4 @@
 CC = clang++
 NVCC = nvcc
 CFLAGS = -Wall $(OPT)
-NVCCFLAGS = -ptx -arch=sm_20
\ No newline at end of file
+NVCCFLAGS = -ptx -arch=sm_20 #-g -G
\ No newline at end of file
--- a/example/fft/main.cc	Wed Jun 11 17:22:17 2014 +0900
+++ b/example/fft/main.cc	Tue Jul 01 03:40:54 2014 +0900
@@ -226,6 +226,7 @@
     sfac->set_outData(0, wm, length_w*sizeof(cl_float2));
     sfac->set_param(0,n);
     sfac->set_cpu(spe_cpu);
+    //    sfac->flip();
     sfac->iterate(gws[0]);
 
     // Butterfly Operation