diff src/parallel_execution/CUDAtwice.cbc @ 303:1dbaef86593b

CUDAtwice.cbc
author ikkun
date Mon, 13 Feb 2017 18:23:29 +0900
parents 8e7926f3e271
children ae4f6aa427f5
line wrap: on
line diff
--- a/src/parallel_execution/CUDAtwice.cbc	Mon Feb 13 17:58:04 2017 +0900
+++ b/src/parallel_execution/CUDAtwice.cbc	Mon Feb 13 18:23:29 2017 +0900
@@ -18,10 +18,72 @@
 
 __code twice_stub(struct Context* context) {
     struct Context* workerContext = context->worker->worker->CPUWorker.context;
+
+  // memory allocate
+    CUdeviceptr devA;
+    CUdeviceptr devB[num_exec];
+    CUdeviceptr devOut[num_exec];
+
+    checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
+    for (int i=0;i<num_exec;i++) {
+        checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float)));
+        checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
+    }
+
+    //twiceカーネルが定義されてなければそれをロードする
+    checkCudaErrors(cuModuleLoad(&module, "multiply.ptx"));
+    checkCudaErrors(cuModuleGetFunction(&function, module, "multiply"));
+
     //入力のDataGearをGPUにbuffer経由で送る
-    //twiceカーネルが定義されてなければそれをロードする
+    // Synchronous data transfer(host to device)
+    checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
+
+  // Asynchronous launch kernel
+    for (int i=0;i<num_exec;i++,cur++) {
+        if (num_stream <= cur)
+            cur=0;
+        //B[i] = (float)(i+1);
+        //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
+        void* args[] = {&devA, &devB[i], &devOut[i]};
+        checkCudaErrors(cuLaunchKernel(function,
+                       LENGTH, 1, 1,
+                       THREAD, 1, 1,
+                                 0, num_stream ? stream[cur] : NULL , args, NULL));
+        //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
+    }
+
     //結果を取ってくるコマンドを入力する
     //コマンドの終了待ちを行う
+ // Asynchronous data transfer(device to host)
+     for (int i=0;i<num_exec;i++,cur++) {
+         if (num_stream <= cur)
+             cur = 0;
+         if (num_stream) {
+             checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
+         } else {
+             checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float)));
+         }
+     }
+    
+    // wait for stream
+    for (int i=0;i<num_stream;i++)
+        checkCudaErrors(cuStreamSynchronize(stream[i]));
+ // Asynchronous data transfer(device to host)
+     for (int i=0;i<num_exec;i++,cur++) {
+         if (num_stream <= cur)
+             cur = 0;
+         if (num_stream) {
+             checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
+         } else {
+             checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float)));
+         }
+     }
+    
+    // wait for stream
+    for (int i=0;i<num_stream;i++)
+        checkCudaErrors(cuStreamSynchronize(stream[i]));
+    
+
     //continationにそってGPUworkerに戻る
     goto twice(context, Gearef(context, LoopCounter), 0, 0, NULL, workerContext);
 }