# HG changeset patch
# User Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
# Date 1509736476 -32400
# Node ID af0ec811b20ef71f2bfe1d0457142866b546061b
# Parent  b75badf42701e323601dc5e628d618e43b4a5cdc
Add CUDAExecutor

diff -r b75badf42701 -r af0ec811b20e src/parallel_execution/CUDAExecutor.cbc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parallel_execution/CUDAExecutor.cbc	Sat Nov 04 04:14:36 2017 +0900
@@ -0,0 +1,73 @@
+#include "../context.h"
+#include <stdio.h>
+
+// includes, project
+#include <driver_types.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include "helper_cuda.h"
+
+Executor* createCUDAExecutor(struct Context* context) {
+    struct Executor* executor = new Executor();
+    struct CUDAExecutor* cudaExecutor = new CUDAExecutor();
+    executor->executor = (union Data*)cudaExecutor;
+    executor->read  = C_readCUDAExecutor;
+    executor->exec  = C_execCUDAExecutor;
+    executor->write = C_writeCUDAExecutor;
+    return executor;
+}
+
+__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
+    int paramLen = buffer->inputLen + buffer->outputLen;
+    struct CUDABuffer buffer = executor->buffer;
+    buffer->kernelParams = ALLOCATE_PTR_ARRAY(context, CudevicePtr, paramLen);
+    struct CUDABuffer buffer = executor->buffer;
+    CUdeviceptr* deviceptrs = ALLOCATE_ARRAY(context, CudevicePtr, paramLen);
+    for (int i = 0; i < paramLen; i++) {
+        CUdeviceptr deviceptr = deviceptrs[i];
+        // memory allocate
+        union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen];
+        checkCUDAErrors(cuMemAlloc(deviceptr, GET_SIZE(data)));
+        checkCUDAErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data)));
+        // Synchronous data transfer(host to device)
+        buffer->kernelParams[paramCount++] = &deviceptr;
+    }
+}
+
+void cudaLoadFunction(struct Context* context, char* filename, char* function) {
+    checkCUDAErrors(cuModuleLoad(&context->module, filename));
+    checkCUDAErrors(cuModuleGetFunction(&context->function, context->module, function));
+}
+
+__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
+    // Asynchronous launch kernel
+    task->num_exec = 1;
+    struct CUDABuffer buffer = executor->buffer;
+    if (task->iterate) {
+        struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
+        checkCUDAErrors(cuLaunchKernel(task->function,
+                    iterator->x, iterator->y, iterator->z,
+                    1, 1, 1,
+                    0, NULL, (void**)buffer->kernelParams, NULL));
+    } else {
+        checkCUDAErrors(cuLaunchKernel(task->function,
+                    1, 1, 1,
+                    1, 1, 1,
+                    0, NULL, (void**)buffer->kernelParams, NULL));
+    }
+}
+
+__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
+    //結果を取ってくるコマンドを入力する
+    //コマンドの終了待ちを行う   
+    int paramLen = buffer->inputLen + buffer->outputLen;
+    struct CUDABuffer buffer = executor->buffer;
+    for (int i = 0; i < paramLen; i++) {
+        CUdeviceptr* deviceptr =  buffer->kernelParams[i];
+        union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen];
+        checkCUDAErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data)));
+        cuMemFree(*deviceptr);
+    }
+    // wait for stream
+    checkCUDAErrors(cuCtxSynchronize());
+}
diff -r b75badf42701 -r af0ec811b20e src/parallel_execution/Executor.cbc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parallel_execution/Executor.cbc	Sat Nov 04 04:14:36 2017 +0900
@@ -0,0 +1,7 @@
+typedef struct Executor<Impl>{
+    union Data* Executor;
+    struct Context* task;
+    __code read(Impl* executor, struct Context* task, __code next(...));
+    __code exec(Impl* executor, struct Context* task, __code next(...));
+    __code write(Impl* executor, struct Context* task, __code next(...));
+}
diff -r b75badf42701 -r af0ec811b20e src/parallel_execution/context.h
--- a/src/parallel_execution/context.h	Tue Oct 31 17:55:50 2017 +0900
+++ b/src/parallel_execution/context.h	Sat Nov 04 04:14:36 2017 +0900
@@ -348,27 +348,27 @@
     struct Executor {
         struct Executor* executor;
         struct Context* task;
-        struct Buffer* buffer;
         enum Code read;
         enum Code exec;
         enum Code write;
         enum Code next;
     } Executor;
 #ifdef USE_CUDAWorker
-    struct CudaExecutor {
+    struct CUDAExecutor {
         void** kernelParams;
-    } CudaExecutor;
-    CudaDevicePtr CudaDevicePtr;
+        CUDABuffer* buffer;
+    } CUDAExecutor;
+    CudevicePtr CudevicePtr;
 #else
-    struct CudaExecutor {
-    } CudaExecutor;
+    struct CUDAExecutor {
+    } CUDAExecutor;
 #endif
-    struct Buffer {
+    struct CUDABuffer {
         int inputLen;
         int outputLen;
         union Data** inputData;
         union Data** outputData;
-    } Buffer;
+    } CUDABuffer;
 }; // union Data end       this is necessary for context generator
 
 typedef union Data Data;
diff -r b75badf42701 -r af0ec811b20e src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc
--- a/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc	Tue Oct 31 17:55:50 2017 +0900
+++ b/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc	Sat Nov 04 04:14:36 2017 +0900
@@ -30,7 +30,7 @@
     if (context->gpu) {
         struct SortArray* inputSortArray  = &context->data[context->idg]->SortArray;
         struct SortArray* outputSortArray = &context->data[context->odg]->SortArray;
-        struct Buffer* buffer = &ALLOCATE(context, Buffer)->Buffer;
+        struct CudaBuffer* buffer = new CudaBuffer();
         buffer->inputData = (union Data**)ALLOCATE_PTR_ARRAY(context, SortArray, 2);
         buffer->inputData[0] = (union Data*)inputSortArray->array;
         buffer->inputData[1] = (union Data*)inputSortArray;
diff -r b75badf42701 -r af0ec811b20e src/parallel_execution/generate_context.pl
--- a/src/parallel_execution/generate_context.pl	Tue Oct 31 17:55:50 2017 +0900
+++ b/src/parallel_execution/generate_context.pl	Sat Nov 04 04:14:36 2017 +0900
@@ -107,6 +107,8 @@
         last if (/union Data end/);
         if (/struct (\w+) \{/) {
             $dataGear{$1} = $1;
+        } elsif (/^\s{4}(\w+) (\w+);/) { # primitive type
+            $dataGear{$1} = $1;
         }
         $dataGear{"Context"} = "Context";
     }