Mercurial > hg > Gears > GearsAgda

--- a/src/parallel_execution/CMakeLists.txt	Tue Feb 14 22:20:17 2017 +0900
+++ b/src/parallel_execution/CMakeLists.txt	Wed Feb 15 11:04:30 2017 +0900
@@ -6,16 +6,15 @@
 #  add_definitions("-Wall -g -O")

 set(CMAKE_C_COMPILER $ENV{CBC_COMPILER})
-include_directories("/usr/local/cuda/include")
+add_definitions("-Wall -g")
+

 if (${USE_CUDA})
+    include_directories("/usr/local/cuda/include")
     set(NVCCFLAG "-std=c++11" "-g" "-O0" )
     set(CUDA_LINK_FLAGS "-framework CUDA -lc++ -Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib")
     find_package(CUDA REQUIRED)
-    add_definitions("-Wall -g -DUSE_CUDAWorker=1")
     SET( CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${CUDA_LINK_FLAGS}" )
-else()
-    add_definitions("-Wall -g")
 endif()


@@ -70,6 +69,7 @@
       SOURCES
           main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc  CUDAWorker.cbc CUDAtwice.cbc CUDAtwice.cu
     )
+    set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1")
 endif()

 GearsCommand(
--- a/src/parallel_execution/CUDAWorker.cbc	Tue Feb 14 22:20:17 2017 +0900
+++ b/src/parallel_execution/CUDAWorker.cbc	Wed Feb 15 11:04:30 2017 +0900
@@ -20,6 +20,19 @@
     worker->worker = (union Data*)cudaWorker;
     worker->tasks = queue;
     cudaWorker->id = id;
+
+    // initialize and load kernel
+    cudaWorker->num_stream = 1; // number of stream
+    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
+    checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
+    checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
+
+    if (cudaWorker->num_stream) {
+        for (int i=0;i<cudaWorker->num_stream;i++)
+            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
+    }
+
     worker->taskReceive = C_taskReceiveCUDAWorker;
     worker->shutdown = C_shutdownCUDAWorker;
     pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker);
@@ -31,17 +44,6 @@
     cudaWorker->context = NEW(struct Context);
     initContext(cudaWorker->context);
     Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker;
-    cudaWorker->num_stream = 1; // number of stream
-
-    // initialize and load kernel
-    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
-    checkCudaErrors(cuInit(0));
-    checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
-    checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
-    if (cudaWorker->num_stream) {
-        for (int i=0;i<cudaWorker->num_stream;i++)
-            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
-    }

     goto meta(cudaWorker->context, C_taskReceiveCUDAWorker);
 }
--- a/src/parallel_execution/CUDAtwice.cbc	Tue Feb 14 22:20:17 2017 +0900
+++ b/src/parallel_execution/CUDAtwice.cbc	Wed Feb 15 11:04:30 2017 +0900
@@ -6,6 +6,39 @@
 #include <cuda_runtime.h>
 #include "helper_cuda.h"

+static void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter) {
+    // Worker *worker = context->worker;
+    // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
+  // memory allocate
+    CUdeviceptr devA;
+    CUdeviceptr devLoopCounter;
+
+    checkCudaErrors(cuMemAlloc(&devA, array->size));
+    checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter)));
+
+    //twiceカーネルが定義されてなければそれをロードする
+    checkCudaErrors(cuModuleLoad(&context->module, "CUDAtwice.ptx"));
+    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice"));
+
+    //入力のDataGearをGPUにbuffer経由で送る
+    // Synchronous data transfer(host to device)
+    checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter)));
+    checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size));
+
+  // Asynchronous launch kernel
+     context->num_exec = 1;
+     void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA};
+     checkCudaErrors(cuLaunchKernel(context->function,
+                       1, 1, 1,
+                       1, 1, 1,
+                                 0, NULL , args, NULL));
+
+    //結果を取ってくるコマンドを入力する
+    //コマンドの終了待ちを行う
+    checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size));
+    // wait for stream
+}
+
 __code CUDAtwice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) {
     int i = loopCounter->i;
     if (i < prefix) {
@@ -19,40 +52,10 @@
     goto meta(workerContext, workerContext->next);
 }

-static void CUDAExec(struct Context* context, struct Array* array) {
-    // Worker *worker = context->worker;
-    // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
-  // memory allocate
-    CUdeviceptr devA;
-
-    checkCudaErrors(cuMemAlloc(&devA, array->size));
-
-    //twiceカーネルが定義されてなければそれをロードする
-    checkCudaErrors(cuModuleLoad(&context->module, "CUDAtwice.ptx"));
-    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice"));
-
-    //入力のDataGearをGPUにbuffer経由で送る
-    // Synchronous data transfer(host to device)
-    checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size));
-
-  // Asynchronous launch kernel
-     context->num_exec = 1;
-     void* args[] = {&devA};
-     checkCudaErrors(cuLaunchKernel(context->function,
-                       array->prefix, 1, 1,
-                       context->num_exec, 1, 1,
-                                 0, NULL , args, NULL));
-
-    //結果を取ってくるコマンドを入力する
-    //コマンドの終了待ちを行う
-    checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size));
-    // wait for stream
-}
-
 __code CUDAtwice_stub(struct Context* context) {
-    // struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter;
+    struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter;
     struct Array* array = &context->data[context->dataNum+1]->Array;
-    CUDAExec(context,array);
+    CUDAExec(context,array,loopCounter);

     //continuationにそってGPUworkerに戻る
     goto meta(context, context->next);
--- a/src/parallel_execution/CUDAtwice.cu	Tue Feb 14 22:20:17 2017 +0900
+++ b/src/parallel_execution/CUDAtwice.cu	Wed Feb 15 11:04:30 2017 +0900
@@ -2,13 +2,32 @@

 #include <stdio.h>

-    __global__ void twice(struct LoopCounter* loopCounter, int prefix ,int* array) {
-         int index = blockIdx.x * blockDim.x + threadIdx.x;
-         printf("array %p, blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n");
-         int i = 0;
-         while (i < prefix) {
-              array[i+index*prefix] = array[i+index*prefix]*2;
-         }
+//    __global__ void twice(struct LoopCounter* loopCounter, int prefix ,int* array) {
+//         int index = blockIdx.x * blockDim.x + threadIdx.x;
+//         printf("array %p, blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n");
+//         int i = 0;
+//         while (i < prefix) {
+//              array[i+index*prefix] = array[i+index*prefix]*2;
+//         }
+//    }
+
+    struct LoopCounter {
+        int i;
+    } LoopCounter;
+
+    __global__ void twice(struct LoopCounter* loopCounter, int index, int prefix, int* array) {
+         printf("array %p, index = %d, prefix = %d loopCounter->i %d\n",array,index,prefix,loopCounter->i);
+C_twice:
+        int i = loopCounter->i;
+        if (i < prefix) {
+            array[i+index*prefix] = array[i+index*prefix]*2;
+            loopCounter->i++;
+
+            goto C_twice;
+        }
+
+        loopCounter->i = 0;
     }

+
 }
--- a/src/parallel_execution/TaskManagerImpl.cbc	Tue Feb 14 22:20:17 2017 +0900
+++ b/src/parallel_execution/TaskManagerImpl.cbc	Wed Feb 15 11:04:30 2017 +0900
@@ -33,7 +33,9 @@
         taskManagerImpl->workers[i] = (Worker*)createCPUWorker(context, i, queue);
     }
     for (;i<taskManager->cpu;i++) {
-#ifdef USE_CUDA
+#ifdef USE_CUDAWorker
+        Queue* queue = createSynchronizedQueue(context);
+        taskManagerImpl->workers[i] = (Worker*)createCUDAWorker(context, i, queue);
 #else
         Queue* queue = createSynchronizedQueue(context);
         taskManagerImpl->workers[i] = (Worker*)createCPUWorker(context, i, queue);
--- a/src/parallel_execution/generate_context.pl	Tue Feb 14 22:20:17 2017 +0900
+++ b/src/parallel_execution/generate_context.pl	Wed Feb 15 11:04:30 2017 +0900
@@ -85,9 +85,9 @@
     my ($filename) = @_;
     open my $fd,"<",$filename or die("can't open $filename $!");
     while (<$fd>) {
-        if (/^__code (\w+)_stub\(struct Context\* context\)/ or /^\s__code (\w+)_stub\(struct Context\* context\)/) {
+        if (/^__code (\w+)_stub\(struct  *Context *\* *context\)/) {
             $codeGear{$1} = $filename;
-        } elsif (/^(\w+)(\*)+ create(\w+)\(([^]]*)\)/) {
+        } elsif (/^(\w+)(\*)+  *create(\w+)\(([^]]*)\)/) {
             my $interface = $1;
             my $implementation = $3;
             my $constructorArgs = $4;
--- a/src/parallel_execution/main.cbc	Tue Feb 14 22:20:17 2017 +0900
+++ b/src/parallel_execution/main.cbc	Wed Feb 15 11:04:30 2017 +0900
@@ -8,6 +8,9 @@
 int length = 102400;
 int split = 8;
 int* array_ptr;
+int gpu_num = 0;
+int CPU_ANY = -1;
+int CPU_CUDA = -1;

 void print_queue(struct Element* element) {
     while (element) {
@@ -27,7 +30,7 @@
 __code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) {
     // loopCounter->tree = createRedBlackTree(context);
     loopCounter->i = 0;
-    taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, 0, 0);
+    taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, gpu_num, 0);
     goto meta(context, C_createTask1);
 }

@@ -37,6 +40,7 @@

 __code code1(struct Time* time) {
     printf("cpus:\t\t%d\n", cpu_num);
+    printf("gpus:\t\t%d\n", gpu_num);
     printf("length:\t\t%d\n", length);
     printf("length/task:\t%d\n", length/split);
     /* puts("queue"); */
@@ -89,9 +93,15 @@
     array->index = i;
     array->prefix = length/split;
     array->array = array_ptr;
+    array->size = length;
     loopCounter2->i = 0;
     task->idgCount = 0;
-    task->next = C_twice;
+    if (gpu_num) {
+        task->next = C_CUDAtwice;
+        task->workerId = CPU_CUDA;
+    } else {
+        task->next = C_twice;
+    }
     task->data[task->dataNum] = (union Data*)loopCounter2;
     task->data[task->dataNum+1] = (union Data*)array;
     task->odg = task->dataNum + 2;
@@ -120,7 +130,11 @@
             length = (int)atoi(argv[i+1]);
         else if (strcmp(argv[i], "-s") == 0)
             split = (int)atoi(argv[i+1]);
+        else if (strcmp(argv[i], "-cuda") == 0) {
+            gpu_num = 1;
+        }
     }
+    CPU_CUDA = cpu_num;
 }