# HG changeset patch
# User Shinji KONO <kono@ie.u-ryukyu.ac.jp>
# Date 1487129659 -32400
# Node ID 1839586f5b41d36a5f5af1e248153dbb931e8466
# Parent  4addbc7469ee32333686f7b46010fb5bfd863c65
pthread CUDA test

diff -r 4addbc7469ee -r 1839586f5b41 src/parallel_execution/CUDAWorker.cbc
--- a/src/parallel_execution/CUDAWorker.cbc	Wed Feb 15 11:36:10 2017 +0900
+++ b/src/parallel_execution/CUDAWorker.cbc	Wed Feb 15 12:34:19 2017 +0900
@@ -13,6 +13,7 @@
 #include "../context.h"
 
 static void start_CUDAworker(Worker* worker);
+static void cudaInit(struct CUDAWorker *cudaWorker) ;
 
 static int cuda_initialized = 0;
 
@@ -22,8 +23,6 @@
     worker->worker = (union Data*)cudaWorker;
     worker->tasks = queue;
     cudaWorker->id = id;
-
-    worker->taskReceive = C_taskReceiveCUDAWorker;
     worker->shutdown = C_shutdownCUDAWorker;
     pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker);
     return worker;
@@ -32,16 +31,19 @@
 static void cudaInit(struct CUDAWorker *cudaWorker) {
     // initialize and load kernel
     cudaWorker->num_stream = 1; // number of stream
-    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
+//    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
+printf("cudaInit 1\n");
     checkCudaErrors(cuInit(0));
     checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
+printf("cudaInit 2\n");
     checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
-
-    if (cudaWorker->num_stream) {
-        for (int i=0;i<cudaWorker->num_stream;i++)
-            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
-    }
+printf("cudaInit 3\n");
+//    if (cudaWorker->num_stream) {
+//        for (int i=0;i<cudaWorker->num_stream;i++)
+//            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
+//    }
     cuda_initialized = 1;
+printf("cudaInit done\n");
 }
 
 static void start_CUDAworker(Worker* worker) {
@@ -54,10 +56,6 @@
 }
 
 __code taskReceiveCUDAWorker(struct Worker* worker,struct Queue* queue) {
-    if (cuda_initialized==0) {
-        CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
-        cudaInit(cudaWorker);
-    }
     queue->queue = (union Data*)worker->tasks;
     queue->next = C_getTaskCUDA;
     goto meta(context, worker->tasks->take);
@@ -70,6 +68,11 @@
 __code getTaskCUDA(struct Worker* worker, struct Context* task) {
     if (!task)
         return; // end thread
+    if (cuda_initialized==0) {
+        CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
+        cudaInit(cudaWorker);
+    }
+    worker->taskReceive = C_taskReceiveCUDAWorker;
     task->worker = worker;
     enum Code taskCg = task->next;
     task->next = C_odgCommitCUDA; // set CG after task exec
@@ -124,8 +127,8 @@
 
 
 __code shutdownCUDAWorker(struct Context* context, CUDAWorker* worker) {
-    for (int i=0;i<worker->num_stream;i++)
-        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
+//    for (int i=0;i<worker->num_stream;i++)
+//        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
     checkCudaErrors(cuCtxDestroy(worker->cuCtx));
 }
 
diff -r 4addbc7469ee -r 1839586f5b41 src/parallel_execution/CUDAtwice.cbc
--- a/src/parallel_execution/CUDAtwice.cbc	Wed Feb 15 11:36:10 2017 +0900
+++ b/src/parallel_execution/CUDAtwice.cbc	Wed Feb 15 12:34:19 2017 +0900
@@ -12,6 +12,7 @@
   // memory allocate
     CUdeviceptr devA;
     CUdeviceptr devLoopCounter;
+printf("CUdA Exe 1\n");
 
     checkCudaErrors(cuMemAlloc(&devA, array->size));
     checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter)));
@@ -19,6 +20,7 @@
     //twiceカーネルが定義されてなければそれをロードする
     checkCudaErrors(cuModuleLoad(&context->module, "CUDAtwice.ptx"));
     checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice"));
+printf("CUdA Exe 2\n");
 
     //入力のDataGearをGPUにbuffer経由で送る
     // Synchronous data transfer(host to device)
@@ -53,6 +55,7 @@
 }
 
 __code CUDAtwice_stub(struct Context* context) {
+printf("CUdAtwice stub\n");
     struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter;
     struct Array* array = &context->data[context->dataNum+1]->Array;
     CUDAExec(context,array,loopCounter);
diff -r 4addbc7469ee -r 1839586f5b41 src/parallel_execution/main.cbc
--- a/src/parallel_execution/main.cbc	Wed Feb 15 11:36:10 2017 +0900
+++ b/src/parallel_execution/main.cbc	Wed Feb 15 12:34:19 2017 +0900
@@ -97,8 +97,12 @@
     loopCounter2->i = 0;
     task->idgCount = 0;
     if (gpu_num) {
+#ifdef USE_CUDAWorker
         task->next = C_CUDAtwice;
         task->workerId = CPU_CUDA;
+#else
+        task->next = C_twice;
+#endif
     } else {
         task->next = C_twice;
     }
diff -r 4addbc7469ee -r 1839586f5b41 src/test/twice.cc
--- a/src/test/twice.cc	Wed Feb 15 11:36:10 2017 +0900
+++ b/src/test/twice.cc	Wed Feb 15 12:34:19 2017 +0900
@@ -2,6 +2,9 @@
 #include <sys/time.h>
 #include <string.h>
 #include <stdlib.h>
+extern "C" {
+#include <pthread.h>
+}
 
 #include <cuda.h>
 
@@ -35,9 +38,12 @@
     }
 }
 
+int num_stream = 1; // number of stream
+int num_exec = 16; // number of executed kernel
+
+static void *start_cuda(void *) ;
+
 int main(int args, char* argv[]) {
-    int num_stream = 1; // number of stream
-    int num_exec = 16; // number of executed kernel
     
     for (int i=1;argv[i];i++) {
         if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
@@ -47,7 +53,17 @@
             num_exec = atoi(argv[++i]);
         }
     }
+#if 0
+    start_cuda(NULL);
+#else
+    pthread_t thread;
+    pthread_create(&thread, NULL, start_cuda, NULL);
+    pthread_join(thread,NULL);
+#endif
+    return 0;
+}
 
+static void *start_cuda(void *args) {
     // initialize and load kernel
     CUdevice device;
     CUcontext context;
@@ -161,7 +177,6 @@
     for (int i=0;i<num_exec;i++)
         delete[] result[i];
     delete[] result;
-
     return 0;
 }