# HG changeset patch
# User Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
# Date 1512423220 -32400
# Node ID dcc42f3e7e97579b1573386da138a6d796119ff2
# Parent  d3d7a7d6a117f7f3373be683561a7c1393d8dc3c
Auto choice blockDim

diff -r d3d7a7d6a117 -r dcc42f3e7e97 src/parallel_execution/CUDAExecutor.cbc
--- a/src/parallel_execution/CUDAExecutor.cbc	Mon Dec 04 04:24:30 2017 +0900
+++ b/src/parallel_execution/CUDAExecutor.cbc	Tue Dec 05 06:33:40 2017 +0900
@@ -7,9 +7,10 @@
 #include "../helper_cuda.h"
 #include "pthread.h"
 
-Executor* createCUDAExecutor(struct Context* context) {
+Executor* createCUDAExecutor(struct Context* context, CUdevice device) {
     struct Executor* executor = new Executor();
     struct CUDAExecutor* cudaExecutor = new CUDAExecutor();
+    checkCudaErrors(cuDeviceGetAttribute(&cudaExecutor->maxThreadPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device));
     executor->executor = (union Data*)cudaExecutor;
     executor->read  = C_readCUDAExecutor;
     executor->exec  = C_execCUDAExecutor;
@@ -35,14 +36,21 @@
     goto meta(context, C_execCUDAExecutor);
 }
 
+int computeblockDim(int count, int maxThreadPerBlock) {
+    return count < maxThreadPerBlock ? count : maxThreadPerBlock;
+}
+
 __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
     // Asynchronous launch kernel
     task->num_exec = 1;
     if (task->iterate) {
         struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
+        int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock);
+        int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock);
+        int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock);
         checkCudaErrors(cuLaunchKernel(task->function,
-                    iterator->x, iterator->y, iterator->z,
-                    1, 1, 1,
+                    iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ,
+                    blockDimX, blockDimY, blockDimZ,
                     0, NULL, (void**)executor->kernelParams, NULL));
     } else {
         checkCudaErrors(cuLaunchKernel(task->function,
diff -r d3d7a7d6a117 -r dcc42f3e7e97 src/parallel_execution/CUDAWorker.cbc
--- a/src/parallel_execution/CUDAWorker.cbc	Mon Dec 04 04:24:30 2017 +0900
+++ b/src/parallel_execution/CUDAWorker.cbc	Tue Dec 05 06:33:40 2017 +0900
@@ -18,10 +18,11 @@
 
 static void startCUDAWorker(Worker* worker) {
     struct CUDAWorker* cudaWorker = &worker->worker->CUDAWorker;
-    cudaInit(cudaWorker, 0);
+    int deviceNum = 0;
+    cudaInit(cudaWorker, deviceNum);
     cudaWorker->context  = NEW(struct Context);
     initContext(cudaWorker->context);
-    cudaWorker->executor = createCUDAExecutor(cudaWorker->context);
+    cudaWorker->executor = createCUDAExecutor(cudaWorker->context, cudaWorker->device);
     Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker;
     goto meta(cudaWorker->context, worker->taskReceive);
 }
diff -r d3d7a7d6a117 -r dcc42f3e7e97 src/parallel_execution/context.h
--- a/src/parallel_execution/context.h	Mon Dec 04 04:24:30 2017 +0900
+++ b/src/parallel_execution/context.h	Tue Dec 05 06:33:40 2017 +0900
@@ -198,7 +198,7 @@
         struct Queue* tasks;
         int runFlag;
         enum Code next;
-        int num_stream;
+        int numStream;
         struct Executor* executor;
         CUstream *stream;
     } CUDAWorker;
@@ -370,6 +370,7 @@
     struct CUDAExecutor {
         CUdeviceptr** kernelParams;
         struct CUDABuffer* buffer;
+        int maxThreadPerBlock;
     } CUDAExecutor;
     struct CUDABuffer {
         int inputLen;
diff -r d3d7a7d6a117 -r dcc42f3e7e97 src/parallel_execution/cuda.c
--- a/src/parallel_execution/cuda.c	Mon Dec 04 04:24:30 2017 +0900
+++ b/src/parallel_execution/cuda.c	Tue Dec 05 06:33:40 2017 +0900
@@ -51,7 +51,7 @@
     struct Queue* tasks;
     int runFlag;
     int next;
-    int num_stream;
+    int numStream;
     CUstream *stream;
 } CUDAWorker;
 
@@ -67,14 +67,14 @@
 } Array;
 */
 
-void cudaInit(struct CUDAWorker *cudaWorker,int phase) {
+void cudaInit(struct CUDAWorker *cudaWorker,int phase, int deviceNum) {
     // initialize and load kernel
-    cudaWorker->num_stream = 1; // number of stream
-    //    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
+    cudaWorker->numStream = 1; // number of stream
+    //    cudaWorker->stream = NEWN(cudaWorker->numStream, CUstream );
     if (phase==0)
         checkCudaErrors(cuInit(0));
     if (phase==0)
-        checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
+        checkCudaErrors(cuDeviceGet(&cudaWorker->device, deviceNum));
     if (phase==0)
         checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
     //    if (cudaWorker->num_stream) {