diff src/parallel_execution/CUDAExecutor.cbc @ 451:dcc42f3e7e97

Auto choice blockDim
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Tue, 05 Dec 2017 06:33:40 +0900
parents eab6f8cd2820
children 8d7e5d48cad3
line wrap: on
line diff
--- a/src/parallel_execution/CUDAExecutor.cbc	Mon Dec 04 04:24:30 2017 +0900
+++ b/src/parallel_execution/CUDAExecutor.cbc	Tue Dec 05 06:33:40 2017 +0900
@@ -7,9 +7,10 @@
 #include "../helper_cuda.h"
 #include "pthread.h"
 
-Executor* createCUDAExecutor(struct Context* context) {
+Executor* createCUDAExecutor(struct Context* context, CUdevice device) {
     struct Executor* executor = new Executor();
     struct CUDAExecutor* cudaExecutor = new CUDAExecutor();
+    checkCudaErrors(cuDeviceGetAttribute(&cudaExecutor->maxThreadPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device));
     executor->executor = (union Data*)cudaExecutor;
     executor->read  = C_readCUDAExecutor;
     executor->exec  = C_execCUDAExecutor;
@@ -35,14 +36,21 @@
     goto meta(context, C_execCUDAExecutor);
 }
 
+int computeblockDim(int count, int maxThreadPerBlock) {
+    return count < maxThreadPerBlock ? count : maxThreadPerBlock;
+}
+
 __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
     // Asynchronous launch kernel
     task->num_exec = 1;
     if (task->iterate) {
         struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
+        int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock);
+        int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock);
+        int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock);
         checkCudaErrors(cuLaunchKernel(task->function,
-                    iterator->x, iterator->y, iterator->z,
-                    1, 1, 1,
+                    iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ,
+                    blockDimX, blockDimY, blockDimZ,
                     0, NULL, (void**)executor->kernelParams, NULL));
     } else {
         checkCudaErrors(cuLaunchKernel(task->function,