Mercurial > hg > Gears > GearsAgda
diff src/parallel_execution/CUDAExecutor.cbc @ 451:dcc42f3e7e97
Auto choice blockDim
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 05 Dec 2017 06:33:40 +0900 |
parents | eab6f8cd2820 |
children | 8d7e5d48cad3 |
line wrap: on
line diff
--- a/src/parallel_execution/CUDAExecutor.cbc Mon Dec 04 04:24:30 2017 +0900 +++ b/src/parallel_execution/CUDAExecutor.cbc Tue Dec 05 06:33:40 2017 +0900 @@ -7,9 +7,10 @@ #include "../helper_cuda.h" #include "pthread.h" -Executor* createCUDAExecutor(struct Context* context) { +Executor* createCUDAExecutor(struct Context* context, CUdevice device) { struct Executor* executor = new Executor(); struct CUDAExecutor* cudaExecutor = new CUDAExecutor(); + checkCudaErrors(cuDeviceGetAttribute(&cudaExecutor->maxThreadPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device)); executor->executor = (union Data*)cudaExecutor; executor->read = C_readCUDAExecutor; executor->exec = C_execCUDAExecutor; @@ -35,14 +36,21 @@ goto meta(context, C_execCUDAExecutor); } +int computeblockDim(int count, int maxThreadPerBlock) { + return count < maxThreadPerBlock ? count : maxThreadPerBlock; +} + __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { // Asynchronous launch kernel task->num_exec = 1; if (task->iterate) { struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; + int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock); + int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock); + int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock); checkCudaErrors(cuLaunchKernel(task->function, - iterator->x, iterator->y, iterator->z, - 1, 1, 1, + iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ, + blockDimX, blockDimY, blockDimZ, 0, NULL, (void**)executor->kernelParams, NULL)); } else { checkCudaErrors(cuLaunchKernel(task->function,