# HG changeset patch # User Tatsuki IHA # Date 1512423220 -32400 # Node ID dcc42f3e7e97579b1573386da138a6d796119ff2 # Parent d3d7a7d6a117f7f3373be683561a7c1393d8dc3c Auto choice blockDim diff -r d3d7a7d6a117 -r dcc42f3e7e97 src/parallel_execution/CUDAExecutor.cbc --- a/src/parallel_execution/CUDAExecutor.cbc Mon Dec 04 04:24:30 2017 +0900 +++ b/src/parallel_execution/CUDAExecutor.cbc Tue Dec 05 06:33:40 2017 +0900 @@ -7,9 +7,10 @@ #include "../helper_cuda.h" #include "pthread.h" -Executor* createCUDAExecutor(struct Context* context) { +Executor* createCUDAExecutor(struct Context* context, CUdevice device) { struct Executor* executor = new Executor(); struct CUDAExecutor* cudaExecutor = new CUDAExecutor(); + checkCudaErrors(cuDeviceGetAttribute(&cudaExecutor->maxThreadPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device)); executor->executor = (union Data*)cudaExecutor; executor->read = C_readCUDAExecutor; executor->exec = C_execCUDAExecutor; @@ -35,14 +36,21 @@ goto meta(context, C_execCUDAExecutor); } +int computeblockDim(int count, int maxThreadPerBlock) { + return count < maxThreadPerBlock ? count : maxThreadPerBlock; +} + __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { // Asynchronous launch kernel task->num_exec = 1; if (task->iterate) { struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; + int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock); + int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock); + int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock); checkCudaErrors(cuLaunchKernel(task->function, - iterator->x, iterator->y, iterator->z, - 1, 1, 1, + iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ, + blockDimX, blockDimY, blockDimZ, 0, NULL, (void**)executor->kernelParams, NULL)); } else { checkCudaErrors(cuLaunchKernel(task->function, diff -r d3d7a7d6a117 -r dcc42f3e7e97 src/parallel_execution/CUDAWorker.cbc --- a/src/parallel_execution/CUDAWorker.cbc Mon Dec 04 04:24:30 2017 +0900 +++ b/src/parallel_execution/CUDAWorker.cbc Tue Dec 05 06:33:40 2017 +0900 @@ -18,10 +18,11 @@ static void startCUDAWorker(Worker* worker) { struct CUDAWorker* cudaWorker = &worker->worker->CUDAWorker; - cudaInit(cudaWorker, 0); + int deviceNum = 0; + cudaInit(cudaWorker, deviceNum); cudaWorker->context = NEW(struct Context); initContext(cudaWorker->context); - cudaWorker->executor = createCUDAExecutor(cudaWorker->context); + cudaWorker->executor = createCUDAExecutor(cudaWorker->context, cudaWorker->device); Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker; goto meta(cudaWorker->context, worker->taskReceive); } diff -r d3d7a7d6a117 -r dcc42f3e7e97 src/parallel_execution/context.h --- a/src/parallel_execution/context.h Mon Dec 04 04:24:30 2017 +0900 +++ b/src/parallel_execution/context.h Tue Dec 05 06:33:40 2017 +0900 @@ -198,7 +198,7 @@ struct Queue* tasks; int runFlag; enum Code next; - int num_stream; + int numStream; struct Executor* executor; CUstream *stream; } CUDAWorker; @@ -370,6 +370,7 @@ struct CUDAExecutor { CUdeviceptr** kernelParams; struct CUDABuffer* buffer; + int maxThreadPerBlock; } CUDAExecutor; struct CUDABuffer { int inputLen; diff -r d3d7a7d6a117 -r dcc42f3e7e97 src/parallel_execution/cuda.c --- a/src/parallel_execution/cuda.c Mon Dec 04 04:24:30 2017 +0900 +++ b/src/parallel_execution/cuda.c Tue Dec 05 06:33:40 2017 +0900 @@ -51,7 +51,7 @@ struct Queue* tasks; int runFlag; int next; - int num_stream; + int numStream; CUstream *stream; } CUDAWorker; @@ -67,14 +67,14 @@ } Array; */ -void cudaInit(struct CUDAWorker *cudaWorker,int phase) { +void cudaInit(struct CUDAWorker *cudaWorker,int phase, int deviceNum) { // initialize and load kernel - cudaWorker->num_stream = 1; // number of stream - // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); + cudaWorker->numStream = 1; // number of stream + // cudaWorker->stream = NEWN(cudaWorker->numStream, CUstream ); if (phase==0) checkCudaErrors(cuInit(0)); if (phase==0) - checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); + checkCudaErrors(cuDeviceGet(&cudaWorker->device, deviceNum)); if (phase==0) checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); // if (cudaWorker->num_stream) {