Mercurial > hg > GearsTemplate
changeset 537:b78533641f9b
Add calcMaxThread
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 06 Feb 2018 05:14:55 +0900 |
parents | acc80b26156b |
children | c0b6ce2ed820 |
files | src/parallel_execution/CUDAExecutor.cbc src/parallel_execution/context.h |
diffstat | 2 files changed, 31 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parallel_execution/CUDAExecutor.cbc Tue Feb 06 02:04:02 2018 +0900 +++ b/src/parallel_execution/CUDAExecutor.cbc Tue Feb 06 05:14:55 2018 +0900 @@ -2,6 +2,7 @@ #interface "Executor.h" #interface "Timer.h" #include <stdio.h> +#include <math.h> Executor* createCUDAExecutor(struct Context* context, CUdevice device) { struct Executor* executor = new Executor(); @@ -38,14 +39,38 @@ return count < maxThreadPerBlock ? count : maxThreadPerBlock; } +void calcBlockMaxThread(struct MultiDimIterator* iterator, struct CUDAExecutor* executor) { + executor->maxThreadPerBlockX = 1; + executor->maxThreadPerBlockY = 1; + executor->maxThreadPerBlockZ = 1; + if (iterator->x > 1 && iterator->y == 1 && iterator->z == 1) { + executor->maxThreadPerBlockX = executor->maxThreadPerBlock; + executor->maxThreadPerBlockY = 1; + executor->maxThreadPerBlockZ = 1; + } else if (iterator->x > 1 && iterator->y > 1 && iterator->z == 1) { + int ln_2 = log2(executor->maxThreadPerBlock); + int maxThread = 1 << (ln_2/2); + executor->maxThreadPerBlockX = maxThread; + executor->maxThreadPerBlockY = maxThread; + executor->maxThreadPerBlockZ = 1; + } else { + int ln_2 = log2(executor->maxThreadPerBlock); + int maxThread = 1 << (ln_2/3); + executor->maxThreadPerBlockX = maxThread * (1 << (ln_2%3)); + executor->maxThreadPerBlockY = maxThread; + executor->maxThreadPerBlockZ = maxThread; + } +} + __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) { // Asynchronous launch kernel task->num_exec = 1; if (task->iterate) { struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator; - int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock); - int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock); - int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock); + calcBlockMaxThread(iterator, executor); + int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlockX); + int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlockY); + int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlockZ); checkCudaErrors(cuLaunchKernel(task->function, iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ, blockDimX, blockDimY, blockDimZ,
--- a/src/parallel_execution/context.h Tue Feb 06 02:04:02 2018 +0900 +++ b/src/parallel_execution/context.h Tue Feb 06 05:14:55 2018 +0900 @@ -378,6 +378,9 @@ CUdeviceptr** kernelParams; struct CUDABuffer* buffer; int maxThreadPerBlock; + int maxThreadPerBlockX; + int maxThreadPerBlockY; + int maxThreadPerBlockZ; struct Timer* timer; } CUDAExecutor; struct CUDABuffer {