0
|
1 int computeblockDim(int count, int maxThreadPerBlock) {
|
|
2 return count < maxThreadPerBlock ? count : maxThreadPerBlock;
|
|
3 }
|
|
4
|
|
5 __code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
|
|
6 // check data parallelism task
|
|
7 if (task->iterate) {
|
|
8 struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
|
|
9
|
|
10 // compute block thread size
|
|
11 int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock);
|
|
12 int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock);
|
|
13 int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock);
|
|
14
|
|
15 checkCudaErrors(cuLaunchKernel(task->function,
|
|
16 iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ,
|
|
17 blockDimX, blockDimY, blockDimZ,
|
|
18 0, NULL, (void**)executor->kernelParams, NULL));
|
|
19 }
|
|
20 ...
|
|
21 }
|