Mercurial > hg > Members > Moririn
changeset 414:49159fbdd1fb
Work CUDAbitonicSort
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 15 Sep 2017 22:49:45 +0900 |
parents | 497b154141de |
children | eec6553a2aa6 764c92c3b181 |
files | src/parallel_execution/CMakeLists.txt src/parallel_execution/CUDAWorker.cbc src/parallel_execution/cuda.c src/parallel_execution/examples/bitonicSort/CUDAbitonicSwap.cu src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc src/parallel_execution/examples/twice/CUDAtwice.cu src/parallel_execution/examples/twice/twice.cbc |
diffstat | 7 files changed, 68 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt Thu Sep 14 22:28:52 2017 +0900 +++ b/src/parallel_execution/CMakeLists.txt Fri Sep 15 22:49:45 2017 +0900 @@ -86,6 +86,14 @@ examples/twice/main.cbc examples/twice/twice.cbc examples/twice/CUDAtwice.cu examples/twice/createArray.cbc CPUWorker.cbc TimeImpl.cbc examples/twice/twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc CUDAWorker.cbc cuda.c MultiDimIterator.cbc ) set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1") + + GearsCommand( + TARGET + CUDAbitonicSort + SOURCES + examples/bitonicSort/bitonicSort.cbc examples/bitonicSort/bitonicSwap.cbc examples/bitonicSort/CUDAbitonicSwap.cu examples/bitonicSort/makeArray.cbc examples/bitonicSort/printArray.cbc CPUWorker.cbc CUDAWorker.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc cuda.c MultiDimIterator.cbc TimeImpl.cbc + ) + set_target_properties(CUDAbitonicSort PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1") endif() GearsCommand(
--- a/src/parallel_execution/CUDAWorker.cbc Thu Sep 14 22:28:52 2017 +0900 +++ b/src/parallel_execution/CUDAWorker.cbc Fri Sep 15 22:49:45 2017 +0900 @@ -76,7 +76,7 @@ } __code iterateCommitCUDA1(struct Context* task) { - goto meta(context, C_taskReceiveWorker); + goto meta(context, C_taskReceiveCUDAWorker); } __code iterateCommitCUDA1_stub(struct Context* context) { @@ -91,7 +91,7 @@ } loopCounter->i = 0; taskManager->taskManager = (union Data*)task->taskManager; - taskManager->next = C_taskReceiveWorker; + taskManager->next = C_taskReceiveCUDAWorker; goto meta(context, task->taskManager->decrementTaskCount); }
--- a/src/parallel_execution/cuda.c Thu Sep 14 22:28:52 2017 +0900 +++ b/src/parallel_execution/cuda.c Fri Sep 15 22:49:45 2017 +0900 @@ -85,29 +85,35 @@ } -void CUDAExec(struct Context* context, struct Array* array) { - printf("cuda exec start\n"); +void CUDAExec(struct Context* context, struct SortArray* inputSortArray, struct SortArray* outputSortArray) { + //printf("cuda exec start\n"); // Worker *worker = context->worker; // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; // memory allocate CUdeviceptr devA; CUdeviceptr devB; + CUdeviceptr devC; + CUdeviceptr devD; - checkCudaErrors(cuMemAlloc(&devA, sizeof(int)*array->size)); + checkCudaErrors(cuMemAlloc(&devA, sizeof(struct Integer)*GET_SIZE(inputSortArray->array))); checkCudaErrors(cuMemAlloc(&devB, sizeof(int))); + checkCudaErrors(cuMemAlloc(&devC, sizeof(int))); + checkCudaErrors(cuMemAlloc(&devD, sizeof(int))); //twiceカーネルが定義されてなければそれをロードする - checkCudaErrors(cuModuleLoad(&context->module, "c/examples/twice/CUDAtwice.ptx")); - checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); + checkCudaErrors(cuModuleLoad(&context->module, "c/examples/bitonicSort/CUDAbitonicSwap.ptx")); + checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "bitonicSwap")); //入力のDataGearをGPUにbuffer経由で送る // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(devA, array->array, sizeof(int)*array->size)); - checkCudaErrors(cuMemcpyHtoD(devB, &array->prefix, sizeof(int))); + checkCudaErrors(cuMemcpyHtoD(devA, inputSortArray->array, sizeof(struct Integer)*GET_SIZE(inputSortArray->array))); + checkCudaErrors(cuMemcpyHtoD(devB, &inputSortArray->block, sizeof(int))); + checkCudaErrors(cuMemcpyHtoD(devC, &inputSortArray->first, sizeof(int))); + checkCudaErrors(cuMemcpyHtoD(devD, &inputSortArray->prefix, sizeof(int))); // Asynchronous launch kernel context->num_exec = 1; - void* args[] = {&devA, &devB}; + void* args[] = {&devA, &devB, &devC, &devD}; if (context->iterate) { struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator; checkCudaErrors(cuLaunchKernel(context->function, @@ -123,12 +129,14 @@ } //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う - checkCudaErrors(cuMemcpyDtoH(array->array, devA, sizeof(int)*array->size)); - + checkCudaErrors(cuMemcpyDtoH(inputSortArray->array, devA, sizeof(struct Integer)*GET_SIZE(inputSortArray->array))); + outputSortArray->array = inputSortArray->array; // wait for stream checkCudaErrors(cuCtxSynchronize()); cuMemFree(devA); cuMemFree(devB); + cuMemFree(devC); + cuMemFree(devD); } void cudaShutdown( struct CUDAWorker *worker) {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/parallel_execution/examples/bitonicSort/CUDAbitonicSwap.cu Fri Sep 15 22:49:45 2017 +0900 @@ -0,0 +1,25 @@ +extern "C" { + struct Integer { + int value; + }; + __global__ void bitonicSwap(struct Integer* array, int* blockPtr, int* firstPtr, int* prefixPtr) { + int block = *blockPtr; + int first = *firstPtr; + int prefix = *prefixPtr; + int i = 0; +C_bitonicSwap: + if (i < prefix) { + int index = i + blockIdx.x * prefix; + int position = index/block; + int index1 = index+block*position; + int index2 = (first == 1)? ((block<<1)*(position+1))-(index1%block)-1 : index1+block; + if (array[index2].value < array[index1].value) { + struct Integer tmp = array[index1]; + array[index1] = array[index2]; + array[index2] = tmp; + } + i++; + goto C_bitonicSwap; + } + } +}
--- a/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc Thu Sep 14 22:28:52 2017 +0900 +++ b/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc Fri Sep 15 22:49:45 2017 +0900 @@ -1,6 +1,10 @@ #include "../../../context.h" #include <stdio.h> +#ifdef USE_CUDAWorker +extern void CUDAExec(struct Context* context, struct SortArray* inputSortArray, struct SortArray* outputSortArray); +#endif + __code bitonicSwap(struct SortArray* inputArray, struct MultiDim* multiDim, __code next(struct SortArray* output, ...), struct LoopCounter* loopCounter) { struct SortArray* output = *O_output; int block = inputArray->block; @@ -26,6 +30,15 @@ } __code bitonicSwap_stub(struct Context* context) { +#ifdef USE_CUDAWorker + struct SortArray* inputSortArray = &context->data[context->idg]->SortArray; + struct SortArray* outputSortArray = &context->data[context->odg]->SortArray; + if (context->gpu) { + CUDAExec(context, inputSortArray, outputSortArray); + //continuationにそってGPUworkerに戻る + goto meta(context, context->next); + } +#endif SortArray** O_output = (struct SortArray **)&context->data[context->odg]; goto bitonicSwap(context, &context->data[context->idg]->SortArray,
--- a/src/parallel_execution/examples/twice/CUDAtwice.cu Thu Sep 14 22:28:52 2017 +0900 +++ b/src/parallel_execution/examples/twice/CUDAtwice.cu Fri Sep 15 22:49:45 2017 +0900 @@ -1,6 +1,4 @@ extern "C" { - -#include <stdio.h> __global__ void twice(int* array, int* prefixPtr) { int i = 0; int prefix = *prefixPtr;
--- a/src/parallel_execution/examples/twice/twice.cbc Thu Sep 14 22:28:52 2017 +0900 +++ b/src/parallel_execution/examples/twice/twice.cbc Fri Sep 15 22:49:45 2017 +0900 @@ -25,9 +25,9 @@ struct Array* array = &context->data[context->idg]->Array; if (context->gpu) { CUDAExec(context, array); + //continuationにそってGPUworkerに戻る + goto meta(context, context->next); } - //continuationにそってGPUworkerに戻る - goto meta(context, context->next); #endif goto twice(context, &context->data[context->idg]->Array,