Mercurial > hg > Gears > GearsAgda
changeset 431:b3359544adbb
Edit cudaExec but not work
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 17 Oct 2017 01:50:12 +0900 |
parents | 35b37fe8d3a7 |
children | 6bb391fc9e12 |
files | src/parallel_execution/CMakeLists.txt src/parallel_execution/context.h src/parallel_execution/cuda.c src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc src/parallel_execution/examples/twice/twice.cbc |
diffstat | 5 files changed, 65 insertions(+), 44 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt Mon Oct 09 17:46:42 2017 +0900 +++ b/src/parallel_execution/CMakeLists.txt Tue Oct 17 01:50:12 2017 +0900 @@ -13,7 +13,7 @@ if (${USE_CUDA}) include_directories("/usr/local/cuda/include") set(NVCCFLAG "-std=c++11" "-g" "-O0" ) - set(CUDA_LINK_FLAGS "-framework CUDA -lc++ -Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib") + set(CUDA_LINK_FLAGS "-framework CUDA -lc++ -Wl,-search_paths_first -Wl,-headerpad_max_install_names /usr/local/cuda/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib") find_package(CUDA REQUIRED) SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CUDA_LINK_FLAGS}" ) endif()
--- a/src/parallel_execution/context.h Mon Oct 09 17:46:42 2017 +0900 +++ b/src/parallel_execution/context.h Tue Oct 17 01:50:12 2017 +0900 @@ -345,6 +345,13 @@ int y; int z; } MultiDim; + struct CudaBuffer { + void** kernelParams; + int inputLen; + int outputLen; + union Data** inputData; + union Data** outputData; + } CudaBuffer; }; // union Data end this is necessary for context generator typedef union Data Data;
--- a/src/parallel_execution/cuda.c Mon Oct 09 17:46:42 2017 +0900 +++ b/src/parallel_execution/cuda.c Tue Oct 17 01:50:12 2017 +0900 @@ -85,58 +85,66 @@ } -void CUDAExec(struct Context* context, struct SortArray* inputSortArray, struct SortArray* outputSortArray) { - //printf("cuda exec start\n"); - // Worker *worker = context->worker; - // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; - // memory allocate - CUdeviceptr devA; - CUdeviceptr devB; - CUdeviceptr devC; - CUdeviceptr devD; +void cudaExec(struct Context* context, struct CudaBuffer* buffer, char* filename, char* function) { + buffer->kernelParams = (void **)calloc(buffer->inputLen + buffer->outputLen, sizeof(void *)); + int paramCount = 0; + for (int i = 0; i < buffer->inputLen; i++) { + CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr)); + // memory allocate + checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->inputData[i]))); + // Synchronous data transfer(host to device) + checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->inputData[i], GET_SIZE(buffer->inputData[i]))); + buffer->kernelParams[paramCount++] = deviceptr; + } - checkCudaErrors(cuMemAlloc(&devA, sizeof(struct Integer)*GET_LEN(inputSortArray->array))); - checkCudaErrors(cuMemAlloc(&devB, sizeof(int))); - checkCudaErrors(cuMemAlloc(&devC, sizeof(int))); - checkCudaErrors(cuMemAlloc(&devD, sizeof(int))); + for (int i = 0; i < buffer->outputLen; i++) { + CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr)); + // memory allocate + checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->outputData[i]))); + // Synchronous data transfer(host to device) + checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->outputData[i], GET_SIZE(buffer->outputData[i]))); + buffer->kernelParams[paramCount++] = deviceptr; + } - //twiceカーネルが定義されてなければそれをロードする - checkCudaErrors(cuModuleLoad(&context->module, "c/examples/bitonicSort/CUDAbitonicSwap.ptx")); - checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "bitonicSwap")); - - //入力のDataGearをGPUにbuffer経由で送る - // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(devA, inputSortArray->array, sizeof(struct Integer)*GET_LEN(inputSortArray->array))); - checkCudaErrors(cuMemcpyHtoD(devB, &inputSortArray->block, sizeof(int))); - checkCudaErrors(cuMemcpyHtoD(devC, &inputSortArray->first, sizeof(int))); - checkCudaErrors(cuMemcpyHtoD(devD, &inputSortArray->prefix, sizeof(int))); + // カーネルが定義されてなければそれをロードする + checkCudaErrors(cuModuleLoad(&context->module, filename)); + checkCudaErrors(cuModuleGetFunction(&context->function, context->module, function)); // Asynchronous launch kernel context->num_exec = 1; - void* args[] = {&devA, &devB, &devC, &devD}; if (context->iterate) { struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator; checkCudaErrors(cuLaunchKernel(context->function, iterator->x, iterator->y, iterator->z, 1, 1, 1, - 0, NULL, args, NULL)); + 0, NULL, buffer->kernelParams, NULL)); } else { checkCudaErrors(cuLaunchKernel(context->function, 1, 1, 1, 1, 1, 1, - 0, NULL, args, NULL)); + 0, NULL, buffer->kernelParams, NULL)); } + //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う - checkCudaErrors(cuMemcpyDtoH(inputSortArray->array, devA, sizeof(struct Integer)*GET_LEN(inputSortArray->array))); - outputSortArray->array = inputSortArray->array; + paramCount = 0; + for (int i = 0; i < buffer->inputLen; i++) { + CUdeviceptr* deviceptr = buffer->kernelParams[paramCount++]; + checkCudaErrors(cuMemcpyDtoH(buffer->inputData[i], *deviceptr, GET_SIZE(buffer->inputData[i]))); + cuMemFree(*deviceptr); + free(deviceptr); + } + + for (int i = 0; i < buffer->outputLen; i++) { + CUdeviceptr* deviceptr = buffer->kernelParams[paramCount++]; + checkCudaErrors(cuMemcpyDtoH(buffer->outputData[i], *deviceptr, GET_SIZE(buffer->outputData[i]))); + cuMemFree(*deviceptr); + free(deviceptr); + } + free(buffer->kernelParams); // wait for stream checkCudaErrors(cuCtxSynchronize()); - cuMemFree(devA); - cuMemFree(devB); - cuMemFree(devC); - cuMemFree(devD); } void cudaShutdown( struct CUDAWorker *worker) {
--- a/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc Mon Oct 09 17:46:42 2017 +0900 +++ b/src/parallel_execution/examples/bitonicSort/bitonicSwap.cbc Tue Oct 17 01:50:12 2017 +0900 @@ -2,7 +2,7 @@ #include <stdio.h> #ifdef USE_CUDAWorker -extern void CUDAExec(struct Context* context, struct SortArray* inputSortArray, struct SortArray* outputSortArray); +extern void cudaExec(struct Context* context, struct CudaBuffer* buffer, char* filename, char* function); #endif __code bitonicSwap(struct SortArray* inputArray, struct MultiDim* multiDim, __code next(struct SortArray* output, ...), struct LoopCounter* loopCounter) { @@ -31,19 +31,25 @@ __code bitonicSwap_stub(struct Context* context) { #ifdef USE_CUDAWorker - struct SortArray* inputSortArray = &context->data[context->idg]->SortArray; - struct SortArray* outputSortArray = &context->data[context->odg]->SortArray; if (context->gpu) { - CUDAExec(context, inputSortArray, outputSortArray); + struct SortArray* inputSortArray = &context->data[context->idg]->SortArray; + struct SortArray* outputSortArray = &context->data[context->odg]->SortArray; + struct CudaBuffer* buffer = &ALLOCATE(context, CudaBuffer)->CudaBuffer; + buffer->inputData = (union Data**){inputSortArray->array, inputSortArray}; + buffer->outputData = NULL; + buffer->inputLen = 2; + buffer->outputLen = 0; + cudaExec(context, buffer, "c/examples/bitonicSort/CUDAbitonicSwap.ptx", "bitonicSwap"); //continuationにそってGPUworkerに戻る + outputSortArray->array = inputSortArray->array; goto meta(context, context->next); } #endif SortArray** O_output = (struct SortArray **)&context->data[context->odg]; goto bitonicSwap(context, - &context->data[context->idg]->SortArray, - &context->data[context->idg+1]->MultiDim, - context->next, - O_output, - Gearef(context, LoopCounter)); + &context->data[context->idg]->SortArray, + &context->data[context->idg+1]->MultiDim, + context->next, + O_output, + Gearef(context, LoopCounter)); }
--- a/src/parallel_execution/examples/twice/twice.cbc Mon Oct 09 17:46:42 2017 +0900 +++ b/src/parallel_execution/examples/twice/twice.cbc Tue Oct 17 01:50:12 2017 +0900 @@ -3,7 +3,7 @@ #include "../../../context.h" #ifdef USE_CUDAWorker -extern void CUDAExec(struct Context* context, Array* array); +extern void cudaExec(struct Context* context, Array* array); #endif __code twice(struct Array* array, struct MultiDim* multiDim, __code next(...), struct LoopCounter* loopCounter) { @@ -24,7 +24,7 @@ #ifdef USE_CUDAWorker struct Array* array = &context->data[context->idg]->Array; if (context->gpu) { - CUDAExec(context, array); + cudaExec(context, array); //continuationにそってGPUworkerに戻る goto meta(context, context->next); }