Mercurial > hg > Gears > GearsAgda
changeset 312:7dd5a7d52a67
USE_CUDAWorker flag only for CUDAtwice
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 15 Feb 2017 11:04:30 +0900 |
parents | 6fcbbe644b92 |
children | 4addbc7469ee |
files | src/parallel_execution/CMakeLists.txt src/parallel_execution/CUDAWorker.cbc src/parallel_execution/CUDAtwice.cbc src/parallel_execution/CUDAtwice.cu src/parallel_execution/TaskManagerImpl.cbc src/parallel_execution/generate_context.pl src/parallel_execution/main.cbc |
diffstat | 7 files changed, 99 insertions(+), 59 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt Tue Feb 14 22:20:17 2017 +0900 +++ b/src/parallel_execution/CMakeLists.txt Wed Feb 15 11:04:30 2017 +0900 @@ -6,16 +6,15 @@ # add_definitions("-Wall -g -O") set(CMAKE_C_COMPILER $ENV{CBC_COMPILER}) -include_directories("/usr/local/cuda/include") +add_definitions("-Wall -g") + if (${USE_CUDA}) + include_directories("/usr/local/cuda/include") set(NVCCFLAG "-std=c++11" "-g" "-O0" ) set(CUDA_LINK_FLAGS "-framework CUDA -lc++ -Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib") find_package(CUDA REQUIRED) - add_definitions("-Wall -g -DUSE_CUDAWorker=1") SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CUDA_LINK_FLAGS}" ) -else() - add_definitions("-Wall -g") endif() @@ -70,6 +69,7 @@ SOURCES main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc CUDAWorker.cbc CUDAtwice.cbc CUDAtwice.cu ) + set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1") endif() GearsCommand(
--- a/src/parallel_execution/CUDAWorker.cbc Tue Feb 14 22:20:17 2017 +0900 +++ b/src/parallel_execution/CUDAWorker.cbc Wed Feb 15 11:04:30 2017 +0900 @@ -20,6 +20,19 @@ worker->worker = (union Data*)cudaWorker; worker->tasks = queue; cudaWorker->id = id; + + // initialize and load kernel + cudaWorker->num_stream = 1; // number of stream + cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); + checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); + + if (cudaWorker->num_stream) { + for (int i=0;i<cudaWorker->num_stream;i++) + checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); + } + worker->taskReceive = C_taskReceiveCUDAWorker; worker->shutdown = C_shutdownCUDAWorker; pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker); @@ -31,17 +44,6 @@ cudaWorker->context = NEW(struct Context); initContext(cudaWorker->context); Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker; - cudaWorker->num_stream = 1; // number of stream - - // initialize and load kernel - cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); - checkCudaErrors(cuInit(0)); - checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); - checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); - if (cudaWorker->num_stream) { - for (int i=0;i<cudaWorker->num_stream;i++) - checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); - } goto meta(cudaWorker->context, C_taskReceiveCUDAWorker); }
--- a/src/parallel_execution/CUDAtwice.cbc Tue Feb 14 22:20:17 2017 +0900 +++ b/src/parallel_execution/CUDAtwice.cbc Wed Feb 15 11:04:30 2017 +0900 @@ -6,6 +6,39 @@ #include <cuda_runtime.h> #include "helper_cuda.h" +static void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter) { + // Worker *worker = context->worker; + // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; + // memory allocate + CUdeviceptr devA; + CUdeviceptr devLoopCounter; + + checkCudaErrors(cuMemAlloc(&devA, array->size)); + checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); + + //twiceカーネルが定義されてなければそれをロードする + checkCudaErrors(cuModuleLoad(&context->module, "CUDAtwice.ptx")); + checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); + + //入力のDataGearをGPUにbuffer経由で送る + // Synchronous data transfer(host to device) + checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter))); + checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); + + // Asynchronous launch kernel + context->num_exec = 1; + void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; + checkCudaErrors(cuLaunchKernel(context->function, + 1, 1, 1, + 1, 1, 1, + 0, NULL , args, NULL)); + + //結果を取ってくるコマンドを入力する + //コマンドの終了待ちを行う + checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); + // wait for stream +} + __code CUDAtwice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) { int i = loopCounter->i; if (i < prefix) { @@ -19,40 +52,10 @@ goto meta(workerContext, workerContext->next); } -static void CUDAExec(struct Context* context, struct Array* array) { - // Worker *worker = context->worker; - // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; - // memory allocate - CUdeviceptr devA; - - checkCudaErrors(cuMemAlloc(&devA, array->size)); - - //twiceカーネルが定義されてなければそれをロードする - checkCudaErrors(cuModuleLoad(&context->module, "CUDAtwice.ptx")); - checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); - - //入力のDataGearをGPUにbuffer経由で送る - // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); - - // Asynchronous launch kernel - context->num_exec = 1; - void* args[] = {&devA}; - checkCudaErrors(cuLaunchKernel(context->function, - array->prefix, 1, 1, - context->num_exec, 1, 1, - 0, NULL , args, NULL)); - - //結果を取ってくるコマンドを入力する - //コマンドの終了待ちを行う - checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); - // wait for stream -} - __code CUDAtwice_stub(struct Context* context) { - // struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter; + struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter; struct Array* array = &context->data[context->dataNum+1]->Array; - CUDAExec(context,array); + CUDAExec(context,array,loopCounter); //continuationにそってGPUworkerに戻る goto meta(context, context->next);
--- a/src/parallel_execution/CUDAtwice.cu Tue Feb 14 22:20:17 2017 +0900 +++ b/src/parallel_execution/CUDAtwice.cu Wed Feb 15 11:04:30 2017 +0900 @@ -2,13 +2,32 @@ #include <stdio.h> - __global__ void twice(struct LoopCounter* loopCounter, int prefix ,int* array) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - printf("array %p, blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n"); - int i = 0; - while (i < prefix) { - array[i+index*prefix] = array[i+index*prefix]*2; - } +// __global__ void twice(struct LoopCounter* loopCounter, int prefix ,int* array) { +// int index = blockIdx.x * blockDim.x + threadIdx.x; +// printf("array %p, blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n"); +// int i = 0; +// while (i < prefix) { +// array[i+index*prefix] = array[i+index*prefix]*2; +// } +// } + + struct LoopCounter { + int i; + } LoopCounter; + + __global__ void twice(struct LoopCounter* loopCounter, int index, int prefix, int* array) { + printf("array %p, index = %d, prefix = %d loopCounter->i %d\n",array,index,prefix,loopCounter->i); +C_twice: + int i = loopCounter->i; + if (i < prefix) { + array[i+index*prefix] = array[i+index*prefix]*2; + loopCounter->i++; + + goto C_twice; + } + + loopCounter->i = 0; } + }
--- a/src/parallel_execution/TaskManagerImpl.cbc Tue Feb 14 22:20:17 2017 +0900 +++ b/src/parallel_execution/TaskManagerImpl.cbc Wed Feb 15 11:04:30 2017 +0900 @@ -33,7 +33,9 @@ taskManagerImpl->workers[i] = (Worker*)createCPUWorker(context, i, queue); } for (;i<taskManager->cpu;i++) { -#ifdef USE_CUDA +#ifdef USE_CUDAWorker + Queue* queue = createSynchronizedQueue(context); + taskManagerImpl->workers[i] = (Worker*)createCUDAWorker(context, i, queue); #else Queue* queue = createSynchronizedQueue(context); taskManagerImpl->workers[i] = (Worker*)createCPUWorker(context, i, queue);
--- a/src/parallel_execution/generate_context.pl Tue Feb 14 22:20:17 2017 +0900 +++ b/src/parallel_execution/generate_context.pl Wed Feb 15 11:04:30 2017 +0900 @@ -85,9 +85,9 @@ my ($filename) = @_; open my $fd,"<",$filename or die("can't open $filename $!"); while (<$fd>) { - if (/^__code (\w+)_stub\(struct Context\* context\)/ or /^\s__code (\w+)_stub\(struct Context\* context\)/) { + if (/^__code (\w+)_stub\(struct *Context *\* *context\)/) { $codeGear{$1} = $filename; - } elsif (/^(\w+)(\*)+ create(\w+)\(([^]]*)\)/) { + } elsif (/^(\w+)(\*)+ *create(\w+)\(([^]]*)\)/) { my $interface = $1; my $implementation = $3; my $constructorArgs = $4;
--- a/src/parallel_execution/main.cbc Tue Feb 14 22:20:17 2017 +0900 +++ b/src/parallel_execution/main.cbc Wed Feb 15 11:04:30 2017 +0900 @@ -8,6 +8,9 @@ int length = 102400; int split = 8; int* array_ptr; +int gpu_num = 0; +int CPU_ANY = -1; +int CPU_CUDA = -1; void print_queue(struct Element* element) { while (element) { @@ -27,7 +30,7 @@ __code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) { // loopCounter->tree = createRedBlackTree(context); loopCounter->i = 0; - taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, 0, 0); + taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, gpu_num, 0); goto meta(context, C_createTask1); } @@ -37,6 +40,7 @@ __code code1(struct Time* time) { printf("cpus:\t\t%d\n", cpu_num); + printf("gpus:\t\t%d\n", gpu_num); printf("length:\t\t%d\n", length); printf("length/task:\t%d\n", length/split); /* puts("queue"); */ @@ -89,9 +93,15 @@ array->index = i; array->prefix = length/split; array->array = array_ptr; + array->size = length; loopCounter2->i = 0; task->idgCount = 0; - task->next = C_twice; + if (gpu_num) { + task->next = C_CUDAtwice; + task->workerId = CPU_CUDA; + } else { + task->next = C_twice; + } task->data[task->dataNum] = (union Data*)loopCounter2; task->data[task->dataNum+1] = (union Data*)array; task->odg = task->dataNum + 2; @@ -120,7 +130,11 @@ length = (int)atoi(argv[i+1]); else if (strcmp(argv[i], "-s") == 0) split = (int)atoi(argv[i+1]); + else if (strcmp(argv[i], "-cuda") == 0) { + gpu_num = 1; + } } + CPU_CUDA = cpu_num; }