Mercurial > hg > Gears > GearsAgda
changeset 319:a15511b1a6e0
separate cuda.c, and USE_CUDA_MAIN_THREAD flag
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 15 Feb 2017 20:43:55 +0900 |
parents | 054c47e6ca20 |
children | f730761bb044 |
files | src/parallel_execution/CMakeLists.txt src/parallel_execution/CUDAWorker.cbc src/parallel_execution/CUDAtwice.cbc src/parallel_execution/TaskManagerImpl.cbc src/parallel_execution/cuda.c src/parallel_execution/main.cbc |
diffstat | 6 files changed, 140 insertions(+), 84 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt Wed Feb 15 16:45:10 2017 +0900 +++ b/src/parallel_execution/CMakeLists.txt Wed Feb 15 20:43:55 2017 +0900 @@ -67,9 +67,9 @@ TARGET CUDAtwice SOURCES - main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc CUDAWorker.cbc CUDAtwice.cbc CUDAtwice.cu + main.cbc RedBlackTree.cbc compare.c SingleLinkedStack.cbc CPUWorker.cbc time.cbc twice.cbc TaskManagerImpl.cbc SingleLinkedQueue.cbc SynchronizedQueue.cbc SemaphoreImpl.cbc CUDAWorker.cbc CUDAtwice.cbc CUDAtwice.cu cuda.c ) - set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1") + set_target_properties(CUDAtwice PROPERTIES COMPILE_FLAGS "-Wall -g -DUSE_CUDAWorker=1 -DUSE_CUDA_MAIN_THREAD") endif() GearsCommand(
--- a/src/parallel_execution/CUDAWorker.cbc Wed Feb 15 16:45:10 2017 +0900 +++ b/src/parallel_execution/CUDAWorker.cbc Wed Feb 15 20:43:55 2017 +0900 @@ -4,16 +4,11 @@ #include <stdlib.h> #include <libkern/OSAtomic.h> -// includes, project -#include <driver_types.h> -#include <cuda_runtime.h> -#include <cuda.h> -#include "helper_cuda.h" - #include "../context.h" +extern void cudaInit(struct CUDAWorker *cudaWorker,int phase) ; + static void start_CUDAworker(Worker* worker); -static void cudaInit(struct CUDAWorker *cudaWorker,int phase) ; volatile int cuda_initialized = 0; @@ -27,33 +22,18 @@ worker->tasks = queue; cudaWorker->id = id; worker->shutdown = C_shutdownCUDAWorker; - // pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker); +#ifndef USE_CUDA_MAIN_THREAD + pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker); +#else if (im) { im->workers[0] = worker; } cuda_initialized = 1; start_CUDAworker(worker); +#endif return worker; } -static void cudaInit(struct CUDAWorker *cudaWorker,int phase) { - // initialize and load kernel - cudaWorker->num_stream = 1; // number of stream -// cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); - if (phase==0) - checkCudaErrors(cuInit(0)); - if (phase==0) - checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); - if (phase==0) - checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); -// if (cudaWorker->num_stream) { -// for (int i=0;i<cudaWorker->num_stream;i++) -// checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); -// } - CUdeviceptr devA; - checkCudaErrors(cuMemAlloc(&devA, 16)); - -} static void start_CUDAworker(Worker* worker) { CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; @@ -77,10 +57,6 @@ __code getTaskCUDA(struct Worker* worker, struct Context* task) { if (!task) return; // end thread -// if (cuda_initialized==0 || 1) { -// CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; -// cudaInit(cudaWorker,1); -// } worker->taskReceive = C_taskReceiveCUDAWorker; task->worker = worker; enum Code taskCg = task->next; @@ -134,11 +110,10 @@ } +extern void cudaShutdown( CUDAWorker *cudaWorker) ; __code shutdownCUDAWorker(struct Context* context, CUDAWorker* worker) { -// for (int i=0;i<worker->num_stream;i++) -// checkCudaErrors(cuStreamDestroy(worker->stream[i])); - checkCudaErrors(cuCtxDestroy(worker->cuCtx)); + cudaShutdown( worker) ; } __code shutdownCUDAWorker_stub(struct Context* context) {
--- a/src/parallel_execution/CUDAtwice.cbc Wed Feb 15 16:45:10 2017 +0900 +++ b/src/parallel_execution/CUDAtwice.cbc Wed Feb 15 20:43:55 2017 +0900 @@ -1,49 +1,8 @@ #include <stdio.h> #include "../context.h" -#include <cuda.h> -#include <cuda_runtime.h> -#include "helper_cuda.h" - -static void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter) { - // Worker *worker = context->worker; - // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; - // memory allocate - CUdeviceptr devA; - CUdeviceptr devLoopCounter; -printf("CUdA Exe 1\n"); - - checkCudaErrors(cuMemAlloc(&devA, array->size)); - checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); - - //twiceカーネルが定義されてなければそれをロードする -printf("CUdA Exe 2\n"); - checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx")); -printf("CUdA Exe 3\n"); - checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); -printf("CUdA Exe 4\n"); - - //入力のDataGearをGPUにbuffer経由で送る - // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter))); - checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); - - // Asynchronous launch kernel - context->num_exec = 1; - void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; - checkCudaErrors(cuLaunchKernel(context->function, - 1, 1, 1, - 1, 1, 1, - 0, NULL , args, NULL)); - - //結果を取ってくるコマンドを入力する - //コマンドの終了待ちを行う - checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); - - // wait for stream - checkCudaErrors(cuCtxSynchronize()); -} +extern void CUDAExec(struct Context* context, Array* array, LoopCounter *loopCounter) ; __code CUDAtwice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) { int i = loopCounter->i;
--- a/src/parallel_execution/TaskManagerImpl.cbc Wed Feb 15 16:45:10 2017 +0900 +++ b/src/parallel_execution/TaskManagerImpl.cbc Wed Feb 15 20:43:55 2017 +0900 @@ -35,8 +35,11 @@ for (;i<taskManager->cpu;i++) { #ifdef USE_CUDAWorker Queue* queue = createSynchronizedQueue(context); - // taskManagerImpl->workers[i] = (Worker*)createCUDAWorker(context, i, queue,0); +#ifndef USE_CUDA_MAIN_THREAD + taskManagerImpl->workers[i] = (Worker*)createCUDAWorker(context, i, queue,0); +#else taskManagerImpl->workers[i] = (Worker*)queue; +#endif #else Queue* queue = createSynchronizedQueue(context); taskManagerImpl->workers[i] = (Worker*)createCPUWorker(context, i, queue); @@ -49,13 +52,9 @@ } __code createTask(struct TaskManager* taskManager) { - TaskManager *t = (TaskManager *)taskManager->taskManager; - TaskManagerImpl *im = (TaskManagerImpl *)t->taskManager; - taskManager->context = NEW(struct Context); initContext(taskManager->context); taskManager->context->taskManager = taskManager; - struct Queue* tasks = im->workers[0]->tasks; goto meta(context, C_setWorker); }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/parallel_execution/cuda.c Wed Feb 15 20:43:55 2017 +0900 @@ -0,0 +1,119 @@ +#include <stdio.h> +#include <sys/time.h> +#include <string.h> +#include <stdlib.h> +#include <libkern/OSAtomic.h> + +// includes, project +#include <driver_types.h> +#include <cuda_runtime.h> +#include <cuda.h> +#include "helper_cuda.h" +#include "pthread.h" + +// #include "context.h" + +struct Context { + int next; + struct Worker* worker; + struct TaskManager* taskManager; + int codeNum; + void (**code) (struct Context*); + void* heapStart; + void* heap; + long heapLimit; + int dataNum; + int idgCount; //number of waiting dataGear + int odg; + int maxOdg; + int workerId; + int num_exec; + CUmodule module; + CUfunction function; + union Data **data; +}; + + struct CUDAWorker { + CUdevice device; + CUcontext cuCtx; + pthread_t thread; + struct Context* context; + int id; + struct Queue* tasks; + int runFlag; + int next; + int num_stream; + CUstream *stream; + } CUDAWorker; + + struct LoopCounter { + int i; + } LoopCounter; + + struct Array { + int size; + int index; + int prefix; + int* array; + } Array; + + + +void cudaInit(struct CUDAWorker *cudaWorker,int phase) { + // initialize and load kernel + cudaWorker->num_stream = 1; // number of stream +// cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); + if (phase==0) + checkCudaErrors(cuInit(0)); + if (phase==0) + checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); + if (phase==0) + checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); +// if (cudaWorker->num_stream) { +// for (int i=0;i<cudaWorker->num_stream;i++) +// checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); +// } +} + + +void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) { + // Worker *worker = context->worker; + // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; + // memory allocate + CUdeviceptr devA; + CUdeviceptr devLoopCounter; + + checkCudaErrors(cuMemAlloc(&devA, array->size)); + checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); + + //twiceカーネルが定義されてなければそれをロードする + checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx")); + checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); + + //入力のDataGearをGPUにbuffer経由で送る + // Synchronous data transfer(host to device) + checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter))); + checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); + + // Asynchronous launch kernel + context->num_exec = 1; + void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; + checkCudaErrors(cuLaunchKernel(context->function, + 1, 1, 1, + 1, 1, 1, + 0, NULL , args, NULL)); + + //結果を取ってくるコマンドを入力する + //コマンドの終了待ちを行う + checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); + + // wait for stream + checkCudaErrors(cuCtxSynchronize()); +} + +void cudaShutdown( struct CUDAWorker *worker) { +// for (int i=0;i<worker->num_stream;i++) +// checkCudaErrors(cuStreamDestroy(worker->stream[i])); + checkCudaErrors(cuCtxDestroy(worker->cuCtx)); +} +
--- a/src/parallel_execution/main.cbc Wed Feb 15 16:45:10 2017 +0900 +++ b/src/parallel_execution/main.cbc Wed Feb 15 20:43:55 2017 +0900 @@ -34,23 +34,27 @@ } #ifdef USE_CUDAWorker +#ifdef USE_CUDA_MAIN_THREAD extern volatile int cuda_initialized; #endif +#endif __code initDataGears(struct LoopCounter* loopCounter, struct TaskManager* taskManager) { // loopCounter->tree = createRedBlackTree(context); loopCounter->i = 0; taskManager->taskManager = (union Data*)createTaskManagerImpl(context, cpu_num, gpu_num, 0); #ifdef USE_CUDAWorker +#ifdef USE_CUDA_MAIN_THREAD while(! cuda_initialized) {}; #endif +#endif goto meta(context, C_createTask1); } __code initDataGears_stub(struct Context* context) { struct TaskManager* taskManager = Gearef(context, TaskManager); taskManager->taskManager = 0; -#ifndef USE_CUDAWorker +#if (! defined(USE_CUDAWorker) || ! defined(USE_CUDA_MAIN_THREAD)) struct LoopCounter* loopCounter = Gearef(context, LoopCounter); goto initDataGears(context, loopCounter, taskManager); #else @@ -114,7 +118,7 @@ loopCounter->i = 0; taskManager->next = C_code1; -#ifdef USE_CUDAWorker +#if ( defined(USE_CUDAWorker) && defined(USE_CUDA_MAIN_THREAD)) sleep(5); #endif goto meta(context, taskManager->taskManager->TaskManager.shutdown);