Mercurial > hg > Gears > GearsAgda
changeset 314:1839586f5b41
pthread CUDA test
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 15 Feb 2017 12:34:19 +0900 |
parents | 4addbc7469ee |
children | faa746c449c6 54d203daf06b |
files | src/parallel_execution/CUDAWorker.cbc src/parallel_execution/CUDAtwice.cbc src/parallel_execution/main.cbc src/test/twice.cc |
diffstat | 4 files changed, 42 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parallel_execution/CUDAWorker.cbc Wed Feb 15 11:36:10 2017 +0900 +++ b/src/parallel_execution/CUDAWorker.cbc Wed Feb 15 12:34:19 2017 +0900 @@ -13,6 +13,7 @@ #include "../context.h" static void start_CUDAworker(Worker* worker); +static void cudaInit(struct CUDAWorker *cudaWorker) ; static int cuda_initialized = 0; @@ -22,8 +23,6 @@ worker->worker = (union Data*)cudaWorker; worker->tasks = queue; cudaWorker->id = id; - - worker->taskReceive = C_taskReceiveCUDAWorker; worker->shutdown = C_shutdownCUDAWorker; pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker); return worker; @@ -32,16 +31,19 @@ static void cudaInit(struct CUDAWorker *cudaWorker) { // initialize and load kernel cudaWorker->num_stream = 1; // number of stream - cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); +// cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); +printf("cudaInit 1\n"); checkCudaErrors(cuInit(0)); checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); +printf("cudaInit 2\n"); checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); - - if (cudaWorker->num_stream) { - for (int i=0;i<cudaWorker->num_stream;i++) - checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); - } +printf("cudaInit 3\n"); +// if (cudaWorker->num_stream) { +// for (int i=0;i<cudaWorker->num_stream;i++) +// checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); +// } cuda_initialized = 1; +printf("cudaInit done\n"); } static void start_CUDAworker(Worker* worker) { @@ -54,10 +56,6 @@ } __code taskReceiveCUDAWorker(struct Worker* worker,struct Queue* queue) { - if (cuda_initialized==0) { - CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; - cudaInit(cudaWorker); - } queue->queue = (union Data*)worker->tasks; queue->next = C_getTaskCUDA; goto meta(context, worker->tasks->take); @@ -70,6 +68,11 @@ __code getTaskCUDA(struct Worker* worker, struct Context* task) { if (!task) return; // end thread + if (cuda_initialized==0) { + CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; + cudaInit(cudaWorker); + } + worker->taskReceive = C_taskReceiveCUDAWorker; task->worker = worker; enum Code taskCg = task->next; task->next = C_odgCommitCUDA; // set CG after task exec @@ -124,8 +127,8 @@ __code shutdownCUDAWorker(struct Context* context, CUDAWorker* worker) { - for (int i=0;i<worker->num_stream;i++) - checkCudaErrors(cuStreamDestroy(worker->stream[i])); +// for (int i=0;i<worker->num_stream;i++) +// checkCudaErrors(cuStreamDestroy(worker->stream[i])); checkCudaErrors(cuCtxDestroy(worker->cuCtx)); }
--- a/src/parallel_execution/CUDAtwice.cbc Wed Feb 15 11:36:10 2017 +0900 +++ b/src/parallel_execution/CUDAtwice.cbc Wed Feb 15 12:34:19 2017 +0900 @@ -12,6 +12,7 @@ // memory allocate CUdeviceptr devA; CUdeviceptr devLoopCounter; +printf("CUdA Exe 1\n"); checkCudaErrors(cuMemAlloc(&devA, array->size)); checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); @@ -19,6 +20,7 @@ //twiceカーネルが定義されてなければそれをロードする checkCudaErrors(cuModuleLoad(&context->module, "CUDAtwice.ptx")); checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); +printf("CUdA Exe 2\n"); //入力のDataGearをGPUにbuffer経由で送る // Synchronous data transfer(host to device) @@ -53,6 +55,7 @@ } __code CUDAtwice_stub(struct Context* context) { +printf("CUdAtwice stub\n"); struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter; struct Array* array = &context->data[context->dataNum+1]->Array; CUDAExec(context,array,loopCounter);
--- a/src/parallel_execution/main.cbc Wed Feb 15 11:36:10 2017 +0900 +++ b/src/parallel_execution/main.cbc Wed Feb 15 12:34:19 2017 +0900 @@ -97,8 +97,12 @@ loopCounter2->i = 0; task->idgCount = 0; if (gpu_num) { +#ifdef USE_CUDAWorker task->next = C_CUDAtwice; task->workerId = CPU_CUDA; +#else + task->next = C_twice; +#endif } else { task->next = C_twice; }
--- a/src/test/twice.cc Wed Feb 15 11:36:10 2017 +0900 +++ b/src/test/twice.cc Wed Feb 15 12:34:19 2017 +0900 @@ -2,6 +2,9 @@ #include <sys/time.h> #include <string.h> #include <stdlib.h> +extern "C" { +#include <pthread.h> +} #include <cuda.h> @@ -35,9 +38,12 @@ } } +int num_stream = 1; // number of stream +int num_exec = 16; // number of executed kernel + +static void *start_cuda(void *) ; + int main(int args, char* argv[]) { - int num_stream = 1; // number of stream - int num_exec = 16; // number of executed kernel for (int i=1;argv[i];i++) { if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { @@ -47,7 +53,17 @@ num_exec = atoi(argv[++i]); } } +#if 0 + start_cuda(NULL); +#else + pthread_t thread; + pthread_create(&thread, NULL, start_cuda, NULL); + pthread_join(thread,NULL); +#endif + return 0; +} +static void *start_cuda(void *args) { // initialize and load kernel CUdevice device; CUcontext context; @@ -161,7 +177,6 @@ for (int i=0;i<num_exec;i++) delete[] result[i]; delete[] result; - return 0; }