Mercurial > hg > GearsTemplate
diff src/parallel_execution/cuda.c @ 319:a15511b1a6e0
separate cuda.c, and USE_CUDA_MAIN_THREAD flag
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 15 Feb 2017 20:43:55 +0900 |
parents | |
children | 408b4aab7610 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/parallel_execution/cuda.c Wed Feb 15 20:43:55 2017 +0900 @@ -0,0 +1,119 @@ +#include <stdio.h> +#include <sys/time.h> +#include <string.h> +#include <stdlib.h> +#include <libkern/OSAtomic.h> + +// includes, project +#include <driver_types.h> +#include <cuda_runtime.h> +#include <cuda.h> +#include "helper_cuda.h" +#include "pthread.h" + +// #include "context.h" + +struct Context { + int next; + struct Worker* worker; + struct TaskManager* taskManager; + int codeNum; + void (**code) (struct Context*); + void* heapStart; + void* heap; + long heapLimit; + int dataNum; + int idgCount; //number of waiting dataGear + int odg; + int maxOdg; + int workerId; + int num_exec; + CUmodule module; + CUfunction function; + union Data **data; +}; + + struct CUDAWorker { + CUdevice device; + CUcontext cuCtx; + pthread_t thread; + struct Context* context; + int id; + struct Queue* tasks; + int runFlag; + int next; + int num_stream; + CUstream *stream; + } CUDAWorker; + + struct LoopCounter { + int i; + } LoopCounter; + + struct Array { + int size; + int index; + int prefix; + int* array; + } Array; + + + +void cudaInit(struct CUDAWorker *cudaWorker,int phase) { + // initialize and load kernel + cudaWorker->num_stream = 1; // number of stream +// cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); + if (phase==0) + checkCudaErrors(cuInit(0)); + if (phase==0) + checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); + if (phase==0) + checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); +// if (cudaWorker->num_stream) { +// for (int i=0;i<cudaWorker->num_stream;i++) +// checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); +// } +} + + +void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) { + // Worker *worker = context->worker; + // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; + // memory allocate + CUdeviceptr devA; + CUdeviceptr devLoopCounter; + + checkCudaErrors(cuMemAlloc(&devA, array->size)); + checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); + + //twiceカーネルが定義されてなければそれをロードする + checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx")); + checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); + + //入力のDataGearをGPUにbuffer経由で送る + // Synchronous data transfer(host to device) + checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter))); + checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); + + // Asynchronous launch kernel + context->num_exec = 1; + void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; + checkCudaErrors(cuLaunchKernel(context->function, + 1, 1, 1, + 1, 1, 1, + 0, NULL , args, NULL)); + + //結果を取ってくるコマンドを入力する + //コマンドの終了待ちを行う + checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); + + // wait for stream + checkCudaErrors(cuCtxSynchronize()); +} + +void cudaShutdown( struct CUDAWorker *worker) { +// for (int i=0;i<worker->num_stream;i++) +// checkCudaErrors(cuStreamDestroy(worker->stream[i])); + checkCudaErrors(cuCtxDestroy(worker->cuCtx)); +} +