Mercurial > hg > Gears > GearsAgda
view src/parallel_execution/cuda.c @ 319:a15511b1a6e0
separate cuda.c, and USE_CUDA_MAIN_THREAD flag
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 15 Feb 2017 20:43:55 +0900 |
parents | |
children | 408b4aab7610 |
line wrap: on
line source
#include <stdio.h> #include <sys/time.h> #include <string.h> #include <stdlib.h> #include <libkern/OSAtomic.h> // includes, project #include <driver_types.h> #include <cuda_runtime.h> #include <cuda.h> #include "helper_cuda.h" #include "pthread.h" // #include "context.h" struct Context { int next; struct Worker* worker; struct TaskManager* taskManager; int codeNum; void (**code) (struct Context*); void* heapStart; void* heap; long heapLimit; int dataNum; int idgCount; //number of waiting dataGear int odg; int maxOdg; int workerId; int num_exec; CUmodule module; CUfunction function; union Data **data; }; struct CUDAWorker { CUdevice device; CUcontext cuCtx; pthread_t thread; struct Context* context; int id; struct Queue* tasks; int runFlag; int next; int num_stream; CUstream *stream; } CUDAWorker; struct LoopCounter { int i; } LoopCounter; struct Array { int size; int index; int prefix; int* array; } Array; void cudaInit(struct CUDAWorker *cudaWorker,int phase) { // initialize and load kernel cudaWorker->num_stream = 1; // number of stream // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); if (phase==0) checkCudaErrors(cuInit(0)); if (phase==0) checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); if (phase==0) checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); // if (cudaWorker->num_stream) { // for (int i=0;i<cudaWorker->num_stream;i++) // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); // } } void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) { // Worker *worker = context->worker; // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; // memory allocate CUdeviceptr devA; CUdeviceptr devLoopCounter; checkCudaErrors(cuMemAlloc(&devA, array->size)); checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); //twiceカーネルが定義されてなければそれをロードする checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx")); checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); //入力のDataGearをGPUにbuffer経由で送る // Synchronous data transfer(host to device) checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter))); checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); // Asynchronous launch kernel context->num_exec = 1; void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; checkCudaErrors(cuLaunchKernel(context->function, 1, 1, 1, 1, 1, 1, 0, NULL , args, NULL)); //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); // wait for stream checkCudaErrors(cuCtxSynchronize()); } void cudaShutdown( struct CUDAWorker *worker) { // for (int i=0;i<worker->num_stream;i++) // checkCudaErrors(cuStreamDestroy(worker->stream[i])); checkCudaErrors(cuCtxDestroy(worker->cuCtx)); }