Mercurial > hg > Gears > GearsAgda
view src/parallel_execution/cuda.c @ 433:d920f3a3f037
Refactoring cuda.c
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 17 Oct 2017 15:47:33 +0900 |
parents | b3359544adbb |
children | 08a93fc2f0d3 |
line wrap: on
line source
#include <stdio.h> #include <sys/time.h> #include <string.h> #include <stdlib.h> #include <libkern/OSAtomic.h> // includes, project #include <driver_types.h> #include <cuda_runtime.h> #include <cuda.h> #include "helper_cuda.h" #include "pthread.h" #include "context.h" /* struct Context { int next; struct Worker* worker; struct TaskManager* taskManager; int codeNum; void (**code) (struct Context*); void* heapStart; void* heap; long heapLimit; int dataNum; int idgCount; //number of waiting dataGear int idg; int maxIdg; int odg; int maxOdg; int workerId; struct Context* task; struct Queue* tasks; int num_exec; CUmodule module; CUfunction function; union Data **data; // multi dimension parameter int iterate; struct Iterator* iterator; }; struct CUDAWorker { CUdevice device; CUcontext cuCtx; pthread_t thread; struct Context* context; int id; struct Queue* tasks; int runFlag; int next; int num_stream; CUstream *stream; } CUDAWorker; struct LoopCounter { int i; } LoopCounter; struct Array { int size; int index; int prefix; int* array; } Array; */ void cudaInit(struct CUDAWorker *cudaWorker,int phase) { // initialize and load kernel cudaWorker->num_stream = 1; // number of stream // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); if (phase==0) checkCudaErrors(cuInit(0)); if (phase==0) checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); if (phase==0) checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); // if (cudaWorker->num_stream) { // for (int i=0;i<cudaWorker->num_stream;i++) // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); // } printf("cuda Init: Done\n"); } void cudaRead(struct CudaBuffer* buffer) { buffer->kernelParams = (void **)calloc(buffer->inputLen + buffer->outputLen, sizeof(void *)); int paramCount = 0; for (int i = 0; i < buffer->inputLen; i++) { CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr)); // memory allocate checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->inputData[i]))); // Synchronous data transfer(host to device) checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->inputData[i], GET_SIZE(buffer->inputData[i]))); buffer->kernelParams[paramCount++] = deviceptr; } for (int i = 0; i < buffer->outputLen; i++) { CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr)); // memory allocate checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->outputData[i]))); // Synchronous data transfer(host to device) checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->outputData[i], GET_SIZE(buffer->outputData[i]))); buffer->kernelParams[paramCount++] = deviceptr; } } void cudaLoadFunction(struct Context* context, char* filename, char* function) { checkCudaErrors(cuModuleLoad(&context->module, filename)); checkCudaErrors(cuModuleGetFunction(&context->function, context->module, function)); } void cudaExec2(struct Context* context, struct CudaBuffer* buffer) { // Asynchronous launch kernel context->num_exec = 1; if (context->iterate) { struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator; checkCudaErrors(cuLaunchKernel(context->function, iterator->x/1024, iterator->y, iterator->z, 1024, 1, 1, 0, NULL, buffer->kernelParams, NULL)); } else { checkCudaErrors(cuLaunchKernel(context->function, 1, 1, 1, 1, 1, 1, 0, NULL, buffer->kernelParams, NULL)); } } void cudaWrite(struct CudaBuffer* buffer) { //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う int paramCount = 0; for (int i = 0; i < buffer->inputLen; i++) { CUdeviceptr* deviceptr = buffer->kernelParams[paramCount++]; checkCudaErrors(cuMemcpyDtoH(buffer->inputData[i], *deviceptr, GET_SIZE(buffer->inputData[i]))); cuMemFree(*deviceptr); free(deviceptr); } for (int i = 0; i < buffer->outputLen; i++) { CUdeviceptr* deviceptr = buffer->kernelParams[paramCount++]; checkCudaErrors(cuMemcpyDtoH(buffer->outputData[i], *deviceptr, GET_SIZE(buffer->outputData[i]))); cuMemFree(*deviceptr); free(deviceptr); } free(buffer->kernelParams); // wait for stream checkCudaErrors(cuCtxSynchronize()); } void cudaExec(struct Context* context, struct CudaBuffer* buffer, char* filename, char* function) { // カーネルが定義されてなければそれをロードする cudaLoadFunction(context, filename, function); cudaRead(buffer); cudaExec2(context, buffer); cudaWrite(buffer); } void cudaShutdown( struct CUDAWorker *worker) { // for (int i=0;i<worker->num_stream;i++) // checkCudaErrors(cuStreamDestroy(worker->stream[i])); checkCudaErrors(cuCtxDestroy(worker->cuCtx)); }