Mercurial > hg > Gears > GearsAgda
view src/parallel_execution/cuda.c @ 414:49159fbdd1fb
Work CUDAbitonicSort
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 15 Sep 2017 22:49:45 +0900 |
parents | 409e6b5fb775 |
children | 35b37fe8d3a7 |
line wrap: on
line source
#include <stdio.h> #include <sys/time.h> #include <string.h> #include <stdlib.h> #include <libkern/OSAtomic.h> // includes, project #include <driver_types.h> #include <cuda_runtime.h> #include <cuda.h> #include "helper_cuda.h" #include "pthread.h" #include "context.h" /* struct Context { int next; struct Worker* worker; struct TaskManager* taskManager; int codeNum; void (**code) (struct Context*); void* heapStart; void* heap; long heapLimit; int dataNum; int idgCount; //number of waiting dataGear int idg; int maxIdg; int odg; int maxOdg; int workerId; struct Context* task; struct Queue* tasks; int num_exec; CUmodule module; CUfunction function; union Data **data; // multi dimension parameter int iterate; struct Iterator* iterator; }; struct CUDAWorker { CUdevice device; CUcontext cuCtx; pthread_t thread; struct Context* context; int id; struct Queue* tasks; int runFlag; int next; int num_stream; CUstream *stream; } CUDAWorker; struct LoopCounter { int i; } LoopCounter; struct Array { int size; int index; int prefix; int* array; } Array; */ void cudaInit(struct CUDAWorker *cudaWorker,int phase) { // initialize and load kernel cudaWorker->num_stream = 1; // number of stream // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); if (phase==0) checkCudaErrors(cuInit(0)); if (phase==0) checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); if (phase==0) checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); // if (cudaWorker->num_stream) { // for (int i=0;i<cudaWorker->num_stream;i++) // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); // } printf("cuda Init: Done\n"); } void CUDAExec(struct Context* context, struct SortArray* inputSortArray, struct SortArray* outputSortArray) { //printf("cuda exec start\n"); // Worker *worker = context->worker; // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; // memory allocate CUdeviceptr devA; CUdeviceptr devB; CUdeviceptr devC; CUdeviceptr devD; checkCudaErrors(cuMemAlloc(&devA, sizeof(struct Integer)*GET_SIZE(inputSortArray->array))); checkCudaErrors(cuMemAlloc(&devB, sizeof(int))); checkCudaErrors(cuMemAlloc(&devC, sizeof(int))); checkCudaErrors(cuMemAlloc(&devD, sizeof(int))); //twiceカーネルが定義されてなければそれをロードする checkCudaErrors(cuModuleLoad(&context->module, "c/examples/bitonicSort/CUDAbitonicSwap.ptx")); checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "bitonicSwap")); //入力のDataGearをGPUにbuffer経由で送る // Synchronous data transfer(host to device) checkCudaErrors(cuMemcpyHtoD(devA, inputSortArray->array, sizeof(struct Integer)*GET_SIZE(inputSortArray->array))); checkCudaErrors(cuMemcpyHtoD(devB, &inputSortArray->block, sizeof(int))); checkCudaErrors(cuMemcpyHtoD(devC, &inputSortArray->first, sizeof(int))); checkCudaErrors(cuMemcpyHtoD(devD, &inputSortArray->prefix, sizeof(int))); // Asynchronous launch kernel context->num_exec = 1; void* args[] = {&devA, &devB, &devC, &devD}; if (context->iterate) { struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator; checkCudaErrors(cuLaunchKernel(context->function, iterator->x, iterator->y, iterator->z, 1, 1, 1, 0, NULL, args, NULL)); } else { checkCudaErrors(cuLaunchKernel(context->function, 1, 1, 1, 1, 1, 1, 0, NULL, args, NULL)); } //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う checkCudaErrors(cuMemcpyDtoH(inputSortArray->array, devA, sizeof(struct Integer)*GET_SIZE(inputSortArray->array))); outputSortArray->array = inputSortArray->array; // wait for stream checkCudaErrors(cuCtxSynchronize()); cuMemFree(devA); cuMemFree(devB); cuMemFree(devC); cuMemFree(devD); } void cudaShutdown( struct CUDAWorker *worker) { // for (int i=0;i<worker->num_stream;i++) // checkCudaErrors(cuStreamDestroy(worker->stream[i])); checkCudaErrors(cuCtxDestroy(worker->cuCtx)); }