Mercurial > hg > Gears > GearsAgda
changeset 305:ec0a5b4fba05
CUDAWorker
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 14 Feb 2017 12:15:58 +0900 |
parents | 9755206813cb |
children | ae4f6aa427f5 |
files | src/parallel_execution/CMakeLists.txt src/parallel_execution/CPUWorker.cbc src/parallel_execution/CUDAWorker.cbc src/parallel_execution/context.h src/parallel_execution/generate_stub.pl src/parallel_execution/helper_cuda.h |
diffstat | 6 files changed, 81 insertions(+), 270 deletions(-) [+] |
line wrap: on
line diff
--- a/src/parallel_execution/CMakeLists.txt Tue Feb 14 11:36:41 2017 +0900 +++ b/src/parallel_execution/CMakeLists.txt Tue Feb 14 12:15:58 2017 +0900 @@ -13,6 +13,7 @@ set(CUDA_LINK_FLAGS "-framework CUDA -lc++ -Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib") find_package(CUDA REQUIRED) add_definitions("-Wall -g -DUSE_CUDAWorker=1") + SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CUDA_LINK_FLAGS}" ) else() add_definitions("-Wall -g") endif()
--- a/src/parallel_execution/CPUWorker.cbc Tue Feb 14 11:36:41 2017 +0900 +++ b/src/parallel_execution/CPUWorker.cbc Tue Feb 14 12:15:58 2017 +0900 @@ -89,17 +89,5 @@ } -#ifdef USE_CUDA -__code twiceGpu() { - cuMemcpyHtoDAsync(context,context,context,context->stream); - cuLaunchkanel(); - cuMemcpyDtoHAsync(); -} - -__code twiceGpu_stub() { -} - -#endif - __code shutdownWorker(struct CPUWorker* worker) { }
--- a/src/parallel_execution/CUDAWorker.cbc Tue Feb 14 11:36:41 2017 +0900 +++ b/src/parallel_execution/CUDAWorker.cbc Tue Feb 14 12:15:58 2017 +0900 @@ -3,8 +3,11 @@ #include <string.h> #include <stdlib.h> #include <libkern/OSAtomic.h> + +// includes, project +#include <driver_types.h> +#include <cuda_runtime.h> #include <cuda.h> -#include <cuda_runtime.h> #include "helper_cuda.h" #include "../context.h" @@ -13,10 +16,10 @@ Worker* createCUDAWorker(struct Context* context, int id, Queue* queue) { struct Worker* worker = ALLOC(context, Worker); - struct CUDAWorker* CUDAWorker = ALLOC(context, CUDAWorker); - worker->worker = (union Data*)CUDAWorker; + struct CUDAWorker* cudaWorker = new CUDAWorker(); + worker->worker = (union Data*)cudaWorker; worker->tasks = queue; - cpuWorker->id = id; + cudaWorker->id = id; worker->taskReceive = C_taskReceiveCUDAWorker; worker->shutdown = C_shutdownCUDAWorker; pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker); @@ -24,75 +27,98 @@ } static void start_CUDAworker(Worker* worker) { - CUDAWorker* CUDAWorker = (CUDAWorker*)worker->worker; - CUDAWorker->context = NEW(struct Context); - initContext(CUDAWorker->context); - Gearef(CUDAWorker->context, Worker)->worker = (union Data*)worker; - int num_stream = 1; // number of stream - int num_exec = 16; // number of executed kernel + CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; + cudaWorker->context = NEW(struct Context); + initContext(cudaWorker->context); + Gearef(cudaWorker->context, Worker)->worker = (union Data*)worker; + cudaWorker->num_stream = 1; // number of stream // initialize and load kernel - CUdevice device; - CUcontext context; - CUmodule module; - CUfunction function; - CUstream stream[num_stream]; - + cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); checkCudaErrors(cuInit(0)); - checkCudaErrors(cuDeviceGet(&device, 0)); - checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); - checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); - checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); - if (num_stream) { - for (int i=0;i<num_stream;i++) - checkCudaErrors(cuStreamCreate(&stream[i],0)); + checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); + checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); + if (cudaWorker->num_stream) { + for (int i=0;i<cudaWorker->num_stream;i++) + checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); } - goto meta(CUDAWorker->context, C_taskReceiveCUDAWorker); + goto meta(cudaWorker->context, C_taskReceiveCUDAWorker); } -__code taskReceiveCUDAWorker(struct Context* context, Worker* worker, Queue* queue) { +__code taskReceiveCUDAWorker(struct Worker* worker,struct Queue* queue) { queue->queue = (union Data*)worker->tasks; - queue->next = C_getTask; + queue->next = C_getTaskCUDA; goto meta(context, worker->tasks->take); } __code taskReceiveCUDAWorker_stub(struct Context* context) { - CUDAWorker* CUDAWorker = (CUDAWorker *)GearImpl(context, CUDAWorker, CUDAworker); - pthread_cond_wait(&CUDAWorker->cond, &CUDAWorker->mutex); goto taskReceiveCUDAWorker(context, &Gearef(context, Worker)->worker->Worker, Gearef(context, Queue)); } -__code getCUDATask(struct Context* context, Worker* worker, struct Context* task) { +__code getTaskCUDA(struct Worker* worker, struct Context* task) { if (!task) return; // end thread task->worker = worker; - context->next = C_taskReceiveCUDAWorker; // set CG after task exec - goto meta(task, task->next); + enum Code taskCg = task->next; + task->next = C_odgCommitCUDA; // set CG after task exec + goto meta(task, taskCg); } -__code getCUDATask_stub(struct Context* context) { +__code getTaskCUDA_stub(struct Context* context) { Worker* worker = &Gearef(context,Worker)->worker->Worker; struct Context* task = &Gearef(context, Queue)->data->Context; - goto getCUDATask(context, worker, task); + goto getTaskCUDA(context, worker, task); +} + +__code odgCommitCUDA(struct LoopCounter* loopCounter, struct Queue* queue, struct Context* task) { + int i = loopCounter->i ; + if(task->odg + i < task->maxOdg) { + queue->queue = (union Data*)GET_WAIT_LIST(task->data[task->odg+i]); + queue->next = C_odgCommitCUDA1; + goto meta(context, queue->queue->Queue.take); + } + loopCounter->i = 0; + goto meta(context, C_taskReceiveCUDAWorker); } -#ifdef USE_CUDA -__code twiceCUDA(struct Context* context) { - cuMemcpyHtoDAsync(context,context,context,context->stream); - cuLaunchkanel(); - cuMemcpyDtoHAsync(); +__code odgCommitCUDA_stub(struct Context* context) { + struct Context* workerContext = context->worker->worker->CUDAWorker.context; + goto odgCommitCUDA(workerContext, + Gearef(workerContext, LoopCounter), + Gearef(workerContext, Queue), + context); } -#endif + +__code odgCommitCUDA1(struct TaskManager* taskManager, struct Context* task) { + if(__sync_fetch_and_sub(&task->idgCount, 1)) { + if(task->idgCount == 0) { + taskManager->taskManager = (union Data*)task->taskManager; + taskManager->context = task; + taskManager->next = C_odgCommitCUDA; + goto meta(context, task->taskManager->spawn); + } + } else { + goto meta(context, C_odgCommitCUDA1); + } +} -__code shutdownCUDAWorker(struct Context* context, CPUWorker* worker) { - for (int i=0;i<num_stream;i++) - checkCudaErrors(cuStreamDestroy(stream[i])); - checkCudaErrors(cuModuleUnload(module)); - checkCudaErrors(cuCtxDestroy(context)); +__code odgCommitCUDA1_stub(struct Context* context) { + struct Context* task = &Gearef(context, Queue)->data->Context; + goto odgCommitCUDA1(context, + Gearef(context, TaskManager), + task); + +} + + +__code shutdownCUDAWorker(struct Context* context, CUDAWorker* worker) { + for (int i=0;i<worker->num_stream;i++) + checkCudaErrors(cuStreamDestroy(worker->stream[i])); + checkCudaErrors(cuCtxDestroy(worker->cuCtx)); } __code shutdownCUDAWorker_stub(struct Context* context) { - CPUWorker* worker = (CPUWorker *)GearImpl(context, Worker, worker); + CUDAWorker* worker = (CUDAWorker *)GearImpl(context, Worker, worker); goto shutdownCUDAWorker(context,worker); }
--- a/src/parallel_execution/context.h Tue Feb 14 11:36:41 2017 +0900 +++ b/src/parallel_execution/context.h Tue Feb 14 12:15:58 2017 +0900 @@ -145,13 +145,12 @@ enum Code next; CUdevice device; CUcontext cuCtx; - CUfunction code; - CUdeviceptr* deviceptr; - CUstream stream; - } CudaWorker; + int num_stream; + CUstream *stream; + } CUDAWorker; #else struct CUDAWorker { - } CudaWorker; + } CUDAWorker; #endif struct Main { enum Code code;
--- a/src/parallel_execution/generate_stub.pl Tue Feb 14 11:36:41 2017 +0900 +++ b/src/parallel_execution/generate_stub.pl Tue Feb 14 12:15:58 2017 +0900 @@ -195,6 +195,9 @@ $outputVar{$codeGearName} = ""; $outputArgs{$codeGearName} = {}; my $newArgs = "struct Context *context,"; + if ($args=~/^struct Context\s*\*\s*context/) { + $newArgs = ""; + } while($args) { if ($args =~ s/(^\s*,\s*)//) { $newArgs .= $1;
--- a/src/parallel_execution/helper_cuda.h Tue Feb 14 11:36:41 2017 +0900 +++ b/src/parallel_execution/helper_cuda.h Tue Feb 14 12:15:58 2017 +0900 @@ -32,7 +32,7 @@ // on which CUDA functions are used. // CUDA Runtime error messages -#ifdef __DRIVER_TYPES_H__ +#ifndef __DRIVER_TYPES_H__ static const char *_cudaGetErrorEnum(cudaError_t error) { switch (error) @@ -979,7 +979,7 @@ #endif #endif -#ifndef __DRIVER_TYPES_H__ +#ifdef __DRIVER_TYPES_H__ static inline void check(CUresult result, char const *const func, const char *const file, int const line) { if (result) @@ -1081,212 +1081,6 @@ } // end of GPU Architecture definitions -#ifdef __CUDA_RUNTIME_H__ -// General GPU Device CUDA Initialization -inline int gpuDeviceInit(int devID) -{ - int device_count; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) - { - fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - if (devID < 0) - { - devID = 0; - } - - if (devID > device_count-1) - { - fprintf(stderr, "\n"); - fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count); - fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID); - fprintf(stderr, "\n"); - return -devID; - } - - struct cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - - if (deviceProp.computeMode == cudaComputeModeProhibited) - { - fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n"); - return -1; - } - - if (deviceProp.major < 1) - { - fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); - exit(EXIT_FAILURE); - } - - checkCudaErrors(cudaSetDevice(devID)); - printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); - - return devID; -} - -// This function returns the best GPU (with maximum GFLOPS) -inline int gpuGetMaxGflopsDeviceId() -{ - int current_device = 0, sm_per_multiproc = 0; - int max_perf_device = 0; - int device_count = 0, best_SM_arch = 0; - int devices_prohibited = 0; - - unsigned long long max_compute_perf = 0; - struct cudaDeviceProp deviceProp; - cudaGetDeviceCount(&device_count); - - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) - { - fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - // Find the best major SM Architecture GPU device - while (current_device < device_count) - { - cudaGetDeviceProperties(&deviceProp, current_device); - - // If this GPU is not running on Compute Mode prohibited, then we can add it to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) - { - if (deviceProp.major > 0 && deviceProp.major < 9999) - { - best_SM_arch = MAX(best_SM_arch, deviceProp.major); - } - } - else - { - devices_prohibited++; - } - - current_device++; - } - - if (devices_prohibited == device_count) - { - fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n"); - exit(EXIT_FAILURE); - } - - // Find the best CUDA capable GPU device - current_device = 0; - - while (current_device < device_count) - { - cudaGetDeviceProperties(&deviceProp, current_device); - - // If this GPU is not running on Compute Mode prohibited, then we can add it to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) - { - if (deviceProp.major == 9999 && deviceProp.minor == 9999) - { - sm_per_multiproc = 1; - } - else - { - sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); - } - - unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; - - if (compute_perf > max_compute_perf) - { - // If we find GPU with SM major > 2, search only these - if (best_SM_arch > 2) - { - // If our device==dest_SM_arch, choose this, or else pass - if (deviceProp.major == best_SM_arch) - { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } - else - { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } - } - - ++current_device; - } - - return max_perf_device; -} - - -// Initialization code to find the best CUDA Device -inline int findCudaDevice(int argc, const char **argv) -{ - struct cudaDeviceProp deviceProp; - int devID = 0; - - // If the command-line has a device number specified, use it - if (checkCmdLineFlag(argc, argv, "device")) - { - devID = getCmdLineArgumentInt(argc, argv, "device="); - - if (devID < 0) - { - printf("Invalid command line parameter\n "); - exit(EXIT_FAILURE); - } - else - { - devID = gpuDeviceInit(devID); - - if (devID < 0) - { - printf("exiting...\n"); - exit(EXIT_FAILURE); - } - } - } - else - { - // Otherwise pick the device with highest Gflops/s - devID = gpuGetMaxGflopsDeviceId(); - checkCudaErrors(cudaSetDevice(devID)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); - } - - return devID; -} - -// General check for CUDA GPU SM Capabilities -inline bool checkCudaCapabilities(int major_version, int minor_version) -{ - struct cudaDeviceProp deviceProp; - deviceProp.major = 0; - deviceProp.minor = 0; - int dev; - - checkCudaErrors(cudaGetDevice(&dev)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - - if ((deviceProp.major > major_version) || - (deviceProp.major == major_version && deviceProp.minor >= minor_version)) - { - printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor); - return true; - } - else - { - printf(" No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version); - return false; - } -} -#endif // end of CUDA Helper Functions