Mercurial > hg > Members > Moririn
comparison src/parallel_execution/cuda.c @ 319:a15511b1a6e0
separate cuda.c, and USE_CUDA_MAIN_THREAD flag
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 15 Feb 2017 20:43:55 +0900 |
parents | |
children | 408b4aab7610 |
comparison
equal
deleted
inserted
replaced
318:054c47e6ca20 | 319:a15511b1a6e0 |
---|---|
1 #include <stdio.h> | |
2 #include <sys/time.h> | |
3 #include <string.h> | |
4 #include <stdlib.h> | |
5 #include <libkern/OSAtomic.h> | |
6 | |
7 // includes, project | |
8 #include <driver_types.h> | |
9 #include <cuda_runtime.h> | |
10 #include <cuda.h> | |
11 #include "helper_cuda.h" | |
12 #include "pthread.h" | |
13 | |
14 // #include "context.h" | |
15 | |
16 struct Context { | |
17 int next; | |
18 struct Worker* worker; | |
19 struct TaskManager* taskManager; | |
20 int codeNum; | |
21 void (**code) (struct Context*); | |
22 void* heapStart; | |
23 void* heap; | |
24 long heapLimit; | |
25 int dataNum; | |
26 int idgCount; //number of waiting dataGear | |
27 int odg; | |
28 int maxOdg; | |
29 int workerId; | |
30 int num_exec; | |
31 CUmodule module; | |
32 CUfunction function; | |
33 union Data **data; | |
34 }; | |
35 | |
36 struct CUDAWorker { | |
37 CUdevice device; | |
38 CUcontext cuCtx; | |
39 pthread_t thread; | |
40 struct Context* context; | |
41 int id; | |
42 struct Queue* tasks; | |
43 int runFlag; | |
44 int next; | |
45 int num_stream; | |
46 CUstream *stream; | |
47 } CUDAWorker; | |
48 | |
49 struct LoopCounter { | |
50 int i; | |
51 } LoopCounter; | |
52 | |
53 struct Array { | |
54 int size; | |
55 int index; | |
56 int prefix; | |
57 int* array; | |
58 } Array; | |
59 | |
60 | |
61 | |
62 void cudaInit(struct CUDAWorker *cudaWorker,int phase) { | |
63 // initialize and load kernel | |
64 cudaWorker->num_stream = 1; // number of stream | |
65 // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); | |
66 if (phase==0) | |
67 checkCudaErrors(cuInit(0)); | |
68 if (phase==0) | |
69 checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); | |
70 if (phase==0) | |
71 checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); | |
72 // if (cudaWorker->num_stream) { | |
73 // for (int i=0;i<cudaWorker->num_stream;i++) | |
74 // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); | |
75 // } | |
76 } | |
77 | |
78 | |
79 void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) { | |
80 // Worker *worker = context->worker; | |
81 // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; | |
82 // memory allocate | |
83 CUdeviceptr devA; | |
84 CUdeviceptr devLoopCounter; | |
85 | |
86 checkCudaErrors(cuMemAlloc(&devA, array->size)); | |
87 checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); | |
88 | |
89 //twiceカーネルが定義されてなければそれをロードする | |
90 checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx")); | |
91 checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); | |
92 | |
93 //入力のDataGearをGPUにbuffer経由で送る | |
94 // Synchronous data transfer(host to device) | |
95 checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter))); | |
96 checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); | |
97 | |
98 // Asynchronous launch kernel | |
99 context->num_exec = 1; | |
100 void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; | |
101 checkCudaErrors(cuLaunchKernel(context->function, | |
102 1, 1, 1, | |
103 1, 1, 1, | |
104 0, NULL , args, NULL)); | |
105 | |
106 //結果を取ってくるコマンドを入力する | |
107 //コマンドの終了待ちを行う | |
108 checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); | |
109 | |
110 // wait for stream | |
111 checkCudaErrors(cuCtxSynchronize()); | |
112 } | |
113 | |
114 void cudaShutdown( struct CUDAWorker *worker) { | |
115 // for (int i=0;i<worker->num_stream;i++) | |
116 // checkCudaErrors(cuStreamDestroy(worker->stream[i])); | |
117 checkCudaErrors(cuCtxDestroy(worker->cuCtx)); | |
118 } | |
119 |