Mercurial > hg > Members > Moririn
comparison src/parallel_execution/cuda.c @ 410:85b0ddbf458e
Fix CudaWorker
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 14 Sep 2017 02:35:20 +0900 |
parents | c5cd9888bf2a |
children | 0eba9a04633f |
comparison
equal
deleted
inserted
replaced
409:4d1e3697a6b8 | 410:85b0ddbf458e |
---|---|
9 #include <cuda_runtime.h> | 9 #include <cuda_runtime.h> |
10 #include <cuda.h> | 10 #include <cuda.h> |
11 #include "helper_cuda.h" | 11 #include "helper_cuda.h" |
12 #include "pthread.h" | 12 #include "pthread.h" |
13 | 13 |
14 // #include "context.h" | 14 #include "context.h" |
15 | 15 |
16 /* | |
16 struct Context { | 17 struct Context { |
17 int next; | 18 int next; |
18 struct Worker* worker; | 19 struct Worker* worker; |
19 struct TaskManager* taskManager; | 20 struct TaskManager* taskManager; |
20 int codeNum; | 21 int codeNum; |
22 void* heapStart; | 23 void* heapStart; |
23 void* heap; | 24 void* heap; |
24 long heapLimit; | 25 long heapLimit; |
25 int dataNum; | 26 int dataNum; |
26 int idgCount; //number of waiting dataGear | 27 int idgCount; //number of waiting dataGear |
28 int idg; | |
29 int maxIdg; | |
27 int odg; | 30 int odg; |
28 int maxOdg; | 31 int maxOdg; |
29 int workerId; | 32 int workerId; |
33 struct Context* task; | |
34 struct Queue* tasks; | |
30 int num_exec; | 35 int num_exec; |
31 CUmodule module; | 36 CUmodule module; |
32 CUfunction function; | 37 CUfunction function; |
33 union Data **data; | 38 union Data **data; |
39 | |
40 // multi dimension parameter | |
41 int iterate; | |
42 struct Iterator* iterator; | |
34 }; | 43 }; |
35 | 44 |
36 struct CUDAWorker { | 45 struct CUDAWorker { |
37 CUdevice device; | 46 CUdevice device; |
38 CUcontext cuCtx; | 47 CUcontext cuCtx; |
39 pthread_t thread; | 48 pthread_t thread; |
40 struct Context* context; | 49 struct Context* context; |
41 int id; | 50 int id; |
42 struct Queue* tasks; | 51 struct Queue* tasks; |
43 int runFlag; | 52 int runFlag; |
44 int next; | 53 int next; |
45 int num_stream; | 54 int num_stream; |
46 CUstream *stream; | 55 CUstream *stream; |
47 } CUDAWorker; | 56 } CUDAWorker; |
48 | 57 |
49 struct LoopCounter { | 58 struct LoopCounter { |
50 int i; | 59 int i; |
51 } LoopCounter; | 60 } LoopCounter; |
52 | 61 |
53 struct Array { | 62 struct Array { |
54 int size; | 63 int size; |
55 int index; | 64 int index; |
56 int prefix; | 65 int prefix; |
57 int* array; | 66 int* array; |
58 } Array; | 67 } Array; |
59 | 68 */ |
60 | |
61 | 69 |
62 void cudaInit(struct CUDAWorker *cudaWorker,int phase) { | 70 void cudaInit(struct CUDAWorker *cudaWorker,int phase) { |
63 // initialize and load kernel | 71 // initialize and load kernel |
64 cudaWorker->num_stream = 1; // number of stream | 72 cudaWorker->num_stream = 1; // number of stream |
65 // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); | 73 // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); |
66 if (phase==0) | 74 if (phase==0) |
67 checkCudaErrors(cuInit(0)); | 75 checkCudaErrors(cuInit(0)); |
68 if (phase==0) | 76 if (phase==0) |
69 checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); | 77 checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); |
70 if (phase==0) | 78 if (phase==0) |
71 checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); | 79 checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); |
72 // if (cudaWorker->num_stream) { | 80 // if (cudaWorker->num_stream) { |
73 // for (int i=0;i<cudaWorker->num_stream;i++) | 81 // for (int i=0;i<cudaWorker->num_stream;i++) |
74 // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); | 82 // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); |
75 // } | 83 // } |
84 printf("cuda Init: Done\n"); | |
76 } | 85 } |
77 | 86 |
78 | 87 |
79 void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) { | 88 void CUDAExec(struct Context* context, struct Array* array) { |
89 printf("cuda exec start\n"); | |
80 // Worker *worker = context->worker; | 90 // Worker *worker = context->worker; |
81 // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; | 91 // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; |
82 // memory allocate | 92 // memory allocate |
83 CUdeviceptr devA; | 93 CUdeviceptr devA; |
84 CUdeviceptr devLoopCounter; | |
85 | 94 |
86 checkCudaErrors(cuMemAlloc(&devA, array->size)); | 95 checkCudaErrors(cuMemAlloc(&devA, sizeof(int)*array->size)); |
87 checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter))); | |
88 | 96 |
89 //twiceカーネルが定義されてなければそれをロードする | 97 //twiceカーネルが定義されてなければそれをロードする |
90 checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx")); | 98 checkCudaErrors(cuModuleLoad(&context->module, "c/examples/twice/CUDAtwice.ptx")); |
91 checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); | 99 checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); |
92 | 100 |
93 //入力のDataGearをGPUにbuffer経由で送る | 101 //入力のDataGearをGPUにbuffer経由で送る |
94 // Synchronous data transfer(host to device) | 102 // Synchronous data transfer(host to device) |
95 checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter))); | |
96 checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); | 103 checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); |
97 | 104 |
98 // Asynchronous launch kernel | 105 // Asynchronous launch kernel |
99 context->num_exec = 1; | 106 context->num_exec = 1; |
100 void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; | 107 void* args[] = {&devA}; |
101 checkCudaErrors(cuLaunchKernel(context->function, | 108 if (context->iterate) { |
102 1, 1, 1, | 109 struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator; |
103 1, 1, 1, | 110 checkCudaErrors(cuLaunchKernel(context->function, |
104 0, NULL, args, NULL)); | 111 iterator->x, iterator->y, iterator->z, |
112 1, 1, 1, | |
113 0, NULL, args, NULL)); | |
105 | 114 |
115 } else { | |
116 checkCudaErrors(cuLaunchKernel(context->function, | |
117 1, 1, 1, | |
118 1, 1, 1, | |
119 0, NULL, args, NULL)); | |
120 } | |
106 //結果を取ってくるコマンドを入力する | 121 //結果を取ってくるコマンドを入力する |
107 //コマンドの終了待ちを行う | 122 //コマンドの終了待ちを行う |
108 checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); | 123 checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); |
109 | 124 |
110 // wait for stream | 125 // wait for stream |
111 checkCudaErrors(cuCtxSynchronize()); | 126 checkCudaErrors(cuCtxSynchronize()); |
112 } | 127 } |
113 | 128 |
114 void cudaShutdown( struct CUDAWorker *worker) { | 129 void cudaShutdown( struct CUDAWorker *worker) { |
115 // for (int i=0;i<worker->num_stream;i++) | 130 // for (int i=0;i<worker->num_stream;i++) |
116 // checkCudaErrors(cuStreamDestroy(worker->stream[i])); | 131 // checkCudaErrors(cuStreamDestroy(worker->stream[i])); |
117 checkCudaErrors(cuCtxDestroy(worker->cuCtx)); | 132 checkCudaErrors(cuCtxDestroy(worker->cuCtx)); |
118 } | 133 } |