comparison src/parallel_execution/cuda.c @ 410:85b0ddbf458e

Fix CudaWorker
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Thu, 14 Sep 2017 02:35:20 +0900
parents c5cd9888bf2a
children 0eba9a04633f
comparison
equal deleted inserted replaced
409:4d1e3697a6b8 410:85b0ddbf458e
9 #include <cuda_runtime.h> 9 #include <cuda_runtime.h>
10 #include <cuda.h> 10 #include <cuda.h>
11 #include "helper_cuda.h" 11 #include "helper_cuda.h"
12 #include "pthread.h" 12 #include "pthread.h"
13 13
14 // #include "context.h" 14 #include "context.h"
15 15
16 /*
16 struct Context { 17 struct Context {
17 int next; 18 int next;
18 struct Worker* worker; 19 struct Worker* worker;
19 struct TaskManager* taskManager; 20 struct TaskManager* taskManager;
20 int codeNum; 21 int codeNum;
22 void* heapStart; 23 void* heapStart;
23 void* heap; 24 void* heap;
24 long heapLimit; 25 long heapLimit;
25 int dataNum; 26 int dataNum;
26 int idgCount; //number of waiting dataGear 27 int idgCount; //number of waiting dataGear
28 int idg;
29 int maxIdg;
27 int odg; 30 int odg;
28 int maxOdg; 31 int maxOdg;
29 int workerId; 32 int workerId;
33 struct Context* task;
34 struct Queue* tasks;
30 int num_exec; 35 int num_exec;
31 CUmodule module; 36 CUmodule module;
32 CUfunction function; 37 CUfunction function;
33 union Data **data; 38 union Data **data;
39
40 // multi dimension parameter
41 int iterate;
42 struct Iterator* iterator;
34 }; 43 };
35 44
36 struct CUDAWorker { 45 struct CUDAWorker {
37 CUdevice device; 46 CUdevice device;
38 CUcontext cuCtx; 47 CUcontext cuCtx;
39 pthread_t thread; 48 pthread_t thread;
40 struct Context* context; 49 struct Context* context;
41 int id; 50 int id;
42 struct Queue* tasks; 51 struct Queue* tasks;
43 int runFlag; 52 int runFlag;
44 int next; 53 int next;
45 int num_stream; 54 int num_stream;
46 CUstream *stream; 55 CUstream *stream;
47 } CUDAWorker; 56 } CUDAWorker;
48 57
49 struct LoopCounter { 58 struct LoopCounter {
50 int i; 59 int i;
51 } LoopCounter; 60 } LoopCounter;
52 61
53 struct Array { 62 struct Array {
54 int size; 63 int size;
55 int index; 64 int index;
56 int prefix; 65 int prefix;
57 int* array; 66 int* array;
58 } Array; 67 } Array;
59 68 */
60
61 69
62 void cudaInit(struct CUDAWorker *cudaWorker,int phase) { 70 void cudaInit(struct CUDAWorker *cudaWorker,int phase) {
63 // initialize and load kernel 71 // initialize and load kernel
64 cudaWorker->num_stream = 1; // number of stream 72 cudaWorker->num_stream = 1; // number of stream
65 // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream ); 73 // cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
66 if (phase==0) 74 if (phase==0)
67 checkCudaErrors(cuInit(0)); 75 checkCudaErrors(cuInit(0));
68 if (phase==0) 76 if (phase==0)
69 checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0)); 77 checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
70 if (phase==0) 78 if (phase==0)
71 checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device)); 79 checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
72 // if (cudaWorker->num_stream) { 80 // if (cudaWorker->num_stream) {
73 // for (int i=0;i<cudaWorker->num_stream;i++) 81 // for (int i=0;i<cudaWorker->num_stream;i++)
74 // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0)); 82 // checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
75 // } 83 // }
84 printf("cuda Init: Done\n");
76 } 85 }
77 86
78 87
79 void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) { 88 void CUDAExec(struct Context* context, struct Array* array) {
89 printf("cuda exec start\n");
80 // Worker *worker = context->worker; 90 // Worker *worker = context->worker;
81 // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; 91 // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
82 // memory allocate 92 // memory allocate
83 CUdeviceptr devA; 93 CUdeviceptr devA;
84 CUdeviceptr devLoopCounter;
85 94
86 checkCudaErrors(cuMemAlloc(&devA, array->size)); 95 checkCudaErrors(cuMemAlloc(&devA, sizeof(int)*array->size));
87 checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter)));
88 96
89 //twiceカーネルが定義されてなければそれをロードする 97 //twiceカーネルが定義されてなければそれをロードする
90 checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx")); 98 checkCudaErrors(cuModuleLoad(&context->module, "c/examples/twice/CUDAtwice.ptx"));
91 checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice")); 99 checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice"));
92 100
93 //入力のDataGearをGPUにbuffer経由で送る 101 //入力のDataGearをGPUにbuffer経由で送る
94 // Synchronous data transfer(host to device) 102 // Synchronous data transfer(host to device)
95 checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter)));
96 checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); 103 checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size));
97 104
98 // Asynchronous launch kernel 105 // Asynchronous launch kernel
99 context->num_exec = 1; 106 context->num_exec = 1;
100 void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA}; 107 void* args[] = {&devA};
101 checkCudaErrors(cuLaunchKernel(context->function, 108 if (context->iterate) {
102 1, 1, 1, 109 struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator;
103 1, 1, 1, 110 checkCudaErrors(cuLaunchKernel(context->function,
104 0, NULL, args, NULL)); 111 iterator->x, iterator->y, iterator->z,
112 1, 1, 1,
113 0, NULL, args, NULL));
105 114
115 } else {
116 checkCudaErrors(cuLaunchKernel(context->function,
117 1, 1, 1,
118 1, 1, 1,
119 0, NULL, args, NULL));
120 }
106 //結果を取ってくるコマンドを入力する 121 //結果を取ってくるコマンドを入力する
107 //コマンドの終了待ちを行う 122 //コマンドの終了待ちを行う
108 checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); 123 checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size));
109 124
110 // wait for stream 125 // wait for stream
111 checkCudaErrors(cuCtxSynchronize()); 126 checkCudaErrors(cuCtxSynchronize());
112 } 127 }
113 128
114 void cudaShutdown( struct CUDAWorker *worker) { 129 void cudaShutdown( struct CUDAWorker *worker) {
115 // for (int i=0;i<worker->num_stream;i++) 130 // for (int i=0;i<worker->num_stream;i++)
116 // checkCudaErrors(cuStreamDestroy(worker->stream[i])); 131 // checkCudaErrors(cuStreamDestroy(worker->stream[i]));
117 checkCudaErrors(cuCtxDestroy(worker->cuCtx)); 132 checkCudaErrors(cuCtxDestroy(worker->cuCtx));
118 } 133 }