Mercurial > hg > Gears > GearsAgda
annotate src/parallel_execution/CUDAtwice.cbc @ 303:1dbaef86593b
CUDAtwice.cbc
author | ikkun |
---|---|
date | Mon, 13 Feb 2017 18:23:29 +0900 |
parents | 8e7926f3e271 |
children | ae4f6aa427f5 |
rev | line source |
---|---|
92 | 1 #include <stdio.h> |
2 | |
3 #include "context.h" | |
4 #include "origin_cs.h" | |
5 | |
257 | 6 __code twice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) { |
95
3e28ee215c0e
modify twice, use OSAtomiceCompareAndSwap
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
92
diff
changeset
|
7 int i = loopCounter->i; |
3e28ee215c0e
modify twice, use OSAtomiceCompareAndSwap
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
92
diff
changeset
|
8 if (i < prefix) { |
3e28ee215c0e
modify twice, use OSAtomiceCompareAndSwap
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
92
diff
changeset
|
9 array[i+index*prefix] = array[i+index*prefix]*2; |
3e28ee215c0e
modify twice, use OSAtomiceCompareAndSwap
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
92
diff
changeset
|
10 loopCounter->i++; |
3e28ee215c0e
modify twice, use OSAtomiceCompareAndSwap
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
92
diff
changeset
|
11 |
193 | 12 goto meta(context, C_twice); |
95
3e28ee215c0e
modify twice, use OSAtomiceCompareAndSwap
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
92
diff
changeset
|
13 } |
3e28ee215c0e
modify twice, use OSAtomiceCompareAndSwap
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
92
diff
changeset
|
14 |
3e28ee215c0e
modify twice, use OSAtomiceCompareAndSwap
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
92
diff
changeset
|
15 loopCounter->i = 0; |
248
1ede5390cda2
Fix segmentation fault but not multi thread running
Tatsuki IHA <e125716@ie.u-ryukyu.ac.jp>
parents:
247
diff
changeset
|
16 goto meta(workerContext, workerContext->next); |
92 | 17 } |
18 | |
19 __code twice_stub(struct Context* context) { | |
257 | 20 struct Context* workerContext = context->worker->worker->CPUWorker.context; |
303 | 21 |
22 // memory allocate | |
23 CUdeviceptr devA; | |
24 CUdeviceptr devB[num_exec]; | |
25 CUdeviceptr devOut[num_exec]; | |
26 | |
27 checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); | |
28 for (int i=0;i<num_exec;i++) { | |
29 checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); | |
30 checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); | |
31 } | |
32 | |
33 //twiceカーネルが定義されてなければそれをロードする | |
34 checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); | |
35 checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); | |
36 | |
275 | 37 //入力のDataGearをGPUにbuffer経由で送る |
303 | 38 // Synchronous data transfer(host to device) |
39 checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); | |
40 | |
41 // Asynchronous launch kernel | |
42 for (int i=0;i<num_exec;i++,cur++) { | |
43 if (num_stream <= cur) | |
44 cur=0; | |
45 //B[i] = (float)(i+1); | |
46 //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); | |
47 void* args[] = {&devA, &devB[i], &devOut[i]}; | |
48 checkCudaErrors(cuLaunchKernel(function, | |
49 LENGTH, 1, 1, | |
50 THREAD, 1, 1, | |
51 0, num_stream ? stream[cur] : NULL , args, NULL)); | |
52 //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); | |
53 } | |
54 | |
275 | 55 //結果を取ってくるコマンドを入力する |
56 //コマンドの終了待ちを行う | |
303 | 57 // Asynchronous data transfer(device to host) |
58 for (int i=0;i<num_exec;i++,cur++) { | |
59 if (num_stream <= cur) | |
60 cur = 0; | |
61 if (num_stream) { | |
62 checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); | |
63 } else { | |
64 checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); | |
65 } | |
66 } | |
67 | |
68 // wait for stream | |
69 for (int i=0;i<num_stream;i++) | |
70 checkCudaErrors(cuStreamSynchronize(stream[i])); | |
71 // Asynchronous data transfer(device to host) | |
72 for (int i=0;i<num_exec;i++,cur++) { | |
73 if (num_stream <= cur) | |
74 cur = 0; | |
75 if (num_stream) { | |
76 checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); | |
77 } else { | |
78 checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); | |
79 } | |
80 } | |
81 | |
82 // wait for stream | |
83 for (int i=0;i<num_stream;i++) | |
84 checkCudaErrors(cuStreamSynchronize(stream[i])); | |
85 | |
86 | |
275 | 87 //continationにそってGPUworkerに戻る |
248
1ede5390cda2
Fix segmentation fault but not multi thread running
Tatsuki IHA <e125716@ie.u-ryukyu.ac.jp>
parents:
247
diff
changeset
|
88 goto twice(context, Gearef(context, LoopCounter), 0, 0, NULL, workerContext); |
92 | 89 } |