Mercurial > hg > Members > Moririn
annotate src/test/twice.cc @ 502:8d997f0c9b2c
stack.agda comment
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 01 Jan 2018 18:58:05 +0900 |
parents | 1839586f5b41 |
children |
rev | line source |
---|---|
290 | 1 #include <stdio.h> |
2 #include <sys/time.h> | |
3 #include <string.h> | |
4 #include <stdlib.h> | |
314 | 5 extern "C" { |
6 #include <pthread.h> | |
7 } | |
290 | 8 |
292 | 9 #include <cuda.h> |
10 | |
291 | 11 #include <cuda_runtime.h> |
292 | 12 #include "helper_cuda.h" |
290 | 13 |
14 #define LENGTH (10) | |
15 #define THREAD (10) | |
16 | |
17 double | |
18 getTime() { | |
19 struct timeval tv; | |
20 gettimeofday(&tv, NULL); | |
21 return tv.tv_sec + (double)tv.tv_usec*1e-6; | |
22 } | |
23 | |
24 void | |
25 check_data(float* A, float B, float* C) { | |
26 for (int i=0; i<LENGTH*THREAD; i++) { | |
27 if (A[i]*B!=C[i]) { | |
28 puts("multiply failure."); | |
29 return; | |
30 } | |
31 } | |
32 puts("success."); | |
33 } | |
34 | |
35 void print_result(float* C) { | |
36 for (int i=0; i<LENGTH*THREAD; i++) { | |
37 printf("%f\n",C[i]); | |
38 } | |
39 } | |
40 | |
314 | 41 int num_stream = 1; // number of stream |
42 int num_exec = 16; // number of executed kernel | |
43 | |
44 static void *start_cuda(void *) ; | |
45 | |
290 | 46 int main(int args, char* argv[]) { |
47 | |
48 for (int i=1;argv[i];i++) { | |
49 if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { | |
50 num_stream = atoi(argv[++i]); | |
51 } | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
52 if (strcmp(argv[i], "--numExec") == 0 || strcmp(argv[i], "-e") == 0) { |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
53 num_exec = atoi(argv[++i]); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
54 } |
290 | 55 } |
314 | 56 #if 0 |
57 start_cuda(NULL); | |
58 #else | |
59 pthread_t thread; | |
60 pthread_create(&thread, NULL, start_cuda, NULL); | |
61 pthread_join(thread,NULL); | |
62 #endif | |
63 return 0; | |
64 } | |
290 | 65 |
314 | 66 static void *start_cuda(void *args) { |
290 | 67 // initialize and load kernel |
68 CUdevice device; | |
69 CUcontext context; | |
70 CUmodule module; | |
71 CUfunction function; | |
72 CUstream stream[num_stream]; | |
73 | |
292 | 74 checkCudaErrors(cuInit(0)); |
75 checkCudaErrors(cuDeviceGet(&device, 0)); | |
76 checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); | |
77 checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); | |
78 checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
79 if (num_stream) { |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
80 for (int i=0;i<num_stream;i++) |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
81 checkCudaErrors(cuStreamCreate(&stream[i],0)); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
82 } |
290 | 83 |
84 // memory allocate | |
85 CUdeviceptr devA; | |
86 CUdeviceptr devB[num_exec]; | |
87 CUdeviceptr devOut[num_exec]; | |
88 | |
292 | 89 checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); |
290 | 90 for (int i=0;i<num_exec;i++) { |
292 | 91 checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); |
92 checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); | |
290 | 93 } |
94 | |
95 // input buffer | |
96 float* A = new float[LENGTH*THREAD]; | |
97 float* B = new float[num_exec]; | |
98 | |
99 for (int i=0; i<LENGTH*THREAD; i++) | |
100 A[i] = (float)(i+1000); | |
101 | |
102 // output buffer | |
103 float** result = new float* [num_exec]; | |
104 | |
105 for (int i=0;i<num_exec;i++) | |
106 result[i] = new float[LENGTH*THREAD]; | |
107 | |
108 // Synchronous data transfer(host to device) | |
292 | 109 checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); |
290 | 110 |
111 // Asynchronous data transfer(host to device) | |
112 int cur = 0; | |
113 | |
114 for (int i=0;i<num_exec;i++,cur++) { | |
115 if (num_stream <= cur) | |
116 cur = 0; | |
117 B[i] = (float)(i+1); | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
118 if (num_stream) { |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
119 checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
120 } else { |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
121 checkCudaErrors(cuMemcpyHtoD(devB[i], &B[i], sizeof(float))); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
122 } |
290 | 123 } |
124 | |
125 cur = 0; | |
126 | |
127 // Asynchronous launch kernel | |
128 for (int i=0;i<num_exec;i++,cur++) { | |
129 if (num_stream <= cur) | |
130 cur=0; | |
301
609bf62768b9
add -DUSE_CUDA=1 flag to cmake
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
300
diff
changeset
|
131 //B[i] = (float)(i+1); |
290 | 132 //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); |
133 void* args[] = {&devA, &devB[i], &devOut[i]}; | |
292 | 134 checkCudaErrors(cuLaunchKernel(function, |
290 | 135 LENGTH, 1, 1, |
136 THREAD, 1, 1, | |
301
609bf62768b9
add -DUSE_CUDA=1 flag to cmake
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
300
diff
changeset
|
137 0, num_stream ? stream[cur] : NULL , args, NULL)); |
290 | 138 //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); |
139 } | |
140 | |
141 cur = 0; | |
142 | |
143 | |
144 // Asynchronous data transfer(device to host) | |
145 for (int i=0;i<num_exec;i++,cur++) { | |
146 if (num_stream <= cur) | |
147 cur = 0; | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
148 if (num_stream) { |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
149 checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
150 } else { |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
151 checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
299
diff
changeset
|
152 } |
290 | 153 } |
154 | |
155 // wait for stream | |
156 for (int i=0;i<num_stream;i++) | |
292 | 157 checkCudaErrors(cuStreamSynchronize(stream[i])); |
290 | 158 |
159 //printf("%0.6f\n",getTime()-start); | |
160 | |
161 for (int i=0;i<num_exec;i++) | |
162 check_data(A,(float)(i+1),result[i]); | |
163 | |
164 // memory release | |
292 | 165 checkCudaErrors(cuMemFree(devA)); |
290 | 166 for (int i=0;i<num_exec;i++) { |
292 | 167 checkCudaErrors(cuMemFree(devB[i])); |
168 checkCudaErrors(cuMemFree(devOut[i])); | |
290 | 169 } |
170 for (int i=0;i<num_stream;i++) | |
292 | 171 checkCudaErrors(cuStreamDestroy(stream[i])); |
172 checkCudaErrors(cuModuleUnload(module)); | |
173 checkCudaErrors(cuCtxDestroy(context)); | |
290 | 174 |
175 delete[] A; | |
176 delete[] B; | |
177 for (int i=0;i<num_exec;i++) | |
178 delete[] result[i]; | |
179 delete[] result; | |
180 return 0; | |
181 } | |
299 | 182 |