Mercurial > hg > GearsTemplate
annotate src/test/vectorAddDrv.cc @ 405:8915fce522b3
Fix shutdown TaskManager
author | Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 05 Sep 2017 16:46:31 +0900 |
parents | 8bbc0012e1a4 |
children |
rev | line source |
---|---|
297 | 1 /* |
2 * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. | |
3 * | |
4 * Please refer to the NVIDIA end user license agreement (EULA) associated | |
5 * with this source code for terms and conditions that govern your use of | |
6 * this software. Any use, reproduction, disclosure, or distribution of | |
7 * this software and related documentation outside the terms of the EULA | |
8 * is strictly prohibited. | |
9 * | |
10 */ | |
11 | |
12 /* Vector addition: C = A + B. | |
13 * | |
14 * This sample is a very basic sample that implements element by element | |
15 * vector addition. It is the same as the sample illustrating Chapter 3 | |
16 * of the programming guide with some additions like error checking. | |
17 * | |
18 */ | |
19 | |
20 // Includes | |
21 #include <stdio.h> | |
22 #include <string.h> | |
23 #include <iostream> | |
24 #include <cstring> | |
25 #include <math.h> | |
26 | |
27 // includes, project | |
28 #include <driver_types.h> | |
29 #include <cuda_runtime.h> | |
30 #include <cuda.h> | |
31 #include "helper_cuda.h" | |
32 | |
33 // includes, CUDA | |
34 #include <builtin_types.h> | |
35 | |
36 #define PTX_FILE "vectorAdd_kernel.ptx" | |
37 | |
38 | |
39 using namespace std; | |
40 | |
41 // Variables | |
42 CUdevice cuDevice; | |
43 CUcontext cuContext; | |
44 CUmodule cuModule; | |
45 CUfunction vecAdd_kernel; | |
46 float *h_A; | |
47 float *h_B; | |
48 float *h_C; | |
49 CUdeviceptr d_A; | |
50 CUdeviceptr d_B; | |
51 CUdeviceptr d_C; | |
52 bool noprompt = false; | |
53 | |
54 // Functions | |
55 void Cleanup(bool); | |
56 CUresult CleanupNoFailure(); | |
57 void RandomInit(float *, int); | |
58 bool findModulePath(const char *, string &, char **, string &); | |
59 void ParseArguments(int, char **); | |
60 | |
61 int *pArgc = NULL; | |
62 char **pArgv = NULL; | |
63 | |
64 | |
65 // Host code | |
66 int main(int argc, char **argv) | |
67 { | |
68 pArgc = &argc; | |
69 pArgv = argv; | |
70 | |
71 printf("Vector Addition (Driver API)\n"); | |
72 int N = 50000, devID = 0; | |
73 size_t size = N * sizeof(float); | |
74 | |
75 ParseArguments(argc, argv); | |
76 | |
77 // Initialize | |
78 checkCudaErrors(cuInit(0)); | |
79 | |
80 // This assumes that the user is attempting to specify a explicit device -device=n | |
81 if (argc > 1) | |
82 { | |
83 bool bFound = false; | |
84 | |
85 for (int param=0; param < argc; param++) | |
86 { | |
87 int string_start = 0; | |
88 | |
89 while (argv[param][string_start] == '-') | |
90 { | |
91 string_start++; | |
92 } | |
93 | |
94 char *string_argv = &argv[param][string_start]; | |
95 | |
96 if (!strncmp(string_argv, "device", 6)) | |
97 { | |
98 int len=(int)strlen(string_argv); | |
99 | |
100 while (string_argv[len] != '=') | |
101 { | |
102 len--; | |
103 } | |
104 | |
105 devID = atoi(&string_argv[++len]); | |
106 bFound = true; | |
107 } | |
108 | |
109 if (bFound) | |
110 { | |
111 break; | |
112 } | |
113 } | |
114 } | |
115 | |
116 // Get number of devices supporting CUDA | |
117 int deviceCount = 0; | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
118 checkCudaErrors(cuDeviceGetCount(&deviceCount)); |
297 | 119 if (deviceCount == 0) |
120 { | |
121 printf("There is no device supporting CUDA.\n"); | |
122 Cleanup(false); | |
123 } | |
124 | |
125 if (devID < 0) | |
126 { | |
127 devID = 0; | |
128 } | |
129 | |
130 if (devID > deviceCount-1) | |
131 { | |
132 fprintf(stderr, "(Device=%d) invalid GPU device. %d GPU device(s) detected.\nexiting...\n", devID, deviceCount); | |
133 CleanupNoFailure(); | |
134 exit(EXIT_SUCCESS); | |
135 } | |
136 else | |
137 { | |
138 int major, minor; | |
139 char deviceName[100]; | |
140 checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID)); | |
141 checkCudaErrors(cuDeviceGetName(deviceName, 256, devID)); | |
142 printf("> Using Device %d: \"%s\" with Compute %d.%d capability\n", devID, deviceName, major, minor); | |
143 } | |
144 | |
145 // pick up device with zero ordinal (default, or devID) | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
146 checkCudaErrors(cuDeviceGet(&cuDevice, devID)); |
297 | 147 // Create context |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
148 checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); |
297 | 149 // first search for the module path before we load the results |
150 string module_path, ptx_source; | |
151 | |
152 if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) | |
153 { | |
154 if (!findModulePath("vectorAdd_kernel.cubin", module_path, argv, ptx_source)) | |
155 { | |
156 printf("> findModulePath could not find <vectorAdd> ptx or cubin\n"); | |
157 Cleanup(false); | |
158 } | |
159 } | |
160 else | |
161 { | |
162 printf("> initCUDA loading module: <%s>\n", module_path.c_str()); | |
163 } | |
164 | |
165 // Create module from binary file (PTX or CUBIN) | |
166 if (module_path.rfind("ptx") != string::npos) | |
167 { | |
168 // in this branch we use compilation with parameters | |
169 const unsigned int jitNumOptions = 3; | |
170 CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; | |
171 void **jitOptVals = new void *[jitNumOptions]; | |
172 | |
173 // set up size of compilation log buffer | |
174 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; | |
175 int jitLogBufferSize = 1024; | |
176 jitOptVals[0] = (void *)(size_t)jitLogBufferSize; | |
177 | |
178 // set up pointer to the compilation log buffer | |
179 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; | |
180 char *jitLogBuffer = new char[jitLogBufferSize]; | |
181 jitOptVals[1] = jitLogBuffer; | |
182 | |
183 // set up pointer to set the Maximum # of registers for a particular kernel | |
184 jitOptions[2] = CU_JIT_MAX_REGISTERS; | |
185 int jitRegCount = 32; | |
186 jitOptVals[2] = (void *)(size_t)jitRegCount; | |
187 | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
188 checkCudaErrors(cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals)); |
297 | 189 |
190 printf("> PTX JIT log:\n%s\n", jitLogBuffer); | |
191 } | |
192 else | |
193 { | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
194 checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str())); |
297 | 195 } |
196 | |
197 // Get function handle from module | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
198 checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel")); |
297 | 199 |
200 // Allocate input vectors h_A and h_B in host memory | |
201 h_A = (float *)malloc(size); | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
202 if (h_A == 0) { Cleanup(false); } |
297 | 203 |
204 h_B = (float *)malloc(size); | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
205 if (h_B == 0) { Cleanup(false); } |
297 | 206 |
207 h_C = (float *)malloc(size); | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
208 if (h_C == 0) { Cleanup(false); } |
297 | 209 |
210 // Initialize input vectors | |
211 RandomInit(h_A, N); | |
212 RandomInit(h_B, N); | |
213 | |
214 // Allocate vectors in device memory | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
215 checkCudaErrors(cuMemAlloc(&d_A, size)); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
216 checkCudaErrors(cuMemAlloc(&d_B, size)); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
217 checkCudaErrors(cuMemAlloc(&d_C, size)); |
297 | 218 |
219 // Copy vectors from host memory to device memory | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
220 checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size)); |
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
221 checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size)); |
297 | 222 |
223 #if 1 | |
224 | |
225 if (1) | |
226 { | |
227 // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch (simpler method) | |
228 | |
229 // Grid/Block configuration | |
230 int threadsPerBlock = 256; | |
231 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; | |
232 | |
233 void *args[] = { &d_A, &d_B, &d_C, &N }; | |
234 | |
235 // Launch the CUDA kernel | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
236 checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, |
297 | 237 threadsPerBlock, 1, 1, |
238 0, | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
239 NULL, args, NULL)); |
297 | 240 } |
241 else | |
242 { | |
243 // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch (advanced method) | |
244 int offset = 0; | |
245 void *argBuffer[16]; | |
246 *((CUdeviceptr *)&argBuffer[offset]) = d_A; | |
247 offset += sizeof(d_A); | |
248 *((CUdeviceptr *)&argBuffer[offset]) = d_B; | |
249 offset += sizeof(d_B); | |
250 *((CUdeviceptr *)&argBuffer[offset]) = d_C; | |
251 offset += sizeof(d_C); | |
252 *((int *)&argBuffer[offset]) = N; | |
253 offset += sizeof(N); | |
254 | |
255 // Grid/Block configuration | |
256 int threadsPerBlock = 256; | |
257 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; | |
258 | |
259 // Launch the CUDA kernel | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
260 checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, |
297 | 261 threadsPerBlock, 1, 1, |
262 0, | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
263 NULL, NULL, argBuffer)); |
297 | 264 } |
265 | |
266 #else | |
267 { | |
268 char argBuffer[256]; | |
269 | |
270 // pass in launch parameters (not actually de-referencing CUdeviceptr). CUdeviceptr is | |
271 // storing the value of the parameters | |
272 *((CUdeviceptr *)&argBuffer[offset]) = d_A; | |
273 offset += sizeof(d_A); | |
274 *((CUdeviceptr *)&argBuffer[offset]) = d_B; | |
275 offset += sizeof(d_B); | |
276 *((CUdeviceptr *)&argBuffer[offset]) = d_C; | |
277 offset += sizeof(d_C); | |
278 *((int *)&argBuffer[offset]) = N; | |
279 offset += sizeof(N); | |
280 | |
281 void *kernel_launch_config[5] = | |
282 { | |
283 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, | |
284 CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, | |
285 CU_LAUNCH_PARAM_END | |
286 }; | |
287 | |
288 // Grid/Block configuration | |
289 int threadsPerBlock = 256; | |
290 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; | |
291 | |
292 // Launch the CUDA kernel | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
293 checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, |
297 | 294 threadsPerBlock, 1, 1, |
295 0, 0, | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
296 NULL, (void **)&kernel_launch_config)); |
297 | 297 } |
298 #endif | |
299 | |
300 #ifdef _DEBUG | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
301 checkCudaErrors(cuCtxSynchronize()); |
297 | 302 #endif |
303 | |
304 // Copy result from device memory to host memory | |
305 // h_C contains the result in host memory | |
300
8bbc0012e1a4
checkErrors on an example
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents:
297
diff
changeset
|
306 checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size)); |
297 | 307 |
308 // Verify result | |
309 int i; | |
310 | |
311 for (i = 0; i < N; ++i) | |
312 { | |
313 float sum = h_A[i] + h_B[i]; | |
314 | |
315 if (fabs(h_C[i] - sum) > 1e-7f) | |
316 { | |
317 break; | |
318 } | |
319 } | |
320 | |
321 printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL"); | |
322 | |
323 exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE); | |
324 } | |
325 | |
326 CUresult CleanupNoFailure() | |
327 { | |
328 CUresult error; | |
329 | |
330 // Free device memory | |
331 if (d_A) | |
332 { | |
333 error = cuMemFree(d_A); | |
334 } | |
335 | |
336 if (d_B) | |
337 { | |
338 error = cuMemFree(d_B); | |
339 } | |
340 | |
341 if (d_C) | |
342 { | |
343 error = cuMemFree(d_C); | |
344 } | |
345 | |
346 // Free host memory | |
347 if (h_A) | |
348 { | |
349 free(h_A); | |
350 } | |
351 | |
352 if (h_B) | |
353 { | |
354 free(h_B); | |
355 } | |
356 | |
357 if (h_C) | |
358 { | |
359 free(h_C); | |
360 } | |
361 | |
362 error = cuCtxDestroy(cuContext); | |
363 | |
364 return error; | |
365 } | |
366 | |
367 void Cleanup(bool noError) | |
368 { | |
369 CUresult error; | |
370 error = CleanupNoFailure(); | |
371 | |
372 if (!noError || error != CUDA_SUCCESS) | |
373 { | |
374 printf("Function call failed\nFAILED\n"); | |
375 exit(EXIT_FAILURE); | |
376 } | |
377 | |
378 if (!noprompt) | |
379 { | |
380 printf("\nPress ENTER to exit...\n"); | |
381 fflush(stdout); | |
382 fflush(stderr); | |
383 getchar(); | |
384 } | |
385 } | |
386 | |
387 | |
388 // Allocates an array with random float entries. | |
389 void RandomInit(float *data, int n) | |
390 { | |
391 for (int i = 0; i < n; ++i) | |
392 { | |
393 data[i] = rand() / (float)RAND_MAX; | |
394 } | |
395 } | |
396 | |
397 bool inline | |
398 findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source) | |
399 { | |
400 char *actual_path = sdkFindFilePath(module_file, argv[0]); | |
401 | |
402 if (actual_path) | |
403 { | |
404 module_path = actual_path; | |
405 } | |
406 else | |
407 { | |
408 printf("> findModulePath file not found: <%s> \n", module_file); | |
409 return false; | |
410 } | |
411 | |
412 if (module_path.empty()) | |
413 { | |
414 printf("> findModulePath could not find file: <%s> \n", module_file); | |
415 return false; | |
416 } | |
417 else | |
418 { | |
419 printf("> findModulePath found file at <%s>\n", module_path.c_str()); | |
420 | |
421 if (module_path.rfind(".ptx") != string::npos) | |
422 { | |
423 FILE *fp = fopen(module_path.c_str(), "rb"); | |
424 fseek(fp, 0, SEEK_END); | |
425 int file_size = ftell(fp); | |
426 char *buf = new char[file_size+1]; | |
427 fseek(fp, 0, SEEK_SET); | |
428 fread(buf, sizeof(char), file_size, fp); | |
429 fclose(fp); | |
430 buf[file_size] = '\0'; | |
431 ptx_source = buf; | |
432 delete[] buf; | |
433 } | |
434 | |
435 return true; | |
436 } | |
437 } | |
438 | |
439 // Parse program arguments | |
440 void ParseArguments(int argc, char **argv) | |
441 { | |
442 for (int i = 0; i < argc; ++i) | |
443 { | |
444 if (strcmp(argv[i], "--noprompt") == 0 || | |
445 strcmp(argv[i], "-noprompt") == 0) | |
446 { | |
447 noprompt = true; | |
448 break; | |
449 } | |
450 } | |
451 } |