Mercurial > hg > GearsTemplate
comparison src/test/vectorAddDrv.cc @ 297:b46398081fe4
add working example
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Sat, 11 Feb 2017 10:55:36 +0900 |
parents | |
children | 8bbc0012e1a4 |
comparison
equal
deleted
inserted
replaced
296:f16802b3b580 | 297:b46398081fe4 |
---|---|
1 /* | |
2 * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. | |
3 * | |
4 * Please refer to the NVIDIA end user license agreement (EULA) associated | |
5 * with this source code for terms and conditions that govern your use of | |
6 * this software. Any use, reproduction, disclosure, or distribution of | |
7 * this software and related documentation outside the terms of the EULA | |
8 * is strictly prohibited. | |
9 * | |
10 */ | |
11 | |
12 /* Vector addition: C = A + B. | |
13 * | |
14 * This sample is a very basic sample that implements element by element | |
15 * vector addition. It is the same as the sample illustrating Chapter 3 | |
16 * of the programming guide with some additions like error checking. | |
17 * | |
18 */ | |
19 | |
20 // Includes | |
21 #include <stdio.h> | |
22 #include <string.h> | |
23 #include <iostream> | |
24 #include <cstring> | |
25 #include <math.h> | |
26 | |
27 // includes, project | |
28 #include <driver_types.h> | |
29 #include <cuda_runtime.h> | |
30 #include <cuda.h> | |
31 #include "helper_cuda.h" | |
32 | |
33 // includes, CUDA | |
34 #include <builtin_types.h> | |
35 | |
36 #define PTX_FILE "vectorAdd_kernel.ptx" | |
37 | |
38 | |
39 using namespace std; | |
40 | |
41 // Variables | |
42 CUdevice cuDevice; | |
43 CUcontext cuContext; | |
44 CUmodule cuModule; | |
45 CUfunction vecAdd_kernel; | |
46 float *h_A; | |
47 float *h_B; | |
48 float *h_C; | |
49 CUdeviceptr d_A; | |
50 CUdeviceptr d_B; | |
51 CUdeviceptr d_C; | |
52 bool noprompt = false; | |
53 | |
54 // Functions | |
55 void Cleanup(bool); | |
56 CUresult CleanupNoFailure(); | |
57 void RandomInit(float *, int); | |
58 bool findModulePath(const char *, string &, char **, string &); | |
59 void ParseArguments(int, char **); | |
60 | |
61 int *pArgc = NULL; | |
62 char **pArgv = NULL; | |
63 | |
64 | |
65 // Host code | |
66 int main(int argc, char **argv) | |
67 { | |
68 pArgc = &argc; | |
69 pArgv = argv; | |
70 | |
71 printf("Vector Addition (Driver API)\n"); | |
72 int N = 50000, devID = 0; | |
73 size_t size = N * sizeof(float); | |
74 | |
75 CUresult error; | |
76 ParseArguments(argc, argv); | |
77 | |
78 // Initialize | |
79 checkCudaErrors(cuInit(0)); | |
80 | |
81 // This assumes that the user is attempting to specify a explicit device -device=n | |
82 if (argc > 1) | |
83 { | |
84 bool bFound = false; | |
85 | |
86 for (int param=0; param < argc; param++) | |
87 { | |
88 int string_start = 0; | |
89 | |
90 while (argv[param][string_start] == '-') | |
91 { | |
92 string_start++; | |
93 } | |
94 | |
95 char *string_argv = &argv[param][string_start]; | |
96 | |
97 if (!strncmp(string_argv, "device", 6)) | |
98 { | |
99 int len=(int)strlen(string_argv); | |
100 | |
101 while (string_argv[len] != '=') | |
102 { | |
103 len--; | |
104 } | |
105 | |
106 devID = atoi(&string_argv[++len]); | |
107 bFound = true; | |
108 } | |
109 | |
110 if (bFound) | |
111 { | |
112 break; | |
113 } | |
114 } | |
115 } | |
116 | |
117 // Get number of devices supporting CUDA | |
118 int deviceCount = 0; | |
119 error = cuDeviceGetCount(&deviceCount); | |
120 | |
121 if (error != CUDA_SUCCESS) | |
122 { | |
123 Cleanup(false); | |
124 } | |
125 | |
126 if (deviceCount == 0) | |
127 { | |
128 printf("There is no device supporting CUDA.\n"); | |
129 Cleanup(false); | |
130 } | |
131 | |
132 if (devID < 0) | |
133 { | |
134 devID = 0; | |
135 } | |
136 | |
137 if (devID > deviceCount-1) | |
138 { | |
139 fprintf(stderr, "(Device=%d) invalid GPU device. %d GPU device(s) detected.\nexiting...\n", devID, deviceCount); | |
140 CleanupNoFailure(); | |
141 exit(EXIT_SUCCESS); | |
142 } | |
143 else | |
144 { | |
145 int major, minor; | |
146 char deviceName[100]; | |
147 checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID)); | |
148 checkCudaErrors(cuDeviceGetName(deviceName, 256, devID)); | |
149 printf("> Using Device %d: \"%s\" with Compute %d.%d capability\n", devID, deviceName, major, minor); | |
150 } | |
151 | |
152 // pick up device with zero ordinal (default, or devID) | |
153 error = cuDeviceGet(&cuDevice, devID); | |
154 | |
155 if (error != CUDA_SUCCESS) | |
156 { | |
157 Cleanup(false); | |
158 } | |
159 | |
160 // Create context | |
161 error = cuCtxCreate(&cuContext, 0, cuDevice); | |
162 | |
163 if (error != CUDA_SUCCESS) | |
164 { | |
165 Cleanup(false); | |
166 } | |
167 | |
168 // first search for the module path before we load the results | |
169 string module_path, ptx_source; | |
170 | |
171 if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) | |
172 { | |
173 if (!findModulePath("vectorAdd_kernel.cubin", module_path, argv, ptx_source)) | |
174 { | |
175 printf("> findModulePath could not find <vectorAdd> ptx or cubin\n"); | |
176 Cleanup(false); | |
177 } | |
178 } | |
179 else | |
180 { | |
181 printf("> initCUDA loading module: <%s>\n", module_path.c_str()); | |
182 } | |
183 | |
184 // Create module from binary file (PTX or CUBIN) | |
185 if (module_path.rfind("ptx") != string::npos) | |
186 { | |
187 // in this branch we use compilation with parameters | |
188 const unsigned int jitNumOptions = 3; | |
189 CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; | |
190 void **jitOptVals = new void *[jitNumOptions]; | |
191 | |
192 // set up size of compilation log buffer | |
193 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; | |
194 int jitLogBufferSize = 1024; | |
195 jitOptVals[0] = (void *)(size_t)jitLogBufferSize; | |
196 | |
197 // set up pointer to the compilation log buffer | |
198 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; | |
199 char *jitLogBuffer = new char[jitLogBufferSize]; | |
200 jitOptVals[1] = jitLogBuffer; | |
201 | |
202 // set up pointer to set the Maximum # of registers for a particular kernel | |
203 jitOptions[2] = CU_JIT_MAX_REGISTERS; | |
204 int jitRegCount = 32; | |
205 jitOptVals[2] = (void *)(size_t)jitRegCount; | |
206 | |
207 error = cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals); | |
208 | |
209 printf("> PTX JIT log:\n%s\n", jitLogBuffer); | |
210 } | |
211 else | |
212 { | |
213 error = cuModuleLoad(&cuModule, module_path.c_str()); | |
214 } | |
215 | |
216 if (error != CUDA_SUCCESS) | |
217 { | |
218 Cleanup(false); | |
219 } | |
220 | |
221 // Get function handle from module | |
222 error = cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"); | |
223 | |
224 if (error != CUDA_SUCCESS) | |
225 { | |
226 Cleanup(false); | |
227 } | |
228 | |
229 // Allocate input vectors h_A and h_B in host memory | |
230 h_A = (float *)malloc(size); | |
231 | |
232 if (h_A == 0) | |
233 { | |
234 Cleanup(false); | |
235 } | |
236 | |
237 h_B = (float *)malloc(size); | |
238 | |
239 if (h_B == 0) | |
240 { | |
241 Cleanup(false); | |
242 } | |
243 | |
244 h_C = (float *)malloc(size); | |
245 | |
246 if (h_C == 0) | |
247 { | |
248 Cleanup(false); | |
249 } | |
250 | |
251 // Initialize input vectors | |
252 RandomInit(h_A, N); | |
253 RandomInit(h_B, N); | |
254 | |
255 // Allocate vectors in device memory | |
256 error = cuMemAlloc(&d_A, size); | |
257 | |
258 if (error != CUDA_SUCCESS) | |
259 { | |
260 Cleanup(false); | |
261 } | |
262 | |
263 error = cuMemAlloc(&d_B, size); | |
264 | |
265 if (error != CUDA_SUCCESS) | |
266 { | |
267 Cleanup(false); | |
268 } | |
269 | |
270 error = cuMemAlloc(&d_C, size); | |
271 | |
272 if (error != CUDA_SUCCESS) | |
273 { | |
274 Cleanup(false); | |
275 } | |
276 | |
277 // Copy vectors from host memory to device memory | |
278 error = cuMemcpyHtoD(d_A, h_A, size); | |
279 | |
280 if (error != CUDA_SUCCESS) | |
281 { | |
282 Cleanup(false); | |
283 } | |
284 | |
285 error = cuMemcpyHtoD(d_B, h_B, size); | |
286 | |
287 if (error != CUDA_SUCCESS) | |
288 { | |
289 Cleanup(false); | |
290 } | |
291 | |
292 #if 1 | |
293 | |
294 if (1) | |
295 { | |
296 // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch (simpler method) | |
297 | |
298 // Grid/Block configuration | |
299 int threadsPerBlock = 256; | |
300 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; | |
301 | |
302 void *args[] = { &d_A, &d_B, &d_C, &N }; | |
303 | |
304 // Launch the CUDA kernel | |
305 error = cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, | |
306 threadsPerBlock, 1, 1, | |
307 0, | |
308 NULL, args, NULL); | |
309 | |
310 if (error != CUDA_SUCCESS) | |
311 { | |
312 Cleanup(false); | |
313 } | |
314 } | |
315 else | |
316 { | |
317 // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch (advanced method) | |
318 int offset = 0; | |
319 void *argBuffer[16]; | |
320 *((CUdeviceptr *)&argBuffer[offset]) = d_A; | |
321 offset += sizeof(d_A); | |
322 *((CUdeviceptr *)&argBuffer[offset]) = d_B; | |
323 offset += sizeof(d_B); | |
324 *((CUdeviceptr *)&argBuffer[offset]) = d_C; | |
325 offset += sizeof(d_C); | |
326 *((int *)&argBuffer[offset]) = N; | |
327 offset += sizeof(N); | |
328 | |
329 // Grid/Block configuration | |
330 int threadsPerBlock = 256; | |
331 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; | |
332 | |
333 // Launch the CUDA kernel | |
334 error = cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, | |
335 threadsPerBlock, 1, 1, | |
336 0, | |
337 NULL, NULL, argBuffer); | |
338 | |
339 if (error != CUDA_SUCCESS) | |
340 { | |
341 Cleanup(false); | |
342 } | |
343 } | |
344 | |
345 #else | |
346 { | |
347 char argBuffer[256]; | |
348 | |
349 // pass in launch parameters (not actually de-referencing CUdeviceptr). CUdeviceptr is | |
350 // storing the value of the parameters | |
351 *((CUdeviceptr *)&argBuffer[offset]) = d_A; | |
352 offset += sizeof(d_A); | |
353 *((CUdeviceptr *)&argBuffer[offset]) = d_B; | |
354 offset += sizeof(d_B); | |
355 *((CUdeviceptr *)&argBuffer[offset]) = d_C; | |
356 offset += sizeof(d_C); | |
357 *((int *)&argBuffer[offset]) = N; | |
358 offset += sizeof(N); | |
359 | |
360 void *kernel_launch_config[5] = | |
361 { | |
362 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, | |
363 CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, | |
364 CU_LAUNCH_PARAM_END | |
365 }; | |
366 | |
367 // Grid/Block configuration | |
368 int threadsPerBlock = 256; | |
369 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; | |
370 | |
371 // Launch the CUDA kernel | |
372 error = cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, | |
373 threadsPerBlock, 1, 1, | |
374 0, 0, | |
375 NULL, (void **)&kernel_launch_config); | |
376 | |
377 if (error != CUDA_SUCCESS) | |
378 { | |
379 Cleanup(false); | |
380 } | |
381 } | |
382 #endif | |
383 | |
384 #ifdef _DEBUG | |
385 error = cuCtxSynchronize(); | |
386 | |
387 if (error != CUDA_SUCCESS) | |
388 { | |
389 Cleanup(false); | |
390 } | |
391 | |
392 #endif | |
393 | |
394 // Copy result from device memory to host memory | |
395 // h_C contains the result in host memory | |
396 error = cuMemcpyDtoH(h_C, d_C, size); | |
397 | |
398 if (error != CUDA_SUCCESS) | |
399 { | |
400 Cleanup(false); | |
401 } | |
402 | |
403 // Verify result | |
404 int i; | |
405 | |
406 for (i = 0; i < N; ++i) | |
407 { | |
408 float sum = h_A[i] + h_B[i]; | |
409 | |
410 if (fabs(h_C[i] - sum) > 1e-7f) | |
411 { | |
412 break; | |
413 } | |
414 } | |
415 | |
416 printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL"); | |
417 | |
418 exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE); | |
419 } | |
420 | |
421 CUresult CleanupNoFailure() | |
422 { | |
423 CUresult error; | |
424 | |
425 // Free device memory | |
426 if (d_A) | |
427 { | |
428 error = cuMemFree(d_A); | |
429 } | |
430 | |
431 if (d_B) | |
432 { | |
433 error = cuMemFree(d_B); | |
434 } | |
435 | |
436 if (d_C) | |
437 { | |
438 error = cuMemFree(d_C); | |
439 } | |
440 | |
441 // Free host memory | |
442 if (h_A) | |
443 { | |
444 free(h_A); | |
445 } | |
446 | |
447 if (h_B) | |
448 { | |
449 free(h_B); | |
450 } | |
451 | |
452 if (h_C) | |
453 { | |
454 free(h_C); | |
455 } | |
456 | |
457 error = cuCtxDestroy(cuContext); | |
458 | |
459 return error; | |
460 } | |
461 | |
462 void Cleanup(bool noError) | |
463 { | |
464 CUresult error; | |
465 error = CleanupNoFailure(); | |
466 | |
467 if (!noError || error != CUDA_SUCCESS) | |
468 { | |
469 printf("Function call failed\nFAILED\n"); | |
470 exit(EXIT_FAILURE); | |
471 } | |
472 | |
473 if (!noprompt) | |
474 { | |
475 printf("\nPress ENTER to exit...\n"); | |
476 fflush(stdout); | |
477 fflush(stderr); | |
478 getchar(); | |
479 } | |
480 } | |
481 | |
482 | |
483 // Allocates an array with random float entries. | |
484 void RandomInit(float *data, int n) | |
485 { | |
486 for (int i = 0; i < n; ++i) | |
487 { | |
488 data[i] = rand() / (float)RAND_MAX; | |
489 } | |
490 } | |
491 | |
492 bool inline | |
493 findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source) | |
494 { | |
495 char *actual_path = sdkFindFilePath(module_file, argv[0]); | |
496 | |
497 if (actual_path) | |
498 { | |
499 module_path = actual_path; | |
500 } | |
501 else | |
502 { | |
503 printf("> findModulePath file not found: <%s> \n", module_file); | |
504 return false; | |
505 } | |
506 | |
507 if (module_path.empty()) | |
508 { | |
509 printf("> findModulePath could not find file: <%s> \n", module_file); | |
510 return false; | |
511 } | |
512 else | |
513 { | |
514 printf("> findModulePath found file at <%s>\n", module_path.c_str()); | |
515 | |
516 if (module_path.rfind(".ptx") != string::npos) | |
517 { | |
518 FILE *fp = fopen(module_path.c_str(), "rb"); | |
519 fseek(fp, 0, SEEK_END); | |
520 int file_size = ftell(fp); | |
521 char *buf = new char[file_size+1]; | |
522 fseek(fp, 0, SEEK_SET); | |
523 fread(buf, sizeof(char), file_size, fp); | |
524 fclose(fp); | |
525 buf[file_size] = '\0'; | |
526 ptx_source = buf; | |
527 delete[] buf; | |
528 } | |
529 | |
530 return true; | |
531 } | |
532 } | |
533 | |
534 // Parse program arguments | |
535 void ParseArguments(int argc, char **argv) | |
536 { | |
537 for (int i = 0; i < argc; ++i) | |
538 { | |
539 if (strcmp(argv[i], "--noprompt") == 0 || | |
540 strcmp(argv[i], "-noprompt") == 0) | |
541 { | |
542 noprompt = true; | |
543 break; | |
544 } | |
545 } | |
546 } |