comparison src/test/vectorAddDrv.cc @ 297:b46398081fe4

add working example
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Sat, 11 Feb 2017 10:55:36 +0900
parents
children 8bbc0012e1a4
comparison
equal deleted inserted replaced
296:f16802b3b580 297:b46398081fe4
1 /*
2 * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
3 *
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
9 *
10 */
11
12 /* Vector addition: C = A + B.
13 *
14 * This sample is a very basic sample that implements element by element
15 * vector addition. It is the same as the sample illustrating Chapter 3
16 * of the programming guide with some additions like error checking.
17 *
18 */
19
20 // Includes
21 #include <stdio.h>
22 #include <string.h>
23 #include <iostream>
24 #include <cstring>
25 #include <math.h>
26
27 // includes, project
28 #include <driver_types.h>
29 #include <cuda_runtime.h>
30 #include <cuda.h>
31 #include "helper_cuda.h"
32
33 // includes, CUDA
34 #include <builtin_types.h>
35
36 #define PTX_FILE "vectorAdd_kernel.ptx"
37
38
39 using namespace std;
40
41 // Variables
42 CUdevice cuDevice;
43 CUcontext cuContext;
44 CUmodule cuModule;
45 CUfunction vecAdd_kernel;
46 float *h_A;
47 float *h_B;
48 float *h_C;
49 CUdeviceptr d_A;
50 CUdeviceptr d_B;
51 CUdeviceptr d_C;
52 bool noprompt = false;
53
54 // Functions
55 void Cleanup(bool);
56 CUresult CleanupNoFailure();
57 void RandomInit(float *, int);
58 bool findModulePath(const char *, string &, char **, string &);
59 void ParseArguments(int, char **);
60
61 int *pArgc = NULL;
62 char **pArgv = NULL;
63
64
65 // Host code
66 int main(int argc, char **argv)
67 {
68 pArgc = &argc;
69 pArgv = argv;
70
71 printf("Vector Addition (Driver API)\n");
72 int N = 50000, devID = 0;
73 size_t size = N * sizeof(float);
74
75 CUresult error;
76 ParseArguments(argc, argv);
77
78 // Initialize
79 checkCudaErrors(cuInit(0));
80
81 // This assumes that the user is attempting to specify a explicit device -device=n
82 if (argc > 1)
83 {
84 bool bFound = false;
85
86 for (int param=0; param < argc; param++)
87 {
88 int string_start = 0;
89
90 while (argv[param][string_start] == '-')
91 {
92 string_start++;
93 }
94
95 char *string_argv = &argv[param][string_start];
96
97 if (!strncmp(string_argv, "device", 6))
98 {
99 int len=(int)strlen(string_argv);
100
101 while (string_argv[len] != '=')
102 {
103 len--;
104 }
105
106 devID = atoi(&string_argv[++len]);
107 bFound = true;
108 }
109
110 if (bFound)
111 {
112 break;
113 }
114 }
115 }
116
117 // Get number of devices supporting CUDA
118 int deviceCount = 0;
119 error = cuDeviceGetCount(&deviceCount);
120
121 if (error != CUDA_SUCCESS)
122 {
123 Cleanup(false);
124 }
125
126 if (deviceCount == 0)
127 {
128 printf("There is no device supporting CUDA.\n");
129 Cleanup(false);
130 }
131
132 if (devID < 0)
133 {
134 devID = 0;
135 }
136
137 if (devID > deviceCount-1)
138 {
139 fprintf(stderr, "(Device=%d) invalid GPU device. %d GPU device(s) detected.\nexiting...\n", devID, deviceCount);
140 CleanupNoFailure();
141 exit(EXIT_SUCCESS);
142 }
143 else
144 {
145 int major, minor;
146 char deviceName[100];
147 checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID));
148 checkCudaErrors(cuDeviceGetName(deviceName, 256, devID));
149 printf("> Using Device %d: \"%s\" with Compute %d.%d capability\n", devID, deviceName, major, minor);
150 }
151
152 // pick up device with zero ordinal (default, or devID)
153 error = cuDeviceGet(&cuDevice, devID);
154
155 if (error != CUDA_SUCCESS)
156 {
157 Cleanup(false);
158 }
159
160 // Create context
161 error = cuCtxCreate(&cuContext, 0, cuDevice);
162
163 if (error != CUDA_SUCCESS)
164 {
165 Cleanup(false);
166 }
167
168 // first search for the module path before we load the results
169 string module_path, ptx_source;
170
171 if (!findModulePath(PTX_FILE, module_path, argv, ptx_source))
172 {
173 if (!findModulePath("vectorAdd_kernel.cubin", module_path, argv, ptx_source))
174 {
175 printf("> findModulePath could not find <vectorAdd> ptx or cubin\n");
176 Cleanup(false);
177 }
178 }
179 else
180 {
181 printf("> initCUDA loading module: <%s>\n", module_path.c_str());
182 }
183
184 // Create module from binary file (PTX or CUBIN)
185 if (module_path.rfind("ptx") != string::npos)
186 {
187 // in this branch we use compilation with parameters
188 const unsigned int jitNumOptions = 3;
189 CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
190 void **jitOptVals = new void *[jitNumOptions];
191
192 // set up size of compilation log buffer
193 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
194 int jitLogBufferSize = 1024;
195 jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
196
197 // set up pointer to the compilation log buffer
198 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
199 char *jitLogBuffer = new char[jitLogBufferSize];
200 jitOptVals[1] = jitLogBuffer;
201
202 // set up pointer to set the Maximum # of registers for a particular kernel
203 jitOptions[2] = CU_JIT_MAX_REGISTERS;
204 int jitRegCount = 32;
205 jitOptVals[2] = (void *)(size_t)jitRegCount;
206
207 error = cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
208
209 printf("> PTX JIT log:\n%s\n", jitLogBuffer);
210 }
211 else
212 {
213 error = cuModuleLoad(&cuModule, module_path.c_str());
214 }
215
216 if (error != CUDA_SUCCESS)
217 {
218 Cleanup(false);
219 }
220
221 // Get function handle from module
222 error = cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel");
223
224 if (error != CUDA_SUCCESS)
225 {
226 Cleanup(false);
227 }
228
229 // Allocate input vectors h_A and h_B in host memory
230 h_A = (float *)malloc(size);
231
232 if (h_A == 0)
233 {
234 Cleanup(false);
235 }
236
237 h_B = (float *)malloc(size);
238
239 if (h_B == 0)
240 {
241 Cleanup(false);
242 }
243
244 h_C = (float *)malloc(size);
245
246 if (h_C == 0)
247 {
248 Cleanup(false);
249 }
250
251 // Initialize input vectors
252 RandomInit(h_A, N);
253 RandomInit(h_B, N);
254
255 // Allocate vectors in device memory
256 error = cuMemAlloc(&d_A, size);
257
258 if (error != CUDA_SUCCESS)
259 {
260 Cleanup(false);
261 }
262
263 error = cuMemAlloc(&d_B, size);
264
265 if (error != CUDA_SUCCESS)
266 {
267 Cleanup(false);
268 }
269
270 error = cuMemAlloc(&d_C, size);
271
272 if (error != CUDA_SUCCESS)
273 {
274 Cleanup(false);
275 }
276
277 // Copy vectors from host memory to device memory
278 error = cuMemcpyHtoD(d_A, h_A, size);
279
280 if (error != CUDA_SUCCESS)
281 {
282 Cleanup(false);
283 }
284
285 error = cuMemcpyHtoD(d_B, h_B, size);
286
287 if (error != CUDA_SUCCESS)
288 {
289 Cleanup(false);
290 }
291
292 #if 1
293
294 if (1)
295 {
296 // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch (simpler method)
297
298 // Grid/Block configuration
299 int threadsPerBlock = 256;
300 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
301
302 void *args[] = { &d_A, &d_B, &d_C, &N };
303
304 // Launch the CUDA kernel
305 error = cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
306 threadsPerBlock, 1, 1,
307 0,
308 NULL, args, NULL);
309
310 if (error != CUDA_SUCCESS)
311 {
312 Cleanup(false);
313 }
314 }
315 else
316 {
317 // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch (advanced method)
318 int offset = 0;
319 void *argBuffer[16];
320 *((CUdeviceptr *)&argBuffer[offset]) = d_A;
321 offset += sizeof(d_A);
322 *((CUdeviceptr *)&argBuffer[offset]) = d_B;
323 offset += sizeof(d_B);
324 *((CUdeviceptr *)&argBuffer[offset]) = d_C;
325 offset += sizeof(d_C);
326 *((int *)&argBuffer[offset]) = N;
327 offset += sizeof(N);
328
329 // Grid/Block configuration
330 int threadsPerBlock = 256;
331 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
332
333 // Launch the CUDA kernel
334 error = cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
335 threadsPerBlock, 1, 1,
336 0,
337 NULL, NULL, argBuffer);
338
339 if (error != CUDA_SUCCESS)
340 {
341 Cleanup(false);
342 }
343 }
344
345 #else
346 {
347 char argBuffer[256];
348
349 // pass in launch parameters (not actually de-referencing CUdeviceptr). CUdeviceptr is
350 // storing the value of the parameters
351 *((CUdeviceptr *)&argBuffer[offset]) = d_A;
352 offset += sizeof(d_A);
353 *((CUdeviceptr *)&argBuffer[offset]) = d_B;
354 offset += sizeof(d_B);
355 *((CUdeviceptr *)&argBuffer[offset]) = d_C;
356 offset += sizeof(d_C);
357 *((int *)&argBuffer[offset]) = N;
358 offset += sizeof(N);
359
360 void *kernel_launch_config[5] =
361 {
362 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
363 CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
364 CU_LAUNCH_PARAM_END
365 };
366
367 // Grid/Block configuration
368 int threadsPerBlock = 256;
369 int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
370
371 // Launch the CUDA kernel
372 error = cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
373 threadsPerBlock, 1, 1,
374 0, 0,
375 NULL, (void **)&kernel_launch_config);
376
377 if (error != CUDA_SUCCESS)
378 {
379 Cleanup(false);
380 }
381 }
382 #endif
383
384 #ifdef _DEBUG
385 error = cuCtxSynchronize();
386
387 if (error != CUDA_SUCCESS)
388 {
389 Cleanup(false);
390 }
391
392 #endif
393
394 // Copy result from device memory to host memory
395 // h_C contains the result in host memory
396 error = cuMemcpyDtoH(h_C, d_C, size);
397
398 if (error != CUDA_SUCCESS)
399 {
400 Cleanup(false);
401 }
402
403 // Verify result
404 int i;
405
406 for (i = 0; i < N; ++i)
407 {
408 float sum = h_A[i] + h_B[i];
409
410 if (fabs(h_C[i] - sum) > 1e-7f)
411 {
412 break;
413 }
414 }
415
416 printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL");
417
418 exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE);
419 }
420
421 CUresult CleanupNoFailure()
422 {
423 CUresult error;
424
425 // Free device memory
426 if (d_A)
427 {
428 error = cuMemFree(d_A);
429 }
430
431 if (d_B)
432 {
433 error = cuMemFree(d_B);
434 }
435
436 if (d_C)
437 {
438 error = cuMemFree(d_C);
439 }
440
441 // Free host memory
442 if (h_A)
443 {
444 free(h_A);
445 }
446
447 if (h_B)
448 {
449 free(h_B);
450 }
451
452 if (h_C)
453 {
454 free(h_C);
455 }
456
457 error = cuCtxDestroy(cuContext);
458
459 return error;
460 }
461
462 void Cleanup(bool noError)
463 {
464 CUresult error;
465 error = CleanupNoFailure();
466
467 if (!noError || error != CUDA_SUCCESS)
468 {
469 printf("Function call failed\nFAILED\n");
470 exit(EXIT_FAILURE);
471 }
472
473 if (!noprompt)
474 {
475 printf("\nPress ENTER to exit...\n");
476 fflush(stdout);
477 fflush(stderr);
478 getchar();
479 }
480 }
481
482
483 // Allocates an array with random float entries.
484 void RandomInit(float *data, int n)
485 {
486 for (int i = 0; i < n; ++i)
487 {
488 data[i] = rand() / (float)RAND_MAX;
489 }
490 }
491
492 bool inline
493 findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source)
494 {
495 char *actual_path = sdkFindFilePath(module_file, argv[0]);
496
497 if (actual_path)
498 {
499 module_path = actual_path;
500 }
501 else
502 {
503 printf("> findModulePath file not found: <%s> \n", module_file);
504 return false;
505 }
506
507 if (module_path.empty())
508 {
509 printf("> findModulePath could not find file: <%s> \n", module_file);
510 return false;
511 }
512 else
513 {
514 printf("> findModulePath found file at <%s>\n", module_path.c_str());
515
516 if (module_path.rfind(".ptx") != string::npos)
517 {
518 FILE *fp = fopen(module_path.c_str(), "rb");
519 fseek(fp, 0, SEEK_END);
520 int file_size = ftell(fp);
521 char *buf = new char[file_size+1];
522 fseek(fp, 0, SEEK_SET);
523 fread(buf, sizeof(char), file_size, fp);
524 fclose(fp);
525 buf[file_size] = '\0';
526 ptx_source = buf;
527 delete[] buf;
528 }
529
530 return true;
531 }
532 }
533
534 // Parse program arguments
535 void ParseArguments(int argc, char **argv)
536 {
537 for (int i = 0; i < argc; ++i)
538 {
539 if (strcmp(argv[i], "--noprompt") == 0 ||
540 strcmp(argv[i], "-noprompt") == 0)
541 {
542 noprompt = true;
543 break;
544 }
545 }
546 }