291
|
1 /*
|
|
2 * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
|
3 *
|
|
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
5 * with this source code for terms and conditions that govern your use of
|
|
6 * this software. Any use, reproduction, disclosure, or distribution of
|
|
7 * this software and related documentation outside the terms of the EULA
|
|
8 * is strictly prohibited.
|
|
9 *
|
|
10 */
|
|
11
|
|
12 /*
|
|
13 * Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two memcopies
|
|
14 * with kernel execution. This sample illustrates the usage of CUDA streams to
|
|
15 * achieve overlapping of kernel execution with copying data to and from the device.
|
|
16 *
|
|
17 * Additionally, this sample uses CUDA events to measure elapsed time for
|
|
18 * CUDA calls. Events are a part of CUDA API and provide a system independent
|
|
19 * way to measure execution times on CUDA devices with approximately 0.5
|
|
20 * microsecond precision.
|
|
21 *
|
|
22 * Elapsed times are averaged over nreps repetitions (10 by default).
|
|
23 *
|
|
24 */
|
|
25
|
|
26 const char *sSDKname = "simpleMultiCopy";
|
|
27
|
|
28 // includes, system
|
|
29 #include <stdio.h>
|
|
30
|
|
31 extern "C" {
|
|
32 extern void test1();
|
|
33 }
|
|
34 // include CUDA
|
|
35 #include <cuda.h>
|
|
36 #include <cuda_runtime.h>
|
|
37
|
|
38 // includes, project
|
|
39 //#include <helper_cuda.h>
|
|
40 //#include <helper_functions.h> // helper for shared that are common to CUDA Samples
|
|
41
|
|
42 #include "helper_cuda.h"
|
|
43
|
|
44 // includes, kernels
|
|
45 // Declare the CUDA kernels here and main() code that is needed to launch
|
|
46 // Compute workload on the system
|
|
47 __global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
|
|
48 {
|
|
49 int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
50
|
|
51 if (idx < N)
|
|
52 {
|
|
53 for (int i=0; i<inner_reps; ++i)
|
|
54 {
|
|
55 g_out[idx] = g_in[idx] + 1;
|
|
56 }
|
|
57 }
|
|
58 }
|
|
59
|
|
60 #define STREAM_COUNT 4
|
|
61
|
|
62 // Uncomment to simulate data source/sink IO times
|
|
63 //#define SIMULATE_IO
|
|
64
|
|
65 int *h_data_source;
|
|
66 int *h_data_sink;
|
|
67
|
|
68 int *h_data_in[STREAM_COUNT];
|
|
69 int *d_data_in[STREAM_COUNT];
|
|
70
|
|
71 int *h_data_out[STREAM_COUNT];
|
|
72 int *d_data_out[STREAM_COUNT];
|
|
73
|
|
74
|
|
75 cudaEvent_t cycleDone[STREAM_COUNT];
|
|
76 cudaStream_t stream[STREAM_COUNT];
|
|
77
|
|
78 cudaEvent_t start, stop;
|
|
79
|
|
80 int N = 1 << 22;
|
|
81 int nreps = 10; // number of times each experiment is repeated
|
|
82 int inner_reps = 5;
|
|
83
|
|
84 int memsize;
|
|
85
|
|
86 dim3 block(512);
|
|
87 dim3 grid;
|
|
88
|
|
89 int thread_blocks;
|
|
90
|
|
91 float processWithStreams(int streams_used);
|
|
92 void init();
|
|
93 bool test();
|
|
94
|
|
95 ////////////////////////////////////////////////////////////////////////////////
|
|
96 // Program main
|
|
97 ////////////////////////////////////////////////////////////////////////////////
|
|
98 int main(int argc, char *argv[])
|
|
99 {
|
|
100 int cuda_device = 0;
|
|
101 float scale_factor;
|
|
102 cudaDeviceProp deviceProp;
|
|
103
|
|
104 test1();
|
|
105 printf("[%s] - Starting...\n", sSDKname);
|
|
106
|
|
107 // Otherwise pick the device with the highest Gflops/s
|
|
108 cuda_device = 0;
|
|
109 checkCudaErrors(cudaSetDevice(cuda_device));
|
|
110 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
|
111 printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
|
|
112
|
|
113 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
|
114 printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
|
|
115 deviceProp.name, deviceProp.multiProcessorCount,
|
|
116 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
|
117 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
|
|
118
|
|
119 // Anything that is less than 32 Cores will have scaled down workload
|
|
120 scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
|
|
121 N = (int)((float)N / scale_factor);
|
|
122
|
|
123 printf("> Device name: %s\n", deviceProp.name);
|
|
124 printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
|
|
125 deviceProp.major, deviceProp.minor,
|
|
126 deviceProp.multiProcessorCount);
|
|
127 printf("> scale_factor = %.2f\n", 1.0f/scale_factor);
|
|
128 printf("> array_size = %d\n\n", N);
|
|
129
|
|
130 memsize = N * sizeof(int);
|
|
131
|
|
132 thread_blocks = N / block.x;
|
|
133
|
|
134 grid.x = thread_blocks % 65535;
|
|
135 grid.y = (thread_blocks / 65535 + 1);
|
|
136
|
|
137
|
|
138 // Allocate resources
|
|
139
|
|
140 h_data_source = (int *) malloc(memsize);
|
|
141 h_data_sink = (int *) malloc(memsize);
|
|
142
|
|
143 for (int i =0; i<STREAM_COUNT; ++i)
|
|
144 {
|
|
145
|
|
146 checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize,
|
|
147 cudaHostAllocDefault));
|
|
148 checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
|
|
149
|
|
150 checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize,
|
|
151 cudaHostAllocDefault));
|
|
152 checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
|
|
153
|
|
154 checkCudaErrors(cudaStreamCreate(&stream[i]));
|
|
155 checkCudaErrors(cudaEventCreate(&cycleDone[i]));
|
|
156
|
|
157 cudaEventRecord(cycleDone[i], stream[i]);
|
|
158 }
|
|
159
|
|
160 cudaEventCreate(&start);
|
|
161 cudaEventCreate(&stop);
|
|
162
|
|
163 init();
|
|
164
|
|
165 // Kernel warmup
|
|
166 incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
|
|
167
|
|
168
|
|
169 // Time copies and kernel
|
|
170 cudaEventRecord(start,0);
|
|
171 checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
|
|
172 cudaMemcpyHostToDevice,0));
|
|
173 cudaEventRecord(stop,0);
|
|
174 cudaEventSynchronize(stop);
|
|
175
|
|
176 float memcpy_h2d_time;
|
|
177 cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
|
|
178
|
|
179 cudaEventRecord(start,0);
|
|
180 checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
|
|
181 cudaMemcpyDeviceToHost, 0));
|
|
182 cudaEventRecord(stop,0);
|
|
183 cudaEventSynchronize(stop);
|
|
184
|
|
185 float memcpy_d2h_time;
|
|
186 cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
|
|
187
|
|
188 cudaEventRecord(start,0);
|
|
189 incKernel<<<grid, block,0,0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
|
|
190 cudaEventRecord(stop,0);
|
|
191 cudaEventSynchronize(stop);
|
|
192
|
|
193 float kernel_time;
|
|
194 cudaEventElapsedTime(&kernel_time, start, stop);
|
|
195
|
|
196 printf("\n");
|
|
197 printf("Relevant properties of this CUDA device\n");
|
|
198 printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution (device property \"deviceOverlap\")\n", deviceProp.deviceOverlap ? "X" : " ");
|
|
199 //printf("(%s) Can execute several GPU kernels simultaneously (compute capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
|
|
200 printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
|
|
201 " (Compute Capability >= 2.0 AND (Tesla product OR Quadro 4000/5000/6000/K5000)\n",
|
|
202 (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1)
|
|
203 ? "X" : " ");
|
|
204
|
|
205 printf("\n");
|
|
206 printf("Measured timings (throughput):\n");
|
|
207 printf(" Memcpy host to device\t: %f ms (%f GB/s)\n",
|
|
208 memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time);
|
|
209 printf(" Memcpy device to host\t: %f ms (%f GB/s)\n",
|
|
210 memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);
|
|
211 printf(" Kernel\t\t\t: %f ms (%f GB/s)\n",
|
|
212 kernel_time, (inner_reps *memsize * 2e-6)/ kernel_time);
|
|
213
|
|
214 printf("\n");
|
|
215 printf("Theoretical limits for speedup gained from overlapped data transfers:\n");
|
|
216 printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
|
|
217 memcpy_h2d_time + memcpy_d2h_time + kernel_time);
|
|
218 printf("Compute can overlap with one transfer: %f ms\n",
|
|
219 max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
|
|
220 printf("Compute can overlap with both data transfers: %f ms\n",
|
|
221 max(max(memcpy_h2d_time,memcpy_d2h_time), kernel_time));
|
|
222
|
|
223 // Process pipelined work
|
|
224 float serial_time = processWithStreams(1);
|
|
225 float overlap_time = processWithStreams(STREAM_COUNT);
|
|
226
|
|
227 printf("\nAverage measured timings over %d repetitions:\n", nreps);
|
|
228 printf(" Avg. time when execution fully serialized\t: %f ms\n",
|
|
229 serial_time / nreps);
|
|
230 printf(" Avg. time when overlapped using %d streams\t: %f ms\n",
|
|
231 STREAM_COUNT, overlap_time / nreps);
|
|
232 printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
|
|
233 (serial_time - overlap_time) / nreps);
|
|
234
|
|
235 printf("\nMeasured throughput:\n");
|
|
236 printf(" Fully serialized execution\t\t: %f GB/s\n",
|
|
237 (nreps * (memsize * 2e-6))/ serial_time);
|
|
238 printf(" Overlapped using %d streams\t\t: %f GB/s\n",
|
|
239 STREAM_COUNT, (nreps * (memsize * 2e-6))/ overlap_time);
|
|
240
|
|
241 // Verify the results, we will use the results for final output
|
|
242 bool bResults = test();
|
|
243
|
|
244 // Free resources
|
|
245
|
|
246 free(h_data_source);
|
|
247 free(h_data_sink);
|
|
248
|
|
249 for (int i =0; i<STREAM_COUNT; ++i)
|
|
250 {
|
|
251
|
|
252 cudaFreeHost(h_data_in[i]);
|
|
253 cudaFree(d_data_in[i]);
|
|
254
|
|
255 cudaFreeHost(h_data_out[i]);
|
|
256 cudaFree(d_data_out[i]);
|
|
257
|
|
258 cudaStreamDestroy(stream[i]);
|
|
259 cudaEventDestroy(cycleDone[i]);
|
|
260 }
|
|
261
|
|
262 cudaEventDestroy(start);
|
|
263 cudaEventDestroy(stop);
|
|
264
|
|
265 // Test result
|
|
266 exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
267 }
|
|
268
|
|
269 float processWithStreams(int streams_used)
|
|
270 {
|
|
271
|
|
272 int current_stream = 0;
|
|
273
|
|
274 float time;
|
|
275
|
|
276 // Do processing in a loop
|
|
277 //
|
|
278 // Note: All memory commands are processed in the order they are issued,
|
|
279 // independent of the stream they are enqueued in. Hence the pattern by
|
|
280 // which the copy and kernel commands are enqueued in the stream
|
|
281 // has an influence on the achieved overlap.
|
|
282
|
|
283 cudaEventRecord(start, 0);
|
|
284
|
|
285 for (int i=0; i<nreps; ++i)
|
|
286 {
|
|
287 int next_stream = (current_stream + 1) % streams_used;
|
|
288
|
|
289 #ifdef SIMULATE_IO
|
|
290 // Store the result
|
|
291 memcpy(h_data_sink, h_data_out[current_stream],memsize);
|
|
292
|
|
293 // Read new input
|
|
294 memcpy(h_data_in[next_stream], h_data_source, memsize);
|
|
295 #endif
|
|
296
|
|
297 // Ensure that processing and copying of the last cycle has finished
|
|
298 cudaEventSynchronize(cycleDone[next_stream]);
|
|
299
|
|
300 // Process current frame
|
|
301 incKernel<<<grid, block, 0, stream[current_stream]>>>(
|
|
302 d_data_out[current_stream],
|
|
303 d_data_in[current_stream],
|
|
304 N,
|
|
305 inner_reps);
|
|
306
|
|
307 // Upload next frame
|
|
308 checkCudaErrors(cudaMemcpyAsync(
|
|
309 d_data_in[next_stream],
|
|
310 h_data_in[next_stream],
|
|
311 memsize,
|
|
312 cudaMemcpyHostToDevice,
|
|
313 stream[next_stream]));
|
|
314
|
|
315 // Download current frame
|
|
316 checkCudaErrors(cudaMemcpyAsync(
|
|
317 h_data_out[current_stream],
|
|
318 d_data_out[current_stream],
|
|
319 memsize,
|
|
320 cudaMemcpyDeviceToHost,
|
|
321 stream[current_stream]));
|
|
322
|
|
323 checkCudaErrors(cudaEventRecord(
|
|
324 cycleDone[current_stream],
|
|
325 stream[current_stream]));
|
|
326
|
|
327 current_stream = next_stream;
|
|
328 }
|
|
329
|
|
330 cudaEventRecord(stop, 0);
|
|
331
|
|
332 cudaDeviceSynchronize();
|
|
333
|
|
334 cudaEventElapsedTime(&time, start, stop);
|
|
335
|
|
336 return time;
|
|
337
|
|
338 }
|
|
339
|
|
340 void init()
|
|
341 {
|
|
342 for (int i=0; i<N; ++i)
|
|
343 {
|
|
344 h_data_source[i] = 0;
|
|
345 }
|
|
346
|
|
347 for (int i =0; i<STREAM_COUNT; ++i)
|
|
348 {
|
|
349 memcpy(h_data_in[i], h_data_source, memsize);
|
|
350 }
|
|
351 }
|
|
352
|
|
353
|
|
354 bool test()
|
|
355 {
|
|
356
|
|
357 bool passed = true;
|
|
358
|
|
359 for (int j =0; j<STREAM_COUNT; ++j)
|
|
360 {
|
|
361 for (int i =0; i<N; ++i)
|
|
362 {
|
|
363 passed &= (h_data_out[j][i] == 1);
|
|
364 }
|
|
365 }
|
|
366
|
|
367 return passed;
|
|
368 }
|