annotate src/test/main.cu @ 408:8ee89eefbc6d

Fix twice
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Wed, 06 Sep 2017 21:54:22 +0900
parents 87128b876c63
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
291
87128b876c63 add test
ikkun
parents:
diff changeset
1 /*
87128b876c63 add test
ikkun
parents:
diff changeset
2 * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
87128b876c63 add test
ikkun
parents:
diff changeset
3 *
87128b876c63 add test
ikkun
parents:
diff changeset
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
87128b876c63 add test
ikkun
parents:
diff changeset
5 * with this source code for terms and conditions that govern your use of
87128b876c63 add test
ikkun
parents:
diff changeset
6 * this software. Any use, reproduction, disclosure, or distribution of
87128b876c63 add test
ikkun
parents:
diff changeset
7 * this software and related documentation outside the terms of the EULA
87128b876c63 add test
ikkun
parents:
diff changeset
8 * is strictly prohibited.
87128b876c63 add test
ikkun
parents:
diff changeset
9 *
87128b876c63 add test
ikkun
parents:
diff changeset
10 */
87128b876c63 add test
ikkun
parents:
diff changeset
11
87128b876c63 add test
ikkun
parents:
diff changeset
12 /*
87128b876c63 add test
ikkun
parents:
diff changeset
13 * Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two memcopies
87128b876c63 add test
ikkun
parents:
diff changeset
14 * with kernel execution. This sample illustrates the usage of CUDA streams to
87128b876c63 add test
ikkun
parents:
diff changeset
15 * achieve overlapping of kernel execution with copying data to and from the device.
87128b876c63 add test
ikkun
parents:
diff changeset
16 *
87128b876c63 add test
ikkun
parents:
diff changeset
17 * Additionally, this sample uses CUDA events to measure elapsed time for
87128b876c63 add test
ikkun
parents:
diff changeset
18 * CUDA calls. Events are a part of CUDA API and provide a system independent
87128b876c63 add test
ikkun
parents:
diff changeset
19 * way to measure execution times on CUDA devices with approximately 0.5
87128b876c63 add test
ikkun
parents:
diff changeset
20 * microsecond precision.
87128b876c63 add test
ikkun
parents:
diff changeset
21 *
87128b876c63 add test
ikkun
parents:
diff changeset
22 * Elapsed times are averaged over nreps repetitions (10 by default).
87128b876c63 add test
ikkun
parents:
diff changeset
23 *
87128b876c63 add test
ikkun
parents:
diff changeset
24 */
87128b876c63 add test
ikkun
parents:
diff changeset
25
87128b876c63 add test
ikkun
parents:
diff changeset
26 const char *sSDKname = "simpleMultiCopy";
87128b876c63 add test
ikkun
parents:
diff changeset
27
87128b876c63 add test
ikkun
parents:
diff changeset
28 // includes, system
87128b876c63 add test
ikkun
parents:
diff changeset
29 #include <stdio.h>
87128b876c63 add test
ikkun
parents:
diff changeset
30
87128b876c63 add test
ikkun
parents:
diff changeset
31 extern "C" {
87128b876c63 add test
ikkun
parents:
diff changeset
32 extern void test1();
87128b876c63 add test
ikkun
parents:
diff changeset
33 }
87128b876c63 add test
ikkun
parents:
diff changeset
34 // include CUDA
87128b876c63 add test
ikkun
parents:
diff changeset
35 #include <cuda.h>
87128b876c63 add test
ikkun
parents:
diff changeset
36 #include <cuda_runtime.h>
87128b876c63 add test
ikkun
parents:
diff changeset
37
87128b876c63 add test
ikkun
parents:
diff changeset
38 // includes, project
87128b876c63 add test
ikkun
parents:
diff changeset
39 //#include <helper_cuda.h>
87128b876c63 add test
ikkun
parents:
diff changeset
40 //#include <helper_functions.h> // helper for shared that are common to CUDA Samples
87128b876c63 add test
ikkun
parents:
diff changeset
41
87128b876c63 add test
ikkun
parents:
diff changeset
42 #include "helper_cuda.h"
87128b876c63 add test
ikkun
parents:
diff changeset
43
87128b876c63 add test
ikkun
parents:
diff changeset
44 // includes, kernels
87128b876c63 add test
ikkun
parents:
diff changeset
45 // Declare the CUDA kernels here and main() code that is needed to launch
87128b876c63 add test
ikkun
parents:
diff changeset
46 // Compute workload on the system
87128b876c63 add test
ikkun
parents:
diff changeset
47 __global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
87128b876c63 add test
ikkun
parents:
diff changeset
48 {
87128b876c63 add test
ikkun
parents:
diff changeset
49 int idx = blockIdx.x * blockDim.x + threadIdx.x;
87128b876c63 add test
ikkun
parents:
diff changeset
50
87128b876c63 add test
ikkun
parents:
diff changeset
51 if (idx < N)
87128b876c63 add test
ikkun
parents:
diff changeset
52 {
87128b876c63 add test
ikkun
parents:
diff changeset
53 for (int i=0; i<inner_reps; ++i)
87128b876c63 add test
ikkun
parents:
diff changeset
54 {
87128b876c63 add test
ikkun
parents:
diff changeset
55 g_out[idx] = g_in[idx] + 1;
87128b876c63 add test
ikkun
parents:
diff changeset
56 }
87128b876c63 add test
ikkun
parents:
diff changeset
57 }
87128b876c63 add test
ikkun
parents:
diff changeset
58 }
87128b876c63 add test
ikkun
parents:
diff changeset
59
87128b876c63 add test
ikkun
parents:
diff changeset
60 #define STREAM_COUNT 4
87128b876c63 add test
ikkun
parents:
diff changeset
61
87128b876c63 add test
ikkun
parents:
diff changeset
62 // Uncomment to simulate data source/sink IO times
87128b876c63 add test
ikkun
parents:
diff changeset
63 //#define SIMULATE_IO
87128b876c63 add test
ikkun
parents:
diff changeset
64
87128b876c63 add test
ikkun
parents:
diff changeset
65 int *h_data_source;
87128b876c63 add test
ikkun
parents:
diff changeset
66 int *h_data_sink;
87128b876c63 add test
ikkun
parents:
diff changeset
67
87128b876c63 add test
ikkun
parents:
diff changeset
68 int *h_data_in[STREAM_COUNT];
87128b876c63 add test
ikkun
parents:
diff changeset
69 int *d_data_in[STREAM_COUNT];
87128b876c63 add test
ikkun
parents:
diff changeset
70
87128b876c63 add test
ikkun
parents:
diff changeset
71 int *h_data_out[STREAM_COUNT];
87128b876c63 add test
ikkun
parents:
diff changeset
72 int *d_data_out[STREAM_COUNT];
87128b876c63 add test
ikkun
parents:
diff changeset
73
87128b876c63 add test
ikkun
parents:
diff changeset
74
87128b876c63 add test
ikkun
parents:
diff changeset
75 cudaEvent_t cycleDone[STREAM_COUNT];
87128b876c63 add test
ikkun
parents:
diff changeset
76 cudaStream_t stream[STREAM_COUNT];
87128b876c63 add test
ikkun
parents:
diff changeset
77
87128b876c63 add test
ikkun
parents:
diff changeset
78 cudaEvent_t start, stop;
87128b876c63 add test
ikkun
parents:
diff changeset
79
87128b876c63 add test
ikkun
parents:
diff changeset
80 int N = 1 << 22;
87128b876c63 add test
ikkun
parents:
diff changeset
81 int nreps = 10; // number of times each experiment is repeated
87128b876c63 add test
ikkun
parents:
diff changeset
82 int inner_reps = 5;
87128b876c63 add test
ikkun
parents:
diff changeset
83
87128b876c63 add test
ikkun
parents:
diff changeset
84 int memsize;
87128b876c63 add test
ikkun
parents:
diff changeset
85
87128b876c63 add test
ikkun
parents:
diff changeset
86 dim3 block(512);
87128b876c63 add test
ikkun
parents:
diff changeset
87 dim3 grid;
87128b876c63 add test
ikkun
parents:
diff changeset
88
87128b876c63 add test
ikkun
parents:
diff changeset
89 int thread_blocks;
87128b876c63 add test
ikkun
parents:
diff changeset
90
87128b876c63 add test
ikkun
parents:
diff changeset
91 float processWithStreams(int streams_used);
87128b876c63 add test
ikkun
parents:
diff changeset
92 void init();
87128b876c63 add test
ikkun
parents:
diff changeset
93 bool test();
87128b876c63 add test
ikkun
parents:
diff changeset
94
87128b876c63 add test
ikkun
parents:
diff changeset
95 ////////////////////////////////////////////////////////////////////////////////
87128b876c63 add test
ikkun
parents:
diff changeset
96 // Program main
87128b876c63 add test
ikkun
parents:
diff changeset
97 ////////////////////////////////////////////////////////////////////////////////
87128b876c63 add test
ikkun
parents:
diff changeset
98 int main(int argc, char *argv[])
87128b876c63 add test
ikkun
parents:
diff changeset
99 {
87128b876c63 add test
ikkun
parents:
diff changeset
100 int cuda_device = 0;
87128b876c63 add test
ikkun
parents:
diff changeset
101 float scale_factor;
87128b876c63 add test
ikkun
parents:
diff changeset
102 cudaDeviceProp deviceProp;
87128b876c63 add test
ikkun
parents:
diff changeset
103
87128b876c63 add test
ikkun
parents:
diff changeset
104 test1();
87128b876c63 add test
ikkun
parents:
diff changeset
105 printf("[%s] - Starting...\n", sSDKname);
87128b876c63 add test
ikkun
parents:
diff changeset
106
87128b876c63 add test
ikkun
parents:
diff changeset
107 // Otherwise pick the device with the highest Gflops/s
87128b876c63 add test
ikkun
parents:
diff changeset
108 cuda_device = 0;
87128b876c63 add test
ikkun
parents:
diff changeset
109 checkCudaErrors(cudaSetDevice(cuda_device));
87128b876c63 add test
ikkun
parents:
diff changeset
110 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
87128b876c63 add test
ikkun
parents:
diff changeset
111 printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
87128b876c63 add test
ikkun
parents:
diff changeset
112
87128b876c63 add test
ikkun
parents:
diff changeset
113 checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
87128b876c63 add test
ikkun
parents:
diff changeset
114 printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
87128b876c63 add test
ikkun
parents:
diff changeset
115 deviceProp.name, deviceProp.multiProcessorCount,
87128b876c63 add test
ikkun
parents:
diff changeset
116 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
87128b876c63 add test
ikkun
parents:
diff changeset
117 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
87128b876c63 add test
ikkun
parents:
diff changeset
118
87128b876c63 add test
ikkun
parents:
diff changeset
119 // Anything that is less than 32 Cores will have scaled down workload
87128b876c63 add test
ikkun
parents:
diff changeset
120 scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
87128b876c63 add test
ikkun
parents:
diff changeset
121 N = (int)((float)N / scale_factor);
87128b876c63 add test
ikkun
parents:
diff changeset
122
87128b876c63 add test
ikkun
parents:
diff changeset
123 printf("> Device name: %s\n", deviceProp.name);
87128b876c63 add test
ikkun
parents:
diff changeset
124 printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
87128b876c63 add test
ikkun
parents:
diff changeset
125 deviceProp.major, deviceProp.minor,
87128b876c63 add test
ikkun
parents:
diff changeset
126 deviceProp.multiProcessorCount);
87128b876c63 add test
ikkun
parents:
diff changeset
127 printf("> scale_factor = %.2f\n", 1.0f/scale_factor);
87128b876c63 add test
ikkun
parents:
diff changeset
128 printf("> array_size = %d\n\n", N);
87128b876c63 add test
ikkun
parents:
diff changeset
129
87128b876c63 add test
ikkun
parents:
diff changeset
130 memsize = N * sizeof(int);
87128b876c63 add test
ikkun
parents:
diff changeset
131
87128b876c63 add test
ikkun
parents:
diff changeset
132 thread_blocks = N / block.x;
87128b876c63 add test
ikkun
parents:
diff changeset
133
87128b876c63 add test
ikkun
parents:
diff changeset
134 grid.x = thread_blocks % 65535;
87128b876c63 add test
ikkun
parents:
diff changeset
135 grid.y = (thread_blocks / 65535 + 1);
87128b876c63 add test
ikkun
parents:
diff changeset
136
87128b876c63 add test
ikkun
parents:
diff changeset
137
87128b876c63 add test
ikkun
parents:
diff changeset
138 // Allocate resources
87128b876c63 add test
ikkun
parents:
diff changeset
139
87128b876c63 add test
ikkun
parents:
diff changeset
140 h_data_source = (int *) malloc(memsize);
87128b876c63 add test
ikkun
parents:
diff changeset
141 h_data_sink = (int *) malloc(memsize);
87128b876c63 add test
ikkun
parents:
diff changeset
142
87128b876c63 add test
ikkun
parents:
diff changeset
143 for (int i =0; i<STREAM_COUNT; ++i)
87128b876c63 add test
ikkun
parents:
diff changeset
144 {
87128b876c63 add test
ikkun
parents:
diff changeset
145
87128b876c63 add test
ikkun
parents:
diff changeset
146 checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize,
87128b876c63 add test
ikkun
parents:
diff changeset
147 cudaHostAllocDefault));
87128b876c63 add test
ikkun
parents:
diff changeset
148 checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
87128b876c63 add test
ikkun
parents:
diff changeset
149
87128b876c63 add test
ikkun
parents:
diff changeset
150 checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize,
87128b876c63 add test
ikkun
parents:
diff changeset
151 cudaHostAllocDefault));
87128b876c63 add test
ikkun
parents:
diff changeset
152 checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
87128b876c63 add test
ikkun
parents:
diff changeset
153
87128b876c63 add test
ikkun
parents:
diff changeset
154 checkCudaErrors(cudaStreamCreate(&stream[i]));
87128b876c63 add test
ikkun
parents:
diff changeset
155 checkCudaErrors(cudaEventCreate(&cycleDone[i]));
87128b876c63 add test
ikkun
parents:
diff changeset
156
87128b876c63 add test
ikkun
parents:
diff changeset
157 cudaEventRecord(cycleDone[i], stream[i]);
87128b876c63 add test
ikkun
parents:
diff changeset
158 }
87128b876c63 add test
ikkun
parents:
diff changeset
159
87128b876c63 add test
ikkun
parents:
diff changeset
160 cudaEventCreate(&start);
87128b876c63 add test
ikkun
parents:
diff changeset
161 cudaEventCreate(&stop);
87128b876c63 add test
ikkun
parents:
diff changeset
162
87128b876c63 add test
ikkun
parents:
diff changeset
163 init();
87128b876c63 add test
ikkun
parents:
diff changeset
164
87128b876c63 add test
ikkun
parents:
diff changeset
165 // Kernel warmup
87128b876c63 add test
ikkun
parents:
diff changeset
166 incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
87128b876c63 add test
ikkun
parents:
diff changeset
167
87128b876c63 add test
ikkun
parents:
diff changeset
168
87128b876c63 add test
ikkun
parents:
diff changeset
169 // Time copies and kernel
87128b876c63 add test
ikkun
parents:
diff changeset
170 cudaEventRecord(start,0);
87128b876c63 add test
ikkun
parents:
diff changeset
171 checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
87128b876c63 add test
ikkun
parents:
diff changeset
172 cudaMemcpyHostToDevice,0));
87128b876c63 add test
ikkun
parents:
diff changeset
173 cudaEventRecord(stop,0);
87128b876c63 add test
ikkun
parents:
diff changeset
174 cudaEventSynchronize(stop);
87128b876c63 add test
ikkun
parents:
diff changeset
175
87128b876c63 add test
ikkun
parents:
diff changeset
176 float memcpy_h2d_time;
87128b876c63 add test
ikkun
parents:
diff changeset
177 cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
87128b876c63 add test
ikkun
parents:
diff changeset
178
87128b876c63 add test
ikkun
parents:
diff changeset
179 cudaEventRecord(start,0);
87128b876c63 add test
ikkun
parents:
diff changeset
180 checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
87128b876c63 add test
ikkun
parents:
diff changeset
181 cudaMemcpyDeviceToHost, 0));
87128b876c63 add test
ikkun
parents:
diff changeset
182 cudaEventRecord(stop,0);
87128b876c63 add test
ikkun
parents:
diff changeset
183 cudaEventSynchronize(stop);
87128b876c63 add test
ikkun
parents:
diff changeset
184
87128b876c63 add test
ikkun
parents:
diff changeset
185 float memcpy_d2h_time;
87128b876c63 add test
ikkun
parents:
diff changeset
186 cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
87128b876c63 add test
ikkun
parents:
diff changeset
187
87128b876c63 add test
ikkun
parents:
diff changeset
188 cudaEventRecord(start,0);
87128b876c63 add test
ikkun
parents:
diff changeset
189 incKernel<<<grid, block,0,0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
87128b876c63 add test
ikkun
parents:
diff changeset
190 cudaEventRecord(stop,0);
87128b876c63 add test
ikkun
parents:
diff changeset
191 cudaEventSynchronize(stop);
87128b876c63 add test
ikkun
parents:
diff changeset
192
87128b876c63 add test
ikkun
parents:
diff changeset
193 float kernel_time;
87128b876c63 add test
ikkun
parents:
diff changeset
194 cudaEventElapsedTime(&kernel_time, start, stop);
87128b876c63 add test
ikkun
parents:
diff changeset
195
87128b876c63 add test
ikkun
parents:
diff changeset
196 printf("\n");
87128b876c63 add test
ikkun
parents:
diff changeset
197 printf("Relevant properties of this CUDA device\n");
87128b876c63 add test
ikkun
parents:
diff changeset
198 printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution (device property \"deviceOverlap\")\n", deviceProp.deviceOverlap ? "X" : " ");
87128b876c63 add test
ikkun
parents:
diff changeset
199 //printf("(%s) Can execute several GPU kernels simultaneously (compute capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
87128b876c63 add test
ikkun
parents:
diff changeset
200 printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
87128b876c63 add test
ikkun
parents:
diff changeset
201 " (Compute Capability >= 2.0 AND (Tesla product OR Quadro 4000/5000/6000/K5000)\n",
87128b876c63 add test
ikkun
parents:
diff changeset
202 (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1)
87128b876c63 add test
ikkun
parents:
diff changeset
203 ? "X" : " ");
87128b876c63 add test
ikkun
parents:
diff changeset
204
87128b876c63 add test
ikkun
parents:
diff changeset
205 printf("\n");
87128b876c63 add test
ikkun
parents:
diff changeset
206 printf("Measured timings (throughput):\n");
87128b876c63 add test
ikkun
parents:
diff changeset
207 printf(" Memcpy host to device\t: %f ms (%f GB/s)\n",
87128b876c63 add test
ikkun
parents:
diff changeset
208 memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time);
87128b876c63 add test
ikkun
parents:
diff changeset
209 printf(" Memcpy device to host\t: %f ms (%f GB/s)\n",
87128b876c63 add test
ikkun
parents:
diff changeset
210 memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);
87128b876c63 add test
ikkun
parents:
diff changeset
211 printf(" Kernel\t\t\t: %f ms (%f GB/s)\n",
87128b876c63 add test
ikkun
parents:
diff changeset
212 kernel_time, (inner_reps *memsize * 2e-6)/ kernel_time);
87128b876c63 add test
ikkun
parents:
diff changeset
213
87128b876c63 add test
ikkun
parents:
diff changeset
214 printf("\n");
87128b876c63 add test
ikkun
parents:
diff changeset
215 printf("Theoretical limits for speedup gained from overlapped data transfers:\n");
87128b876c63 add test
ikkun
parents:
diff changeset
216 printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
87128b876c63 add test
ikkun
parents:
diff changeset
217 memcpy_h2d_time + memcpy_d2h_time + kernel_time);
87128b876c63 add test
ikkun
parents:
diff changeset
218 printf("Compute can overlap with one transfer: %f ms\n",
87128b876c63 add test
ikkun
parents:
diff changeset
219 max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
87128b876c63 add test
ikkun
parents:
diff changeset
220 printf("Compute can overlap with both data transfers: %f ms\n",
87128b876c63 add test
ikkun
parents:
diff changeset
221 max(max(memcpy_h2d_time,memcpy_d2h_time), kernel_time));
87128b876c63 add test
ikkun
parents:
diff changeset
222
87128b876c63 add test
ikkun
parents:
diff changeset
223 // Process pipelined work
87128b876c63 add test
ikkun
parents:
diff changeset
224 float serial_time = processWithStreams(1);
87128b876c63 add test
ikkun
parents:
diff changeset
225 float overlap_time = processWithStreams(STREAM_COUNT);
87128b876c63 add test
ikkun
parents:
diff changeset
226
87128b876c63 add test
ikkun
parents:
diff changeset
227 printf("\nAverage measured timings over %d repetitions:\n", nreps);
87128b876c63 add test
ikkun
parents:
diff changeset
228 printf(" Avg. time when execution fully serialized\t: %f ms\n",
87128b876c63 add test
ikkun
parents:
diff changeset
229 serial_time / nreps);
87128b876c63 add test
ikkun
parents:
diff changeset
230 printf(" Avg. time when overlapped using %d streams\t: %f ms\n",
87128b876c63 add test
ikkun
parents:
diff changeset
231 STREAM_COUNT, overlap_time / nreps);
87128b876c63 add test
ikkun
parents:
diff changeset
232 printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
87128b876c63 add test
ikkun
parents:
diff changeset
233 (serial_time - overlap_time) / nreps);
87128b876c63 add test
ikkun
parents:
diff changeset
234
87128b876c63 add test
ikkun
parents:
diff changeset
235 printf("\nMeasured throughput:\n");
87128b876c63 add test
ikkun
parents:
diff changeset
236 printf(" Fully serialized execution\t\t: %f GB/s\n",
87128b876c63 add test
ikkun
parents:
diff changeset
237 (nreps * (memsize * 2e-6))/ serial_time);
87128b876c63 add test
ikkun
parents:
diff changeset
238 printf(" Overlapped using %d streams\t\t: %f GB/s\n",
87128b876c63 add test
ikkun
parents:
diff changeset
239 STREAM_COUNT, (nreps * (memsize * 2e-6))/ overlap_time);
87128b876c63 add test
ikkun
parents:
diff changeset
240
87128b876c63 add test
ikkun
parents:
diff changeset
241 // Verify the results, we will use the results for final output
87128b876c63 add test
ikkun
parents:
diff changeset
242 bool bResults = test();
87128b876c63 add test
ikkun
parents:
diff changeset
243
87128b876c63 add test
ikkun
parents:
diff changeset
244 // Free resources
87128b876c63 add test
ikkun
parents:
diff changeset
245
87128b876c63 add test
ikkun
parents:
diff changeset
246 free(h_data_source);
87128b876c63 add test
ikkun
parents:
diff changeset
247 free(h_data_sink);
87128b876c63 add test
ikkun
parents:
diff changeset
248
87128b876c63 add test
ikkun
parents:
diff changeset
249 for (int i =0; i<STREAM_COUNT; ++i)
87128b876c63 add test
ikkun
parents:
diff changeset
250 {
87128b876c63 add test
ikkun
parents:
diff changeset
251
87128b876c63 add test
ikkun
parents:
diff changeset
252 cudaFreeHost(h_data_in[i]);
87128b876c63 add test
ikkun
parents:
diff changeset
253 cudaFree(d_data_in[i]);
87128b876c63 add test
ikkun
parents:
diff changeset
254
87128b876c63 add test
ikkun
parents:
diff changeset
255 cudaFreeHost(h_data_out[i]);
87128b876c63 add test
ikkun
parents:
diff changeset
256 cudaFree(d_data_out[i]);
87128b876c63 add test
ikkun
parents:
diff changeset
257
87128b876c63 add test
ikkun
parents:
diff changeset
258 cudaStreamDestroy(stream[i]);
87128b876c63 add test
ikkun
parents:
diff changeset
259 cudaEventDestroy(cycleDone[i]);
87128b876c63 add test
ikkun
parents:
diff changeset
260 }
87128b876c63 add test
ikkun
parents:
diff changeset
261
87128b876c63 add test
ikkun
parents:
diff changeset
262 cudaEventDestroy(start);
87128b876c63 add test
ikkun
parents:
diff changeset
263 cudaEventDestroy(stop);
87128b876c63 add test
ikkun
parents:
diff changeset
264
87128b876c63 add test
ikkun
parents:
diff changeset
265 // Test result
87128b876c63 add test
ikkun
parents:
diff changeset
266 exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
87128b876c63 add test
ikkun
parents:
diff changeset
267 }
87128b876c63 add test
ikkun
parents:
diff changeset
268
87128b876c63 add test
ikkun
parents:
diff changeset
269 float processWithStreams(int streams_used)
87128b876c63 add test
ikkun
parents:
diff changeset
270 {
87128b876c63 add test
ikkun
parents:
diff changeset
271
87128b876c63 add test
ikkun
parents:
diff changeset
272 int current_stream = 0;
87128b876c63 add test
ikkun
parents:
diff changeset
273
87128b876c63 add test
ikkun
parents:
diff changeset
274 float time;
87128b876c63 add test
ikkun
parents:
diff changeset
275
87128b876c63 add test
ikkun
parents:
diff changeset
276 // Do processing in a loop
87128b876c63 add test
ikkun
parents:
diff changeset
277 //
87128b876c63 add test
ikkun
parents:
diff changeset
278 // Note: All memory commands are processed in the order they are issued,
87128b876c63 add test
ikkun
parents:
diff changeset
279 // independent of the stream they are enqueued in. Hence the pattern by
87128b876c63 add test
ikkun
parents:
diff changeset
280 // which the copy and kernel commands are enqueued in the stream
87128b876c63 add test
ikkun
parents:
diff changeset
281 // has an influence on the achieved overlap.
87128b876c63 add test
ikkun
parents:
diff changeset
282
87128b876c63 add test
ikkun
parents:
diff changeset
283 cudaEventRecord(start, 0);
87128b876c63 add test
ikkun
parents:
diff changeset
284
87128b876c63 add test
ikkun
parents:
diff changeset
285 for (int i=0; i<nreps; ++i)
87128b876c63 add test
ikkun
parents:
diff changeset
286 {
87128b876c63 add test
ikkun
parents:
diff changeset
287 int next_stream = (current_stream + 1) % streams_used;
87128b876c63 add test
ikkun
parents:
diff changeset
288
87128b876c63 add test
ikkun
parents:
diff changeset
289 #ifdef SIMULATE_IO
87128b876c63 add test
ikkun
parents:
diff changeset
290 // Store the result
87128b876c63 add test
ikkun
parents:
diff changeset
291 memcpy(h_data_sink, h_data_out[current_stream],memsize);
87128b876c63 add test
ikkun
parents:
diff changeset
292
87128b876c63 add test
ikkun
parents:
diff changeset
293 // Read new input
87128b876c63 add test
ikkun
parents:
diff changeset
294 memcpy(h_data_in[next_stream], h_data_source, memsize);
87128b876c63 add test
ikkun
parents:
diff changeset
295 #endif
87128b876c63 add test
ikkun
parents:
diff changeset
296
87128b876c63 add test
ikkun
parents:
diff changeset
297 // Ensure that processing and copying of the last cycle has finished
87128b876c63 add test
ikkun
parents:
diff changeset
298 cudaEventSynchronize(cycleDone[next_stream]);
87128b876c63 add test
ikkun
parents:
diff changeset
299
87128b876c63 add test
ikkun
parents:
diff changeset
300 // Process current frame
87128b876c63 add test
ikkun
parents:
diff changeset
301 incKernel<<<grid, block, 0, stream[current_stream]>>>(
87128b876c63 add test
ikkun
parents:
diff changeset
302 d_data_out[current_stream],
87128b876c63 add test
ikkun
parents:
diff changeset
303 d_data_in[current_stream],
87128b876c63 add test
ikkun
parents:
diff changeset
304 N,
87128b876c63 add test
ikkun
parents:
diff changeset
305 inner_reps);
87128b876c63 add test
ikkun
parents:
diff changeset
306
87128b876c63 add test
ikkun
parents:
diff changeset
307 // Upload next frame
87128b876c63 add test
ikkun
parents:
diff changeset
308 checkCudaErrors(cudaMemcpyAsync(
87128b876c63 add test
ikkun
parents:
diff changeset
309 d_data_in[next_stream],
87128b876c63 add test
ikkun
parents:
diff changeset
310 h_data_in[next_stream],
87128b876c63 add test
ikkun
parents:
diff changeset
311 memsize,
87128b876c63 add test
ikkun
parents:
diff changeset
312 cudaMemcpyHostToDevice,
87128b876c63 add test
ikkun
parents:
diff changeset
313 stream[next_stream]));
87128b876c63 add test
ikkun
parents:
diff changeset
314
87128b876c63 add test
ikkun
parents:
diff changeset
315 // Download current frame
87128b876c63 add test
ikkun
parents:
diff changeset
316 checkCudaErrors(cudaMemcpyAsync(
87128b876c63 add test
ikkun
parents:
diff changeset
317 h_data_out[current_stream],
87128b876c63 add test
ikkun
parents:
diff changeset
318 d_data_out[current_stream],
87128b876c63 add test
ikkun
parents:
diff changeset
319 memsize,
87128b876c63 add test
ikkun
parents:
diff changeset
320 cudaMemcpyDeviceToHost,
87128b876c63 add test
ikkun
parents:
diff changeset
321 stream[current_stream]));
87128b876c63 add test
ikkun
parents:
diff changeset
322
87128b876c63 add test
ikkun
parents:
diff changeset
323 checkCudaErrors(cudaEventRecord(
87128b876c63 add test
ikkun
parents:
diff changeset
324 cycleDone[current_stream],
87128b876c63 add test
ikkun
parents:
diff changeset
325 stream[current_stream]));
87128b876c63 add test
ikkun
parents:
diff changeset
326
87128b876c63 add test
ikkun
parents:
diff changeset
327 current_stream = next_stream;
87128b876c63 add test
ikkun
parents:
diff changeset
328 }
87128b876c63 add test
ikkun
parents:
diff changeset
329
87128b876c63 add test
ikkun
parents:
diff changeset
330 cudaEventRecord(stop, 0);
87128b876c63 add test
ikkun
parents:
diff changeset
331
87128b876c63 add test
ikkun
parents:
diff changeset
332 cudaDeviceSynchronize();
87128b876c63 add test
ikkun
parents:
diff changeset
333
87128b876c63 add test
ikkun
parents:
diff changeset
334 cudaEventElapsedTime(&time, start, stop);
87128b876c63 add test
ikkun
parents:
diff changeset
335
87128b876c63 add test
ikkun
parents:
diff changeset
336 return time;
87128b876c63 add test
ikkun
parents:
diff changeset
337
87128b876c63 add test
ikkun
parents:
diff changeset
338 }
87128b876c63 add test
ikkun
parents:
diff changeset
339
87128b876c63 add test
ikkun
parents:
diff changeset
340 void init()
87128b876c63 add test
ikkun
parents:
diff changeset
341 {
87128b876c63 add test
ikkun
parents:
diff changeset
342 for (int i=0; i<N; ++i)
87128b876c63 add test
ikkun
parents:
diff changeset
343 {
87128b876c63 add test
ikkun
parents:
diff changeset
344 h_data_source[i] = 0;
87128b876c63 add test
ikkun
parents:
diff changeset
345 }
87128b876c63 add test
ikkun
parents:
diff changeset
346
87128b876c63 add test
ikkun
parents:
diff changeset
347 for (int i =0; i<STREAM_COUNT; ++i)
87128b876c63 add test
ikkun
parents:
diff changeset
348 {
87128b876c63 add test
ikkun
parents:
diff changeset
349 memcpy(h_data_in[i], h_data_source, memsize);
87128b876c63 add test
ikkun
parents:
diff changeset
350 }
87128b876c63 add test
ikkun
parents:
diff changeset
351 }
87128b876c63 add test
ikkun
parents:
diff changeset
352
87128b876c63 add test
ikkun
parents:
diff changeset
353
87128b876c63 add test
ikkun
parents:
diff changeset
354 bool test()
87128b876c63 add test
ikkun
parents:
diff changeset
355 {
87128b876c63 add test
ikkun
parents:
diff changeset
356
87128b876c63 add test
ikkun
parents:
diff changeset
357 bool passed = true;
87128b876c63 add test
ikkun
parents:
diff changeset
358
87128b876c63 add test
ikkun
parents:
diff changeset
359 for (int j =0; j<STREAM_COUNT; ++j)
87128b876c63 add test
ikkun
parents:
diff changeset
360 {
87128b876c63 add test
ikkun
parents:
diff changeset
361 for (int i =0; i<N; ++i)
87128b876c63 add test
ikkun
parents:
diff changeset
362 {
87128b876c63 add test
ikkun
parents:
diff changeset
363 passed &= (h_data_out[j][i] == 1);
87128b876c63 add test
ikkun
parents:
diff changeset
364 }
87128b876c63 add test
ikkun
parents:
diff changeset
365 }
87128b876c63 add test
ikkun
parents:
diff changeset
366
87128b876c63 add test
ikkun
parents:
diff changeset
367 return passed;
87128b876c63 add test
ikkun
parents:
diff changeset
368 }