290
|
1 #include <stdio.h>
|
|
2 #include <sys/time.h>
|
|
3 #include <string.h>
|
|
4 #include <stdlib.h>
|
|
5
|
292
|
6 #include <cuda.h>
|
|
7
|
291
|
8 #include <cuda_runtime.h>
|
292
|
9 #include "helper_cuda.h"
|
290
|
10
|
|
11 #define LENGTH (10)
|
|
12 #define THREAD (10)
|
|
13
|
|
14 double
|
|
15 getTime() {
|
|
16 struct timeval tv;
|
|
17 gettimeofday(&tv, NULL);
|
|
18 return tv.tv_sec + (double)tv.tv_usec*1e-6;
|
|
19 }
|
|
20
|
|
21 void
|
|
22 check_data(float* A, float B, float* C) {
|
|
23 for (int i=0; i<LENGTH*THREAD; i++) {
|
|
24 if (A[i]*B!=C[i]) {
|
|
25 puts("multiply failure.");
|
|
26 return;
|
|
27 }
|
|
28 }
|
|
29 puts("success.");
|
|
30 }
|
|
31
|
|
32 void print_result(float* C) {
|
|
33 for (int i=0; i<LENGTH*THREAD; i++) {
|
|
34 printf("%f\n",C[i]);
|
|
35 }
|
|
36 }
|
|
37
|
|
38 int main(int args, char* argv[]) {
|
|
39 int num_stream = 1; // number of stream
|
|
40 int num_exec = 16; // number of executed kernel
|
|
41
|
|
42 for (int i=1;argv[i];i++) {
|
|
43 if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
|
|
44 num_stream = atoi(argv[++i]);
|
|
45 }
|
|
46 }
|
|
47
|
|
48 // initialize and load kernel
|
|
49 CUdevice device;
|
|
50 CUcontext context;
|
|
51 CUmodule module;
|
|
52 CUfunction function;
|
|
53 CUstream stream[num_stream];
|
|
54
|
292
|
55 checkCudaErrors(cuInit(0));
|
|
56 checkCudaErrors(cuDeviceGet(&device, 0));
|
|
57 checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
|
|
58 checkCudaErrors(cuModuleLoad(&module, "multiply.ptx"));
|
|
59 checkCudaErrors(cuModuleGetFunction(&function, module, "multiply"));
|
290
|
60 for (int i=0;i<num_stream;i++)
|
292
|
61 checkCudaErrors(cuStreamCreate(&stream[i],0));
|
290
|
62
|
|
63 // memory allocate
|
|
64 CUdeviceptr devA;
|
|
65 CUdeviceptr devB[num_exec];
|
|
66 CUdeviceptr devOut[num_exec];
|
|
67
|
292
|
68 checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
|
290
|
69 for (int i=0;i<num_exec;i++) {
|
292
|
70 checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float)));
|
|
71 checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
|
290
|
72 }
|
|
73
|
|
74 // input buffer
|
|
75 float* A = new float[LENGTH*THREAD];
|
|
76 float* B = new float[num_exec];
|
|
77
|
|
78 for (int i=0; i<LENGTH*THREAD; i++)
|
|
79 A[i] = (float)(i+1000);
|
|
80
|
|
81 // output buffer
|
|
82 float** result = new float* [num_exec];
|
|
83
|
|
84 for (int i=0;i<num_exec;i++)
|
|
85 result[i] = new float[LENGTH*THREAD];
|
|
86
|
|
87 // Synchronous data transfer(host to device)
|
292
|
88 checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
|
290
|
89
|
|
90 // Asynchronous data transfer(host to device)
|
|
91 int cur = 0;
|
|
92
|
|
93 for (int i=0;i<num_exec;i++,cur++) {
|
|
94 if (num_stream <= cur)
|
|
95 cur = 0;
|
|
96 B[i] = (float)(i+1);
|
292
|
97 checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]));
|
290
|
98 }
|
|
99
|
|
100 cur = 0;
|
|
101
|
|
102 // Asynchronous launch kernel
|
|
103 for (int i=0;i<num_exec;i++,cur++) {
|
|
104 if (num_stream <= cur)
|
|
105 cur=0;
|
|
106 B[i] = (float)(i+1);
|
|
107 //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
|
|
108 void* args[] = {&devA, &devB[i], &devOut[i]};
|
292
|
109 checkCudaErrors(cuLaunchKernel(function,
|
290
|
110 LENGTH, 1, 1,
|
|
111 THREAD, 1, 1,
|
|
112 0, stream[cur], args, NULL));
|
|
113 //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
|
|
114 }
|
|
115
|
|
116 cur = 0;
|
|
117
|
|
118
|
|
119 // Asynchronous data transfer(device to host)
|
|
120 for (int i=0;i<num_exec;i++,cur++) {
|
|
121 if (num_stream <= cur)
|
|
122 cur = 0;
|
292
|
123 checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
|
290
|
124 }
|
|
125
|
|
126 // wait for stream
|
|
127 for (int i=0;i<num_stream;i++)
|
292
|
128 checkCudaErrors(cuStreamSynchronize(stream[i]));
|
290
|
129
|
|
130 //printf("%0.6f\n",getTime()-start);
|
|
131
|
|
132 for (int i=0;i<num_exec;i++)
|
|
133 check_data(A,(float)(i+1),result[i]);
|
|
134
|
|
135 // memory release
|
292
|
136 checkCudaErrors(cuMemFree(devA));
|
290
|
137 for (int i=0;i<num_exec;i++) {
|
292
|
138 checkCudaErrors(cuMemFree(devB[i]));
|
|
139 checkCudaErrors(cuMemFree(devOut[i]));
|
290
|
140 }
|
|
141 for (int i=0;i<num_stream;i++)
|
292
|
142 checkCudaErrors(cuStreamDestroy(stream[i]));
|
|
143 checkCudaErrors(cuModuleUnload(module));
|
|
144 checkCudaErrors(cuCtxDestroy(context));
|
290
|
145
|
|
146 delete[] A;
|
|
147 delete[] B;
|
|
148 for (int i=0;i<num_exec;i++)
|
|
149 delete[] result[i];
|
|
150 delete[] result;
|
|
151
|
|
152 return 0;
|
|
153 }
|
|
154 //
|