290
|
1 #include <stdio.h>
|
|
2 #include <sys/time.h>
|
|
3 #include <string.h>
|
|
4 #include <stdlib.h>
|
|
5
|
291
|
6 extern "C"
|
|
7 {
|
|
8 //#include <cuda.h>
|
|
9 }
|
|
10 #include <cuda_runtime.h>
|
290
|
11
|
|
12 #define LENGTH (10)
|
|
13 #define THREAD (10)
|
|
14
|
|
15 void
|
|
16 report_error(cudaError_t err, const char* file, int lineNo) {
|
|
17 fprintf(stderr, "[cudaError] %s (error code: %d) at %s line %d\n", cudaGetErrorString(err), err, file, lineNo);
|
|
18 }
|
|
19
|
|
20 #define CUDA_CALL(func) \
|
|
21 do { \
|
|
22 if ((func) != CUDA_SUCCESS) { \
|
|
23 cudaError_t err = cudaGetLastError(); \
|
|
24 report_error(err, __FILE__, __LINE__); \
|
|
25 exit(err); \
|
|
26 } \
|
|
27 } while(0)
|
|
28
|
|
29 double
|
|
30 getTime() {
|
|
31 struct timeval tv;
|
|
32 gettimeofday(&tv, NULL);
|
|
33 return tv.tv_sec + (double)tv.tv_usec*1e-6;
|
|
34 }
|
|
35
|
|
36 void
|
|
37 check_data(float* A, float B, float* C) {
|
|
38 for (int i=0; i<LENGTH*THREAD; i++) {
|
|
39 if (A[i]*B!=C[i]) {
|
|
40 puts("multiply failure.");
|
|
41 return;
|
|
42 }
|
|
43 }
|
|
44 puts("success.");
|
|
45 }
|
|
46
|
|
47 void print_result(float* C) {
|
|
48 for (int i=0; i<LENGTH*THREAD; i++) {
|
|
49 printf("%f\n",C[i]);
|
|
50 }
|
|
51 }
|
|
52
|
|
53 int main(int args, char* argv[]) {
|
|
54 int num_stream = 1; // number of stream
|
|
55 int num_exec = 16; // number of executed kernel
|
|
56
|
|
57 for (int i=1;argv[i];i++) {
|
|
58 if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
|
|
59 num_stream = atoi(argv[++i]);
|
|
60 }
|
|
61 }
|
|
62
|
|
63 // initialize and load kernel
|
|
64 CUdevice device;
|
|
65 CUcontext context;
|
|
66 CUmodule module;
|
|
67 CUfunction function;
|
|
68 CUstream stream[num_stream];
|
|
69
|
|
70 CUDA_CALL(cuInit(0));
|
|
71 CUDA_CALL(cuDeviceGet(&device, 0));
|
|
72 CUDA_CALL(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
|
|
73 CUDA_CALL(cuModuleLoad(&module, "multiply.ptx"));
|
|
74 CUDA_CALL(cuModuleGetFunction(&function, module, "multiply"));
|
|
75 for (int i=0;i<num_stream;i++)
|
|
76 CUDA_CALL(cuStreamCreate(&stream[i],0));
|
|
77
|
|
78 // memory allocate
|
|
79 CUdeviceptr devA;
|
|
80 CUdeviceptr devB[num_exec];
|
|
81 CUdeviceptr devOut[num_exec];
|
|
82
|
|
83 CUDA_CALL(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
|
|
84 for (int i=0;i<num_exec;i++) {
|
|
85 CUDA_CALL(cuMemAlloc(&devB[i], sizeof(float)));
|
|
86 CUDA_CALL(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
|
|
87 }
|
|
88
|
|
89 // input buffer
|
|
90 float* A = new float[LENGTH*THREAD];
|
|
91 float* B = new float[num_exec];
|
|
92
|
|
93 for (int i=0; i<LENGTH*THREAD; i++)
|
|
94 A[i] = (float)(i+1000);
|
|
95
|
|
96 // output buffer
|
|
97 float** result = new float* [num_exec];
|
|
98
|
|
99 for (int i=0;i<num_exec;i++)
|
|
100 result[i] = new float[LENGTH*THREAD];
|
|
101
|
|
102 // Synchronous data transfer(host to device)
|
|
103 CUDA_CALL(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
|
|
104
|
|
105 // Asynchronous data transfer(host to device)
|
|
106 int cur = 0;
|
|
107
|
|
108 for (int i=0;i<num_exec;i++,cur++) {
|
|
109 if (num_stream <= cur)
|
|
110 cur = 0;
|
|
111 B[i] = (float)(i+1);
|
|
112 CUDA_CALL(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]));
|
|
113 }
|
|
114
|
|
115 cur = 0;
|
|
116
|
|
117 // Asynchronous launch kernel
|
|
118 for (int i=0;i<num_exec;i++,cur++) {
|
|
119 if (num_stream <= cur)
|
|
120 cur=0;
|
|
121 B[i] = (float)(i+1);
|
|
122 //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
|
|
123 void* args[] = {&devA, &devB[i], &devOut[i]};
|
|
124 CUDA_CALL(cuLaunchKernel(function,
|
|
125 LENGTH, 1, 1,
|
|
126 THREAD, 1, 1,
|
|
127 0, stream[cur], args, NULL));
|
|
128 //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
|
|
129 }
|
|
130
|
|
131 cur = 0;
|
|
132
|
|
133
|
|
134 // Asynchronous data transfer(device to host)
|
|
135 for (int i=0;i<num_exec;i++,cur++) {
|
|
136 if (num_stream <= cur)
|
|
137 cur = 0;
|
|
138 CUDA_CALL(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
|
|
139 }
|
|
140
|
|
141 // wait for stream
|
|
142 for (int i=0;i<num_stream;i++)
|
|
143 CUDA_CALL(cuStreamSynchronize(stream[i]));
|
|
144
|
|
145 //printf("%0.6f\n",getTime()-start);
|
|
146
|
|
147 for (int i=0;i<num_exec;i++)
|
|
148 check_data(A,(float)(i+1),result[i]);
|
|
149
|
|
150 // memory release
|
|
151 CUDA_CALL(cuMemFree(devA));
|
|
152 for (int i=0;i<num_exec;i++) {
|
|
153 CUDA_CALL(cuMemFree(devB[i]));
|
|
154 CUDA_CALL(cuMemFree(devOut[i]));
|
|
155 }
|
|
156 for (int i=0;i<num_stream;i++)
|
|
157 CUDA_CALL(cuStreamDestroy(stream[i]));
|
|
158 CUDA_CALL(cuModuleUnload(module));
|
|
159 CUDA_CALL(cuCtxDestroy(context));
|
|
160
|
|
161 delete[] A;
|
|
162 delete[] B;
|
|
163 for (int i=0;i<num_exec;i++)
|
|
164 delete[] result[i];
|
|
165 delete[] result;
|
|
166
|
|
167 return 0;
|
|
168 }
|
|
169 //
|