Mercurial > hg > Members > Moririn
comparison src/test/twice.cu @ 292:2bc63a22dd21
add twice
author | ikkun |
---|---|
date | Thu, 09 Feb 2017 19:51:32 +0900 |
parents | src/test/main.cc@87128b876c63 |
children |
comparison
equal
deleted
inserted
replaced
291:87128b876c63 | 292:2bc63a22dd21 |
---|---|
1 #include <stdio.h> | |
2 #include <sys/time.h> | |
3 #include <string.h> | |
4 #include <stdlib.h> | |
5 | |
6 #include <cuda.h> | |
7 | |
8 #include <cuda_runtime.h> | |
9 #include "helper_cuda.h" | |
10 | |
11 #define LENGTH (10) | |
12 #define THREAD (10) | |
13 | |
14 double | |
15 getTime() { | |
16 struct timeval tv; | |
17 gettimeofday(&tv, NULL); | |
18 return tv.tv_sec + (double)tv.tv_usec*1e-6; | |
19 } | |
20 | |
21 void | |
22 check_data(float* A, float B, float* C) { | |
23 for (int i=0; i<LENGTH*THREAD; i++) { | |
24 if (A[i]*B!=C[i]) { | |
25 puts("multiply failure."); | |
26 return; | |
27 } | |
28 } | |
29 puts("success."); | |
30 } | |
31 | |
32 void print_result(float* C) { | |
33 for (int i=0; i<LENGTH*THREAD; i++) { | |
34 printf("%f\n",C[i]); | |
35 } | |
36 } | |
37 | |
38 int main(int args, char* argv[]) { | |
39 int num_stream = 1; // number of stream | |
40 int num_exec = 16; // number of executed kernel | |
41 | |
42 for (int i=1;argv[i];i++) { | |
43 if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { | |
44 num_stream = atoi(argv[++i]); | |
45 } | |
46 } | |
47 | |
48 // initialize and load kernel | |
49 CUdevice device; | |
50 CUcontext context; | |
51 CUmodule module; | |
52 CUfunction function; | |
53 CUstream stream[num_stream]; | |
54 | |
55 checkCudaErrors(cuInit(0)); | |
56 checkCudaErrors(cuDeviceGet(&device, 0)); | |
57 checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); | |
58 checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); | |
59 checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); | |
60 for (int i=0;i<num_stream;i++) | |
61 checkCudaErrors(cuStreamCreate(&stream[i],0)); | |
62 | |
63 // memory allocate | |
64 CUdeviceptr devA; | |
65 CUdeviceptr devB[num_exec]; | |
66 CUdeviceptr devOut[num_exec]; | |
67 | |
68 checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); | |
69 for (int i=0;i<num_exec;i++) { | |
70 checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); | |
71 checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); | |
72 } | |
73 | |
74 // input buffer | |
75 float* A = new float[LENGTH*THREAD]; | |
76 float* B = new float[num_exec]; | |
77 | |
78 for (int i=0; i<LENGTH*THREAD; i++) | |
79 A[i] = (float)(i+1000); | |
80 | |
81 // output buffer | |
82 float** result = new float* [num_exec]; | |
83 | |
84 for (int i=0;i<num_exec;i++) | |
85 result[i] = new float[LENGTH*THREAD]; | |
86 | |
87 // Synchronous data transfer(host to device) | |
88 checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); | |
89 | |
90 // Asynchronous data transfer(host to device) | |
91 int cur = 0; | |
92 | |
93 for (int i=0;i<num_exec;i++,cur++) { | |
94 if (num_stream <= cur) | |
95 cur = 0; | |
96 B[i] = (float)(i+1); | |
97 checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); | |
98 } | |
99 | |
100 cur = 0; | |
101 | |
102 // Asynchronous launch kernel | |
103 for (int i=0;i<num_exec;i++,cur++) { | |
104 if (num_stream <= cur) | |
105 cur=0; | |
106 B[i] = (float)(i+1); | |
107 //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); | |
108 void* args[] = {&devA, &devB[i], &devOut[i]}; | |
109 checkCudaErrors(cuLaunchKernel(function, | |
110 LENGTH, 1, 1, | |
111 THREAD, 1, 1, | |
112 0, stream[cur], args, NULL)); | |
113 //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); | |
114 } | |
115 | |
116 cur = 0; | |
117 | |
118 | |
119 // Asynchronous data transfer(device to host) | |
120 for (int i=0;i<num_exec;i++,cur++) { | |
121 if (num_stream <= cur) | |
122 cur = 0; | |
123 checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); | |
124 } | |
125 | |
126 // wait for stream | |
127 for (int i=0;i<num_stream;i++) | |
128 checkCudaErrors(cuStreamSynchronize(stream[i])); | |
129 | |
130 //printf("%0.6f\n",getTime()-start); | |
131 | |
132 for (int i=0;i<num_exec;i++) | |
133 check_data(A,(float)(i+1),result[i]); | |
134 | |
135 // memory release | |
136 checkCudaErrors(cuMemFree(devA)); | |
137 for (int i=0;i<num_exec;i++) { | |
138 checkCudaErrors(cuMemFree(devB[i])); | |
139 checkCudaErrors(cuMemFree(devOut[i])); | |
140 } | |
141 for (int i=0;i<num_stream;i++) | |
142 checkCudaErrors(cuStreamDestroy(stream[i])); | |
143 checkCudaErrors(cuModuleUnload(module)); | |
144 checkCudaErrors(cuCtxDestroy(context)); | |
145 | |
146 delete[] A; | |
147 delete[] B; | |
148 for (int i=0;i<num_exec;i++) | |
149 delete[] result[i]; | |
150 delete[] result; | |
151 | |
152 return 0; | |
153 } | |
154 // |