Mercurial > hg > Game > Cerium
comparison example/cuda_fft/main.cc @ 2010:6fced32f85fd draft
wrong result
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 11 Jun 2014 11:24:58 +0900 |
parents | 2c8eab01cc78 |
children | faaea4e1ce1c |
comparison
equal
deleted
inserted
replaced
2009:113b1edd2a9a | 2010:6fced32f85fd |
---|---|
25 gettimeofday(&tv, NULL); | 25 gettimeofday(&tv, NULL); |
26 return tv.tv_sec + (double)tv.tv_usec*1e-6; | 26 return tv.tv_sec + (double)tv.tv_usec*1e-6; |
27 } | 27 } |
28 | 28 |
29 int | 29 int |
30 setWorkSize(int* block, int* thread, int x, int y) | 30 setWorkSize(int* xblocks, int* yblocks, int x, int y) |
31 { | 31 { |
32 switch(y) { | 32 switch(y) { |
33 case 1: | 33 case 1: |
34 *block = x; | 34 *xblocks = x; |
35 *thread = 1; | 35 *yblocks = 1; |
36 break; | 36 break; |
37 default: | 37 default: |
38 *block = x; | 38 *xblocks = x; |
39 *thread = y; | 39 *yblocks = y; |
40 break; | 40 break; |
41 } | 41 } |
42 | 42 |
43 return 0; | 43 return 0; |
44 } | 44 } |
45 | |
45 int | 46 int |
46 fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction) | 47 fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction) |
47 { | 48 { |
48 | 49 |
49 unsigned int flag; | 50 unsigned int flag; |
51 case forward:flag = 0x00000000; break; | 52 case forward:flag = 0x00000000; break; |
52 case inverse:flag = 0x80000000; break; | 53 case inverse:flag = 0x80000000; break; |
53 } | 54 } |
54 | 55 |
55 int n = 1<<m; | 56 int n = 1<<m; |
56 int block, thread; | 57 int xblocks, yblocks; |
57 setWorkSize(&block, &thread, n, n); | 58 setWorkSize(&xblocks, &yblocks, n, n); |
58 | 59 |
59 CUfunction bitReverse; | 60 CUfunction bitReverse; |
60 cuModuleGetFunction(&bitReverse, module, "bitReverse"); | 61 cuModuleGetFunction(&bitReverse, module, "bitReverse"); |
61 | 62 |
62 void* bitReverse_args[] = {&dst, &src, &m, &n}; | 63 void* bitReverse_args[] = {&dst, &src, &m, &n}; |
63 | 64 |
64 cuLaunchKernel(bitReverse, | 65 cuLaunchKernel(bitReverse, |
65 block, 1, 1, | 66 xblocks, yblocks, 1, |
66 thread, 1, 1, | 67 1, 1, 1, |
67 0, NULL, bitReverse_args, NULL); | 68 0, NULL, bitReverse_args, NULL); |
68 | 69 |
69 CUfunction butterfly; | 70 CUfunction butterfly; |
70 cuModuleGetFunction(&butterfly, module, "butterfly"); | 71 cuModuleGetFunction(&butterfly, module, "butterfly"); |
71 | 72 |
72 setWorkSize(&block, &thread, n/2, n); | 73 setWorkSize(&xblocks, &yblocks, n/2, n); |
73 void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag}; | 74 void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag}; |
74 for (int i=1;i<=m;i++) { | 75 for (int i=1;i<=m;i++) { |
75 butterfly_args[4] = &i; | 76 butterfly_args[4] = &i; |
76 cuLaunchKernel(butterfly, | 77 cuLaunchKernel(butterfly, |
77 block, 1, 1, | 78 xblocks, yblocks, 1, |
78 thread, 1, 1, | 79 1, 1, 1, |
79 0, NULL, butterfly_args, NULL); | 80 0, NULL, butterfly_args, NULL); |
80 } | 81 } |
81 | 82 |
82 CUfunction norm; | 83 CUfunction norm; |
83 cuModuleGetFunction(&norm, module, "norm"); | 84 cuModuleGetFunction(&norm, module, "norm"); |
84 | 85 |
85 void* norm_args[] = {&dst, &m}; | 86 void* norm_args[] = {&dst, &n}; |
86 if (direction == inverse) { | 87 if (direction == inverse) { |
87 setWorkSize(&block, &thread, n, n); | 88 setWorkSize(&xblocks, &yblocks, n, n); |
88 cuLaunchKernel(norm, | 89 cuLaunchKernel(norm, |
89 block, 1, 1, | 90 xblocks, yblocks, 1, |
90 thread, 1, 1, | 91 1, 1, 1, |
91 0, NULL, norm_args, NULL); | 92 0, NULL, norm_args, NULL); |
92 } | 93 } |
93 | 94 |
94 return 0; | 95 return 0; |
95 } | 96 } |
120 cuDeviceGet(&device, 0); | 121 cuDeviceGet(&device, 0); |
121 | 122 |
122 CUcontext context; | 123 CUcontext context; |
123 cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device); | 124 cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device); |
124 | 125 |
125 cuModuleLoad(&module, "fft.ptx"); | 126 printf("%u\n", cuModuleLoad(&module, "fft.ptx")); |
126 | 127 |
127 char* pgm_file = init(args, argv); | 128 char* pgm_file = init(args, argv); |
128 | 129 |
129 pgm_t ipgm; | 130 pgm_t ipgm; |
130 int err = readPGM(&ipgm, pgm_file); | 131 int err = readPGM(&ipgm, pgm_file); |
150 st_time = getTime(); | 151 st_time = getTime(); |
151 | 152 |
152 // memory allocate | 153 // memory allocate |
153 CUdeviceptr xmobj; | 154 CUdeviceptr xmobj; |
154 cuMemAlloc(&xmobj, n*n*sizeof(float2)); | 155 cuMemAlloc(&xmobj, n*n*sizeof(float2)); |
155 | 156 |
156 CUdeviceptr rmobj; | 157 CUdeviceptr rmobj; |
157 cuMemAlloc(&rmobj, n*n*sizeof(float2)); | 158 cuMemAlloc(&rmobj, n*n*sizeof(float2)); |
158 | 159 |
159 CUdeviceptr wmobj; | 160 CUdeviceptr wmobj; |
160 cuMemAlloc(&wmobj, (n/2)*sizeof(float2)); | 161 cuMemAlloc(&wmobj, n/2*sizeof(float2)); |
162 | |
163 CUfunction spinFact; | |
164 cuModuleGetFunction(&spinFact, module, "spinFact"); | |
165 | |
166 int xblocks, yblocks; | |
167 setWorkSize(&xblocks, &yblocks, n/2, 1); | |
161 | 168 |
162 // Synchronous data transfer(host to device) | 169 // Synchronous data transfer(host to device) |
163 cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2)); | 170 cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2)); |
164 | 171 |
165 CUfunction spinFact; | 172 void* spinFact_args[] = {&wmobj, &n}; |
166 cuModuleGetFunction(&spinFact, module, "spinFact"); | |
167 | |
168 int block, thread; | |
169 setWorkSize(&block, &thread, n/2, 1); | |
170 | |
171 void* spinFact_args[] = {&xmobj, &n}; | |
172 cuLaunchKernel(spinFact, | 173 cuLaunchKernel(spinFact, |
173 block, 1, 1, | 174 xblocks, yblocks, 1, |
174 thread, 1, 1, | 175 1, 1, 1, |
175 0, NULL, spinFact_args, NULL); | 176 0, NULL, spinFact_args, NULL); |
176 | 177 |
178 | |
177 fftCore(rmobj, xmobj, wmobj, m, forward); | 179 fftCore(rmobj, xmobj, wmobj, m, forward); |
178 | 180 |
179 CUfunction transpose; | 181 CUfunction transpose; |
180 cuModuleGetFunction(&transpose, module, "transpose"); | 182 cuModuleGetFunction(&transpose, module, "transpose"); |
181 | 183 |
182 setWorkSize(&block, &thread, n, n); | 184 setWorkSize(&xblocks, &yblocks, n, n); |
183 | 185 |
184 void* transpose_args[] = {&xmobj, &rmobj, &n}; | 186 void* transpose_args[] = {&xmobj, &rmobj, &n}; |
185 cuLaunchKernel(transpose, | 187 cuLaunchKernel(transpose, |
186 block, 1, 1, | 188 xblocks, yblocks, 1, |
187 thread, 1, 1, | 189 1, 1, 1, |
188 0, NULL, transpose_args, NULL); | 190 0, NULL, transpose_args, NULL); |
189 | 191 |
192 | |
190 fftCore(rmobj, xmobj, wmobj, m, forward); | 193 fftCore(rmobj, xmobj, wmobj, m, forward); |
194 | |
191 | 195 |
192 CUfunction highPassFilter; | 196 CUfunction highPassFilter; |
193 cuModuleGetFunction(&highPassFilter, module, "highPassFilter"); | 197 cuModuleGetFunction(&highPassFilter, module, "highPassFilter"); |
194 | 198 |
195 setWorkSize(&block, &thread, n, n); | 199 setWorkSize(&xblocks, &yblocks, n, n); |
196 | 200 |
197 int radius = n/8; | 201 int radius = n/8; |
198 void*highPassFilter_args[] = {&rmobj, &n, &radius}; | 202 void*highPassFilter_args[] = {&rmobj, &n, &radius}; |
199 cuLaunchKernel(highPassFilter, | 203 cuLaunchKernel(highPassFilter, |
200 block, 1, 1, | 204 xblocks, yblocks, 1, |
201 thread, 1, 1, | 205 1, 1, 1, |
202 0, NULL, highPassFilter_args, NULL); | 206 0, NULL, highPassFilter_args, NULL); |
203 | 207 |
208 | |
204 fftCore(xmobj, rmobj, wmobj, m, inverse); | 209 fftCore(xmobj, rmobj, wmobj, m, inverse); |
205 | 210 |
206 setWorkSize(&block, &thread, n, n); | 211 setWorkSize(&xblocks, &yblocks, n, n); |
207 | 212 |
208 void* transpose2_args[] = {&rmobj, &xmobj, &n}; | 213 void* transpose2_args[] = {&rmobj, &xmobj, &n}; |
209 cuLaunchKernel(transpose, | 214 cuLaunchKernel(transpose, |
210 block, 1, 1, | 215 xblocks, yblocks, 1, |
211 thread, 1, 1, | 216 1, 1, 1, |
212 0, NULL, transpose2_args, NULL); | 217 0, NULL, transpose2_args, NULL); |
213 | 218 |
214 fftCore(xmobj, rmobj, wmobj, m, inverse); | 219 fftCore(xmobj, rmobj, wmobj, m, inverse); |
215 | 220 |
221 | |
216 cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2)); | 222 cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2)); |
223 | |
224 cuStreamSynchronize(NULL); | |
217 | 225 |
218 float* ampd; | 226 float* ampd; |
219 ampd = (float*)malloc(n*n*sizeof(float)); | 227 ampd = (float*)malloc(n*n*sizeof(float)); |
220 | 228 |
221 for (int i=0;i<n*n;i++) | 229 for (int i=0;i<n*n;i++) |