comparison example/cuda_fft/main.cc @ 2010:6fced32f85fd draft

wrong result
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Wed, 11 Jun 2014 11:24:58 +0900
parents 2c8eab01cc78
children faaea4e1ce1c
comparison
equal deleted inserted replaced
2009:113b1edd2a9a 2010:6fced32f85fd
25 gettimeofday(&tv, NULL); 25 gettimeofday(&tv, NULL);
26 return tv.tv_sec + (double)tv.tv_usec*1e-6; 26 return tv.tv_sec + (double)tv.tv_usec*1e-6;
27 } 27 }
28 28
29 int 29 int
30 setWorkSize(int* block, int* thread, int x, int y) 30 setWorkSize(int* xblocks, int* yblocks, int x, int y)
31 { 31 {
32 switch(y) { 32 switch(y) {
33 case 1: 33 case 1:
34 *block = x; 34 *xblocks = x;
35 *thread = 1; 35 *yblocks = 1;
36 break; 36 break;
37 default: 37 default:
38 *block = x; 38 *xblocks = x;
39 *thread = y; 39 *yblocks = y;
40 break; 40 break;
41 } 41 }
42 42
43 return 0; 43 return 0;
44 } 44 }
45
45 int 46 int
46 fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction) 47 fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction)
47 { 48 {
48 49
49 unsigned int flag; 50 unsigned int flag;
51 case forward:flag = 0x00000000; break; 52 case forward:flag = 0x00000000; break;
52 case inverse:flag = 0x80000000; break; 53 case inverse:flag = 0x80000000; break;
53 } 54 }
54 55
55 int n = 1<<m; 56 int n = 1<<m;
56 int block, thread; 57 int xblocks, yblocks;
57 setWorkSize(&block, &thread, n, n); 58 setWorkSize(&xblocks, &yblocks, n, n);
58 59
59 CUfunction bitReverse; 60 CUfunction bitReverse;
60 cuModuleGetFunction(&bitReverse, module, "bitReverse"); 61 cuModuleGetFunction(&bitReverse, module, "bitReverse");
61 62
62 void* bitReverse_args[] = {&dst, &src, &m, &n}; 63 void* bitReverse_args[] = {&dst, &src, &m, &n};
63 64
64 cuLaunchKernel(bitReverse, 65 cuLaunchKernel(bitReverse,
65 block, 1, 1, 66 xblocks, yblocks, 1,
66 thread, 1, 1, 67 1, 1, 1,
67 0, NULL, bitReverse_args, NULL); 68 0, NULL, bitReverse_args, NULL);
68 69
69 CUfunction butterfly; 70 CUfunction butterfly;
70 cuModuleGetFunction(&butterfly, module, "butterfly"); 71 cuModuleGetFunction(&butterfly, module, "butterfly");
71 72
72 setWorkSize(&block, &thread, n/2, n); 73 setWorkSize(&xblocks, &yblocks, n/2, n);
73 void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag}; 74 void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag};
74 for (int i=1;i<=m;i++) { 75 for (int i=1;i<=m;i++) {
75 butterfly_args[4] = &i; 76 butterfly_args[4] = &i;
76 cuLaunchKernel(butterfly, 77 cuLaunchKernel(butterfly,
77 block, 1, 1, 78 xblocks, yblocks, 1,
78 thread, 1, 1, 79 1, 1, 1,
79 0, NULL, butterfly_args, NULL); 80 0, NULL, butterfly_args, NULL);
80 } 81 }
81 82
82 CUfunction norm; 83 CUfunction norm;
83 cuModuleGetFunction(&norm, module, "norm"); 84 cuModuleGetFunction(&norm, module, "norm");
84 85
85 void* norm_args[] = {&dst, &m}; 86 void* norm_args[] = {&dst, &n};
86 if (direction == inverse) { 87 if (direction == inverse) {
87 setWorkSize(&block, &thread, n, n); 88 setWorkSize(&xblocks, &yblocks, n, n);
88 cuLaunchKernel(norm, 89 cuLaunchKernel(norm,
89 block, 1, 1, 90 xblocks, yblocks, 1,
90 thread, 1, 1, 91 1, 1, 1,
91 0, NULL, norm_args, NULL); 92 0, NULL, norm_args, NULL);
92 } 93 }
93 94
94 return 0; 95 return 0;
95 } 96 }
120 cuDeviceGet(&device, 0); 121 cuDeviceGet(&device, 0);
121 122
122 CUcontext context; 123 CUcontext context;
123 cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device); 124 cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);
124 125
125 cuModuleLoad(&module, "fft.ptx"); 126 printf("%u\n", cuModuleLoad(&module, "fft.ptx"));
126 127
127 char* pgm_file = init(args, argv); 128 char* pgm_file = init(args, argv);
128 129
129 pgm_t ipgm; 130 pgm_t ipgm;
130 int err = readPGM(&ipgm, pgm_file); 131 int err = readPGM(&ipgm, pgm_file);
150 st_time = getTime(); 151 st_time = getTime();
151 152
152 // memory allocate 153 // memory allocate
153 CUdeviceptr xmobj; 154 CUdeviceptr xmobj;
154 cuMemAlloc(&xmobj, n*n*sizeof(float2)); 155 cuMemAlloc(&xmobj, n*n*sizeof(float2));
155 156
156 CUdeviceptr rmobj; 157 CUdeviceptr rmobj;
157 cuMemAlloc(&rmobj, n*n*sizeof(float2)); 158 cuMemAlloc(&rmobj, n*n*sizeof(float2));
158 159
159 CUdeviceptr wmobj; 160 CUdeviceptr wmobj;
160 cuMemAlloc(&wmobj, (n/2)*sizeof(float2)); 161 cuMemAlloc(&wmobj, n/2*sizeof(float2));
162
163 CUfunction spinFact;
164 cuModuleGetFunction(&spinFact, module, "spinFact");
165
166 int xblocks, yblocks;
167 setWorkSize(&xblocks, &yblocks, n/2, 1);
161 168
162 // Synchronous data transfer(host to device) 169 // Synchronous data transfer(host to device)
163 cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2)); 170 cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));
164 171
165 CUfunction spinFact; 172 void* spinFact_args[] = {&wmobj, &n};
166 cuModuleGetFunction(&spinFact, module, "spinFact");
167
168 int block, thread;
169 setWorkSize(&block, &thread, n/2, 1);
170
171 void* spinFact_args[] = {&xmobj, &n};
172 cuLaunchKernel(spinFact, 173 cuLaunchKernel(spinFact,
173 block, 1, 1, 174 xblocks, yblocks, 1,
174 thread, 1, 1, 175 1, 1, 1,
175 0, NULL, spinFact_args, NULL); 176 0, NULL, spinFact_args, NULL);
176 177
178
177 fftCore(rmobj, xmobj, wmobj, m, forward); 179 fftCore(rmobj, xmobj, wmobj, m, forward);
178 180
179 CUfunction transpose; 181 CUfunction transpose;
180 cuModuleGetFunction(&transpose, module, "transpose"); 182 cuModuleGetFunction(&transpose, module, "transpose");
181 183
182 setWorkSize(&block, &thread, n, n); 184 setWorkSize(&xblocks, &yblocks, n, n);
183 185
184 void* transpose_args[] = {&xmobj, &rmobj, &n}; 186 void* transpose_args[] = {&xmobj, &rmobj, &n};
185 cuLaunchKernel(transpose, 187 cuLaunchKernel(transpose,
186 block, 1, 1, 188 xblocks, yblocks, 1,
187 thread, 1, 1, 189 1, 1, 1,
188 0, NULL, transpose_args, NULL); 190 0, NULL, transpose_args, NULL);
189 191
192
190 fftCore(rmobj, xmobj, wmobj, m, forward); 193 fftCore(rmobj, xmobj, wmobj, m, forward);
194
191 195
192 CUfunction highPassFilter; 196 CUfunction highPassFilter;
193 cuModuleGetFunction(&highPassFilter, module, "highPassFilter"); 197 cuModuleGetFunction(&highPassFilter, module, "highPassFilter");
194 198
195 setWorkSize(&block, &thread, n, n); 199 setWorkSize(&xblocks, &yblocks, n, n);
196 200
197 int radius = n/8; 201 int radius = n/8;
198 void*highPassFilter_args[] = {&rmobj, &n, &radius}; 202 void*highPassFilter_args[] = {&rmobj, &n, &radius};
199 cuLaunchKernel(highPassFilter, 203 cuLaunchKernel(highPassFilter,
200 block, 1, 1, 204 xblocks, yblocks, 1,
201 thread, 1, 1, 205 1, 1, 1,
202 0, NULL, highPassFilter_args, NULL); 206 0, NULL, highPassFilter_args, NULL);
203 207
208
204 fftCore(xmobj, rmobj, wmobj, m, inverse); 209 fftCore(xmobj, rmobj, wmobj, m, inverse);
205 210
206 setWorkSize(&block, &thread, n, n); 211 setWorkSize(&xblocks, &yblocks, n, n);
207 212
208 void* transpose2_args[] = {&rmobj, &xmobj, &n}; 213 void* transpose2_args[] = {&rmobj, &xmobj, &n};
209 cuLaunchKernel(transpose, 214 cuLaunchKernel(transpose,
210 block, 1, 1, 215 xblocks, yblocks, 1,
211 thread, 1, 1, 216 1, 1, 1,
212 0, NULL, transpose2_args, NULL); 217 0, NULL, transpose2_args, NULL);
213 218
214 fftCore(xmobj, rmobj, wmobj, m, inverse); 219 fftCore(xmobj, rmobj, wmobj, m, inverse);
215 220
221
216 cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2)); 222 cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2));
223
224 cuStreamSynchronize(NULL);
217 225
218 float* ampd; 226 float* ampd;
219 ampd = (float*)malloc(n*n*sizeof(float)); 227 ampd = (float*)malloc(n*n*sizeof(float));
220 228
221 for (int i=0;i<n*n;i++) 229 for (int i=0;i<n*n;i++)