Game/Cerium: example/cuda_fft/main.cc comparison

comparison example/cuda_fft/main.cc @ 2010:6fced32f85fd draft

wrong result

author	Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date	Wed, 11 Jun 2014 11:24:58 +0900
parents	2c8eab01cc78
children	faaea4e1ce1c

comparison

equal deleted inserted replaced

-:113b1edd2a9a
+:6fced32f85fd
 gettimeofday(&tv, NULL);
 return tv.tv_sec + (double)tv.tv_usec*1e-6;
 }
 int
-setWorkSize(int* block, int* thread, int x, int y)
+setWorkSize(int* xblocks, int* yblocks, int x, int y)
 {
 switch(y) {
 case 1:
-*block = x;
+*xblocks = x;
-*thread = 1;
+*yblocks = 1;
 break;
 default:
-*block = x;
+*xblocks = x;
-*thread = y;
+*yblocks = y;
 break;
 }
 return 0;
 }
 int
 fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction)
 {
 unsigned int flag;
 case forward:flag = 0x00000000; break;
 case inverse:flag = 0x80000000; break;
 }
 int n = 1<<m;
-int block, thread;
+int xblocks, yblocks;
-setWorkSize(&block, &thread, n, n);
+setWorkSize(&xblocks, &yblocks, n, n);
 CUfunction bitReverse;
 cuModuleGetFunction(&bitReverse, module, "bitReverse");
 void* bitReverse_args[] = {&dst, &src, &m, &n};
 cuLaunchKernel(bitReverse,
-block, 1, 1,
+xblocks, yblocks, 1,
-thread, 1, 1,
+1, 1, 1,
 0, NULL, bitReverse_args, NULL);
 CUfunction butterfly;
 cuModuleGetFunction(&butterfly, module, "butterfly");
-setWorkSize(&block, &thread, n/2, n);
+setWorkSize(&xblocks, &yblocks, n/2, n);
 void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag};
 for (int i=1;i<=m;i++) {
 butterfly_args[4] = &i;
 cuLaunchKernel(butterfly,
-block, 1, 1,
+xblocks, yblocks, 1,
-thread, 1, 1,
+1, 1, 1,
 0, NULL, butterfly_args, NULL);
 }
 CUfunction norm;
 cuModuleGetFunction(&norm, module, "norm");
-void* norm_args[] = {&dst, &m};
+void* norm_args[] = {&dst, &n};
 if (direction == inverse) {
-setWorkSize(&block, &thread, n, n);
+setWorkSize(&xblocks, &yblocks, n, n);
 cuLaunchKernel(norm,
-block, 1, 1,
+xblocks, yblocks, 1,
-thread, 1, 1,
+1, 1, 1,
 0, NULL, norm_args, NULL);
 }
 return 0;
 }
 cuDeviceGet(&device, 0);
 CUcontext context;
 cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);
-cuModuleLoad(&module, "fft.ptx");
+printf("%u\n", cuModuleLoad(&module, "fft.ptx"));
 char* pgm_file = init(args, argv);
 pgm_t ipgm;
 int err = readPGM(&ipgm, pgm_file);
 st_time = getTime();
 // memory allocate
 CUdeviceptr xmobj;
 cuMemAlloc(&xmobj, n*n*sizeof(float2));
 CUdeviceptr rmobj;
 cuMemAlloc(&rmobj, n*n*sizeof(float2));
 CUdeviceptr wmobj;
-cuMemAlloc(&wmobj, (n/2)*sizeof(float2));
+cuMemAlloc(&wmobj, n/2*sizeof(float2));
+CUfunction spinFact;
+cuModuleGetFunction(&spinFact, module, "spinFact");
+int xblocks, yblocks;
+setWorkSize(&xblocks, &yblocks, n/2, 1);
 // Synchronous data transfer(host to device)
 cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));
-CUfunction spinFact;
+void* spinFact_args[] = {&wmobj, &n};
-cuModuleGetFunction(&spinFact, module, "spinFact");
-int block, thread;
-setWorkSize(&block, &thread, n/2, 1);
-void* spinFact_args[] = {&xmobj, &n};
 cuLaunchKernel(spinFact,
-block, 1, 1,
+xblocks, yblocks, 1,
-thread, 1, 1,
+1, 1, 1,
 0, NULL, spinFact_args, NULL);
 fftCore(rmobj, xmobj, wmobj, m, forward);
 CUfunction transpose;
 cuModuleGetFunction(&transpose, module, "transpose");
-setWorkSize(&block, &thread, n, n);
+setWorkSize(&xblocks, &yblocks, n, n);
 void* transpose_args[] = {&xmobj, &rmobj, &n};
 cuLaunchKernel(transpose,
-block, 1, 1,
+xblocks, yblocks, 1,
-thread, 1, 1,
+1, 1, 1,
 0, NULL, transpose_args, NULL);
 fftCore(rmobj, xmobj, wmobj, m, forward);
 CUfunction highPassFilter;
 cuModuleGetFunction(&highPassFilter, module, "highPassFilter");
-setWorkSize(&block, &thread, n, n);
+setWorkSize(&xblocks, &yblocks, n, n);
 int radius = n/8;
 void*highPassFilter_args[] = {&rmobj, &n, &radius};
 cuLaunchKernel(highPassFilter,
-block, 1, 1,
+xblocks, yblocks, 1,
-thread, 1, 1,
+1, 1, 1,
 0, NULL, highPassFilter_args, NULL);
 fftCore(xmobj, rmobj, wmobj, m, inverse);
-setWorkSize(&block, &thread, n, n);
+setWorkSize(&xblocks, &yblocks, n, n);
 void* transpose2_args[] = {&rmobj, &xmobj, &n};
 cuLaunchKernel(transpose,
-block, 1, 1,
+xblocks, yblocks, 1,
-thread, 1, 1,
+1, 1, 1,
 0, NULL, transpose2_args, NULL);
 fftCore(xmobj, rmobj, wmobj, m, inverse);
 cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2));
+cuStreamSynchronize(NULL);
 float* ampd;
 ampd = (float*)malloc(n*n*sizeof(float));
 for (int i=0;i<n*n;i++)

Mercurial > hg > Game > Cerium

comparison example/cuda_fft/main.cc @ 2010:6fced32f85fd draft