Mercurial > hg > Game > Cerium
diff example/cuda_fft/main.cc @ 2010:6fced32f85fd draft
wrong result
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 11 Jun 2014 11:24:58 +0900 |
parents | 2c8eab01cc78 |
children | faaea4e1ce1c |
line wrap: on
line diff
--- a/example/cuda_fft/main.cc Tue Jun 03 18:12:25 2014 +0900 +++ b/example/cuda_fft/main.cc Wed Jun 11 11:24:58 2014 +0900 @@ -27,21 +27,22 @@ } int -setWorkSize(int* block, int* thread, int x, int y) +setWorkSize(int* xblocks, int* yblocks, int x, int y) { switch(y) { case 1: - *block = x; - *thread = 1; + *xblocks = x; + *yblocks = 1; break; default: - *block = x; - *thread = y; + *xblocks = x; + *yblocks = y; break; } return 0; } + int fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction) { @@ -53,8 +54,8 @@ } int n = 1<<m; - int block, thread; - setWorkSize(&block, &thread, n, n); + int xblocks, yblocks; + setWorkSize(&xblocks, &yblocks, n, n); CUfunction bitReverse; cuModuleGetFunction(&bitReverse, module, "bitReverse"); @@ -62,32 +63,32 @@ void* bitReverse_args[] = {&dst, &src, &m, &n}; cuLaunchKernel(bitReverse, - block, 1, 1, - thread, 1, 1, + xblocks, yblocks, 1, + 1, 1, 1, 0, NULL, bitReverse_args, NULL); CUfunction butterfly; cuModuleGetFunction(&butterfly, module, "butterfly"); - setWorkSize(&block, &thread, n/2, n); + setWorkSize(&xblocks, &yblocks, n/2, n); void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag}; for (int i=1;i<=m;i++) { butterfly_args[4] = &i; cuLaunchKernel(butterfly, - block, 1, 1, - thread, 1, 1, + xblocks, yblocks, 1, + 1, 1, 1, 0, NULL, butterfly_args, NULL); } CUfunction norm; cuModuleGetFunction(&norm, module, "norm"); - void* norm_args[] = {&dst, &m}; + void* norm_args[] = {&dst, &n}; if (direction == inverse) { - setWorkSize(&block, &thread, n, n); + setWorkSize(&xblocks, &yblocks, n, n); cuLaunchKernel(norm, - block, 1, 1, - thread, 1, 1, + xblocks, yblocks, 1, + 1, 1, 1, 0, NULL, norm_args, NULL); } @@ -122,7 +123,7 @@ CUcontext context; cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device); - cuModuleLoad(&module, "fft.ptx"); + printf("%u\n", cuModuleLoad(&module, "fft.ptx")); char* pgm_file = init(args, argv); @@ -152,69 +153,76 @@ // memory allocate CUdeviceptr xmobj; cuMemAlloc(&xmobj, n*n*sizeof(float2)); - + CUdeviceptr rmobj; cuMemAlloc(&rmobj, n*n*sizeof(float2)); CUdeviceptr wmobj; - cuMemAlloc(&wmobj, (n/2)*sizeof(float2)); + cuMemAlloc(&wmobj, n/2*sizeof(float2)); + + CUfunction spinFact; + cuModuleGetFunction(&spinFact, module, "spinFact"); + + int xblocks, yblocks; + setWorkSize(&xblocks, &yblocks, n/2, 1); // Synchronous data transfer(host to device) cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2)); - CUfunction spinFact; - cuModuleGetFunction(&spinFact, module, "spinFact"); - - int block, thread; - setWorkSize(&block, &thread, n/2, 1); - - void* spinFact_args[] = {&xmobj, &n}; + void* spinFact_args[] = {&wmobj, &n}; cuLaunchKernel(spinFact, - block, 1, 1, - thread, 1, 1, + xblocks, yblocks, 1, + 1, 1, 1, 0, NULL, spinFact_args, NULL); + fftCore(rmobj, xmobj, wmobj, m, forward); CUfunction transpose; cuModuleGetFunction(&transpose, module, "transpose"); - setWorkSize(&block, &thread, n, n); + setWorkSize(&xblocks, &yblocks, n, n); void* transpose_args[] = {&xmobj, &rmobj, &n}; cuLaunchKernel(transpose, - block, 1, 1, - thread, 1, 1, + xblocks, yblocks, 1, + 1, 1, 1, 0, NULL, transpose_args, NULL); + fftCore(rmobj, xmobj, wmobj, m, forward); + CUfunction highPassFilter; cuModuleGetFunction(&highPassFilter, module, "highPassFilter"); - setWorkSize(&block, &thread, n, n); + setWorkSize(&xblocks, &yblocks, n, n); int radius = n/8; void*highPassFilter_args[] = {&rmobj, &n, &radius}; cuLaunchKernel(highPassFilter, - block, 1, 1, - thread, 1, 1, + xblocks, yblocks, 1, + 1, 1, 1, 0, NULL, highPassFilter_args, NULL); + fftCore(xmobj, rmobj, wmobj, m, inverse); - setWorkSize(&block, &thread, n, n); + setWorkSize(&xblocks, &yblocks, n, n); void* transpose2_args[] = {&rmobj, &xmobj, &n}; cuLaunchKernel(transpose, - block, 1, 1, - thread, 1, 1, + xblocks, yblocks, 1, + 1, 1, 1, 0, NULL, transpose2_args, NULL); fftCore(xmobj, rmobj, wmobj, m, inverse); + cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2)); + cuStreamSynchronize(NULL); + float* ampd; ampd = (float*)malloc(n*n*sizeof(float));