diff example/cuda_fft/main.cc @ 2010:6fced32f85fd draft

wrong result
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Wed, 11 Jun 2014 11:24:58 +0900
parents 2c8eab01cc78
children faaea4e1ce1c
line wrap: on
line diff
--- a/example/cuda_fft/main.cc	Tue Jun 03 18:12:25 2014 +0900
+++ b/example/cuda_fft/main.cc	Wed Jun 11 11:24:58 2014 +0900
@@ -27,21 +27,22 @@
 }
 
 int
-setWorkSize(int* block, int* thread, int x, int y)
+setWorkSize(int* xblocks, int* yblocks, int x, int y)
 {
     switch(y) {
     case 1:
-        *block = x;
-        *thread = 1;
+        *xblocks = x;
+        *yblocks = 1;
         break;
     default:
-        *block = x;
-        *thread = y;
+        *xblocks = x;
+        *yblocks = y;
         break;
     }
 
     return 0;
 }
+
 int
 fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction)
 {
@@ -53,8 +54,8 @@
     }
 
     int n = 1<<m;
-    int block, thread;
-    setWorkSize(&block, &thread, n, n);
+    int xblocks, yblocks;
+    setWorkSize(&xblocks, &yblocks, n, n);
 
     CUfunction bitReverse;
     cuModuleGetFunction(&bitReverse, module, "bitReverse");
@@ -62,32 +63,32 @@
     void* bitReverse_args[] = {&dst, &src, &m, &n};
 
     cuLaunchKernel(bitReverse,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, bitReverse_args, NULL);
 
     CUfunction butterfly;
     cuModuleGetFunction(&butterfly, module, "butterfly");
 
-    setWorkSize(&block, &thread, n/2, n);
+    setWorkSize(&xblocks, &yblocks, n/2, n);
     void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag};
     for (int i=1;i<=m;i++) {
         butterfly_args[4] = &i;
         cuLaunchKernel(butterfly,
-                       block, 1, 1,
-                       thread, 1, 1,
+                       xblocks, yblocks, 1,
+                       1, 1, 1,
                        0, NULL, butterfly_args, NULL);
     }
     
     CUfunction norm;
     cuModuleGetFunction(&norm, module, "norm");
 
-    void* norm_args[] = {&dst, &m};
+    void* norm_args[] = {&dst, &n};
     if (direction == inverse) {
-        setWorkSize(&block, &thread, n, n);
+        setWorkSize(&xblocks, &yblocks, n, n);
         cuLaunchKernel(norm,
-                       block, 1, 1,
-                       thread, 1, 1,
+                       xblocks, yblocks, 1,
+                       1, 1, 1,
                        0, NULL, norm_args, NULL);
     }
 
@@ -122,7 +123,7 @@
     CUcontext context;
     cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);
 
-    cuModuleLoad(&module, "fft.ptx");
+    printf("%u\n", cuModuleLoad(&module, "fft.ptx"));
 
     char* pgm_file = init(args, argv);
 
@@ -152,69 +153,76 @@
     // memory allocate
     CUdeviceptr xmobj;
     cuMemAlloc(&xmobj, n*n*sizeof(float2));
-
+    
     CUdeviceptr rmobj;
     cuMemAlloc(&rmobj, n*n*sizeof(float2));
 
     CUdeviceptr wmobj;
-    cuMemAlloc(&wmobj, (n/2)*sizeof(float2));
+    cuMemAlloc(&wmobj, n/2*sizeof(float2));
+
+    CUfunction spinFact;
+    cuModuleGetFunction(&spinFact, module, "spinFact");
+    
+    int xblocks, yblocks;
+    setWorkSize(&xblocks, &yblocks, n/2, 1);
 
     // Synchronous data transfer(host to device)
     cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));
 
-    CUfunction spinFact;
-    cuModuleGetFunction(&spinFact, module, "spinFact");
-    
-    int block, thread;
-    setWorkSize(&block, &thread, n/2, 1);
-
-    void* spinFact_args[] = {&xmobj, &n};
+    void* spinFact_args[] = {&wmobj, &n};
     cuLaunchKernel(spinFact,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, spinFact_args, NULL);
     
+
     fftCore(rmobj, xmobj, wmobj, m, forward);
     
     CUfunction transpose;
     cuModuleGetFunction(&transpose, module, "transpose");
 
-    setWorkSize(&block, &thread, n, n);
+    setWorkSize(&xblocks, &yblocks, n, n);
 
     void* transpose_args[] = {&xmobj, &rmobj, &n};
     cuLaunchKernel(transpose,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, transpose_args, NULL);
 
+
     fftCore(rmobj, xmobj, wmobj, m, forward);
+    
 
     CUfunction highPassFilter;
     cuModuleGetFunction(&highPassFilter, module, "highPassFilter");
 
-    setWorkSize(&block, &thread, n, n);
+    setWorkSize(&xblocks, &yblocks, n, n);
 
     int radius = n/8;
     void*highPassFilter_args[] = {&rmobj, &n, &radius};
     cuLaunchKernel(highPassFilter,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, highPassFilter_args, NULL);
 
+
     fftCore(xmobj, rmobj, wmobj, m, inverse);
 
-    setWorkSize(&block, &thread, n, n);
+    setWorkSize(&xblocks, &yblocks, n, n);
     
     void* transpose2_args[] = {&rmobj, &xmobj, &n};
     cuLaunchKernel(transpose,
-                   block, 1, 1,
-                   thread, 1, 1,
+                   xblocks, yblocks, 1,
+                   1, 1, 1,
                    0, NULL, transpose2_args, NULL);
     
     fftCore(xmobj, rmobj, wmobj, m, inverse);
 
+
     cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2));
 
+    cuStreamSynchronize(NULL);
+
     float* ampd;
     ampd = (float*)malloc(n*n*sizeof(float));