changeset 1975:4cf85b48ab9e draft

running fft with CudaScheduler, but wrong result
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Wed, 26 Feb 2014 19:39:15 +0900
parents 9ebee99a9aef
children a8f4227d6a21
files TaskManager/Gpu/GpuScheduler.cc TaskManager/Gpu/GpuScheduler.h example/fft/Makefile.cuda example/fft/cuda/butterfly.cu example/fft/cuda/highPassFilter.cu example/fft/cuda/norm.cu example/fft/cuda/spinFact.cu example/fft/cuda/task_init.cc example/fft/cuda/transpose.cu example/fft/main.cc
diffstat 10 files changed, 153 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Gpu/GpuScheduler.cc	Wed Feb 26 16:24:28 2014 +0900
+++ b/TaskManager/Gpu/GpuScheduler.cc	Wed Feb 26 19:39:15 2014 +0900
@@ -294,7 +294,7 @@
                     int i0 = flag[cur].flip ? i+1 : i ;
                     // flip use memin buffer and memout event
                     ret = clEnqueueReadBuffer(command_queue[cur], mem[cur].buf[i0], CL_FALSE, 0,
-                                              output_buf->size, output_buf->addr, 0, NULL, &memout[cur].event[i]);
+                                              output_buf->size, output_buf->addr, 0, NULL,&memout[cur].event[i]);
                     if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                 }
                 cur++;            // wait write[cur+1]
--- a/TaskManager/Gpu/GpuScheduler.h	Wed Feb 26 16:24:28 2014 +0900
+++ b/TaskManager/Gpu/GpuScheduler.h	Wed Feb 26 19:39:15 2014 +0900
@@ -15,7 +15,7 @@
 
 extern TaskObject gpu_task_list[MAX_TASK_OBJECT];
 
-#define STAGE 1
+#define STAGE 8
 
 class GpuScheduler : public MainScheduler {
  public:
--- a/example/fft/Makefile.cuda	Wed Feb 26 16:24:28 2014 +0900
+++ b/example/fft/Makefile.cuda	Wed Feb 26 19:39:15 2014 +0900
@@ -18,7 +18,7 @@
 CUDA_SRCS = $(filter-out $(CUDA_TASK_DIR)/$(CUDA_SRCS_EXCLUDE),$(CUDA_SRCS_TMP))
 CUDA_OBJS = $(CUDA_SRCS:.cu=.ptx)
 
-CFLAGS += -D__CERIUM_CUDA__
+CFLAGS += -D__CERIUM_CUDA__ -DGPU
 LIBS += `sdl-config --libs` -lCudaManager -F/Library/Frameworks -framework CUDA
 
 INCLUDE += -I/Developer/NVIDIA/CUDA-5.5/include
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/cuda/butterfly.cu	Wed Feb 26 19:39:15 2014 +0900
@@ -0,0 +1,56 @@
+extern "C" {
+    __global__ void
+    butterfly(long* param, float* x_in, float* w, float* x_out)
+    {
+        unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
+        
+        long n = param[0];
+        long direction_flag = param[1];
+        long iter = param[2];
+        
+        int butterflySize = 1 << (iter-1);
+        int butterflyGrpDist = 1 << iter;
+        int butterflyGrpNum = n >> iter;
+        int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist);
+        int butterflyGrpOffset = gid & (butterflySize-1);
+        
+        int a = nid * n + butterflyGrpBase + butterflyGrpOffset;
+        int b = a + butterflySize;
+        
+        int l = butterflyGrpNum * butterflyGrpOffset;
+        
+        float xa[2], xb[2], xbxx[2], xbyy[2], wab[2], wayx[2], wbyx[2], resa[2], resb[2];
+        
+        xa[0] = x_in[2*a];
+        xa[1] = x_in[2*a+1];
+        xb[0] = x_in[2*b];
+        xb[1] = x_in[2*b+1];
+        xbxx[0] = xbxx[1] = xb[0];
+        xbyy[0] = xbyy[1] = xb[1];
+        
+        wab[0] = w[2*l];
+        if(direction_flag == 0x80000000) {
+            wab[1] = -w[2*l+1];
+        } else {
+            wab[1] = w[2*l+1];
+        }
+        
+        wayx[0] = -wab[1];
+        wayx[1] = wab[0];
+        
+        wbyx[0] = wab[1];
+        wbyx[1] = -wab[0];
+        
+        resa[0] = xa[0] + xbxx[0]*wab[0] + xbyy[0]*wayx[0];
+        resa[1] = xa[1] + xbxx[1]*wab[1] + xbyy[1]*wayx[1];
+        
+        resb[0] = xa[0] - xbxx[0]*wab[0] + xbyy[0]*wbyx[0];
+        resb[1] = xa[1] - xbxx[1]*wab[1] + xbyy[1]*wbyx[1];
+
+        x_out[2*a] = resa[0];
+        x_out[2*a+1] = resa[1];
+        x_out[2*b] = resb[0];
+        x_out[2*b+1] = resb[1];
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/cuda/highPassFilter.cu	Wed Feb 26 19:39:15 2014 +0900
@@ -0,0 +1,42 @@
+extern "C" {
+    __global__ void
+    highPassFilter(long* param, float* in, float* image)
+    {
+        unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long ygid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
+
+        long n = param[0];
+        long radius = param[1];
+        
+        int n_2[2];
+        n_2[0] = n_2[1] = n>>1;
+    
+        int mask[2];
+        mask[0] = mask[1] = n-1;
+
+        int gid[2];
+        gid[0] = (xgid + n_2[0]) & mask[0];
+        gid[1] = (ygid + n_2[1]) & mask[1];
+
+        int diff[2];
+        diff[0] = n_2[0] - gid[0];
+        diff[1] = n_2[1] - gid[1];
+    
+        int diff2[2];
+        diff2[0] = diff[0] * diff[0];
+        diff2[1] = diff[1] * diff[1];
+
+        int dist2 = diff2[0] + diff2[1];
+
+        int window[2];
+
+        if (dist2 < radius*radius) {
+            window[0] = window[1] = (int)0L;
+        } else {
+            window[0] = window[1] = (int)-1L;
+        }
+
+        image[(ygid*n+xgid)*2] = (float)((int)in[(ygid*n+xgid)*2] & window[0]);
+        image[(ygid*n+xgid)*2+1] = (float)((int)in[(ygid*n+xgid)*2+1] & window[1]);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/cuda/norm.cu	Wed Feb 26 19:39:15 2014 +0900
@@ -0,0 +1,13 @@
+extern "C" {
+    __global__ void
+    norm(long* param, float* in_x,float* out_x)
+    {
+        unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1);
+        
+        long n = param[0];
+        
+        out_x[(nid*n+gid)*2] = in_x[(nid*n+gid)*2] / (float)n;
+        out_x[(nid*n+gid)*2+1] = in_x[(nid*n+gid)*2+1] / (float)n;
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/cuda/spinFact.cu	Wed Feb 26 19:39:15 2014 +0900
@@ -0,0 +1,21 @@
+extern "C" {
+#include <math.h>
+    
+#define PI 3.14159265358979323846
+#define PI_2 1.57079632679489661923
+    
+    __global__ void
+    spinFact(long* param, float* w)
+    {
+        unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+    
+        long n = param[0];
+
+        float angle[2];
+        angle[0] = (float)(2*i*PI/(float)n);
+        angle[1] = (float)((2*i*PI/(float)n) + PI_2);
+
+        w[2*i] = cos(angle[0]);
+        w[2*i+1] = cos(angle[1]);
+    }
+}
--- a/example/fft/cuda/task_init.cc	Wed Feb 26 16:24:28 2014 +0900
+++ b/example/fft/cuda/task_init.cc	Wed Feb 26 19:39:15 2014 +0900
@@ -10,5 +10,5 @@
     CudaSchedRegister(BIT_REVERSE, "cuda/bitReverse.ptx", "bitReverse");
     CudaSchedRegister(BUTTERFLY, "cuda/butterfly.ptx", "butterfly");
     CudaSchedRegister(TRANSPOSE, "cuda/transpose.ptx", "transpose");
-    CudaSchedRegister(HIGH_PASS_FILTER, "gpu/highPassFilter.ptx", "highPassFilter");
+    CudaSchedRegister(HIGH_PASS_FILTER, "cuda/highPassFilter.ptx", "highPassFilter");
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/cuda/transpose.cu	Wed Feb 26 19:39:15 2014 +0900
@@ -0,0 +1,16 @@
+extern "C" {
+    __global__ void 
+    transpose(long* param, float* src, float* dst)
+    {
+        unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
+        unsigned long ygid = blockIdx.y*blockDim.y*threadIdx.y; // (unsigned long)s->get_param(1);
+
+        long n = param[0];
+
+        unsigned int iid = ygid * n + xgid;
+        unsigned int oid = xgid * n + ygid;
+
+        dst[2*oid] = src[2*iid];
+        dst[2*oid+1] = src[2*iid+1];
+    }
+}
--- a/example/fft/main.cc	Wed Feb 26 16:24:28 2014 +0900
+++ b/example/fft/main.cc	Wed Feb 26 19:39:15 2014 +0900
@@ -125,7 +125,7 @@
     brev->set_param(1,n);
     brev->set_inData(0, src, length_src*sizeof(cl_float2));
     brev->set_outData(0, dst, length_dst*sizeof(cl_float2));
-    brev->set_cpu(SPE_ANY);
+    brev->set_cpu(spe_cpu);
     brev->wait_for(waitTask);
     brev->iterate(gws[0],gws[1]);
 
@@ -140,7 +140,6 @@
         bfly->set_inData(0, dst, length_dst*sizeof(cl_float2));
         bfly->set_inData(1, spin, sizeof(cl_float2)*(n/2));
         bfly->set_outData(0,dst,length_dst*sizeof(cl_float2));
-        //bfly->flip();
         bfly->set_cpu(spe_cpu);
         bfly->wait_for(waitTask);
         bfly->iterate(gws[0],gws[1]);
@@ -153,7 +152,6 @@
         norm->set_inData(0,dst,length_dst*sizeof(cl_float2));
         norm->set_outData(0, dst, length_dst*sizeof(cl_float2));
         norm->set_param(0,n);
-        // norm->flip();
         norm->set_cpu(spe_cpu);
         norm->wait_for(waitTask);
         norm->iterate(gws[0],gws[1]);
@@ -253,7 +251,6 @@
     setWorkSize(gws,lws,n,n);
     hpfl->set_inData(0,rm,length_r*sizeof(cl_float2));
     hpfl->set_outData(0, rm, length_r*sizeof(cl_float2));
-    // hpfl->flip();
     hpfl->set_param(0,n);
     hpfl->set_param(1,(long)radius);
     hpfl->set_cpu(spe_cpu);