changeset 2007:bc2121b09cbc draft

kernel done
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 03 Jun 2014 16:02:06 +0900
parents f6aa6d6a3fa2
children 2c8eab01cc78
files example/cuda_fft/Makefile.def example/cuda_fft/fft.cu example/cuda_fft/main.cc
diffstat 3 files changed, 67 insertions(+), 119 deletions(-) [+]
line wrap: on
line diff
--- a/example/cuda_fft/Makefile.def	Tue Jun 03 12:07:00 2014 +0900
+++ b/example/cuda_fft/Makefile.def	Tue Jun 03 16:02:06 2014 +0900
@@ -1,4 +1,4 @@
-TARGET = multiply
+TARGET = fft
 
 OPT = -g -O0
 
--- a/example/cuda_fft/fft.cu	Tue Jun 03 12:07:00 2014 +0900
+++ b/example/cuda_fft/fft.cu	Tue Jun 03 16:02:06 2014 +0900
@@ -1,16 +1,15 @@
 extern "C" {
     
+#define PI 3.14159265358979323846
+#define PI_2 1.57079632679489661923
+    
     __global__ void
-    bitReverse(long* param, float* src, float* dst)
+    bitReverse(float2* dst, float2* src, int m, int n)
     {
         unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
         unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
         
         unsigned int j = gid;
-        
-        unsigned long m = param[0];
-        unsigned long n = param[1];
-        
         j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1;
         j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2;
         j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4;
@@ -19,46 +18,16 @@
         
         j >>= (32-m);
         
-        dst[(nid*n+j)*2] = src[(nid*n+gid)*2];
-        dst[(nid*n+j)*2+1] = src[(nid*n+gid)*2+1];
+        dst[nid*n+j].x = src[nid*n+gid].x;
+        dst[nid*n+j].y = src[nid*n+gid].y;
     }
-}
-extern "C" {
     
     __global__ void
-    bitReverse(long* param, float* src, float* dst)
+    butterfly(float2* x, float2* w, int m, int n, int iter, unsigned int flag)
     {
         unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
         unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
         
-        unsigned int j = gid;
-        
-        unsigned long m = param[0];
-        unsigned long n = param[1];
-        
-        j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1;
-        j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2;
-        j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4;
-        j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8;
-        j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16;
-        
-        j >>= (32-m);
-        
-        dst[(nid*n+j)*2] = src[(nid*n+gid)*2];
-        dst[(nid*n+j)*2+1] = src[(nid*n+gid)*2+1];
-    }
-}
-extern "C" {
-    __global__ void
-    butterfly(long* param, float* x_in, float* w, float* x_out)
-    {
-        unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
-        unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
-        
-        long n = param[0];
-        long direction_flag = param[1];
-        long iter = param[2];
-        
         int butterflySize = 1 << (iter-1);
         int butterflyGrpDist = 1 << iter;
         int butterflyGrpNum = n >> iter;
@@ -70,129 +39,109 @@
         
         int l = butterflyGrpNum * butterflyGrpOffset;
         
-        float xa[2], xb[2], xbxx[2], xbyy[2], wab[2], wayx[2], wbyx[2], resa[2], resb[2];
-        
-        xa[0] = x_in[2*a];
-        xa[1] = x_in[2*a+1];
-        xb[0] = x_in[2*b];
-        xb[1] = x_in[2*b+1];
-        xbxx[0] = xbxx[1] = xb[0];
-        xbyy[0] = xbyy[1] = xb[1];
+        float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb;
         
-        wab[0] = w[2*l];
-        if(direction_flag == 0x80000000) {
-            wab[1] = -w[2*l+1];
-        } else {
-            wab[1] = w[2*l+1];
-        }
+        xa.x = x[a].x;
+        xa.y = x[a].y;
+        xb.x = x[b].x;
+        xb.y = x[b].y;
+        xbxx.x = xbxx.y = xb.x;
+        xbyy.x = xbyy.y = xb.y;
         
-        wayx[0] = -wab[1];
-        wayx[1] = wab[0];
+        wab.x = (float)((unsigned int)w[l].x);
+        wab.y = (float)((unsigned int)w[l].y ^ (unsigned int)flag);
+        
+        wayx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x80000000);
+        wayx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x0);
         
-        wbyx[0] = wab[1];
-        wbyx[1] = -wab[0];
+        wbyx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x0);
+        wbyx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x80000000);
         
-        resa[0] = xa[0] + xbxx[0]*wab[0] + xbyy[0]*wayx[0];
-        resa[1] = xa[1] + xbxx[1]*wab[1] + xbyy[1]*wayx[1];
+        resa.x = xa.x + xbxx.x*wab.x + xbyy.x*wayx.x;
+        resa.y = xa.y + xbxx.y*wab.y + xbyy.y*wayx.y;
         
-        resb[0] = xa[0] - xbxx[0]*wab[0] + xbyy[0]*wbyx[0];
-        resb[1] = xa[1] - xbxx[1]*wab[1] + xbyy[1]*wbyx[1];
+        resb.x = xa.x - xbxx.x*wab.x + xbyy.x*wbyx.x;
+        resb.y = xa.y - xbxx.y*wab.y + xbyy.y*wbyx.y;
 
-        x_out[2*a] = resa[0];
-        x_out[2*a+1] = resa[1];
-        x_out[2*b] = resb[0];
-        x_out[2*b+1] = resb[1];
+        x[a].x = resa.x;
+        x[a].y = resa.y;
+        x[b].x = resb.x;
+        x[b].y = resb.y;
     }
-}
-extern "C" {
+
     __global__ void
-    highPassFilter(long* param, float* in, float* image)
+    highPassFilter(float2* image, int n, int radius)
     {
         unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
         unsigned long ygid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
 
-        long n = param[0];
-        long radius = param[1];
-        
-        int n_2[2];
-        n_2[0] = n_2[1] = n>>1;
+        int2 n_2;
+        n_2.x = n_2.y = n>>1;
     
-        int mask[2];
-        mask[0] = mask[1] = n-1;
+        int2 mask;
+        mask.x = mask.y = n-1;
+
+        int2 gid;
+        gid.x = (xgid + n_2.x) & mask.x;
+        gid.y = (ygid + n_2.y) & mask.y;
 
-        int gid[2];
-        gid[0] = (xgid + n_2[0]) & mask[0];
-        gid[1] = (ygid + n_2[1]) & mask[1];
-
-        int diff[2];
-        diff[0] = n_2[0] - gid[0];
-        diff[1] = n_2[1] - gid[1];
+        int2 diff;
+        diff.x = n_2.x - gid.x;
+        diff.y = n_2.y - gid.y;
     
-        int diff2[2];
-        diff2[0] = diff[0] * diff[0];
-        diff2[1] = diff[1] * diff[1];
+        int2 diff2;
+        diff2.x = diff.x * diff.x;
+        diff2.y = diff.y * diff.y;
 
-        int dist2 = diff2[0] + diff2[1];
+        int dist2 = diff2.x + diff2.y;
 
-        int window[2];
+        int2 window;
 
         if (dist2 < radius*radius) {
-            window[0] = window[1] = (int)0L;
+            window.x = window.y = (int)0L;
         } else {
-            window[0] = window[1] = (int)-1L;
+            window.x = window.y = (int)-1L;
         }
 
-        image[(ygid*n+xgid)*2] = (float)((int)in[(ygid*n+xgid)*2] & window[0]);
-        image[(ygid*n+xgid)*2+1] = (float)((int)in[(ygid*n+xgid)*2+1] & window[1]);
+        image[ygid*n+xgid].x = (float)((int)image[ygid*n+xgid].x & window.x);
+        image[ygid*n+xgid].y = (float)((int)image[ygid*n+xgid].y & window.y);
     }
-}
-extern "C" {
+
     __global__ void
-    norm(long* param, float* in_x,float* out_x)
+    norm(float2* x, int n)
     {
         unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
         unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1);
         
-        long n = param[0];
-        
-        out_x[(nid*n+gid)*2] = in_x[(nid*n+gid)*2] / (float)n;
-        out_x[(nid*n+gid)*2+1] = in_x[(nid*n+gid)*2+1] / (float)n;
+        x[nid*n+gid].x = x[nid*n+gid].x / (float)n;
+        x[nid*n+gid].y = x[nid*n+gid].y / (float)n;
     }
-}
-extern "C" {
-#include <math.h>
+
     
-#define PI 3.14159265358979323846
-#define PI_2 1.57079632679489661923
     
     __global__ void
-    spinFact(long* param, float* w)
+    spinFact(float2* w, int n)
     {
         unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
     
-        long n = param[0];
-
-        float angle[2];
-        angle[0] = (float)(2*i*PI/(float)n);
-        angle[1] = (float)((2*i*PI/(float)n) + PI_2);
+        float2 angle;
+        angle.x = (float)(2*i*PI/(float)n);
+        angle.y = (float)((2*i*PI/(float)n) + PI_2);
 
-        w[2*i] = cos(angle[0]);
-        w[2*i+1] = cos(angle[1]);
+        w[i].x = cos(angle.x);
+        w[i].y = cos(angle.y);
     }
-}
-extern "C" {
+
     __global__ void 
-    transpose(long* param, float* src, float* dst)
+    transpose(float2* dst, float2* src, int n)
     {
         unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
         unsigned long ygid = blockIdx.y*blockDim.y*threadIdx.y; // (unsigned long)s->get_param(1);
 
-        long n = param[0];
-
         unsigned int iid = ygid * n + xgid;
         unsigned int oid = xgid * n + ygid;
 
-        dst[2*oid] = src[2*iid];
-        dst[2*oid+1] = src[2*iid+1];
+        dst[oid].x = src[iid].x;
+        dst[oid].y = src[iid].y;
     }
 }
--- a/example/cuda_fft/main.cc	Tue Jun 03 12:07:00 2014 +0900
+++ b/example/cuda_fft/main.cc	Tue Jun 03 16:02:06 2014 +0900
@@ -122,7 +122,6 @@
 }
 
 int main(int args, char* argv[]) {
-    // initialize and load kernel
     cuInit(0);
 
     CUdevice device;