changeset 1673:2c3adce7eb40 draft

fix fft on gpu
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Thu, 18 Jul 2013 17:16:43 +0900
parents 32bc4ea3557f
children 614c60736bfd
files example/fft/gpu/bitReverse.cl example/fft/gpu/butterfly.cl example/fft/gpu/fft.cl example/fft/gpu/highPassFilter.cl example/fft/gpu/norm.cl example/fft/gpu/spinFact.cl example/fft/gpu/transpose.cl example/fft/main.cc example/fft/output.pgm example/fft/task_init.cc
diffstat 10 files changed, 157 insertions(+), 123 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/gpu/bitReverse.cl	Thu Jul 18 17:16:43 2013 +0900
@@ -0,0 +1,20 @@
+__kernel
+void bitReverse(__constant long *param, __global float2 *src,__global float2 *dst)
+{
+    unsigned long gid = (unsigned long)get_global_id(0);
+    unsigned long nid = (unsigned long)get_global_id(1);
+    
+    unsigned long m = (unsigned long)param[3];
+    unsigned long n = (unsigned long)param[4];
+    unsigned int j = gid;
+    
+    j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1;
+    j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2;
+    j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4;
+    j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8;
+    j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16;
+
+    j >>= (32-m);
+
+    dst[nid*n+j] = src[nid*n+gid];
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/gpu/butterfly.cl	Thu Jul 18 17:16:43 2013 +0900
@@ -0,0 +1,50 @@
+__kernel
+void butterfly(__constant long *param, __global float2 *x_in, __global float2 *w, __global float2 *x_out)
+{
+    unsigned long gid = (unsigned long)get_global_id(0);
+    unsigned long nid = (unsigned long)get_global_id(1);
+    
+    long n = param[3];
+    unsigned long direction_flag = (unsigned long)param[4];
+    long iter = param[5];
+
+    int butterflySize = 1 << (iter-1);
+    int butterflyGrpDist = 1 << iter;
+    int butterflyGrpNum = n >> iter;
+    int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist);
+    int butterflyGrpOffset = gid & (butterflySize-1);
+
+    int a = nid * n + butterflyGrpBase + butterflyGrpOffset;
+    int b = a + butterflySize;
+
+    int l = butterflyGrpNum * butterflyGrpOffset;
+
+    float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb;
+
+    xa = x_in[a];
+    xb = x_in[b];
+    xbxx.x = xbxx.y = xb.x;
+    xbyy.x = xbyy.y = xb.y;
+    
+    wab.x = w[l].x;
+    if(direction_flag == 0x80000000) {
+        wab.y = -w[l].y;
+    } else {
+        wab.y = w[l].y;
+    }
+
+    wayx.x = -wab.y;
+    wayx.y = wab.x;
+
+    wbyx.x = wab.y;
+    wbyx.y = -wab.x;
+
+    resa.x = xa.x + xbxx.x*wab.x + xbyy.x*wayx.x;
+    resa.y = xa.y + xbxx.y*wab.y + xbyy.y*wayx.y;
+
+    resb.x = xa.x - xbxx.x*wab.x + xbyy.x*wbyx.x;
+    resb.y = xa.y - xbxx.y*wab.y + xbyy.y*wbyx.y;
+
+    x_out[a] = resa;
+    x_out[b] = resb;
+}
--- a/example/fft/gpu/fft.cl	Thu Jul 18 11:20:51 2013 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,116 +0,0 @@
-#define PI 3.14159265358979323846
-#define PI_2 1.57079632679489661923
-
-__kernel void spinFact(__constant long *param,__global float2* w)
-{
-    long n = param[3];
-    unsigned int i = get_global_id(0);
-
-    float2 angle = (float2)(2*i*PI/(float)n,(2*i*PI/(float)n)+PI_2);
-    w[i] = cos(angle);
-}
-
-__kernel void bitReverse(__constant int *param, __global float2 *dst, __global float2 *src) {
-    int m = param[3];
-    int n = param[4];
-    unsigned int gid = get_global_id(0);
-    unsigned int nid = get_global_id(1);
-
-    unsigned int j = gid;
-    j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1;
-    j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2;
-    j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4;
-    j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8;
-    j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16;
-
-    j >>= (32-m);
-
-    dst[nid*n+j] = src[nid*n+gid];
-}
-
-__kernel void norm(__constant int *param, __global float2 *x,__global float2 *x_out)
-{
-    int n = param[3];
-    unsigned int gid = get_global_id(0);
-
-
-    unsigned int nid = get_global_id(1);
-
-    x_out[nid*n+gid] = x[nid*n+gid] / (float2)((float)n, (float)n);
-}
-
-__kernel void butterfly(__constant int *param, __global float2 *x, __global float2* w,__global float2 *x_out)
-{
-    int n     = param[3];
-    uint flag = param[4];
-    int iter  = param[5];
-    
-    unsigned int gid = get_global_id(0);
-    unsigned int nid = get_global_id(1);
-
-    int butterflySize = 1 << (iter-1);
-    int butterflyGrpDist = 1 << iter;
-    int butterflyGrpNum = n >> iter;
-    int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist);
-    int butterflyGrpOffset = gid & (butterflySize-1);
-
-    int a = nid * n + butterflyGrpBase + butterflyGrpOffset;
-    int b = a + butterflySize;
-
-    int l = butterflyGrpNum * butterflyGrpOffset;
-
-    float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb;
-
-    xa = x[a];
-    xb = x[b];
-    xbxx = xb.xx;
-    xbyy = xb.yy;
-
-    wab = as_float2(as_uint2(w[l]) ^ (uint2)(0x0, flag));
-    wayx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x80000000, 0x0));
-    wbyx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x0, 0x80000000));
-
-    resa = xa + xbxx*wab + xbyy*wayx;
-    resb = xa - xbxx*wab + xbyy*wbyx;
-
-    x_out[a] = resa;
-    x_out[b] = resb;
-}
-
-__kernel void transpose(__constant int *param, __global float2* src,__global float2 *dst)
-{
-    int n = param[3];
-    unsigned int xgid = get_global_id(0);
-    unsigned int ygid = get_global_id(1);
-
-    unsigned int iid = ygid * n + xgid;
-    unsigned int oid = xgid * n + ygid;
-
-    dst[oid] = src[iid];
-}
-
-__kernel void highPassFilter(__constant int *param, __global float2* image, __global float2* image_out)
-{
-    int n = param[3];
-    int radius = param[4];
-    unsigned int xgid = get_global_id(0);
-    unsigned int ygid = get_global_id(1);
-
-    int2 n_2 = (int2)(n>>1, n>>1);
-    int2 mask = (int2)(n-1, n-1);
-
-    int2 gid = ((int2)(xgid, ygid) + n_2) & mask;
-
-    int2 diff = n_2 - gid;
-    int2 diff2 = diff * diff;
-    int dist2 = diff2.x + diff2.y;
-
-    int2 window;
-
-    if (dist2 < radius*radius) {
-        window = (int2)(0L, 0L);
-    } else {
-        window = (int2)(-1L, -1L);
-    }
-        image_out[ygid*n+xgid] = as_float2(as_int2(image[ygid*n+xgid]) & window);
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/gpu/highPassFilter.cl	Thu Jul 18 17:16:43 2013 +0900
@@ -0,0 +1,40 @@
+__kernel
+void highPassFilter(__constant long *param, __global float2 *in,__global float2 *image)
+{
+    unsigned long xgid = (unsigned long)get_global_id(0);
+    unsigned long ygid = (unsigned long)get_global_id(1);
+
+    long n      = param[3];
+    long radius = param[4];
+
+    int2 n_2;
+    n_2.x = n_2.y = n>>1;
+    
+    int2 mask;
+    mask.x = mask.y = n-1;
+
+    int2 gid;
+    gid.x = (xgid + n_2.x) & mask.x;
+    gid.y = (ygid + n_2.y) & mask.y;
+
+    int2 diff;
+    diff.x = n_2.x - gid.x;
+    diff.y = n_2.y - gid.y;
+    
+    int2 diff2;
+    diff2.x = diff.x * diff.x;
+    diff2.y = diff.y * diff.y;
+
+    int dist2 = diff2.x + diff2.y;
+
+    int2 window;
+
+    if (dist2 < radius*radius) {
+        window.x = window.y = (int)0L;
+    } else {
+        window.x = window.y = (int)-1L;
+    }
+
+    image[ygid*n+xgid].x = (float)((int)in[ygid*n+xgid].x & window.x);
+    image[ygid*n+xgid].y = (float)((int)in[ygid*n+xgid].y & window.y);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/gpu/norm.cl	Thu Jul 18 17:16:43 2013 +0900
@@ -0,0 +1,11 @@
+__kernel void
+norm(__constant long *param, __global float2 *in_x, __global float2 *out_x)
+{
+    unsigned long gid = (unsigned long)get_global_id(0);
+    unsigned long nid = (unsigned long)get_global_id(1);
+
+    long n = param[3];
+
+    out_x[nid*n+gid].x = in_x[nid*n+gid].x / (float)n;
+    out_x[nid*n+gid].y = in_x[nid*n+gid].y / (float)n;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/gpu/spinFact.cl	Thu Jul 18 17:16:43 2013 +0900
@@ -0,0 +1,16 @@
+#define PI 3.14159265358979323846
+#define PI_2 1.57079632679489661923
+
+__kernel
+void spinFact(__constant long *param, __global float2 *w)
+{
+    unsigned long i = (unsigned long)get_global_id(0);
+    long n =param[3];
+
+    float2 angle;
+    angle.x = (float)(2*i*PI/(float)n);
+    angle.y = (float)((2*i*PI/(float)n) + PI_2);
+
+    w[i].x = cos(angle.x);
+    w[i].y = cos(angle.y);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/fft/gpu/transpose.cl	Thu Jul 18 17:16:43 2013 +0900
@@ -0,0 +1,13 @@
+__kernel
+void transpose(__constant long *param, __global float2 *src, __global float2 *dst)
+{
+    unsigned long xgid = (unsigned long)get_global_id(0);
+    unsigned long ygid = (unsigned long)get_global_id(1);
+
+    long n = (long)param[3];
+
+    unsigned int iid = ygid * n + xgid;
+    unsigned int oid = xgid * n + ygid;
+
+    dst[oid] = src[iid];
+}
--- a/example/fft/main.cc	Thu Jul 18 11:20:51 2013 +0900
+++ b/example/fft/main.cc	Thu Jul 18 17:16:43 2013 +0900
@@ -107,7 +107,7 @@
 HTask*
 fftCore(TaskManager *manager,cl_float2 *dst, cl_float2 *src, cl_float2 *spin, long m, enum Mode direction,HTask* waitTask)
 {
-    unsigned int direction_flag;
+    long direction_flag;
     switch (direction) {
     case forward:direction_flag = 0x00000000; break;
     case inverse:direction_flag = 0x80000000; break;
Binary file example/fft/output.pgm has changed
--- a/example/fft/task_init.cc	Thu Jul 18 11:20:51 2013 +0900
+++ b/example/fft/task_init.cc	Thu Jul 18 17:16:43 2013 +0900
@@ -16,12 +16,12 @@
 task_init(void)
 {
 #ifdef __CERIUM_GPU__
-    GpuSchedRegister(SPIN_FACT, "gpu/fft.cl", "spinFact");
-    GpuSchedRegister(BIT_REVERSE, "gpu/fft.cl", "bitReverse");
-    GpuSchedRegister(NORMALIZATION, "gpu/fft.cl", "norm");
-    GpuSchedRegister(BUTTERFLY, "gpu/fft.cl", "butterfly");
-    GpuSchedRegister(TRANSPOSE, "gpu/fft.cl", "transpose");
-    GpuSchedRegister(HIGH_PASS_FILTER, "gpu/fft.cl", "highPassFilter");
+    GpuSchedRegister(SPIN_FACT, "gpu/spinFact.cl", "spinFact");
+    GpuSchedRegister(BIT_REVERSE, "gpu/bitReverse.cl", "bitReverse");
+    GpuSchedRegister(NORMALIZATION, "gpu/norm.cl", "norm");
+    GpuSchedRegister(BUTTERFLY, "gpu/butterfly.cl", "butterfly");
+    GpuSchedRegister(TRANSPOSE, "gpu/transpose.cl", "transpose");
+    GpuSchedRegister(HIGH_PASS_FILTER, "gpu/highPassFilter.cl", "highPassFilter");
 #else
     SchedRegisterTask(SPIN_FACT,spinFact);
     SchedRegisterTask(NORMALIZATION, norm);