Mercurial > hg > Game > Cerium
view example/cuda_fft/fft.cu @ 2069:26aa08c9a1de draft default tip
cuda example fix
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 12 Feb 2017 10:04:55 +0900 |
parents | faaea4e1ce1c |
children |
line wrap: on
line source
extern "C" { #define PI 3.14159265358979323846 #define PI_2 1.57079632679489661923 __global__ void bitReverse(float2* dst, float2* src, int m, int n) { unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); unsigned int j = gid; j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8; j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16; j >>= (32-m); dst[nid*n+j] = src[nid*n+gid]; } __global__ void butterfly(float2* x, float2* w, int m, int n, int iter, unsigned int flag) { unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); int butterflySize = 1 << (iter-1); int butterflyGrpDist = 1 << iter; int butterflyGrpNum = n >> iter; int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist); int butterflyGrpOffset = gid & (butterflySize-1); int a = nid * n + butterflyGrpBase + butterflyGrpOffset; int b = a + butterflySize; int l = butterflyGrpNum * butterflyGrpOffset; float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb; xa = x[a]; xb = x[b]; xbxx.x = xbxx.y = xb.x; xbyy.x = xbyy.y = xb.y; wab.x = (float)((unsigned int)w[l].x ^ (unsigned int)0x0); wab.y = (float)((unsigned int)w[l].y ^ (unsigned int)flag); wayx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x80000000); wayx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x0); wbyx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x0); wbyx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x80000000); resa.x = xa.x + xbxx.x*wab.x + xbyy.x*wayx.x; resa.y = xa.y + xbxx.y*wab.y + xbyy.y*wayx.y; resb.x = xa.x - xbxx.x*wab.x + xbyy.x*wbyx.x; resb.y = xa.y - xbxx.y*wab.y + xbyy.y*wbyx.y; x[a] = resa; x[b] = resb; } __global__ void highPassFilter(float2* image, int n, int radius) { unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long ygid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); int2 n_2; n_2.x = n_2.y = n>>1; int2 mask; mask.x = mask.y = n-1; int2 gid; gid.x = (xgid + n_2.x) & mask.x; gid.y = (ygid + n_2.y) & mask.y; int2 diff; diff.x = n_2.x - gid.x; diff.y = n_2.y - gid.y; int2 diff2; diff2.x = diff.x * diff.x; diff2.y = diff.y * diff.y; int dist2 = diff2.x + diff2.y; int2 window; if (dist2 < radius*radius) { window.x = window.y = (int)0L; } else { window.x = window.y = (int)-1L; } image[ygid*n+xgid].x = (float)((int)image[ygid*n+xgid].x & window.x); image[ygid*n+xgid].y = (float)((int)image[ygid*n+xgid].y & window.y); } __global__ void norm(float2* x, int n) { unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1); x[nid*n+gid].x = x[nid*n+gid].x / (float)n; x[nid*n+gid].y = x[nid*n+gid].y / (float)n; } __global__ void spinFact(float2* w, int n) { unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); float2 angle; angle.x = (float)(2*i*PI/(float)n); angle.y = (float)((2*i*PI/(float)n) + PI_2); w[i].x = cos(angle.x); w[i].y = cos(angle.y); } __global__ void transpose(float2* dst, float2* src, int n) { unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long ygid = blockIdx.y*blockDim.y*threadIdx.y; // (unsigned long)s->get_param(1); unsigned int iid = ygid * n + xgid; unsigned int oid = xgid * n + ygid; dst[oid] = src[iid]; } }