Mercurial > hg > Game > Cerium
changeset 2007:bc2121b09cbc draft
kernel done
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 03 Jun 2014 16:02:06 +0900 |
parents | f6aa6d6a3fa2 |
children | 2c8eab01cc78 |
files | example/cuda_fft/Makefile.def example/cuda_fft/fft.cu example/cuda_fft/main.cc |
diffstat | 3 files changed, 67 insertions(+), 119 deletions(-) [+] |
line wrap: on
line diff
--- a/example/cuda_fft/Makefile.def Tue Jun 03 12:07:00 2014 +0900 +++ b/example/cuda_fft/Makefile.def Tue Jun 03 16:02:06 2014 +0900 @@ -1,4 +1,4 @@ -TARGET = multiply +TARGET = fft OPT = -g -O0
--- a/example/cuda_fft/fft.cu Tue Jun 03 12:07:00 2014 +0900 +++ b/example/cuda_fft/fft.cu Tue Jun 03 16:02:06 2014 +0900 @@ -1,16 +1,15 @@ extern "C" { +#define PI 3.14159265358979323846 +#define PI_2 1.57079632679489661923 + __global__ void - bitReverse(long* param, float* src, float* dst) + bitReverse(float2* dst, float2* src, int m, int n) { unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); unsigned int j = gid; - - unsigned long m = param[0]; - unsigned long n = param[1]; - j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; @@ -19,46 +18,16 @@ j >>= (32-m); - dst[(nid*n+j)*2] = src[(nid*n+gid)*2]; - dst[(nid*n+j)*2+1] = src[(nid*n+gid)*2+1]; + dst[nid*n+j].x = src[nid*n+gid].x; + dst[nid*n+j].y = src[nid*n+gid].y; } -} -extern "C" { __global__ void - bitReverse(long* param, float* src, float* dst) + butterfly(float2* x, float2* w, int m, int n, int iter, unsigned int flag) { unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); - unsigned int j = gid; - - unsigned long m = param[0]; - unsigned long n = param[1]; - - j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; - j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; - j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; - j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8; - j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16; - - j >>= (32-m); - - dst[(nid*n+j)*2] = src[(nid*n+gid)*2]; - dst[(nid*n+j)*2+1] = src[(nid*n+gid)*2+1]; - } -} -extern "C" { - __global__ void - butterfly(long* param, float* x_in, float* w, float* x_out) - { - unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); - unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); - - long n = param[0]; - long direction_flag = param[1]; - long iter = param[2]; - int butterflySize = 1 << (iter-1); int butterflyGrpDist = 1 << iter; int butterflyGrpNum = n >> iter; @@ -70,129 +39,109 @@ int l = butterflyGrpNum * butterflyGrpOffset; - float xa[2], xb[2], xbxx[2], xbyy[2], wab[2], wayx[2], wbyx[2], resa[2], resb[2]; - - xa[0] = x_in[2*a]; - xa[1] = x_in[2*a+1]; - xb[0] = x_in[2*b]; - xb[1] = x_in[2*b+1]; - xbxx[0] = xbxx[1] = xb[0]; - xbyy[0] = xbyy[1] = xb[1]; + float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb; - wab[0] = w[2*l]; - if(direction_flag == 0x80000000) { - wab[1] = -w[2*l+1]; - } else { - wab[1] = w[2*l+1]; - } + xa.x = x[a].x; + xa.y = x[a].y; + xb.x = x[b].x; + xb.y = x[b].y; + xbxx.x = xbxx.y = xb.x; + xbyy.x = xbyy.y = xb.y; - wayx[0] = -wab[1]; - wayx[1] = wab[0]; + wab.x = (float)((unsigned int)w[l].x); + wab.y = (float)((unsigned int)w[l].y ^ (unsigned int)flag); + + wayx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x80000000); + wayx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x0); - wbyx[0] = wab[1]; - wbyx[1] = -wab[0]; + wbyx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x0); + wbyx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x80000000); - resa[0] = xa[0] + xbxx[0]*wab[0] + xbyy[0]*wayx[0]; - resa[1] = xa[1] + xbxx[1]*wab[1] + xbyy[1]*wayx[1]; + resa.x = xa.x + xbxx.x*wab.x + xbyy.x*wayx.x; + resa.y = xa.y + xbxx.y*wab.y + xbyy.y*wayx.y; - resb[0] = xa[0] - xbxx[0]*wab[0] + xbyy[0]*wbyx[0]; - resb[1] = xa[1] - xbxx[1]*wab[1] + xbyy[1]*wbyx[1]; + resb.x = xa.x - xbxx.x*wab.x + xbyy.x*wbyx.x; + resb.y = xa.y - xbxx.y*wab.y + xbyy.y*wbyx.y; - x_out[2*a] = resa[0]; - x_out[2*a+1] = resa[1]; - x_out[2*b] = resb[0]; - x_out[2*b+1] = resb[1]; + x[a].x = resa.x; + x[a].y = resa.y; + x[b].x = resb.x; + x[b].y = resb.y; } -} -extern "C" { + __global__ void - highPassFilter(long* param, float* in, float* image) + highPassFilter(float2* image, int n, int radius) { unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long ygid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); - long n = param[0]; - long radius = param[1]; - - int n_2[2]; - n_2[0] = n_2[1] = n>>1; + int2 n_2; + n_2.x = n_2.y = n>>1; - int mask[2]; - mask[0] = mask[1] = n-1; + int2 mask; + mask.x = mask.y = n-1; + + int2 gid; + gid.x = (xgid + n_2.x) & mask.x; + gid.y = (ygid + n_2.y) & mask.y; - int gid[2]; - gid[0] = (xgid + n_2[0]) & mask[0]; - gid[1] = (ygid + n_2[1]) & mask[1]; - - int diff[2]; - diff[0] = n_2[0] - gid[0]; - diff[1] = n_2[1] - gid[1]; + int2 diff; + diff.x = n_2.x - gid.x; + diff.y = n_2.y - gid.y; - int diff2[2]; - diff2[0] = diff[0] * diff[0]; - diff2[1] = diff[1] * diff[1]; + int2 diff2; + diff2.x = diff.x * diff.x; + diff2.y = diff.y * diff.y; - int dist2 = diff2[0] + diff2[1]; + int dist2 = diff2.x + diff2.y; - int window[2]; + int2 window; if (dist2 < radius*radius) { - window[0] = window[1] = (int)0L; + window.x = window.y = (int)0L; } else { - window[0] = window[1] = (int)-1L; + window.x = window.y = (int)-1L; } - image[(ygid*n+xgid)*2] = (float)((int)in[(ygid*n+xgid)*2] & window[0]); - image[(ygid*n+xgid)*2+1] = (float)((int)in[(ygid*n+xgid)*2+1] & window[1]); + image[ygid*n+xgid].x = (float)((int)image[ygid*n+xgid].x & window.x); + image[ygid*n+xgid].y = (float)((int)image[ygid*n+xgid].y & window.y); } -} -extern "C" { + __global__ void - norm(long* param, float* in_x,float* out_x) + norm(float2* x, int n) { unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1); - long n = param[0]; - - out_x[(nid*n+gid)*2] = in_x[(nid*n+gid)*2] / (float)n; - out_x[(nid*n+gid)*2+1] = in_x[(nid*n+gid)*2+1] / (float)n; + x[nid*n+gid].x = x[nid*n+gid].x / (float)n; + x[nid*n+gid].y = x[nid*n+gid].y / (float)n; } -} -extern "C" { -#include <math.h> + -#define PI 3.14159265358979323846 -#define PI_2 1.57079632679489661923 __global__ void - spinFact(long* param, float* w) + spinFact(float2* w, int n) { unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); - long n = param[0]; - - float angle[2]; - angle[0] = (float)(2*i*PI/(float)n); - angle[1] = (float)((2*i*PI/(float)n) + PI_2); + float2 angle; + angle.x = (float)(2*i*PI/(float)n); + angle.y = (float)((2*i*PI/(float)n) + PI_2); - w[2*i] = cos(angle[0]); - w[2*i+1] = cos(angle[1]); + w[i].x = cos(angle.x); + w[i].y = cos(angle.y); } -} -extern "C" { + __global__ void - transpose(long* param, float* src, float* dst) + transpose(float2* dst, float2* src, int n) { unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); unsigned long ygid = blockIdx.y*blockDim.y*threadIdx.y; // (unsigned long)s->get_param(1); - long n = param[0]; - unsigned int iid = ygid * n + xgid; unsigned int oid = xgid * n + ygid; - dst[2*oid] = src[2*iid]; - dst[2*oid+1] = src[2*iid+1]; + dst[oid].x = src[iid].x; + dst[oid].y = src[iid].y; } }