Mercurial > hg > Game > Cerium
annotate example/cuda_fft/fft.cu @ 2048:6796d85f3d6b draft
remove error
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 28 Jan 2016 00:05:49 +0900 |
parents | faaea4e1ce1c |
children |
rev | line source |
---|---|
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
1 extern "C" { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
2 |
2007 | 3 #define PI 3.14159265358979323846 |
4 #define PI_2 1.57079632679489661923 | |
5 | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
6 __global__ void |
2007 | 7 bitReverse(float2* dst, float2* src, int m, int n) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
8 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
9 unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
10 unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
11 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
12 unsigned int j = gid; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
13 j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
14 j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
15 j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
16 j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
17 j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
18 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
19 j >>= (32-m); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
20 |
2011 | 21 dst[nid*n+j] = src[nid*n+gid]; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
22 } |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
23 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
24 __global__ void |
2007 | 25 butterfly(float2* x, float2* w, int m, int n, int iter, unsigned int flag) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
26 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
27 unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
28 unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
29 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
30 int butterflySize = 1 << (iter-1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
31 int butterflyGrpDist = 1 << iter; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
32 int butterflyGrpNum = n >> iter; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
33 int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
34 int butterflyGrpOffset = gid & (butterflySize-1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
35 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
36 int a = nid * n + butterflyGrpBase + butterflyGrpOffset; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
37 int b = a + butterflySize; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
38 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
39 int l = butterflyGrpNum * butterflyGrpOffset; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
40 |
2007 | 41 float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
42 |
2011 | 43 xa = x[a]; |
44 xb = x[b]; | |
2007 | 45 xbxx.x = xbxx.y = xb.x; |
46 xbyy.x = xbyy.y = xb.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
47 |
2011 | 48 wab.x = (float)((unsigned int)w[l].x ^ (unsigned int)0x0); |
2007 | 49 wab.y = (float)((unsigned int)w[l].y ^ (unsigned int)flag); |
50 | |
51 wayx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x80000000); | |
52 wayx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x0); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
53 |
2007 | 54 wbyx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x0); |
55 wbyx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x80000000); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
56 |
2007 | 57 resa.x = xa.x + xbxx.x*wab.x + xbyy.x*wayx.x; |
58 resa.y = xa.y + xbxx.y*wab.y + xbyy.y*wayx.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
59 |
2007 | 60 resb.x = xa.x - xbxx.x*wab.x + xbyy.x*wbyx.x; |
61 resb.y = xa.y - xbxx.y*wab.y + xbyy.y*wbyx.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
62 |
2011 | 63 x[a] = resa; |
64 x[b] = resb; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
65 } |
2007 | 66 |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
67 __global__ void |
2007 | 68 highPassFilter(float2* image, int n, int radius) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
69 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
70 unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
71 unsigned long ygid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
72 |
2007 | 73 int2 n_2; |
74 n_2.x = n_2.y = n>>1; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
75 |
2007 | 76 int2 mask; |
77 mask.x = mask.y = n-1; | |
78 | |
79 int2 gid; | |
80 gid.x = (xgid + n_2.x) & mask.x; | |
81 gid.y = (ygid + n_2.y) & mask.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
82 |
2007 | 83 int2 diff; |
84 diff.x = n_2.x - gid.x; | |
85 diff.y = n_2.y - gid.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
86 |
2007 | 87 int2 diff2; |
88 diff2.x = diff.x * diff.x; | |
89 diff2.y = diff.y * diff.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
90 |
2007 | 91 int dist2 = diff2.x + diff2.y; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
92 |
2007 | 93 int2 window; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
94 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
95 if (dist2 < radius*radius) { |
2007 | 96 window.x = window.y = (int)0L; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
97 } else { |
2007 | 98 window.x = window.y = (int)-1L; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
99 } |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
100 |
2007 | 101 image[ygid*n+xgid].x = (float)((int)image[ygid*n+xgid].x & window.x); |
102 image[ygid*n+xgid].y = (float)((int)image[ygid*n+xgid].y & window.y); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
103 } |
2007 | 104 |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
105 __global__ void |
2007 | 106 norm(float2* x, int n) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
107 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
108 unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
109 unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
110 |
2007 | 111 x[nid*n+gid].x = x[nid*n+gid].x / (float)n; |
112 x[nid*n+gid].y = x[nid*n+gid].y / (float)n; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
113 } |
2007 | 114 |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
115 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
116 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
117 __global__ void |
2007 | 118 spinFact(float2* w, int n) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
119 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
120 unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
2007 | 121 float2 angle; |
122 angle.x = (float)(2*i*PI/(float)n); | |
123 angle.y = (float)((2*i*PI/(float)n) + PI_2); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
124 |
2007 | 125 w[i].x = cos(angle.x); |
126 w[i].y = cos(angle.y); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
127 } |
2007 | 128 |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
129 __global__ void |
2007 | 130 transpose(float2* dst, float2* src, int n) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
131 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
132 unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
133 unsigned long ygid = blockIdx.y*blockDim.y*threadIdx.y; // (unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
134 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
135 unsigned int iid = ygid * n + xgid; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
136 unsigned int oid = xgid * n + ygid; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
137 |
2011 | 138 dst[oid] = src[iid]; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
139 } |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
140 } |