Mercurial > hg > Game > Cerium
annotate example/cuda_fft/fft.cu @ 2007:bc2121b09cbc draft
kernel done
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 03 Jun 2014 16:02:06 +0900 |
parents | f6aa6d6a3fa2 |
children | 6fced32f85fd |
rev | line source |
---|---|
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
1 extern "C" { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
2 |
2007 | 3 #define PI 3.14159265358979323846 |
4 #define PI_2 1.57079632679489661923 | |
5 | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
6 __global__ void |
2007 | 7 bitReverse(float2* dst, float2* src, int m, int n) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
8 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
9 unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
10 unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
11 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
12 unsigned int j = gid; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
13 j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
14 j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
15 j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
16 j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
17 j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
18 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
19 j >>= (32-m); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
20 |
2007 | 21 dst[nid*n+j].x = src[nid*n+gid].x; |
22 dst[nid*n+j].y = src[nid*n+gid].y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
23 } |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
24 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
25 __global__ void |
2007 | 26 butterfly(float2* x, float2* w, int m, int n, int iter, unsigned int flag) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
27 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
28 unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
29 unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
30 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
31 int butterflySize = 1 << (iter-1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
32 int butterflyGrpDist = 1 << iter; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
33 int butterflyGrpNum = n >> iter; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
34 int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
35 int butterflyGrpOffset = gid & (butterflySize-1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
36 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
37 int a = nid * n + butterflyGrpBase + butterflyGrpOffset; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
38 int b = a + butterflySize; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
39 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
40 int l = butterflyGrpNum * butterflyGrpOffset; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
41 |
2007 | 42 float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
43 |
2007 | 44 xa.x = x[a].x; |
45 xa.y = x[a].y; | |
46 xb.x = x[b].x; | |
47 xb.y = x[b].y; | |
48 xbxx.x = xbxx.y = xb.x; | |
49 xbyy.x = xbyy.y = xb.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
50 |
2007 | 51 wab.x = (float)((unsigned int)w[l].x); |
52 wab.y = (float)((unsigned int)w[l].y ^ (unsigned int)flag); | |
53 | |
54 wayx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x80000000); | |
55 wayx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x0); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
56 |
2007 | 57 wbyx.x = (float)((unsigned int)wab.y ^ (unsigned int)0x0); |
58 wbyx.y = (float)((unsigned int)wab.x ^ (unsigned int)0x80000000); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
59 |
2007 | 60 resa.x = xa.x + xbxx.x*wab.x + xbyy.x*wayx.x; |
61 resa.y = xa.y + xbxx.y*wab.y + xbyy.y*wayx.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
62 |
2007 | 63 resb.x = xa.x - xbxx.x*wab.x + xbyy.x*wbyx.x; |
64 resb.y = xa.y - xbxx.y*wab.y + xbyy.y*wbyx.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
65 |
2007 | 66 x[a].x = resa.x; |
67 x[a].y = resa.y; | |
68 x[b].x = resb.x; | |
69 x[b].y = resb.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
70 } |
2007 | 71 |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
72 __global__ void |
2007 | 73 highPassFilter(float2* image, int n, int radius) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
74 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
75 unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
76 unsigned long ygid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
77 |
2007 | 78 int2 n_2; |
79 n_2.x = n_2.y = n>>1; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
80 |
2007 | 81 int2 mask; |
82 mask.x = mask.y = n-1; | |
83 | |
84 int2 gid; | |
85 gid.x = (xgid + n_2.x) & mask.x; | |
86 gid.y = (ygid + n_2.y) & mask.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
87 |
2007 | 88 int2 diff; |
89 diff.x = n_2.x - gid.x; | |
90 diff.y = n_2.y - gid.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
91 |
2007 | 92 int2 diff2; |
93 diff2.x = diff.x * diff.x; | |
94 diff2.y = diff.y * diff.y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
95 |
2007 | 96 int dist2 = diff2.x + diff2.y; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
97 |
2007 | 98 int2 window; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
99 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
100 if (dist2 < radius*radius) { |
2007 | 101 window.x = window.y = (int)0L; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
102 } else { |
2007 | 103 window.x = window.y = (int)-1L; |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
104 } |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
105 |
2007 | 106 image[ygid*n+xgid].x = (float)((int)image[ygid*n+xgid].x & window.x); |
107 image[ygid*n+xgid].y = (float)((int)image[ygid*n+xgid].y & window.y); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
108 } |
2007 | 109 |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
110 __global__ void |
2007 | 111 norm(float2* x, int n) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
112 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
113 unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
114 unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
115 |
2007 | 116 x[nid*n+gid].x = x[nid*n+gid].x / (float)n; |
117 x[nid*n+gid].y = x[nid*n+gid].y / (float)n; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
118 } |
2007 | 119 |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
120 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
121 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
122 __global__ void |
2007 | 123 spinFact(float2* w, int n) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
124 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
125 unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
126 |
2007 | 127 float2 angle; |
128 angle.x = (float)(2*i*PI/(float)n); | |
129 angle.y = (float)((2*i*PI/(float)n) + PI_2); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
130 |
2007 | 131 w[i].x = cos(angle.x); |
132 w[i].y = cos(angle.y); | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
133 } |
2007 | 134 |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
135 __global__ void |
2007 | 136 transpose(float2* dst, float2* src, int n) |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
137 { |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
138 unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
139 unsigned long ygid = blockIdx.y*blockDim.y*threadIdx.y; // (unsigned long)s->get_param(1); |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
140 |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
141 unsigned int iid = ygid * n + xgid; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
142 unsigned int oid = xgid * n + ygid; |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
143 |
2007 | 144 dst[oid].x = src[iid].x; |
145 dst[oid].y = src[iid].y; | |
2006
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
146 } |
f6aa6d6a3fa2
add fft using cuda, not running
Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
147 } |