150
|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
2 ; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
|
|
3 ; RUN: llc -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM %s
|
|
4
|
|
5 ; Loosely based on test/CodeGen/{X86,AArch64}/extract-lowbits.ll,
|
|
6 ; but with all 64-bit tests, and tests with loads dropped.
|
|
7
|
|
8 ; Patterns:
|
|
9 ; a) x & (1 << nbits) - 1
|
|
10 ; b) x & ~(-1 << nbits)
|
|
11 ; c) x & (-1 >> (32 - y))
|
|
12 ; d) x << (32 - y) >> (32 - y)
|
|
13 ; are equivalent.
|
|
14
|
|
15 ; ---------------------------------------------------------------------------- ;
|
|
16 ; Pattern a. 32-bit
|
|
17 ; ---------------------------------------------------------------------------- ;
|
|
18
|
|
19 define amdgpu_kernel void @bzhi32_a0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
|
|
20 ; EG-LABEL: bzhi32_a0:
|
|
21 ; EG: ; %bb.0:
|
|
22 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
23 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
24 ; EG-NEXT: CF_END
|
|
25 ; EG-NEXT: PAD
|
|
26 ; EG-NEXT: ALU clause starting at 4:
|
|
27 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
28 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
29 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
30 ;
|
|
31 ; CM-LABEL: bzhi32_a0:
|
|
32 ; CM: ; %bb.0:
|
|
33 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
34 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
35 ; CM-NEXT: CF_END
|
|
36 ; CM-NEXT: PAD
|
|
37 ; CM-NEXT: ALU clause starting at 4:
|
|
38 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
39 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
40 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
41 %onebit = shl i32 1, %numlowbits
|
|
42 %mask = add nsw i32 %onebit, -1
|
|
43 %masked = and i32 %mask, %val
|
|
44 store i32 %masked, i32 addrspace(1)* %out
|
|
45 ret void
|
|
46 }
|
|
47
|
|
48 define amdgpu_kernel void @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) {
|
|
49 ; EG-LABEL: bzhi32_a1_indexzext:
|
|
50 ; EG: ; %bb.0:
|
|
51 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
|
|
52 ; EG-NEXT: TEX 0 @6
|
|
53 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
|
|
54 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
55 ; EG-NEXT: CF_END
|
|
56 ; EG-NEXT: PAD
|
|
57 ; EG-NEXT: Fetch clause starting at 6:
|
|
58 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
|
|
59 ; EG-NEXT: ALU clause starting at 8:
|
|
60 ; EG-NEXT: MOV * T0.X, 0.0,
|
|
61 ; EG-NEXT: ALU clause starting at 9:
|
|
62 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
|
|
63 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
64 ; EG-NEXT: BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W,
|
|
65 ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
|
|
66 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
67 ;
|
|
68 ; CM-LABEL: bzhi32_a1_indexzext:
|
|
69 ; CM: ; %bb.0:
|
|
70 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
|
|
71 ; CM-NEXT: TEX 0 @6
|
|
72 ; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
|
|
73 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
74 ; CM-NEXT: CF_END
|
|
75 ; CM-NEXT: PAD
|
|
76 ; CM-NEXT: Fetch clause starting at 6:
|
|
77 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
|
|
78 ; CM-NEXT: ALU clause starting at 8:
|
|
79 ; CM-NEXT: MOV * T0.X, 0.0,
|
|
80 ; CM-NEXT: ALU clause starting at 9:
|
|
81 ; CM-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
|
|
82 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
83 ; CM-NEXT: BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W,
|
|
84 ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
|
|
85 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
86 %conv = zext i8 %numlowbits to i32
|
|
87 %onebit = shl i32 1, %conv
|
|
88 %mask = add nsw i32 %onebit, -1
|
|
89 %masked = and i32 %mask, %val
|
|
90 store i32 %masked, i32 addrspace(1)* %out
|
|
91 ret void
|
|
92 }
|
|
93
|
|
94 define amdgpu_kernel void @bzhi32_a4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
|
|
95 ; EG-LABEL: bzhi32_a4_commutative:
|
|
96 ; EG: ; %bb.0:
|
|
97 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
98 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
99 ; EG-NEXT: CF_END
|
|
100 ; EG-NEXT: PAD
|
|
101 ; EG-NEXT: ALU clause starting at 4:
|
|
102 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
103 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
104 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
105 ;
|
|
106 ; CM-LABEL: bzhi32_a4_commutative:
|
|
107 ; CM: ; %bb.0:
|
|
108 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
109 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
110 ; CM-NEXT: CF_END
|
|
111 ; CM-NEXT: PAD
|
|
112 ; CM-NEXT: ALU clause starting at 4:
|
|
113 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
114 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
115 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
116 %onebit = shl i32 1, %numlowbits
|
|
117 %mask = add nsw i32 %onebit, -1
|
|
118 %masked = and i32 %val, %mask ; swapped order
|
|
119 store i32 %masked, i32 addrspace(1)* %out
|
|
120 ret void
|
|
121 }
|
|
122
|
|
123 ; ---------------------------------------------------------------------------- ;
|
|
124 ; Pattern b. 32-bit
|
|
125 ; ---------------------------------------------------------------------------- ;
|
|
126
|
|
127 define amdgpu_kernel void @bzhi32_b0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
|
|
128 ; EG-LABEL: bzhi32_b0:
|
|
129 ; EG: ; %bb.0:
|
|
130 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
131 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
132 ; EG-NEXT: CF_END
|
|
133 ; EG-NEXT: PAD
|
|
134 ; EG-NEXT: ALU clause starting at 4:
|
|
135 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
136 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
137 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
138 ;
|
|
139 ; CM-LABEL: bzhi32_b0:
|
|
140 ; CM: ; %bb.0:
|
|
141 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
142 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
143 ; CM-NEXT: CF_END
|
|
144 ; CM-NEXT: PAD
|
|
145 ; CM-NEXT: ALU clause starting at 4:
|
|
146 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
147 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
148 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
149 %notmask = shl i32 -1, %numlowbits
|
|
150 %mask = xor i32 %notmask, -1
|
|
151 %masked = and i32 %mask, %val
|
|
152 store i32 %masked, i32 addrspace(1)* %out
|
|
153 ret void
|
|
154 }
|
|
155
|
|
156 define amdgpu_kernel void @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) {
|
|
157 ; EG-LABEL: bzhi32_b1_indexzext:
|
|
158 ; EG: ; %bb.0:
|
|
159 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
|
|
160 ; EG-NEXT: TEX 0 @6
|
|
161 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
|
|
162 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
163 ; EG-NEXT: CF_END
|
|
164 ; EG-NEXT: PAD
|
|
165 ; EG-NEXT: Fetch clause starting at 6:
|
|
166 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
|
|
167 ; EG-NEXT: ALU clause starting at 8:
|
|
168 ; EG-NEXT: MOV * T0.X, 0.0,
|
|
169 ; EG-NEXT: ALU clause starting at 9:
|
|
170 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
|
|
171 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
172 ; EG-NEXT: BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W,
|
|
173 ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
|
|
174 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
175 ;
|
|
176 ; CM-LABEL: bzhi32_b1_indexzext:
|
|
177 ; CM: ; %bb.0:
|
|
178 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
|
|
179 ; CM-NEXT: TEX 0 @6
|
|
180 ; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
|
|
181 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
182 ; CM-NEXT: CF_END
|
|
183 ; CM-NEXT: PAD
|
|
184 ; CM-NEXT: Fetch clause starting at 6:
|
|
185 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
|
|
186 ; CM-NEXT: ALU clause starting at 8:
|
|
187 ; CM-NEXT: MOV * T0.X, 0.0,
|
|
188 ; CM-NEXT: ALU clause starting at 9:
|
|
189 ; CM-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
|
|
190 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
191 ; CM-NEXT: BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W,
|
|
192 ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
|
|
193 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
194 %conv = zext i8 %numlowbits to i32
|
|
195 %notmask = shl i32 -1, %conv
|
|
196 %mask = xor i32 %notmask, -1
|
|
197 %masked = and i32 %mask, %val
|
|
198 store i32 %masked, i32 addrspace(1)* %out
|
|
199 ret void
|
|
200 }
|
|
201
|
|
202 define amdgpu_kernel void @bzhi32_b4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
|
|
203 ; EG-LABEL: bzhi32_b4_commutative:
|
|
204 ; EG: ; %bb.0:
|
|
205 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
206 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
207 ; EG-NEXT: CF_END
|
|
208 ; EG-NEXT: PAD
|
|
209 ; EG-NEXT: ALU clause starting at 4:
|
|
210 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
211 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
212 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
213 ;
|
|
214 ; CM-LABEL: bzhi32_b4_commutative:
|
|
215 ; CM: ; %bb.0:
|
|
216 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
217 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
218 ; CM-NEXT: CF_END
|
|
219 ; CM-NEXT: PAD
|
|
220 ; CM-NEXT: ALU clause starting at 4:
|
|
221 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
222 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
223 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
224 %notmask = shl i32 -1, %numlowbits
|
|
225 %mask = xor i32 %notmask, -1
|
|
226 %masked = and i32 %val, %mask ; swapped order
|
|
227 store i32 %masked, i32 addrspace(1)* %out
|
|
228 ret void
|
|
229 }
|
|
230
|
|
231 ; ---------------------------------------------------------------------------- ;
|
|
232 ; Pattern c. 32-bit
|
|
233 ; ---------------------------------------------------------------------------- ;
|
|
234
|
|
235 define amdgpu_kernel void @bzhi32_c0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
|
|
236 ; EG-LABEL: bzhi32_c0:
|
|
237 ; EG: ; %bb.0:
|
|
238 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
239 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
240 ; EG-NEXT: CF_END
|
|
241 ; EG-NEXT: PAD
|
|
242 ; EG-NEXT: ALU clause starting at 4:
|
|
243 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
244 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
245 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
246 ;
|
|
247 ; CM-LABEL: bzhi32_c0:
|
|
248 ; CM: ; %bb.0:
|
|
249 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
250 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
251 ; CM-NEXT: CF_END
|
|
252 ; CM-NEXT: PAD
|
|
253 ; CM-NEXT: ALU clause starting at 4:
|
|
254 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
255 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
256 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
257 %numhighbits = sub i32 32, %numlowbits
|
|
258 %mask = lshr i32 -1, %numhighbits
|
|
259 %masked = and i32 %mask, %val
|
|
260 store i32 %masked, i32 addrspace(1)* %out
|
|
261 ret void
|
|
262 }
|
|
263
|
|
264 define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) {
|
|
265 ; EG-LABEL: bzhi32_c1_indexzext:
|
|
266 ; EG: ; %bb.0:
|
|
267 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
|
|
268 ; EG-NEXT: TEX 0 @6
|
|
269 ; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
|
|
270 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
271 ; EG-NEXT: CF_END
|
|
272 ; EG-NEXT: PAD
|
|
273 ; EG-NEXT: Fetch clause starting at 6:
|
|
274 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
|
|
275 ; EG-NEXT: ALU clause starting at 8:
|
|
276 ; EG-NEXT: MOV * T0.X, 0.0,
|
|
277 ; EG-NEXT: ALU clause starting at 9:
|
|
278 ; EG-NEXT: SUB_INT * T0.W, literal.x, T0.X,
|
|
279 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
280 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
281 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
282 ; EG-NEXT: LSHR * T0.W, literal.x, PV.W,
|
|
283 ; EG-NEXT: -1(nan), 0(0.000000e+00)
|
|
284 ; EG-NEXT: AND_INT T0.X, PV.W, KC0[2].Y,
|
|
285 ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
|
|
286 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
287 ;
|
|
288 ; CM-LABEL: bzhi32_c1_indexzext:
|
|
289 ; CM: ; %bb.0:
|
|
290 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
|
|
291 ; CM-NEXT: TEX 0 @6
|
|
292 ; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
|
|
293 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
294 ; CM-NEXT: CF_END
|
|
295 ; CM-NEXT: PAD
|
|
296 ; CM-NEXT: Fetch clause starting at 6:
|
|
297 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
|
|
298 ; CM-NEXT: ALU clause starting at 8:
|
|
299 ; CM-NEXT: MOV * T0.X, 0.0,
|
|
300 ; CM-NEXT: ALU clause starting at 9:
|
|
301 ; CM-NEXT: SUB_INT * T0.W, literal.x, T0.X,
|
|
302 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
303 ; CM-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
304 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
305 ; CM-NEXT: LSHR * T0.W, literal.x, PV.W,
|
|
306 ; CM-NEXT: -1(nan), 0(0.000000e+00)
|
|
307 ; CM-NEXT: AND_INT * T0.X, PV.W, KC0[2].Y,
|
|
308 ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
|
|
309 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
310 %numhighbits = sub i8 32, %numlowbits
|
|
311 %sh_prom = zext i8 %numhighbits to i32
|
|
312 %mask = lshr i32 -1, %sh_prom
|
|
313 %masked = and i32 %mask, %val
|
|
314 store i32 %masked, i32 addrspace(1)* %out
|
|
315 ret void
|
|
316 }
|
|
317
|
|
318 define amdgpu_kernel void @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
|
|
319 ; EG-LABEL: bzhi32_c4_commutative:
|
|
320 ; EG: ; %bb.0:
|
|
321 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
322 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
323 ; EG-NEXT: CF_END
|
|
324 ; EG-NEXT: PAD
|
|
325 ; EG-NEXT: ALU clause starting at 4:
|
|
326 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
327 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
328 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
329 ;
|
|
330 ; CM-LABEL: bzhi32_c4_commutative:
|
|
331 ; CM: ; %bb.0:
|
|
332 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
333 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
334 ; CM-NEXT: CF_END
|
|
335 ; CM-NEXT: PAD
|
|
336 ; CM-NEXT: ALU clause starting at 4:
|
|
337 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
338 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
339 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
340 %numhighbits = sub i32 32, %numlowbits
|
|
341 %mask = lshr i32 -1, %numhighbits
|
|
342 %masked = and i32 %val, %mask ; swapped order
|
|
343 store i32 %masked, i32 addrspace(1)* %out
|
|
344 ret void
|
|
345 }
|
|
346
|
|
347 ; ---------------------------------------------------------------------------- ;
|
|
348 ; Pattern d. 32-bit.
|
|
349 ; ---------------------------------------------------------------------------- ;
|
|
350
|
|
351 define amdgpu_kernel void @bzhi32_d0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
|
|
352 ; EG-LABEL: bzhi32_d0:
|
|
353 ; EG: ; %bb.0:
|
|
354 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
355 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
356 ; EG-NEXT: CF_END
|
|
357 ; EG-NEXT: PAD
|
|
358 ; EG-NEXT: ALU clause starting at 4:
|
|
359 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
360 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
361 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
362 ;
|
|
363 ; CM-LABEL: bzhi32_d0:
|
|
364 ; CM: ; %bb.0:
|
|
365 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
|
366 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
367 ; CM-NEXT: CF_END
|
|
368 ; CM-NEXT: PAD
|
|
369 ; CM-NEXT: ALU clause starting at 4:
|
|
370 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
|
|
371 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
372 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
|
|
373 %numhighbits = sub i32 32, %numlowbits
|
|
374 %highbitscleared = shl i32 %val, %numhighbits
|
|
375 %masked = lshr i32 %highbitscleared, %numhighbits
|
|
376 store i32 %masked, i32 addrspace(1)* %out
|
|
377 ret void
|
|
378 }
|
|
379
|
|
380 define amdgpu_kernel void @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) {
|
|
381 ; EG-LABEL: bzhi32_d1_indexzext:
|
|
382 ; EG: ; %bb.0:
|
|
383 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
|
|
384 ; EG-NEXT: TEX 0 @6
|
|
385 ; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
|
|
386 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
387 ; EG-NEXT: CF_END
|
|
388 ; EG-NEXT: PAD
|
|
389 ; EG-NEXT: Fetch clause starting at 6:
|
|
390 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
|
|
391 ; EG-NEXT: ALU clause starting at 8:
|
|
392 ; EG-NEXT: MOV * T0.X, 0.0,
|
|
393 ; EG-NEXT: ALU clause starting at 9:
|
|
394 ; EG-NEXT: SUB_INT * T0.W, literal.x, T0.X,
|
|
395 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
396 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
397 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
398 ; EG-NEXT: LSHL * T1.W, KC0[2].Y, PV.W,
|
|
399 ; EG-NEXT: LSHR T0.X, PV.W, T0.W,
|
|
400 ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
|
|
401 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
402 ;
|
|
403 ; CM-LABEL: bzhi32_d1_indexzext:
|
|
404 ; CM: ; %bb.0:
|
|
405 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
|
|
406 ; CM-NEXT: TEX 0 @6
|
|
407 ; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
|
|
408 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
409 ; CM-NEXT: CF_END
|
|
410 ; CM-NEXT: PAD
|
|
411 ; CM-NEXT: Fetch clause starting at 6:
|
|
412 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
|
|
413 ; CM-NEXT: ALU clause starting at 8:
|
|
414 ; CM-NEXT: MOV * T0.X, 0.0,
|
|
415 ; CM-NEXT: ALU clause starting at 9:
|
|
416 ; CM-NEXT: SUB_INT * T0.W, literal.x, T0.X,
|
|
417 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
418 ; CM-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
419 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
420 ; CM-NEXT: LSHL * T1.W, KC0[2].Y, PV.W,
|
|
421 ; CM-NEXT: LSHR * T0.X, PV.W, T0.W,
|
|
422 ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
|
|
423 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
424 %numhighbits = sub i8 32, %numlowbits
|
|
425 %sh_prom = zext i8 %numhighbits to i32
|
|
426 %highbitscleared = shl i32 %val, %sh_prom
|
|
427 %masked = lshr i32 %highbitscleared, %sh_prom
|
|
428 store i32 %masked, i32 addrspace(1)* %out
|
|
429 ret void
|
|
430 }
|