150
|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
2 ; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,GCN,SI
|
|
3 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,VI
|
|
4
|
|
5 declare i16 @llvm.bswap.i16(i16) nounwind readnone
|
|
6 declare i32 @llvm.bswap.i32(i32) nounwind readnone
|
|
7 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
|
|
8 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
|
|
9 declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
|
|
10 declare i64 @llvm.bswap.i64(i64) nounwind readnone
|
|
11 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
|
|
12 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
|
|
13
|
|
14 define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
|
|
15 ; SI-LABEL: test_bswap_i32:
|
|
16 ; SI: ; %bb.0:
|
|
17 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
18 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
19 ; SI-NEXT: s_load_dword s4, s[2:3], 0x0
|
|
20 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
21 ; SI-NEXT: s_mov_b32 s2, -1
|
|
22 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
23 ; SI-NEXT: v_alignbit_b32 v0, s4, s4, 8
|
|
24 ; SI-NEXT: v_alignbit_b32 v1, s4, s4, 24
|
|
25 ; SI-NEXT: s_mov_b32 s4, 0xff00ff
|
|
26 ; SI-NEXT: v_bfi_b32 v0, s4, v1, v0
|
|
27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
28 ; SI-NEXT: s_endpgm
|
|
29 ;
|
|
30 ; VI-LABEL: test_bswap_i32:
|
|
31 ; VI: ; %bb.0:
|
|
32 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
33 ; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
34 ; VI-NEXT: s_mov_b32 s2, -1
|
|
35 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
36 ; VI-NEXT: s_mov_b32 s0, s4
|
|
37 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
|
|
38 ; VI-NEXT: s_mov_b32 s1, s5
|
|
39 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
40 ; VI-NEXT: v_alignbit_b32 v0, s4, s4, 8
|
|
41 ; VI-NEXT: v_alignbit_b32 v1, s4, s4, 24
|
|
42 ; VI-NEXT: s_mov_b32 s4, 0xff00ff
|
|
43 ; VI-NEXT: v_bfi_b32 v0, s4, v1, v0
|
|
44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
45 ; VI-NEXT: s_endpgm
|
|
46 %val = load i32, i32 addrspace(1)* %in, align 4
|
|
47 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
|
|
48 store i32 %bswap, i32 addrspace(1)* %out, align 4
|
|
49 ret void
|
|
50 }
|
|
51
|
|
52 define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
|
|
53 ; SI-LABEL: test_bswap_v2i32:
|
|
54 ; SI: ; %bb.0:
|
|
55 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
56 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
57 ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
|
|
58 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
59 ; SI-NEXT: s_mov_b32 s2, -1
|
|
60 ; SI-NEXT: s_mov_b32 s6, 0xff00ff
|
|
61 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
62 ; SI-NEXT: v_alignbit_b32 v0, s5, s5, 8
|
|
63 ; SI-NEXT: v_alignbit_b32 v1, s5, s5, 24
|
|
64 ; SI-NEXT: v_alignbit_b32 v2, s4, s4, 8
|
|
65 ; SI-NEXT: v_alignbit_b32 v3, s4, s4, 24
|
|
66 ; SI-NEXT: v_bfi_b32 v1, s6, v1, v0
|
|
67 ; SI-NEXT: v_bfi_b32 v0, s6, v3, v2
|
|
68 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
69 ; SI-NEXT: s_endpgm
|
|
70 ;
|
|
71 ; VI-LABEL: test_bswap_v2i32:
|
|
72 ; VI: ; %bb.0:
|
|
73 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
74 ; VI-NEXT: s_mov_b32 s8, 0xff00ff
|
|
75 ; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
76 ; VI-NEXT: s_mov_b32 s2, -1
|
|
77 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
78 ; VI-NEXT: s_mov_b32 s0, s4
|
|
79 ; VI-NEXT: s_mov_b32 s1, s5
|
|
80 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
|
|
81 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
82 ; VI-NEXT: v_alignbit_b32 v0, s5, s5, 8
|
|
83 ; VI-NEXT: v_alignbit_b32 v1, s5, s5, 24
|
|
84 ; VI-NEXT: v_alignbit_b32 v2, s4, s4, 8
|
|
85 ; VI-NEXT: v_alignbit_b32 v3, s4, s4, 24
|
|
86 ; VI-NEXT: v_bfi_b32 v1, s8, v1, v0
|
|
87 ; VI-NEXT: v_bfi_b32 v0, s8, v3, v2
|
|
88 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
89 ; VI-NEXT: s_endpgm
|
|
90 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
|
|
91 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
|
|
92 store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
|
|
93 ret void
|
|
94 }
|
|
95
|
|
96 define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
|
|
97 ; SI-LABEL: test_bswap_v4i32:
|
|
98 ; SI: ; %bb.0:
|
|
99 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
100 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
101 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
|
|
102 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
103 ; SI-NEXT: s_mov_b32 s2, -1
|
|
104 ; SI-NEXT: s_mov_b32 s8, 0xff00ff
|
|
105 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
106 ; SI-NEXT: v_alignbit_b32 v0, s7, s7, 8
|
|
107 ; SI-NEXT: v_alignbit_b32 v1, s7, s7, 24
|
|
108 ; SI-NEXT: v_alignbit_b32 v2, s6, s6, 8
|
|
109 ; SI-NEXT: v_alignbit_b32 v4, s6, s6, 24
|
|
110 ; SI-NEXT: v_alignbit_b32 v5, s5, s5, 8
|
|
111 ; SI-NEXT: v_alignbit_b32 v6, s5, s5, 24
|
|
112 ; SI-NEXT: v_alignbit_b32 v7, s4, s4, 8
|
|
113 ; SI-NEXT: v_alignbit_b32 v8, s4, s4, 24
|
|
114 ; SI-NEXT: v_bfi_b32 v3, s8, v1, v0
|
|
115 ; SI-NEXT: v_bfi_b32 v2, s8, v4, v2
|
|
116 ; SI-NEXT: v_bfi_b32 v1, s8, v6, v5
|
|
117 ; SI-NEXT: v_bfi_b32 v0, s8, v8, v7
|
|
118 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
119 ; SI-NEXT: s_endpgm
|
|
120 ;
|
|
121 ; VI-LABEL: test_bswap_v4i32:
|
|
122 ; VI: ; %bb.0:
|
|
123 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
124 ; VI-NEXT: s_mov_b32 s8, 0xff00ff
|
|
125 ; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
126 ; VI-NEXT: s_mov_b32 s2, -1
|
|
127 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
128 ; VI-NEXT: s_mov_b32 s0, s4
|
|
129 ; VI-NEXT: s_mov_b32 s1, s5
|
|
130 ; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
|
|
131 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
132 ; VI-NEXT: v_alignbit_b32 v0, s7, s7, 8
|
|
133 ; VI-NEXT: v_alignbit_b32 v1, s7, s7, 24
|
|
134 ; VI-NEXT: v_bfi_b32 v3, s8, v1, v0
|
|
135 ; VI-NEXT: v_alignbit_b32 v2, s6, s6, 8
|
|
136 ; VI-NEXT: v_alignbit_b32 v4, s6, s6, 24
|
|
137 ; VI-NEXT: v_alignbit_b32 v0, s5, s5, 8
|
|
138 ; VI-NEXT: v_alignbit_b32 v1, s5, s5, 24
|
|
139 ; VI-NEXT: v_bfi_b32 v2, s8, v4, v2
|
|
140 ; VI-NEXT: v_bfi_b32 v1, s8, v1, v0
|
|
141 ; VI-NEXT: v_alignbit_b32 v0, s4, s4, 8
|
|
142 ; VI-NEXT: v_alignbit_b32 v4, s4, s4, 24
|
|
143 ; VI-NEXT: v_bfi_b32 v0, s8, v4, v0
|
|
144 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
145 ; VI-NEXT: s_endpgm
|
|
146 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
|
|
147 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
|
|
148 store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
|
|
149 ret void
|
|
150 }
|
|
151
|
|
152 define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
|
|
153 ; SI-LABEL: test_bswap_v8i32:
|
|
154 ; SI: ; %bb.0:
|
|
155 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
|
156 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
157 ; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
|
|
158 ; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
159 ; SI-NEXT: s_mov_b32 s10, -1
|
|
160 ; SI-NEXT: s_mov_b32 s12, 0xff00ff
|
|
161 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
162 ; SI-NEXT: v_alignbit_b32 v0, s3, s3, 8
|
|
163 ; SI-NEXT: v_alignbit_b32 v1, s3, s3, 24
|
|
164 ; SI-NEXT: v_alignbit_b32 v2, s2, s2, 8
|
|
165 ; SI-NEXT: v_alignbit_b32 v4, s2, s2, 24
|
|
166 ; SI-NEXT: v_alignbit_b32 v5, s1, s1, 8
|
|
167 ; SI-NEXT: v_alignbit_b32 v6, s1, s1, 24
|
|
168 ; SI-NEXT: v_alignbit_b32 v7, s0, s0, 8
|
|
169 ; SI-NEXT: v_alignbit_b32 v8, s0, s0, 24
|
|
170 ; SI-NEXT: v_alignbit_b32 v9, s7, s7, 8
|
|
171 ; SI-NEXT: v_alignbit_b32 v10, s7, s7, 24
|
|
172 ; SI-NEXT: v_alignbit_b32 v11, s6, s6, 8
|
|
173 ; SI-NEXT: v_alignbit_b32 v12, s6, s6, 24
|
|
174 ; SI-NEXT: v_alignbit_b32 v13, s5, s5, 8
|
|
175 ; SI-NEXT: v_alignbit_b32 v14, s5, s5, 24
|
|
176 ; SI-NEXT: v_alignbit_b32 v15, s4, s4, 8
|
|
177 ; SI-NEXT: v_alignbit_b32 v16, s4, s4, 24
|
|
178 ; SI-NEXT: v_bfi_b32 v3, s12, v1, v0
|
|
179 ; SI-NEXT: v_bfi_b32 v2, s12, v4, v2
|
|
180 ; SI-NEXT: v_bfi_b32 v1, s12, v6, v5
|
|
181 ; SI-NEXT: v_bfi_b32 v0, s12, v8, v7
|
|
182 ; SI-NEXT: v_bfi_b32 v7, s12, v10, v9
|
|
183 ; SI-NEXT: v_bfi_b32 v6, s12, v12, v11
|
|
184 ; SI-NEXT: v_bfi_b32 v5, s12, v14, v13
|
|
185 ; SI-NEXT: v_bfi_b32 v4, s12, v16, v15
|
|
186 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
|
187 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
|
|
188 ; SI-NEXT: s_endpgm
|
|
189 ;
|
|
190 ; VI-LABEL: test_bswap_v8i32:
|
|
191 ; VI: ; %bb.0:
|
|
192 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
193 ; VI-NEXT: s_mov_b32 s12, 0xff00ff
|
|
194 ; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
195 ; VI-NEXT: s_mov_b32 s2, -1
|
|
196 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
197 ; VI-NEXT: s_mov_b32 s0, s4
|
|
198 ; VI-NEXT: s_mov_b32 s1, s5
|
|
199 ; VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
|
|
200 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
201 ; VI-NEXT: v_alignbit_b32 v0, s7, s7, 8
|
|
202 ; VI-NEXT: v_alignbit_b32 v1, s7, s7, 24
|
|
203 ; VI-NEXT: v_bfi_b32 v3, s12, v1, v0
|
|
204 ; VI-NEXT: v_alignbit_b32 v2, s6, s6, 8
|
|
205 ; VI-NEXT: v_alignbit_b32 v4, s6, s6, 24
|
|
206 ; VI-NEXT: v_alignbit_b32 v0, s5, s5, 8
|
|
207 ; VI-NEXT: v_alignbit_b32 v1, s5, s5, 24
|
|
208 ; VI-NEXT: v_bfi_b32 v2, s12, v4, v2
|
|
209 ; VI-NEXT: v_bfi_b32 v1, s12, v1, v0
|
|
210 ; VI-NEXT: v_alignbit_b32 v0, s4, s4, 8
|
|
211 ; VI-NEXT: v_alignbit_b32 v4, s4, s4, 24
|
|
212 ; VI-NEXT: v_bfi_b32 v0, s12, v4, v0
|
|
213 ; VI-NEXT: v_alignbit_b32 v4, s11, s11, 8
|
|
214 ; VI-NEXT: v_alignbit_b32 v5, s11, s11, 24
|
|
215 ; VI-NEXT: v_bfi_b32 v7, s12, v5, v4
|
|
216 ; VI-NEXT: v_alignbit_b32 v4, s10, s10, 8
|
|
217 ; VI-NEXT: v_alignbit_b32 v5, s10, s10, 24
|
|
218 ; VI-NEXT: v_bfi_b32 v6, s12, v5, v4
|
|
219 ; VI-NEXT: v_alignbit_b32 v4, s9, s9, 8
|
|
220 ; VI-NEXT: v_alignbit_b32 v5, s9, s9, 24
|
|
221 ; VI-NEXT: v_bfi_b32 v5, s12, v5, v4
|
|
222 ; VI-NEXT: v_alignbit_b32 v4, s8, s8, 8
|
|
223 ; VI-NEXT: v_alignbit_b32 v8, s8, s8, 24
|
|
224 ; VI-NEXT: v_bfi_b32 v4, s12, v8, v4
|
|
225 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
|
226 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
227 ; VI-NEXT: s_endpgm
|
|
228 %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
|
|
229 %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
|
|
230 store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
|
|
231 ret void
|
|
232 }
|
|
233
|
|
234 define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
|
|
235 ; SI-LABEL: test_bswap_i64:
|
|
236 ; SI: ; %bb.0:
|
|
237 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
238 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
239 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
240 ; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
|
241 ; SI-NEXT: s_mov_b32 s2, -1
|
|
242 ; SI-NEXT: s_mov_b32 s19, 0xff0000
|
|
243 ; SI-NEXT: s_mov_b32 s9, 0
|
|
244 ; SI-NEXT: s_mov_b32 s15, 0xff00
|
|
245 ; SI-NEXT: s_mov_b32 s11, s9
|
|
246 ; SI-NEXT: s_mov_b32 s12, s9
|
|
247 ; SI-NEXT: s_mov_b32 s14, s9
|
|
248 ; SI-NEXT: s_mov_b32 s16, s9
|
|
249 ; SI-NEXT: s_mov_b32 s18, s9
|
|
250 ; SI-NEXT: s_mov_b32 s0, s4
|
|
251 ; SI-NEXT: s_mov_b32 s1, s5
|
|
252 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
253 ; SI-NEXT: v_mov_b32_e32 v0, s6
|
|
254 ; SI-NEXT: v_alignbit_b32 v1, s7, v0, 24
|
|
255 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 8
|
|
256 ; SI-NEXT: s_lshr_b32 s8, s7, 24
|
|
257 ; SI-NEXT: s_lshr_b32 s10, s7, 8
|
|
258 ; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 8
|
|
259 ; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 24
|
|
260 ; SI-NEXT: s_lshl_b32 s17, s6, 24
|
|
261 ; SI-NEXT: s_lshl_b32 s4, s6, 8
|
|
262 ; SI-NEXT: v_and_b32_e32 v1, s19, v1
|
|
263 ; SI-NEXT: v_and_b32_e32 v0, 0xff000000, v0
|
|
264 ; SI-NEXT: s_and_b32 s10, s10, s15
|
|
265 ; SI-NEXT: s_and_b32 s13, s5, 0xff
|
|
266 ; SI-NEXT: s_and_b32 s15, s21, s15
|
|
267 ; SI-NEXT: s_and_b32 s19, s4, s19
|
|
268 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
269 ; SI-NEXT: s_or_b64 s[4:5], s[10:11], s[8:9]
|
|
270 ; SI-NEXT: s_or_b64 s[6:7], s[14:15], s[12:13]
|
|
271 ; SI-NEXT: s_or_b64 s[8:9], s[16:17], s[18:19]
|
|
272 ; SI-NEXT: v_or_b32_e32 v0, s4, v0
|
|
273 ; SI-NEXT: v_mov_b32_e32 v1, s5
|
|
274 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[6:7]
|
|
275 ; SI-NEXT: v_or_b32_e32 v0, s4, v0
|
|
276 ; SI-NEXT: v_or_b32_e32 v1, s5, v1
|
|
277 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
278 ; SI-NEXT: s_endpgm
|
|
279 ;
|
|
280 ; VI-LABEL: test_bswap_i64:
|
|
281 ; VI: ; %bb.0:
|
|
282 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
283 ; VI-NEXT: s_mov_b32 s12, 0xff0000
|
|
284 ; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
285 ; VI-NEXT: s_mov_b32 s2, -1
|
|
286 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
287 ; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
|
288 ; VI-NEXT: s_mov_b32 s1, s5
|
|
289 ; VI-NEXT: s_mov_b32 s5, 0
|
|
290 ; VI-NEXT: s_mov_b32 s0, s4
|
|
291 ; VI-NEXT: s_mov_b32 s9, s5
|
|
292 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
293 ; VI-NEXT: v_mov_b32_e32 v0, s6
|
|
294 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, 24
|
|
295 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, 8
|
|
296 ; VI-NEXT: s_bfe_u32 s8, s7, 0x80010
|
|
297 ; VI-NEXT: v_and_b32_e32 v1, s12, v1
|
|
298 ; VI-NEXT: v_and_b32_e32 v0, 0xff000000, v0
|
|
299 ; VI-NEXT: s_lshr_b32 s4, s7, 24
|
|
300 ; VI-NEXT: s_lshl_b32 s8, s8, 8
|
|
301 ; VI-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5]
|
|
302 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
303 ; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24
|
|
304 ; VI-NEXT: v_or_b32_e32 v0, s8, v0
|
|
305 ; VI-NEXT: v_mov_b32_e32 v1, s9
|
|
306 ; VI-NEXT: s_lshl_b64 s[8:9], s[6:7], 8
|
|
307 ; VI-NEXT: s_lshl_b32 s4, s6, 8
|
|
308 ; VI-NEXT: s_and_b32 s9, s9, 0xff
|
|
309 ; VI-NEXT: s_mov_b32 s8, s5
|
|
310 ; VI-NEXT: s_and_b32 s11, s11, 0xff00
|
|
311 ; VI-NEXT: s_mov_b32 s10, s5
|
|
312 ; VI-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
|
|
313 ; VI-NEXT: s_lshl_b32 s11, s6, 24
|
|
314 ; VI-NEXT: s_and_b32 s7, s4, s12
|
|
315 ; VI-NEXT: s_mov_b32 s6, s5
|
|
316 ; VI-NEXT: s_or_b64 s[4:5], s[10:11], s[6:7]
|
|
317 ; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
|
|
318 ; VI-NEXT: v_or_b32_e32 v0, s4, v0
|
|
319 ; VI-NEXT: v_or_b32_e32 v1, s5, v1
|
|
320 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
321 ; VI-NEXT: s_endpgm
|
|
322 %val = load i64, i64 addrspace(1)* %in, align 8
|
|
323 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
|
|
324 store i64 %bswap, i64 addrspace(1)* %out, align 8
|
|
325 ret void
|
|
326 }
|
|
327
|
|
328 define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
|
|
329 ; SI-LABEL: test_bswap_v2i64:
|
|
330 ; SI: ; %bb.0:
|
|
331 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
332 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
333 ; SI-NEXT: s_mov_b32 s2, -1
|
|
334 ; SI-NEXT: s_mov_b32 s31, 0xff0000
|
|
335 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
336 ; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
|
|
337 ; SI-NEXT: s_mov_b32 s7, 0
|
|
338 ; SI-NEXT: s_mov_b32 s22, 0xff000000
|
|
339 ; SI-NEXT: s_mov_b32 s27, 0xff00
|
|
340 ; SI-NEXT: s_movk_i32 s25, 0xff
|
|
341 ; SI-NEXT: s_mov_b32 s13, s7
|
|
342 ; SI-NEXT: s_mov_b32 s14, s7
|
|
343 ; SI-NEXT: s_mov_b32 s16, s7
|
|
344 ; SI-NEXT: s_mov_b32 s18, s7
|
|
345 ; SI-NEXT: s_mov_b32 s20, s7
|
|
346 ; SI-NEXT: s_mov_b32 s23, s7
|
|
347 ; SI-NEXT: s_mov_b32 s24, s7
|
|
348 ; SI-NEXT: s_mov_b32 s26, s7
|
|
349 ; SI-NEXT: s_mov_b32 s28, s7
|
|
350 ; SI-NEXT: s_mov_b32 s30, s7
|
|
351 ; SI-NEXT: s_mov_b32 s0, s4
|
|
352 ; SI-NEXT: s_mov_b32 s1, s5
|
|
353 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
354 ; SI-NEXT: v_mov_b32_e32 v0, s10
|
|
355 ; SI-NEXT: v_alignbit_b32 v1, s11, v0, 24
|
|
356 ; SI-NEXT: v_alignbit_b32 v0, s11, v0, 8
|
|
357 ; SI-NEXT: s_lshr_b32 s6, s11, 24
|
|
358 ; SI-NEXT: s_lshr_b32 s12, s11, 8
|
|
359 ; SI-NEXT: s_lshl_b64 s[4:5], s[10:11], 8
|
|
360 ; SI-NEXT: s_lshl_b64 s[32:33], s[10:11], 24
|
|
361 ; SI-NEXT: s_lshl_b32 s19, s10, 24
|
|
362 ; SI-NEXT: s_lshl_b32 s21, s10, 8
|
|
363 ; SI-NEXT: v_mov_b32_e32 v2, s8
|
|
364 ; SI-NEXT: v_alignbit_b32 v3, s9, v2, 24
|
|
365 ; SI-NEXT: v_alignbit_b32 v2, s9, v2, 8
|
|
366 ; SI-NEXT: s_lshr_b32 s32, s9, 8
|
|
367 ; SI-NEXT: s_lshl_b64 s[10:11], s[8:9], 8
|
|
368 ; SI-NEXT: s_and_b32 s15, s5, s25
|
|
369 ; SI-NEXT: s_lshl_b64 s[4:5], s[8:9], 24
|
|
370 ; SI-NEXT: s_lshl_b32 s29, s8, 24
|
|
371 ; SI-NEXT: s_lshl_b32 s4, s8, 8
|
|
372 ; SI-NEXT: v_and_b32_e32 v1, s31, v1
|
|
373 ; SI-NEXT: v_and_b32_e32 v0, s22, v0
|
|
374 ; SI-NEXT: s_and_b32 s12, s12, s27
|
|
375 ; SI-NEXT: s_and_b32 s17, s33, s27
|
|
376 ; SI-NEXT: s_and_b32 s21, s21, s31
|
|
377 ; SI-NEXT: v_and_b32_e32 v3, s31, v3
|
|
378 ; SI-NEXT: v_and_b32_e32 v2, s22, v2
|
|
379 ; SI-NEXT: s_and_b32 s22, s32, s27
|
|
380 ; SI-NEXT: s_and_b32 s25, s11, s25
|
|
381 ; SI-NEXT: s_and_b32 s27, s5, s27
|
|
382 ; SI-NEXT: s_and_b32 s31, s4, s31
|
|
383 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
384 ; SI-NEXT: s_or_b64 s[4:5], s[12:13], s[6:7]
|
|
385 ; SI-NEXT: s_or_b64 s[10:11], s[16:17], s[14:15]
|
|
386 ; SI-NEXT: s_or_b64 s[12:13], s[18:19], s[20:21]
|
|
387 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
|
|
388 ; SI-NEXT: s_lshr_b32 s6, s9, 24
|
|
389 ; SI-NEXT: s_or_b64 s[8:9], s[26:27], s[24:25]
|
|
390 ; SI-NEXT: s_or_b64 s[14:15], s[28:29], s[30:31]
|
|
391 ; SI-NEXT: v_or_b32_e32 v0, s4, v0
|
|
392 ; SI-NEXT: v_mov_b32_e32 v3, s5
|
|
393 ; SI-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
|
|
394 ; SI-NEXT: s_or_b64 s[6:7], s[22:23], s[6:7]
|
|
395 ; SI-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
|
|
396 ; SI-NEXT: v_or_b32_e32 v2, s4, v0
|
|
397 ; SI-NEXT: v_or_b32_e32 v3, s5, v3
|
|
398 ; SI-NEXT: v_or_b32_e32 v0, s6, v1
|
|
399 ; SI-NEXT: v_mov_b32_e32 v1, s7
|
|
400 ; SI-NEXT: v_or_b32_e32 v0, s8, v0
|
|
401 ; SI-NEXT: v_or_b32_e32 v1, s9, v1
|
|
402 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
403 ; SI-NEXT: s_endpgm
|
|
404 ;
|
|
405 ; VI-LABEL: test_bswap_v2i64:
|
|
406 ; VI: ; %bb.0:
|
|
407 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
408 ; VI-NEXT: s_mov_b32 s9, 0
|
|
409 ; VI-NEXT: s_mov_b32 s14, 0xff0000
|
|
410 ; VI-NEXT: s_mov_b32 s15, 0xff000000
|
|
411 ; VI-NEXT: s_mov_b32 s11, s9
|
|
412 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
413 ; VI-NEXT: s_mov_b32 s0, s4
|
|
414 ; VI-NEXT: s_mov_b32 s1, s5
|
|
415 ; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
|
|
416 ; VI-NEXT: s_movk_i32 s16, 0xff
|
|
417 ; VI-NEXT: s_mov_b32 s17, 0xff00
|
|
418 ; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
419 ; VI-NEXT: s_mov_b32 s2, -1
|
|
420 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
421 ; VI-NEXT: v_mov_b32_e32 v0, s6
|
|
422 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, 24
|
|
423 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, 8
|
|
424 ; VI-NEXT: s_bfe_u32 s10, s7, 0x80010
|
|
425 ; VI-NEXT: v_and_b32_e32 v1, s14, v1
|
|
426 ; VI-NEXT: v_and_b32_e32 v0, s15, v0
|
|
427 ; VI-NEXT: s_lshr_b32 s8, s7, 24
|
|
428 ; VI-NEXT: s_lshl_b32 s10, s10, 8
|
|
429 ; VI-NEXT: s_or_b64 s[10:11], s[10:11], s[8:9]
|
|
430 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
431 ; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 24
|
|
432 ; VI-NEXT: v_or_b32_e32 v0, s10, v0
|
|
433 ; VI-NEXT: v_mov_b32_e32 v1, s11
|
|
434 ; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 8
|
|
435 ; VI-NEXT: s_and_b32 s11, s11, s16
|
|
436 ; VI-NEXT: s_mov_b32 s10, s9
|
|
437 ; VI-NEXT: s_and_b32 s13, s13, s17
|
|
438 ; VI-NEXT: s_mov_b32 s12, s9
|
|
439 ; VI-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11]
|
|
440 ; VI-NEXT: s_lshl_b32 s13, s6, 24
|
|
441 ; VI-NEXT: s_lshl_b32 s6, s6, 8
|
|
442 ; VI-NEXT: s_and_b32 s7, s6, s14
|
|
443 ; VI-NEXT: s_mov_b32 s6, s9
|
|
444 ; VI-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
|
|
445 ; VI-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
|
|
446 ; VI-NEXT: v_or_b32_e32 v2, s6, v0
|
|
447 ; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
448 ; VI-NEXT: v_or_b32_e32 v3, s7, v1
|
|
449 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 24
|
|
450 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 8
|
|
451 ; VI-NEXT: s_bfe_u32 s6, s5, 0x80010
|
|
452 ; VI-NEXT: v_and_b32_e32 v1, s14, v1
|
|
453 ; VI-NEXT: v_and_b32_e32 v0, s15, v0
|
|
454 ; VI-NEXT: s_lshr_b32 s8, s5, 24
|
|
455 ; VI-NEXT: s_lshl_b32 s6, s6, 8
|
|
456 ; VI-NEXT: s_mov_b32 s7, s9
|
|
457 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
458 ; VI-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
|
|
459 ; VI-NEXT: s_lshl_b64 s[10:11], s[4:5], 24
|
|
460 ; VI-NEXT: v_or_b32_e32 v0, s6, v0
|
|
461 ; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
462 ; VI-NEXT: s_lshl_b64 s[6:7], s[4:5], 8
|
|
463 ; VI-NEXT: s_and_b32 s7, s7, s16
|
|
464 ; VI-NEXT: s_mov_b32 s6, s9
|
|
465 ; VI-NEXT: s_and_b32 s11, s11, s17
|
|
466 ; VI-NEXT: s_mov_b32 s10, s9
|
|
467 ; VI-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
|
|
468 ; VI-NEXT: s_lshl_b32 s11, s4, 24
|
|
469 ; VI-NEXT: s_lshl_b32 s4, s4, 8
|
|
470 ; VI-NEXT: s_and_b32 s5, s4, s14
|
|
471 ; VI-NEXT: s_mov_b32 s4, s9
|
|
472 ; VI-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
|
|
473 ; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
|
|
474 ; VI-NEXT: v_or_b32_e32 v0, s4, v0
|
|
475 ; VI-NEXT: v_or_b32_e32 v1, s5, v1
|
|
476 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
477 ; VI-NEXT: s_endpgm
|
|
478 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
|
|
479 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
|
|
480 store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
|
|
481 ret void
|
|
482 }
|
|
483
|
|
484 define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
|
|
485 ; SI-LABEL: test_bswap_v4i64:
|
|
486 ; SI: ; %bb.0:
|
|
487 ; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x9
|
|
488 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
489 ; SI-NEXT: s_mov_b32 s2, -1
|
|
490 ; SI-NEXT: s_mov_b32 s31, 0xff0000
|
|
491 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
492 ; SI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0
|
|
493 ; SI-NEXT: s_mov_b32 s27, 0xff000000
|
|
494 ; SI-NEXT: s_mov_b32 s34, 0xff00
|
|
495 ; SI-NEXT: s_mov_b32 s14, 0
|
|
496 ; SI-NEXT: s_movk_i32 s36, 0xff
|
|
497 ; SI-NEXT: s_mov_b32 s16, s14
|
|
498 ; SI-NEXT: s_mov_b32 s18, s14
|
|
499 ; SI-NEXT: s_mov_b32 s20, s14
|
|
500 ; SI-NEXT: s_mov_b32 s22, s14
|
|
501 ; SI-NEXT: s_mov_b32 s24, s14
|
|
502 ; SI-NEXT: s_mov_b32 s26, s14
|
|
503 ; SI-NEXT: s_mov_b32 s28, s14
|
|
504 ; SI-NEXT: s_mov_b32 s30, s14
|
|
505 ; SI-NEXT: s_mov_b32 s0, s12
|
|
506 ; SI-NEXT: s_mov_b32 s1, s13
|
|
507 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
508 ; SI-NEXT: v_mov_b32_e32 v0, s6
|
|
509 ; SI-NEXT: v_alignbit_b32 v1, s7, v0, 24
|
|
510 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 8
|
|
511 ; SI-NEXT: s_lshr_b32 s35, s7, 24
|
|
512 ; SI-NEXT: s_lshr_b32 s37, s7, 8
|
|
513 ; SI-NEXT: v_mov_b32_e32 v2, s4
|
|
514 ; SI-NEXT: v_alignbit_b32 v3, s5, v2, 24
|
|
515 ; SI-NEXT: v_alignbit_b32 v2, s5, v2, 8
|
|
516 ; SI-NEXT: s_lshr_b32 s38, s5, 24
|
|
517 ; SI-NEXT: s_lshr_b32 s39, s5, 8
|
|
518 ; SI-NEXT: s_lshl_b64 s[12:13], s[6:7], 8
|
|
519 ; SI-NEXT: s_lshl_b64 s[32:33], s[6:7], 24
|
|
520 ; SI-NEXT: s_lshl_b32 s7, s6, 8
|
|
521 ; SI-NEXT: s_and_b32 s15, s13, s36
|
|
522 ; SI-NEXT: s_lshl_b64 s[12:13], s[4:5], 8
|
|
523 ; SI-NEXT: s_and_b32 s17, s33, s34
|
|
524 ; SI-NEXT: s_lshl_b64 s[32:33], s[4:5], 24
|
|
525 ; SI-NEXT: s_lshl_b32 s5, s4, 8
|
|
526 ; SI-NEXT: v_mov_b32_e32 v4, s10
|
|
527 ; SI-NEXT: v_alignbit_b32 v5, s11, v4, 24
|
|
528 ; SI-NEXT: v_alignbit_b32 v4, s11, v4, 8
|
|
529 ; SI-NEXT: s_and_b32 s21, s33, s34
|
|
530 ; SI-NEXT: s_lshl_b64 s[32:33], s[10:11], 24
|
|
531 ; SI-NEXT: s_and_b32 s25, s33, s34
|
|
532 ; SI-NEXT: s_lshl_b64 s[32:33], s[8:9], 24
|
|
533 ; SI-NEXT: s_and_b32 s29, s33, s34
|
|
534 ; SI-NEXT: s_lshr_b32 s12, s11, 24
|
|
535 ; SI-NEXT: s_lshr_b32 s40, s11, 8
|
|
536 ; SI-NEXT: v_mov_b32_e32 v6, s8
|
|
537 ; SI-NEXT: v_alignbit_b32 v7, s9, v6, 24
|
|
538 ; SI-NEXT: v_alignbit_b32 v6, s9, v6, 8
|
|
539 ; SI-NEXT: s_and_b32 s19, s7, s31
|
|
540 ; SI-NEXT: s_lshr_b32 s7, s9, 24
|
|
541 ; SI-NEXT: s_and_b32 s23, s5, s31
|
|
542 ; SI-NEXT: s_lshr_b32 s5, s9, 8
|
|
543 ; SI-NEXT: v_and_b32_e32 v0, s27, v0
|
|
544 ; SI-NEXT: v_and_b32_e32 v2, s27, v2
|
|
545 ; SI-NEXT: v_and_b32_e32 v4, s27, v4
|
|
546 ; SI-NEXT: v_and_b32_e32 v6, s27, v6
|
|
547 ; SI-NEXT: s_lshl_b32 s27, s10, 8
|
|
548 ; SI-NEXT: s_and_b32 s27, s27, s31
|
|
549 ; SI-NEXT: s_lshl_b32 s32, s8, 8
|
|
550 ; SI-NEXT: v_and_b32_e32 v1, s31, v1
|
|
551 ; SI-NEXT: v_and_b32_e32 v3, s31, v3
|
|
552 ; SI-NEXT: v_and_b32_e32 v5, s31, v5
|
|
553 ; SI-NEXT: v_and_b32_e32 v7, s31, v7
|
|
554 ; SI-NEXT: s_and_b32 s31, s32, s31
|
|
555 ; SI-NEXT: s_lshl_b64 s[32:33], s[10:11], 8
|
|
556 ; SI-NEXT: s_and_b32 s11, s37, s34
|
|
557 ; SI-NEXT: s_and_b32 s32, s39, s34
|
|
558 ; SI-NEXT: s_and_b32 s37, s40, s34
|
|
559 ; SI-NEXT: s_and_b32 s5, s5, s34
|
|
560 ; SI-NEXT: s_or_b32 s11, s11, s35
|
|
561 ; SI-NEXT: s_lshl_b64 s[34:35], s[8:9], 8
|
|
562 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
563 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
|
|
564 ; SI-NEXT: s_or_b32 s9, s32, s38
|
|
565 ; SI-NEXT: s_or_b64 s[16:17], s[16:17], s[14:15]
|
|
566 ; SI-NEXT: s_lshl_b32 s15, s6, 24
|
|
567 ; SI-NEXT: v_or_b32_e32 v3, v4, v5
|
|
568 ; SI-NEXT: s_or_b32 s12, s37, s12
|
|
569 ; SI-NEXT: v_or_b32_e32 v4, v6, v7
|
|
570 ; SI-NEXT: s_or_b32 s32, s5, s7
|
|
571 ; SI-NEXT: v_or_b32_e32 v2, s11, v0
|
|
572 ; SI-NEXT: v_or_b32_e32 v0, s9, v1
|
|
573 ; SI-NEXT: s_or_b64 s[6:7], s[14:15], s[18:19]
|
|
574 ; SI-NEXT: s_and_b32 s15, s13, s36
|
|
575 ; SI-NEXT: v_or_b32_e32 v6, s12, v3
|
|
576 ; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[16:17]
|
|
577 ; SI-NEXT: s_or_b64 s[12:13], s[20:21], s[14:15]
|
|
578 ; SI-NEXT: s_lshl_b32 s15, s4, 24
|
|
579 ; SI-NEXT: s_or_b64 s[4:5], s[14:15], s[22:23]
|
|
580 ; SI-NEXT: s_and_b32 s15, s33, s36
|
|
581 ; SI-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13]
|
|
582 ; SI-NEXT: s_or_b64 s[12:13], s[24:25], s[14:15]
|
|
583 ; SI-NEXT: s_lshl_b32 s15, s10, 24
|
|
584 ; SI-NEXT: s_or_b64 s[10:11], s[14:15], s[26:27]
|
|
585 ; SI-NEXT: s_and_b32 s15, s35, s36
|
|
586 ; SI-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
|
|
587 ; SI-NEXT: s_or_b64 s[12:13], s[28:29], s[14:15]
|
|
588 ; SI-NEXT: s_lshl_b32 s15, s8, 24
|
|
589 ; SI-NEXT: s_or_b64 s[8:9], s[14:15], s[30:31]
|
|
590 ; SI-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
|
|
591 ; SI-NEXT: v_or_b32_e32 v4, s32, v4
|
|
592 ; SI-NEXT: v_mov_b32_e32 v3, s7
|
|
593 ; SI-NEXT: v_mov_b32_e32 v1, s5
|
|
594 ; SI-NEXT: v_mov_b32_e32 v7, s11
|
|
595 ; SI-NEXT: v_mov_b32_e32 v5, s9
|
|
596 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
|
597 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
598 ; SI-NEXT: s_endpgm
|
|
599 ;
|
|
600 ; VI-LABEL: test_bswap_v4i64:
|
|
601 ; VI: ; %bb.0:
|
|
602 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
603 ; VI-NEXT: s_mov_b32 s16, 0xff0000
|
|
604 ; VI-NEXT: s_mov_b32 s17, 0xff000000
|
|
605 ; VI-NEXT: s_movk_i32 s18, 0xff
|
|
606 ; VI-NEXT: s_mov_b32 s19, 0xff00
|
|
607 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
608 ; VI-NEXT: s_mov_b32 s0, s4
|
|
609 ; VI-NEXT: s_mov_b32 s1, s5
|
|
610 ; VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
|
|
611 ; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
612 ; VI-NEXT: s_mov_b32 s2, -1
|
|
613 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
614 ; VI-NEXT: v_mov_b32_e32 v0, s6
|
|
615 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, 24
|
|
616 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, 8
|
|
617 ; VI-NEXT: s_bfe_u32 s13, s7, 0x80010
|
|
618 ; VI-NEXT: v_and_b32_e32 v1, s16, v1
|
|
619 ; VI-NEXT: v_and_b32_e32 v0, s17, v0
|
|
620 ; VI-NEXT: s_lshr_b32 s12, s7, 24
|
|
621 ; VI-NEXT: s_lshl_b32 s13, s13, 8
|
|
622 ; VI-NEXT: s_or_b32 s12, s13, s12
|
|
623 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
624 ; VI-NEXT: v_or_b32_e32 v2, s12, v0
|
|
625 ; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
626 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 24
|
|
627 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 8
|
|
628 ; VI-NEXT: s_bfe_u32 s13, s5, 0x80010
|
|
629 ; VI-NEXT: v_and_b32_e32 v1, s16, v1
|
|
630 ; VI-NEXT: v_and_b32_e32 v0, s17, v0
|
|
631 ; VI-NEXT: s_lshr_b32 s12, s5, 24
|
|
632 ; VI-NEXT: s_lshl_b32 s13, s13, 8
|
|
633 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
634 ; VI-NEXT: s_or_b32 s12, s13, s12
|
|
635 ; VI-NEXT: v_or_b32_e32 v0, s12, v0
|
|
636 ; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 8
|
|
637 ; VI-NEXT: s_lshl_b64 s[14:15], s[6:7], 24
|
|
638 ; VI-NEXT: s_mov_b32 s12, 0
|
|
639 ; VI-NEXT: s_and_b32 s13, s13, s18
|
|
640 ; VI-NEXT: s_and_b32 s15, s15, s19
|
|
641 ; VI-NEXT: s_mov_b32 s14, s12
|
|
642 ; VI-NEXT: s_or_b64 s[14:15], s[14:15], s[12:13]
|
|
643 ; VI-NEXT: s_lshl_b32 s13, s6, 24
|
|
644 ; VI-NEXT: s_lshl_b32 s6, s6, 8
|
|
645 ; VI-NEXT: s_and_b32 s7, s6, s16
|
|
646 ; VI-NEXT: s_mov_b32 s6, s12
|
|
647 ; VI-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
|
|
648 ; VI-NEXT: s_or_b64 s[6:7], s[6:7], s[14:15]
|
|
649 ; VI-NEXT: s_lshl_b64 s[14:15], s[4:5], 8
|
|
650 ; VI-NEXT: s_and_b32 s13, s15, s18
|
|
651 ; VI-NEXT: s_lshl_b64 s[14:15], s[4:5], 24
|
|
652 ; VI-NEXT: s_and_b32 s15, s15, s19
|
|
653 ; VI-NEXT: s_mov_b32 s14, s12
|
|
654 ; VI-NEXT: s_or_b64 s[14:15], s[14:15], s[12:13]
|
|
655 ; VI-NEXT: s_lshl_b32 s13, s4, 24
|
|
656 ; VI-NEXT: s_lshl_b32 s4, s4, 8
|
|
657 ; VI-NEXT: s_and_b32 s5, s4, s16
|
|
658 ; VI-NEXT: s_mov_b32 s4, s12
|
|
659 ; VI-NEXT: s_or_b64 s[4:5], s[12:13], s[4:5]
|
|
660 ; VI-NEXT: v_mov_b32_e32 v1, s10
|
|
661 ; VI-NEXT: v_alignbit_b32 v3, s11, v1, 24
|
|
662 ; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15]
|
|
663 ; VI-NEXT: v_alignbit_b32 v1, s11, v1, 8
|
|
664 ; VI-NEXT: s_bfe_u32 s6, s11, 0x80010
|
|
665 ; VI-NEXT: v_and_b32_e32 v3, s16, v3
|
|
666 ; VI-NEXT: v_and_b32_e32 v1, s17, v1
|
|
667 ; VI-NEXT: s_lshr_b32 s4, s11, 24
|
|
668 ; VI-NEXT: s_lshl_b32 s6, s6, 8
|
|
669 ; VI-NEXT: s_or_b32 s4, s6, s4
|
|
670 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
|
|
671 ; VI-NEXT: v_or_b32_e32 v6, s4, v1
|
|
672 ; VI-NEXT: v_mov_b32_e32 v1, s8
|
|
673 ; VI-NEXT: v_alignbit_b32 v3, s9, v1, 24
|
|
674 ; VI-NEXT: v_alignbit_b32 v1, s9, v1, 8
|
|
675 ; VI-NEXT: s_bfe_u32 s6, s9, 0x80010
|
|
676 ; VI-NEXT: s_lshl_b64 s[14:15], s[10:11], 8
|
|
677 ; VI-NEXT: v_and_b32_e32 v3, s16, v3
|
|
678 ; VI-NEXT: v_and_b32_e32 v1, s17, v1
|
|
679 ; VI-NEXT: s_lshr_b32 s4, s9, 24
|
|
680 ; VI-NEXT: s_lshl_b32 s6, s6, 8
|
|
681 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
|
|
682 ; VI-NEXT: s_or_b32 s4, s6, s4
|
|
683 ; VI-NEXT: s_and_b32 s13, s15, s18
|
|
684 ; VI-NEXT: s_lshl_b64 s[14:15], s[10:11], 24
|
|
685 ; VI-NEXT: v_or_b32_e32 v4, s4, v1
|
|
686 ; VI-NEXT: s_lshl_b32 s4, s10, 8
|
|
687 ; VI-NEXT: s_and_b32 s15, s15, s19
|
|
688 ; VI-NEXT: s_mov_b32 s14, s12
|
|
689 ; VI-NEXT: s_or_b64 s[14:15], s[14:15], s[12:13]
|
|
690 ; VI-NEXT: s_lshl_b32 s13, s10, 24
|
|
691 ; VI-NEXT: s_and_b32 s11, s4, s16
|
|
692 ; VI-NEXT: s_mov_b32 s10, s12
|
|
693 ; VI-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11]
|
|
694 ; VI-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
|
|
695 ; VI-NEXT: s_lshl_b64 s[14:15], s[8:9], 8
|
|
696 ; VI-NEXT: s_and_b32 s13, s15, s18
|
|
697 ; VI-NEXT: s_lshl_b64 s[14:15], s[8:9], 24
|
|
698 ; VI-NEXT: s_lshl_b32 s4, s8, 8
|
|
699 ; VI-NEXT: s_and_b32 s15, s15, s19
|
|
700 ; VI-NEXT: s_mov_b32 s14, s12
|
|
701 ; VI-NEXT: s_or_b64 s[14:15], s[14:15], s[12:13]
|
|
702 ; VI-NEXT: s_lshl_b32 s13, s8, 24
|
|
703 ; VI-NEXT: s_and_b32 s9, s4, s16
|
|
704 ; VI-NEXT: s_mov_b32 s8, s12
|
|
705 ; VI-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
|
|
706 ; VI-NEXT: s_or_b64 s[8:9], s[8:9], s[14:15]
|
|
707 ; VI-NEXT: v_mov_b32_e32 v5, s9
|
|
708 ; VI-NEXT: v_mov_b32_e32 v7, s11
|
|
709 ; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
710 ; VI-NEXT: v_mov_b32_e32 v3, s7
|
|
711 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
|
712 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
713 ; VI-NEXT: s_endpgm
|
|
714 %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
|
|
715 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
|
|
716 store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
|
|
717 ret void
|
|
718 }
|
|
719
|
|
720 define float @missing_truncate_promote_bswap(i32 %arg) {
|
|
721 ; SI-LABEL: missing_truncate_promote_bswap:
|
|
722 ; SI: ; %bb.0: ; %bb
|
|
723 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
724 ; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8
|
|
725 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
|
|
726 ; SI-NEXT: s_mov_b32 s4, 0xff00ff
|
|
727 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
|
|
728 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
729 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
730 ; SI-NEXT: s_setpc_b64 s[30:31]
|
|
731 ;
|
|
732 ; VI-LABEL: missing_truncate_promote_bswap:
|
|
733 ; VI: ; %bb.0: ; %bb
|
|
734 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
735 ; VI-NEXT: v_alignbit_b32 v1, v0, v0, 8
|
|
736 ; VI-NEXT: v_alignbit_b32 v0, v0, v0, 24
|
|
737 ; VI-NEXT: s_mov_b32 s4, 0xff00ff
|
|
738 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
|
|
739 ; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
|
740 ; VI-NEXT: s_setpc_b64 s[30:31]
|
|
741 bb:
|
|
742 %tmp = trunc i32 %arg to i16
|
|
743 %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)
|
|
744 %tmp2 = bitcast i16 %tmp1 to half
|
|
745 %tmp3 = fpext half %tmp2 to float
|
|
746 ret float %tmp3
|
|
747 }
|