Mercurial > hg > CbC > CbC_llvm
comparison llvm/test/CodeGen/AMDGPU/bswap.ll @ 150:1d019706d866
LLVM10
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 15:10:13 +0900 |
parents | |
children | 0572611fdcc8 |
comparison
equal
deleted
inserted
replaced
147:c2174574ed3a | 150:1d019706d866 |
---|---|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |
2 ; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,GCN,SI | |
3 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,VI | |
4 | |
5 declare i16 @llvm.bswap.i16(i16) nounwind readnone | |
6 declare i32 @llvm.bswap.i32(i32) nounwind readnone | |
7 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone | |
8 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone | |
9 declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone | |
10 declare i64 @llvm.bswap.i64(i64) nounwind readnone | |
11 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone | |
12 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone | |
13 | |
14 define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { | |
15 ; SI-LABEL: test_bswap_i32: | |
16 ; SI: ; %bb.0: | |
17 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | |
18 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
19 ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 | |
20 ; SI-NEXT: s_mov_b32 s3, 0xf000 | |
21 ; SI-NEXT: s_mov_b32 s2, -1 | |
22 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
23 ; SI-NEXT: v_alignbit_b32 v0, s4, s4, 8 | |
24 ; SI-NEXT: v_alignbit_b32 v1, s4, s4, 24 | |
25 ; SI-NEXT: s_mov_b32 s4, 0xff00ff | |
26 ; SI-NEXT: v_bfi_b32 v0, s4, v1, v0 | |
27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 | |
28 ; SI-NEXT: s_endpgm | |
29 ; | |
30 ; VI-LABEL: test_bswap_i32: | |
31 ; VI: ; %bb.0: | |
32 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
33 ; VI-NEXT: s_mov_b32 s3, 0xf000 | |
34 ; VI-NEXT: s_mov_b32 s2, -1 | |
35 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
36 ; VI-NEXT: s_mov_b32 s0, s4 | |
37 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0 | |
38 ; VI-NEXT: s_mov_b32 s1, s5 | |
39 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
40 ; VI-NEXT: v_alignbit_b32 v0, s4, s4, 8 | |
41 ; VI-NEXT: v_alignbit_b32 v1, s4, s4, 24 | |
42 ; VI-NEXT: s_mov_b32 s4, 0xff00ff | |
43 ; VI-NEXT: v_bfi_b32 v0, s4, v1, v0 | |
44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 | |
45 ; VI-NEXT: s_endpgm | |
46 %val = load i32, i32 addrspace(1)* %in, align 4 | |
47 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone | |
48 store i32 %bswap, i32 addrspace(1)* %out, align 4 | |
49 ret void | |
50 } | |
51 | |
52 define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { | |
53 ; SI-LABEL: test_bswap_v2i32: | |
54 ; SI: ; %bb.0: | |
55 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | |
56 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
57 ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 | |
58 ; SI-NEXT: s_mov_b32 s3, 0xf000 | |
59 ; SI-NEXT: s_mov_b32 s2, -1 | |
60 ; SI-NEXT: s_mov_b32 s6, 0xff00ff | |
61 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
62 ; SI-NEXT: v_alignbit_b32 v0, s5, s5, 8 | |
63 ; SI-NEXT: v_alignbit_b32 v1, s5, s5, 24 | |
64 ; SI-NEXT: v_alignbit_b32 v2, s4, s4, 8 | |
65 ; SI-NEXT: v_alignbit_b32 v3, s4, s4, 24 | |
66 ; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 | |
67 ; SI-NEXT: v_bfi_b32 v0, s6, v3, v2 | |
68 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 | |
69 ; SI-NEXT: s_endpgm | |
70 ; | |
71 ; VI-LABEL: test_bswap_v2i32: | |
72 ; VI: ; %bb.0: | |
73 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
74 ; VI-NEXT: s_mov_b32 s8, 0xff00ff | |
75 ; VI-NEXT: s_mov_b32 s3, 0xf000 | |
76 ; VI-NEXT: s_mov_b32 s2, -1 | |
77 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
78 ; VI-NEXT: s_mov_b32 s0, s4 | |
79 ; VI-NEXT: s_mov_b32 s1, s5 | |
80 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 | |
81 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
82 ; VI-NEXT: v_alignbit_b32 v0, s5, s5, 8 | |
83 ; VI-NEXT: v_alignbit_b32 v1, s5, s5, 24 | |
84 ; VI-NEXT: v_alignbit_b32 v2, s4, s4, 8 | |
85 ; VI-NEXT: v_alignbit_b32 v3, s4, s4, 24 | |
86 ; VI-NEXT: v_bfi_b32 v1, s8, v1, v0 | |
87 ; VI-NEXT: v_bfi_b32 v0, s8, v3, v2 | |
88 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 | |
89 ; VI-NEXT: s_endpgm | |
90 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 | |
91 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone | |
92 store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8 | |
93 ret void | |
94 } | |
95 | |
96 define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { | |
97 ; SI-LABEL: test_bswap_v4i32: | |
98 ; SI: ; %bb.0: | |
99 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | |
100 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
101 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 | |
102 ; SI-NEXT: s_mov_b32 s3, 0xf000 | |
103 ; SI-NEXT: s_mov_b32 s2, -1 | |
104 ; SI-NEXT: s_mov_b32 s8, 0xff00ff | |
105 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
106 ; SI-NEXT: v_alignbit_b32 v0, s7, s7, 8 | |
107 ; SI-NEXT: v_alignbit_b32 v1, s7, s7, 24 | |
108 ; SI-NEXT: v_alignbit_b32 v2, s6, s6, 8 | |
109 ; SI-NEXT: v_alignbit_b32 v4, s6, s6, 24 | |
110 ; SI-NEXT: v_alignbit_b32 v5, s5, s5, 8 | |
111 ; SI-NEXT: v_alignbit_b32 v6, s5, s5, 24 | |
112 ; SI-NEXT: v_alignbit_b32 v7, s4, s4, 8 | |
113 ; SI-NEXT: v_alignbit_b32 v8, s4, s4, 24 | |
114 ; SI-NEXT: v_bfi_b32 v3, s8, v1, v0 | |
115 ; SI-NEXT: v_bfi_b32 v2, s8, v4, v2 | |
116 ; SI-NEXT: v_bfi_b32 v1, s8, v6, v5 | |
117 ; SI-NEXT: v_bfi_b32 v0, s8, v8, v7 | |
118 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 | |
119 ; SI-NEXT: s_endpgm | |
120 ; | |
121 ; VI-LABEL: test_bswap_v4i32: | |
122 ; VI: ; %bb.0: | |
123 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
124 ; VI-NEXT: s_mov_b32 s8, 0xff00ff | |
125 ; VI-NEXT: s_mov_b32 s3, 0xf000 | |
126 ; VI-NEXT: s_mov_b32 s2, -1 | |
127 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
128 ; VI-NEXT: s_mov_b32 s0, s4 | |
129 ; VI-NEXT: s_mov_b32 s1, s5 | |
130 ; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 | |
131 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
132 ; VI-NEXT: v_alignbit_b32 v0, s7, s7, 8 | |
133 ; VI-NEXT: v_alignbit_b32 v1, s7, s7, 24 | |
134 ; VI-NEXT: v_bfi_b32 v3, s8, v1, v0 | |
135 ; VI-NEXT: v_alignbit_b32 v2, s6, s6, 8 | |
136 ; VI-NEXT: v_alignbit_b32 v4, s6, s6, 24 | |
137 ; VI-NEXT: v_alignbit_b32 v0, s5, s5, 8 | |
138 ; VI-NEXT: v_alignbit_b32 v1, s5, s5, 24 | |
139 ; VI-NEXT: v_bfi_b32 v2, s8, v4, v2 | |
140 ; VI-NEXT: v_bfi_b32 v1, s8, v1, v0 | |
141 ; VI-NEXT: v_alignbit_b32 v0, s4, s4, 8 | |
142 ; VI-NEXT: v_alignbit_b32 v4, s4, s4, 24 | |
143 ; VI-NEXT: v_bfi_b32 v0, s8, v4, v0 | |
144 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 | |
145 ; VI-NEXT: s_endpgm | |
146 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 | |
147 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone | |
148 store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16 | |
149 ret void | |
150 } | |
151 | |
152 define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { | |
153 ; SI-LABEL: test_bswap_v8i32: | |
154 ; SI: ; %bb.0: | |
155 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 | |
156 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
157 ; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 | |
158 ; SI-NEXT: s_mov_b32 s11, 0xf000 | |
159 ; SI-NEXT: s_mov_b32 s10, -1 | |
160 ; SI-NEXT: s_mov_b32 s12, 0xff00ff | |
161 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
162 ; SI-NEXT: v_alignbit_b32 v0, s3, s3, 8 | |
163 ; SI-NEXT: v_alignbit_b32 v1, s3, s3, 24 | |
164 ; SI-NEXT: v_alignbit_b32 v2, s2, s2, 8 | |
165 ; SI-NEXT: v_alignbit_b32 v4, s2, s2, 24 | |
166 ; SI-NEXT: v_alignbit_b32 v5, s1, s1, 8 | |
167 ; SI-NEXT: v_alignbit_b32 v6, s1, s1, 24 | |
168 ; SI-NEXT: v_alignbit_b32 v7, s0, s0, 8 | |
169 ; SI-NEXT: v_alignbit_b32 v8, s0, s0, 24 | |
170 ; SI-NEXT: v_alignbit_b32 v9, s7, s7, 8 | |
171 ; SI-NEXT: v_alignbit_b32 v10, s7, s7, 24 | |
172 ; SI-NEXT: v_alignbit_b32 v11, s6, s6, 8 | |
173 ; SI-NEXT: v_alignbit_b32 v12, s6, s6, 24 | |
174 ; SI-NEXT: v_alignbit_b32 v13, s5, s5, 8 | |
175 ; SI-NEXT: v_alignbit_b32 v14, s5, s5, 24 | |
176 ; SI-NEXT: v_alignbit_b32 v15, s4, s4, 8 | |
177 ; SI-NEXT: v_alignbit_b32 v16, s4, s4, 24 | |
178 ; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 | |
179 ; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 | |
180 ; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 | |
181 ; SI-NEXT: v_bfi_b32 v0, s12, v8, v7 | |
182 ; SI-NEXT: v_bfi_b32 v7, s12, v10, v9 | |
183 ; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 | |
184 ; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 | |
185 ; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 | |
186 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 | |
187 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 | |
188 ; SI-NEXT: s_endpgm | |
189 ; | |
190 ; VI-LABEL: test_bswap_v8i32: | |
191 ; VI: ; %bb.0: | |
192 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
193 ; VI-NEXT: s_mov_b32 s12, 0xff00ff | |
194 ; VI-NEXT: s_mov_b32 s3, 0xf000 | |
195 ; VI-NEXT: s_mov_b32 s2, -1 | |
196 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
197 ; VI-NEXT: s_mov_b32 s0, s4 | |
198 ; VI-NEXT: s_mov_b32 s1, s5 | |
199 ; VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 | |
200 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
201 ; VI-NEXT: v_alignbit_b32 v0, s7, s7, 8 | |
202 ; VI-NEXT: v_alignbit_b32 v1, s7, s7, 24 | |
203 ; VI-NEXT: v_bfi_b32 v3, s12, v1, v0 | |
204 ; VI-NEXT: v_alignbit_b32 v2, s6, s6, 8 | |
205 ; VI-NEXT: v_alignbit_b32 v4, s6, s6, 24 | |
206 ; VI-NEXT: v_alignbit_b32 v0, s5, s5, 8 | |
207 ; VI-NEXT: v_alignbit_b32 v1, s5, s5, 24 | |
208 ; VI-NEXT: v_bfi_b32 v2, s12, v4, v2 | |
209 ; VI-NEXT: v_bfi_b32 v1, s12, v1, v0 | |
210 ; VI-NEXT: v_alignbit_b32 v0, s4, s4, 8 | |
211 ; VI-NEXT: v_alignbit_b32 v4, s4, s4, 24 | |
212 ; VI-NEXT: v_bfi_b32 v0, s12, v4, v0 | |
213 ; VI-NEXT: v_alignbit_b32 v4, s11, s11, 8 | |
214 ; VI-NEXT: v_alignbit_b32 v5, s11, s11, 24 | |
215 ; VI-NEXT: v_bfi_b32 v7, s12, v5, v4 | |
216 ; VI-NEXT: v_alignbit_b32 v4, s10, s10, 8 | |
217 ; VI-NEXT: v_alignbit_b32 v5, s10, s10, 24 | |
218 ; VI-NEXT: v_bfi_b32 v6, s12, v5, v4 | |
219 ; VI-NEXT: v_alignbit_b32 v4, s9, s9, 8 | |
220 ; VI-NEXT: v_alignbit_b32 v5, s9, s9, 24 | |
221 ; VI-NEXT: v_bfi_b32 v5, s12, v5, v4 | |
222 ; VI-NEXT: v_alignbit_b32 v4, s8, s8, 8 | |
223 ; VI-NEXT: v_alignbit_b32 v8, s8, s8, 24 | |
224 ; VI-NEXT: v_bfi_b32 v4, s12, v8, v4 | |
225 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 | |
226 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 | |
227 ; VI-NEXT: s_endpgm | |
228 %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 | |
229 %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone | |
230 store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32 | |
231 ret void | |
232 } | |
233 | |
234 define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { | |
235 ; SI-LABEL: test_bswap_i64: | |
236 ; SI: ; %bb.0: | |
237 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | |
238 ; SI-NEXT: s_mov_b32 s3, 0xf000 | |
239 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
240 ; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 | |
241 ; SI-NEXT: s_mov_b32 s2, -1 | |
242 ; SI-NEXT: s_mov_b32 s19, 0xff0000 | |
243 ; SI-NEXT: s_mov_b32 s9, 0 | |
244 ; SI-NEXT: s_mov_b32 s15, 0xff00 | |
245 ; SI-NEXT: s_mov_b32 s11, s9 | |
246 ; SI-NEXT: s_mov_b32 s12, s9 | |
247 ; SI-NEXT: s_mov_b32 s14, s9 | |
248 ; SI-NEXT: s_mov_b32 s16, s9 | |
249 ; SI-NEXT: s_mov_b32 s18, s9 | |
250 ; SI-NEXT: s_mov_b32 s0, s4 | |
251 ; SI-NEXT: s_mov_b32 s1, s5 | |
252 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
253 ; SI-NEXT: v_mov_b32_e32 v0, s6 | |
254 ; SI-NEXT: v_alignbit_b32 v1, s7, v0, 24 | |
255 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 8 | |
256 ; SI-NEXT: s_lshr_b32 s8, s7, 24 | |
257 ; SI-NEXT: s_lshr_b32 s10, s7, 8 | |
258 ; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 8 | |
259 ; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 24 | |
260 ; SI-NEXT: s_lshl_b32 s17, s6, 24 | |
261 ; SI-NEXT: s_lshl_b32 s4, s6, 8 | |
262 ; SI-NEXT: v_and_b32_e32 v1, s19, v1 | |
263 ; SI-NEXT: v_and_b32_e32 v0, 0xff000000, v0 | |
264 ; SI-NEXT: s_and_b32 s10, s10, s15 | |
265 ; SI-NEXT: s_and_b32 s13, s5, 0xff | |
266 ; SI-NEXT: s_and_b32 s15, s21, s15 | |
267 ; SI-NEXT: s_and_b32 s19, s4, s19 | |
268 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 | |
269 ; SI-NEXT: s_or_b64 s[4:5], s[10:11], s[8:9] | |
270 ; SI-NEXT: s_or_b64 s[6:7], s[14:15], s[12:13] | |
271 ; SI-NEXT: s_or_b64 s[8:9], s[16:17], s[18:19] | |
272 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 | |
273 ; SI-NEXT: v_mov_b32_e32 v1, s5 | |
274 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[6:7] | |
275 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 | |
276 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 | |
277 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 | |
278 ; SI-NEXT: s_endpgm | |
279 ; | |
280 ; VI-LABEL: test_bswap_i64: | |
281 ; VI: ; %bb.0: | |
282 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
283 ; VI-NEXT: s_mov_b32 s12, 0xff0000 | |
284 ; VI-NEXT: s_mov_b32 s3, 0xf000 | |
285 ; VI-NEXT: s_mov_b32 s2, -1 | |
286 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
287 ; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 | |
288 ; VI-NEXT: s_mov_b32 s1, s5 | |
289 ; VI-NEXT: s_mov_b32 s5, 0 | |
290 ; VI-NEXT: s_mov_b32 s0, s4 | |
291 ; VI-NEXT: s_mov_b32 s9, s5 | |
292 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
293 ; VI-NEXT: v_mov_b32_e32 v0, s6 | |
294 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, 24 | |
295 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, 8 | |
296 ; VI-NEXT: s_bfe_u32 s8, s7, 0x80010 | |
297 ; VI-NEXT: v_and_b32_e32 v1, s12, v1 | |
298 ; VI-NEXT: v_and_b32_e32 v0, 0xff000000, v0 | |
299 ; VI-NEXT: s_lshr_b32 s4, s7, 24 | |
300 ; VI-NEXT: s_lshl_b32 s8, s8, 8 | |
301 ; VI-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] | |
302 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 | |
303 ; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24 | |
304 ; VI-NEXT: v_or_b32_e32 v0, s8, v0 | |
305 ; VI-NEXT: v_mov_b32_e32 v1, s9 | |
306 ; VI-NEXT: s_lshl_b64 s[8:9], s[6:7], 8 | |
307 ; VI-NEXT: s_lshl_b32 s4, s6, 8 | |
308 ; VI-NEXT: s_and_b32 s9, s9, 0xff | |
309 ; VI-NEXT: s_mov_b32 s8, s5 | |
310 ; VI-NEXT: s_and_b32 s11, s11, 0xff00 | |
311 ; VI-NEXT: s_mov_b32 s10, s5 | |
312 ; VI-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] | |
313 ; VI-NEXT: s_lshl_b32 s11, s6, 24 | |
314 ; VI-NEXT: s_and_b32 s7, s4, s12 | |
315 ; VI-NEXT: s_mov_b32 s6, s5 | |
316 ; VI-NEXT: s_or_b64 s[4:5], s[10:11], s[6:7] | |
317 ; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] | |
318 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 | |
319 ; VI-NEXT: v_or_b32_e32 v1, s5, v1 | |
320 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 | |
321 ; VI-NEXT: s_endpgm | |
322 %val = load i64, i64 addrspace(1)* %in, align 8 | |
323 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone | |
324 store i64 %bswap, i64 addrspace(1)* %out, align 8 | |
325 ret void | |
326 } | |
327 | |
328 define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { | |
329 ; SI-LABEL: test_bswap_v2i64: | |
330 ; SI: ; %bb.0: | |
331 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | |
332 ; SI-NEXT: s_mov_b32 s3, 0xf000 | |
333 ; SI-NEXT: s_mov_b32 s2, -1 | |
334 ; SI-NEXT: s_mov_b32 s31, 0xff0000 | |
335 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
336 ; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 | |
337 ; SI-NEXT: s_mov_b32 s7, 0 | |
338 ; SI-NEXT: s_mov_b32 s22, 0xff000000 | |
339 ; SI-NEXT: s_mov_b32 s27, 0xff00 | |
340 ; SI-NEXT: s_movk_i32 s25, 0xff | |
341 ; SI-NEXT: s_mov_b32 s13, s7 | |
342 ; SI-NEXT: s_mov_b32 s14, s7 | |
343 ; SI-NEXT: s_mov_b32 s16, s7 | |
344 ; SI-NEXT: s_mov_b32 s18, s7 | |
345 ; SI-NEXT: s_mov_b32 s20, s7 | |
346 ; SI-NEXT: s_mov_b32 s23, s7 | |
347 ; SI-NEXT: s_mov_b32 s24, s7 | |
348 ; SI-NEXT: s_mov_b32 s26, s7 | |
349 ; SI-NEXT: s_mov_b32 s28, s7 | |
350 ; SI-NEXT: s_mov_b32 s30, s7 | |
351 ; SI-NEXT: s_mov_b32 s0, s4 | |
352 ; SI-NEXT: s_mov_b32 s1, s5 | |
353 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
354 ; SI-NEXT: v_mov_b32_e32 v0, s10 | |
355 ; SI-NEXT: v_alignbit_b32 v1, s11, v0, 24 | |
356 ; SI-NEXT: v_alignbit_b32 v0, s11, v0, 8 | |
357 ; SI-NEXT: s_lshr_b32 s6, s11, 24 | |
358 ; SI-NEXT: s_lshr_b32 s12, s11, 8 | |
359 ; SI-NEXT: s_lshl_b64 s[4:5], s[10:11], 8 | |
360 ; SI-NEXT: s_lshl_b64 s[32:33], s[10:11], 24 | |
361 ; SI-NEXT: s_lshl_b32 s19, s10, 24 | |
362 ; SI-NEXT: s_lshl_b32 s21, s10, 8 | |
363 ; SI-NEXT: v_mov_b32_e32 v2, s8 | |
364 ; SI-NEXT: v_alignbit_b32 v3, s9, v2, 24 | |
365 ; SI-NEXT: v_alignbit_b32 v2, s9, v2, 8 | |
366 ; SI-NEXT: s_lshr_b32 s32, s9, 8 | |
367 ; SI-NEXT: s_lshl_b64 s[10:11], s[8:9], 8 | |
368 ; SI-NEXT: s_and_b32 s15, s5, s25 | |
369 ; SI-NEXT: s_lshl_b64 s[4:5], s[8:9], 24 | |
370 ; SI-NEXT: s_lshl_b32 s29, s8, 24 | |
371 ; SI-NEXT: s_lshl_b32 s4, s8, 8 | |
372 ; SI-NEXT: v_and_b32_e32 v1, s31, v1 | |
373 ; SI-NEXT: v_and_b32_e32 v0, s22, v0 | |
374 ; SI-NEXT: s_and_b32 s12, s12, s27 | |
375 ; SI-NEXT: s_and_b32 s17, s33, s27 | |
376 ; SI-NEXT: s_and_b32 s21, s21, s31 | |
377 ; SI-NEXT: v_and_b32_e32 v3, s31, v3 | |
378 ; SI-NEXT: v_and_b32_e32 v2, s22, v2 | |
379 ; SI-NEXT: s_and_b32 s22, s32, s27 | |
380 ; SI-NEXT: s_and_b32 s25, s11, s25 | |
381 ; SI-NEXT: s_and_b32 s27, s5, s27 | |
382 ; SI-NEXT: s_and_b32 s31, s4, s31 | |
383 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 | |
384 ; SI-NEXT: s_or_b64 s[4:5], s[12:13], s[6:7] | |
385 ; SI-NEXT: s_or_b64 s[10:11], s[16:17], s[14:15] | |
386 ; SI-NEXT: s_or_b64 s[12:13], s[18:19], s[20:21] | |
387 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 | |
388 ; SI-NEXT: s_lshr_b32 s6, s9, 24 | |
389 ; SI-NEXT: s_or_b64 s[8:9], s[26:27], s[24:25] | |
390 ; SI-NEXT: s_or_b64 s[14:15], s[28:29], s[30:31] | |
391 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 | |
392 ; SI-NEXT: v_mov_b32_e32 v3, s5 | |
393 ; SI-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] | |
394 ; SI-NEXT: s_or_b64 s[6:7], s[22:23], s[6:7] | |
395 ; SI-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] | |
396 ; SI-NEXT: v_or_b32_e32 v2, s4, v0 | |
397 ; SI-NEXT: v_or_b32_e32 v3, s5, v3 | |
398 ; SI-NEXT: v_or_b32_e32 v0, s6, v1 | |
399 ; SI-NEXT: v_mov_b32_e32 v1, s7 | |
400 ; SI-NEXT: v_or_b32_e32 v0, s8, v0 | |
401 ; SI-NEXT: v_or_b32_e32 v1, s9, v1 | |
402 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 | |
403 ; SI-NEXT: s_endpgm | |
404 ; | |
405 ; VI-LABEL: test_bswap_v2i64: | |
406 ; VI: ; %bb.0: | |
407 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
408 ; VI-NEXT: s_mov_b32 s9, 0 | |
409 ; VI-NEXT: s_mov_b32 s14, 0xff0000 | |
410 ; VI-NEXT: s_mov_b32 s15, 0xff000000 | |
411 ; VI-NEXT: s_mov_b32 s11, s9 | |
412 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
413 ; VI-NEXT: s_mov_b32 s0, s4 | |
414 ; VI-NEXT: s_mov_b32 s1, s5 | |
415 ; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 | |
416 ; VI-NEXT: s_movk_i32 s16, 0xff | |
417 ; VI-NEXT: s_mov_b32 s17, 0xff00 | |
418 ; VI-NEXT: s_mov_b32 s3, 0xf000 | |
419 ; VI-NEXT: s_mov_b32 s2, -1 | |
420 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
421 ; VI-NEXT: v_mov_b32_e32 v0, s6 | |
422 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, 24 | |
423 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, 8 | |
424 ; VI-NEXT: s_bfe_u32 s10, s7, 0x80010 | |
425 ; VI-NEXT: v_and_b32_e32 v1, s14, v1 | |
426 ; VI-NEXT: v_and_b32_e32 v0, s15, v0 | |
427 ; VI-NEXT: s_lshr_b32 s8, s7, 24 | |
428 ; VI-NEXT: s_lshl_b32 s10, s10, 8 | |
429 ; VI-NEXT: s_or_b64 s[10:11], s[10:11], s[8:9] | |
430 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 | |
431 ; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 24 | |
432 ; VI-NEXT: v_or_b32_e32 v0, s10, v0 | |
433 ; VI-NEXT: v_mov_b32_e32 v1, s11 | |
434 ; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 8 | |
435 ; VI-NEXT: s_and_b32 s11, s11, s16 | |
436 ; VI-NEXT: s_mov_b32 s10, s9 | |
437 ; VI-NEXT: s_and_b32 s13, s13, s17 | |
438 ; VI-NEXT: s_mov_b32 s12, s9 | |
439 ; VI-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] | |
440 ; VI-NEXT: s_lshl_b32 s13, s6, 24 | |
441 ; VI-NEXT: s_lshl_b32 s6, s6, 8 | |
442 ; VI-NEXT: s_and_b32 s7, s6, s14 | |
443 ; VI-NEXT: s_mov_b32 s6, s9 | |
444 ; VI-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] | |
445 ; VI-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] | |
446 ; VI-NEXT: v_or_b32_e32 v2, s6, v0 | |
447 ; VI-NEXT: v_mov_b32_e32 v0, s4 | |
448 ; VI-NEXT: v_or_b32_e32 v3, s7, v1 | |
449 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 24 | |
450 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 8 | |
451 ; VI-NEXT: s_bfe_u32 s6, s5, 0x80010 | |
452 ; VI-NEXT: v_and_b32_e32 v1, s14, v1 | |
453 ; VI-NEXT: v_and_b32_e32 v0, s15, v0 | |
454 ; VI-NEXT: s_lshr_b32 s8, s5, 24 | |
455 ; VI-NEXT: s_lshl_b32 s6, s6, 8 | |
456 ; VI-NEXT: s_mov_b32 s7, s9 | |
457 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 | |
458 ; VI-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] | |
459 ; VI-NEXT: s_lshl_b64 s[10:11], s[4:5], 24 | |
460 ; VI-NEXT: v_or_b32_e32 v0, s6, v0 | |
461 ; VI-NEXT: v_mov_b32_e32 v1, s7 | |
462 ; VI-NEXT: s_lshl_b64 s[6:7], s[4:5], 8 | |
463 ; VI-NEXT: s_and_b32 s7, s7, s16 | |
464 ; VI-NEXT: s_mov_b32 s6, s9 | |
465 ; VI-NEXT: s_and_b32 s11, s11, s17 | |
466 ; VI-NEXT: s_mov_b32 s10, s9 | |
467 ; VI-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] | |
468 ; VI-NEXT: s_lshl_b32 s11, s4, 24 | |
469 ; VI-NEXT: s_lshl_b32 s4, s4, 8 | |
470 ; VI-NEXT: s_and_b32 s5, s4, s14 | |
471 ; VI-NEXT: s_mov_b32 s4, s9 | |
472 ; VI-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] | |
473 ; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] | |
474 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 | |
475 ; VI-NEXT: v_or_b32_e32 v1, s5, v1 | |
476 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 | |
477 ; VI-NEXT: s_endpgm | |
478 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 | |
479 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone | |
480 store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16 | |
481 ret void | |
482 } | |
483 | |
484 define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { | |
485 ; SI-LABEL: test_bswap_v4i64: | |
486 ; SI: ; %bb.0: | |
487 ; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x9 | |
488 ; SI-NEXT: s_mov_b32 s3, 0xf000 | |
489 ; SI-NEXT: s_mov_b32 s2, -1 | |
490 ; SI-NEXT: s_mov_b32 s31, 0xff0000 | |
491 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
492 ; SI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 | |
493 ; SI-NEXT: s_mov_b32 s27, 0xff000000 | |
494 ; SI-NEXT: s_mov_b32 s34, 0xff00 | |
495 ; SI-NEXT: s_mov_b32 s14, 0 | |
496 ; SI-NEXT: s_movk_i32 s36, 0xff | |
497 ; SI-NEXT: s_mov_b32 s16, s14 | |
498 ; SI-NEXT: s_mov_b32 s18, s14 | |
499 ; SI-NEXT: s_mov_b32 s20, s14 | |
500 ; SI-NEXT: s_mov_b32 s22, s14 | |
501 ; SI-NEXT: s_mov_b32 s24, s14 | |
502 ; SI-NEXT: s_mov_b32 s26, s14 | |
503 ; SI-NEXT: s_mov_b32 s28, s14 | |
504 ; SI-NEXT: s_mov_b32 s30, s14 | |
505 ; SI-NEXT: s_mov_b32 s0, s12 | |
506 ; SI-NEXT: s_mov_b32 s1, s13 | |
507 ; SI-NEXT: s_waitcnt lgkmcnt(0) | |
508 ; SI-NEXT: v_mov_b32_e32 v0, s6 | |
509 ; SI-NEXT: v_alignbit_b32 v1, s7, v0, 24 | |
510 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 8 | |
511 ; SI-NEXT: s_lshr_b32 s35, s7, 24 | |
512 ; SI-NEXT: s_lshr_b32 s37, s7, 8 | |
513 ; SI-NEXT: v_mov_b32_e32 v2, s4 | |
514 ; SI-NEXT: v_alignbit_b32 v3, s5, v2, 24 | |
515 ; SI-NEXT: v_alignbit_b32 v2, s5, v2, 8 | |
516 ; SI-NEXT: s_lshr_b32 s38, s5, 24 | |
517 ; SI-NEXT: s_lshr_b32 s39, s5, 8 | |
518 ; SI-NEXT: s_lshl_b64 s[12:13], s[6:7], 8 | |
519 ; SI-NEXT: s_lshl_b64 s[32:33], s[6:7], 24 | |
520 ; SI-NEXT: s_lshl_b32 s7, s6, 8 | |
521 ; SI-NEXT: s_and_b32 s15, s13, s36 | |
522 ; SI-NEXT: s_lshl_b64 s[12:13], s[4:5], 8 | |
523 ; SI-NEXT: s_and_b32 s17, s33, s34 | |
524 ; SI-NEXT: s_lshl_b64 s[32:33], s[4:5], 24 | |
525 ; SI-NEXT: s_lshl_b32 s5, s4, 8 | |
526 ; SI-NEXT: v_mov_b32_e32 v4, s10 | |
527 ; SI-NEXT: v_alignbit_b32 v5, s11, v4, 24 | |
528 ; SI-NEXT: v_alignbit_b32 v4, s11, v4, 8 | |
529 ; SI-NEXT: s_and_b32 s21, s33, s34 | |
530 ; SI-NEXT: s_lshl_b64 s[32:33], s[10:11], 24 | |
531 ; SI-NEXT: s_and_b32 s25, s33, s34 | |
532 ; SI-NEXT: s_lshl_b64 s[32:33], s[8:9], 24 | |
533 ; SI-NEXT: s_and_b32 s29, s33, s34 | |
534 ; SI-NEXT: s_lshr_b32 s12, s11, 24 | |
535 ; SI-NEXT: s_lshr_b32 s40, s11, 8 | |
536 ; SI-NEXT: v_mov_b32_e32 v6, s8 | |
537 ; SI-NEXT: v_alignbit_b32 v7, s9, v6, 24 | |
538 ; SI-NEXT: v_alignbit_b32 v6, s9, v6, 8 | |
539 ; SI-NEXT: s_and_b32 s19, s7, s31 | |
540 ; SI-NEXT: s_lshr_b32 s7, s9, 24 | |
541 ; SI-NEXT: s_and_b32 s23, s5, s31 | |
542 ; SI-NEXT: s_lshr_b32 s5, s9, 8 | |
543 ; SI-NEXT: v_and_b32_e32 v0, s27, v0 | |
544 ; SI-NEXT: v_and_b32_e32 v2, s27, v2 | |
545 ; SI-NEXT: v_and_b32_e32 v4, s27, v4 | |
546 ; SI-NEXT: v_and_b32_e32 v6, s27, v6 | |
547 ; SI-NEXT: s_lshl_b32 s27, s10, 8 | |
548 ; SI-NEXT: s_and_b32 s27, s27, s31 | |
549 ; SI-NEXT: s_lshl_b32 s32, s8, 8 | |
550 ; SI-NEXT: v_and_b32_e32 v1, s31, v1 | |
551 ; SI-NEXT: v_and_b32_e32 v3, s31, v3 | |
552 ; SI-NEXT: v_and_b32_e32 v5, s31, v5 | |
553 ; SI-NEXT: v_and_b32_e32 v7, s31, v7 | |
554 ; SI-NEXT: s_and_b32 s31, s32, s31 | |
555 ; SI-NEXT: s_lshl_b64 s[32:33], s[10:11], 8 | |
556 ; SI-NEXT: s_and_b32 s11, s37, s34 | |
557 ; SI-NEXT: s_and_b32 s32, s39, s34 | |
558 ; SI-NEXT: s_and_b32 s37, s40, s34 | |
559 ; SI-NEXT: s_and_b32 s5, s5, s34 | |
560 ; SI-NEXT: s_or_b32 s11, s11, s35 | |
561 ; SI-NEXT: s_lshl_b64 s[34:35], s[8:9], 8 | |
562 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 | |
563 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 | |
564 ; SI-NEXT: s_or_b32 s9, s32, s38 | |
565 ; SI-NEXT: s_or_b64 s[16:17], s[16:17], s[14:15] | |
566 ; SI-NEXT: s_lshl_b32 s15, s6, 24 | |
567 ; SI-NEXT: v_or_b32_e32 v3, v4, v5 | |
568 ; SI-NEXT: s_or_b32 s12, s37, s12 | |
569 ; SI-NEXT: v_or_b32_e32 v4, v6, v7 | |
570 ; SI-NEXT: s_or_b32 s32, s5, s7 | |
571 ; SI-NEXT: v_or_b32_e32 v2, s11, v0 | |
572 ; SI-NEXT: v_or_b32_e32 v0, s9, v1 | |
573 ; SI-NEXT: s_or_b64 s[6:7], s[14:15], s[18:19] | |
574 ; SI-NEXT: s_and_b32 s15, s13, s36 | |
575 ; SI-NEXT: v_or_b32_e32 v6, s12, v3 | |
576 ; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[16:17] | |
577 ; SI-NEXT: s_or_b64 s[12:13], s[20:21], s[14:15] | |
578 ; SI-NEXT: s_lshl_b32 s15, s4, 24 | |
579 ; SI-NEXT: s_or_b64 s[4:5], s[14:15], s[22:23] | |
580 ; SI-NEXT: s_and_b32 s15, s33, s36 | |
581 ; SI-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] | |
582 ; SI-NEXT: s_or_b64 s[12:13], s[24:25], s[14:15] | |
583 ; SI-NEXT: s_lshl_b32 s15, s10, 24 | |
584 ; SI-NEXT: s_or_b64 s[10:11], s[14:15], s[26:27] | |
585 ; SI-NEXT: s_and_b32 s15, s35, s36 | |
586 ; SI-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] | |
587 ; SI-NEXT: s_or_b64 s[12:13], s[28:29], s[14:15] | |
588 ; SI-NEXT: s_lshl_b32 s15, s8, 24 | |
589 ; SI-NEXT: s_or_b64 s[8:9], s[14:15], s[30:31] | |
590 ; SI-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] | |
591 ; SI-NEXT: v_or_b32_e32 v4, s32, v4 | |
592 ; SI-NEXT: v_mov_b32_e32 v3, s7 | |
593 ; SI-NEXT: v_mov_b32_e32 v1, s5 | |
594 ; SI-NEXT: v_mov_b32_e32 v7, s11 | |
595 ; SI-NEXT: v_mov_b32_e32 v5, s9 | |
596 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 | |
597 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 | |
598 ; SI-NEXT: s_endpgm | |
599 ; | |
600 ; VI-LABEL: test_bswap_v4i64: | |
601 ; VI: ; %bb.0: | |
602 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
603 ; VI-NEXT: s_mov_b32 s16, 0xff0000 | |
604 ; VI-NEXT: s_mov_b32 s17, 0xff000000 | |
605 ; VI-NEXT: s_movk_i32 s18, 0xff | |
606 ; VI-NEXT: s_mov_b32 s19, 0xff00 | |
607 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
608 ; VI-NEXT: s_mov_b32 s0, s4 | |
609 ; VI-NEXT: s_mov_b32 s1, s5 | |
610 ; VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 | |
611 ; VI-NEXT: s_mov_b32 s3, 0xf000 | |
612 ; VI-NEXT: s_mov_b32 s2, -1 | |
613 ; VI-NEXT: s_waitcnt lgkmcnt(0) | |
614 ; VI-NEXT: v_mov_b32_e32 v0, s6 | |
615 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, 24 | |
616 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, 8 | |
617 ; VI-NEXT: s_bfe_u32 s13, s7, 0x80010 | |
618 ; VI-NEXT: v_and_b32_e32 v1, s16, v1 | |
619 ; VI-NEXT: v_and_b32_e32 v0, s17, v0 | |
620 ; VI-NEXT: s_lshr_b32 s12, s7, 24 | |
621 ; VI-NEXT: s_lshl_b32 s13, s13, 8 | |
622 ; VI-NEXT: s_or_b32 s12, s13, s12 | |
623 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 | |
624 ; VI-NEXT: v_or_b32_e32 v2, s12, v0 | |
625 ; VI-NEXT: v_mov_b32_e32 v0, s4 | |
626 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 24 | |
627 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 8 | |
628 ; VI-NEXT: s_bfe_u32 s13, s5, 0x80010 | |
629 ; VI-NEXT: v_and_b32_e32 v1, s16, v1 | |
630 ; VI-NEXT: v_and_b32_e32 v0, s17, v0 | |
631 ; VI-NEXT: s_lshr_b32 s12, s5, 24 | |
632 ; VI-NEXT: s_lshl_b32 s13, s13, 8 | |
633 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 | |
634 ; VI-NEXT: s_or_b32 s12, s13, s12 | |
635 ; VI-NEXT: v_or_b32_e32 v0, s12, v0 | |
636 ; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 8 | |
637 ; VI-NEXT: s_lshl_b64 s[14:15], s[6:7], 24 | |
638 ; VI-NEXT: s_mov_b32 s12, 0 | |
639 ; VI-NEXT: s_and_b32 s13, s13, s18 | |
640 ; VI-NEXT: s_and_b32 s15, s15, s19 | |
641 ; VI-NEXT: s_mov_b32 s14, s12 | |
642 ; VI-NEXT: s_or_b64 s[14:15], s[14:15], s[12:13] | |
643 ; VI-NEXT: s_lshl_b32 s13, s6, 24 | |
644 ; VI-NEXT: s_lshl_b32 s6, s6, 8 | |
645 ; VI-NEXT: s_and_b32 s7, s6, s16 | |
646 ; VI-NEXT: s_mov_b32 s6, s12 | |
647 ; VI-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] | |
648 ; VI-NEXT: s_or_b64 s[6:7], s[6:7], s[14:15] | |
649 ; VI-NEXT: s_lshl_b64 s[14:15], s[4:5], 8 | |
650 ; VI-NEXT: s_and_b32 s13, s15, s18 | |
651 ; VI-NEXT: s_lshl_b64 s[14:15], s[4:5], 24 | |
652 ; VI-NEXT: s_and_b32 s15, s15, s19 | |
653 ; VI-NEXT: s_mov_b32 s14, s12 | |
654 ; VI-NEXT: s_or_b64 s[14:15], s[14:15], s[12:13] | |
655 ; VI-NEXT: s_lshl_b32 s13, s4, 24 | |
656 ; VI-NEXT: s_lshl_b32 s4, s4, 8 | |
657 ; VI-NEXT: s_and_b32 s5, s4, s16 | |
658 ; VI-NEXT: s_mov_b32 s4, s12 | |
659 ; VI-NEXT: s_or_b64 s[4:5], s[12:13], s[4:5] | |
660 ; VI-NEXT: v_mov_b32_e32 v1, s10 | |
661 ; VI-NEXT: v_alignbit_b32 v3, s11, v1, 24 | |
662 ; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[14:15] | |
663 ; VI-NEXT: v_alignbit_b32 v1, s11, v1, 8 | |
664 ; VI-NEXT: s_bfe_u32 s6, s11, 0x80010 | |
665 ; VI-NEXT: v_and_b32_e32 v3, s16, v3 | |
666 ; VI-NEXT: v_and_b32_e32 v1, s17, v1 | |
667 ; VI-NEXT: s_lshr_b32 s4, s11, 24 | |
668 ; VI-NEXT: s_lshl_b32 s6, s6, 8 | |
669 ; VI-NEXT: s_or_b32 s4, s6, s4 | |
670 ; VI-NEXT: v_or_b32_e32 v1, v1, v3 | |
671 ; VI-NEXT: v_or_b32_e32 v6, s4, v1 | |
672 ; VI-NEXT: v_mov_b32_e32 v1, s8 | |
673 ; VI-NEXT: v_alignbit_b32 v3, s9, v1, 24 | |
674 ; VI-NEXT: v_alignbit_b32 v1, s9, v1, 8 | |
675 ; VI-NEXT: s_bfe_u32 s6, s9, 0x80010 | |
676 ; VI-NEXT: s_lshl_b64 s[14:15], s[10:11], 8 | |
677 ; VI-NEXT: v_and_b32_e32 v3, s16, v3 | |
678 ; VI-NEXT: v_and_b32_e32 v1, s17, v1 | |
679 ; VI-NEXT: s_lshr_b32 s4, s9, 24 | |
680 ; VI-NEXT: s_lshl_b32 s6, s6, 8 | |
681 ; VI-NEXT: v_or_b32_e32 v1, v1, v3 | |
682 ; VI-NEXT: s_or_b32 s4, s6, s4 | |
683 ; VI-NEXT: s_and_b32 s13, s15, s18 | |
684 ; VI-NEXT: s_lshl_b64 s[14:15], s[10:11], 24 | |
685 ; VI-NEXT: v_or_b32_e32 v4, s4, v1 | |
686 ; VI-NEXT: s_lshl_b32 s4, s10, 8 | |
687 ; VI-NEXT: s_and_b32 s15, s15, s19 | |
688 ; VI-NEXT: s_mov_b32 s14, s12 | |
689 ; VI-NEXT: s_or_b64 s[14:15], s[14:15], s[12:13] | |
690 ; VI-NEXT: s_lshl_b32 s13, s10, 24 | |
691 ; VI-NEXT: s_and_b32 s11, s4, s16 | |
692 ; VI-NEXT: s_mov_b32 s10, s12 | |
693 ; VI-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] | |
694 ; VI-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] | |
695 ; VI-NEXT: s_lshl_b64 s[14:15], s[8:9], 8 | |
696 ; VI-NEXT: s_and_b32 s13, s15, s18 | |
697 ; VI-NEXT: s_lshl_b64 s[14:15], s[8:9], 24 | |
698 ; VI-NEXT: s_lshl_b32 s4, s8, 8 | |
699 ; VI-NEXT: s_and_b32 s15, s15, s19 | |
700 ; VI-NEXT: s_mov_b32 s14, s12 | |
701 ; VI-NEXT: s_or_b64 s[14:15], s[14:15], s[12:13] | |
702 ; VI-NEXT: s_lshl_b32 s13, s8, 24 | |
703 ; VI-NEXT: s_and_b32 s9, s4, s16 | |
704 ; VI-NEXT: s_mov_b32 s8, s12 | |
705 ; VI-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] | |
706 ; VI-NEXT: s_or_b64 s[8:9], s[8:9], s[14:15] | |
707 ; VI-NEXT: v_mov_b32_e32 v5, s9 | |
708 ; VI-NEXT: v_mov_b32_e32 v7, s11 | |
709 ; VI-NEXT: v_mov_b32_e32 v1, s5 | |
710 ; VI-NEXT: v_mov_b32_e32 v3, s7 | |
711 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 | |
712 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 | |
713 ; VI-NEXT: s_endpgm | |
714 %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 | |
715 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone | |
716 store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32 | |
717 ret void | |
718 } | |
719 | |
720 define float @missing_truncate_promote_bswap(i32 %arg) { | |
721 ; SI-LABEL: missing_truncate_promote_bswap: | |
722 ; SI: ; %bb.0: ; %bb | |
723 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
724 ; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 | |
725 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 | |
726 ; SI-NEXT: s_mov_b32 s4, 0xff00ff | |
727 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 | |
728 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 | |
729 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 | |
730 ; SI-NEXT: s_setpc_b64 s[30:31] | |
731 ; | |
732 ; VI-LABEL: missing_truncate_promote_bswap: | |
733 ; VI: ; %bb.0: ; %bb | |
734 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
735 ; VI-NEXT: v_alignbit_b32 v1, v0, v0, 8 | |
736 ; VI-NEXT: v_alignbit_b32 v0, v0, v0, 24 | |
737 ; VI-NEXT: s_mov_b32 s4, 0xff00ff | |
738 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 | |
739 ; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 | |
740 ; VI-NEXT: s_setpc_b64 s[30:31] | |
741 bb: | |
742 %tmp = trunc i32 %arg to i16 | |
743 %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp) | |
744 %tmp2 = bitcast i16 %tmp1 to half | |
745 %tmp3 = fpext half %tmp2 to float | |
746 ret float %tmp3 | |
747 } |