comparison llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @ 150:1d019706d866

LLVM10
author anatofuz
date Thu, 13 Feb 2020 15:10:13 +0900
parents
children 0572611fdcc8
comparison
equal deleted inserted replaced
147:c2174574ed3a 150:1d019706d866
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
5
6 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
7 ; GFX9-LABEL: s_shl_v2i16:
8 ; GFX9: ; %bb.0:
9 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
10 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
11 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30
12 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
13 ; GFX9-NEXT: s_mov_b32 s6, -1
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
16 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0
17 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
18 ; GFX9-NEXT: s_endpgm
19 ;
20 ; VI-LABEL: s_shl_v2i16:
21 ; VI: ; %bb.0:
22 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
23 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
24 ; VI-NEXT: s_load_dword s0, s[0:1], 0x30
25 ; VI-NEXT: s_mov_b32 s3, 0xffff
26 ; VI-NEXT: s_mov_b32 s7, 0xf000
27 ; VI-NEXT: s_mov_b32 s6, -1
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_lshr_b32 s1, s2, 16
30 ; VI-NEXT: s_lshr_b32 s8, s0, 16
31 ; VI-NEXT: s_and_b32 s2, s2, s3
32 ; VI-NEXT: s_and_b32 s0, s0, s3
33 ; VI-NEXT: s_lshl_b32 s0, s2, s0
34 ; VI-NEXT: s_lshl_b32 s1, s1, s8
35 ; VI-NEXT: s_lshl_b32 s1, s1, 16
36 ; VI-NEXT: s_and_b32 s0, s0, s3
37 ; VI-NEXT: s_or_b32 s0, s0, s1
38 ; VI-NEXT: v_mov_b32_e32 v0, s0
39 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
40 ; VI-NEXT: s_endpgm
41 ;
42 ; CI-LABEL: s_shl_v2i16:
43 ; CI: ; %bb.0:
44 ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
45 ; CI-NEXT: s_load_dword s2, s[0:1], 0xb
46 ; CI-NEXT: s_load_dword s0, s[0:1], 0xc
47 ; CI-NEXT: s_mov_b32 s3, 0xffff
48 ; CI-NEXT: s_mov_b32 s7, 0xf000
49 ; CI-NEXT: s_mov_b32 s6, -1
50 ; CI-NEXT: s_waitcnt lgkmcnt(0)
51 ; CI-NEXT: s_lshr_b32 s1, s2, 16
52 ; CI-NEXT: s_and_b32 s8, s0, s3
53 ; CI-NEXT: s_lshr_b32 s0, s0, 16
54 ; CI-NEXT: s_lshl_b32 s0, s1, s0
55 ; CI-NEXT: s_lshl_b32 s1, s2, s8
56 ; CI-NEXT: s_lshl_b32 s0, s0, 16
57 ; CI-NEXT: s_and_b32 s1, s1, s3
58 ; CI-NEXT: s_or_b32 s0, s1, s0
59 ; CI-NEXT: v_mov_b32_e32 v0, s0
60 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
61 ; CI-NEXT: s_endpgm
62 %result = shl <2 x i16> %lhs, %rhs
63 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
64 ret void
65 }
66
67 define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
68 ; GFX9-LABEL: v_shl_v2i16:
69 ; GFX9: ; %bb.0:
70 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
71 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
74 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
75 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
76 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
77 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
78 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
79 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
80 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
81 ; GFX9-NEXT: s_waitcnt vmcnt(0)
82 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3
83 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
84 ; GFX9-NEXT: s_endpgm
85 ;
86 ; VI-LABEL: v_shl_v2i16:
87 ; VI: ; %bb.0:
88 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
89 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
90 ; VI-NEXT: s_waitcnt lgkmcnt(0)
91 ; VI-NEXT: v_mov_b32_e32 v1, s3
92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
94 ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
95 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
96 ; VI-NEXT: flat_load_dword v5, v[0:1]
97 ; VI-NEXT: flat_load_dword v2, v[2:3]
98 ; VI-NEXT: v_mov_b32_e32 v1, s1
99 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
100 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
101 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
102 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5
103 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
104 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
105 ; VI-NEXT: flat_store_dword v[0:1], v2
106 ; VI-NEXT: s_endpgm
107 ;
108 ; CI-LABEL: v_shl_v2i16:
109 ; CI: ; %bb.0:
110 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
111 ; CI-NEXT: s_mov_b32 s7, 0xf000
112 ; CI-NEXT: s_mov_b32 s6, 0
113 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
114 ; CI-NEXT: v_mov_b32_e32 v1, 0
115 ; CI-NEXT: s_waitcnt lgkmcnt(0)
116 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
117 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
118 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
119 ; CI-NEXT: s_mov_b32 s8, 0xffff
120 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
121 ; CI-NEXT: s_waitcnt vmcnt(1)
122 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
123 ; CI-NEXT: s_waitcnt vmcnt(0)
124 ; CI-NEXT: v_and_b32_e32 v5, s8, v3
125 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
126 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3
127 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5
128 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
129 ; CI-NEXT: v_and_b32_e32 v2, s8, v2
130 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
131 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
132 ; CI-NEXT: s_endpgm
133 %tid = call i32 @llvm.amdgcn.workitem.id.x()
134 %tid.ext = sext i32 %tid to i64
135 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
136 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
137 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
138 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
139 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
140 %result = shl <2 x i16> %a, %b
141 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
142 ret void
143 }
144
145 define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
146 ; GFX9-LABEL: shl_v_s_v2i16:
147 ; GFX9: ; %bb.0:
148 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
149 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
150 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
153 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
154 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
155 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
156 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
157 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
158 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
159 ; GFX9-NEXT: s_waitcnt vmcnt(0)
160 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3
161 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
162 ; GFX9-NEXT: s_endpgm
163 ;
164 ; VI-LABEL: shl_v_s_v2i16:
165 ; VI: ; %bb.0:
166 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
167 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
168 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
169 ; VI-NEXT: s_waitcnt lgkmcnt(0)
170 ; VI-NEXT: v_mov_b32_e32 v1, s7
171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
173 ; VI-NEXT: flat_load_dword v3, v[0:1]
174 ; VI-NEXT: s_lshr_b32 s1, s0, 16
175 ; VI-NEXT: v_mov_b32_e32 v4, s1
176 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
177 ; VI-NEXT: v_mov_b32_e32 v1, s5
178 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
179 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
180 ; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3
181 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
182 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
183 ; VI-NEXT: flat_store_dword v[0:1], v2
184 ; VI-NEXT: s_endpgm
185 ;
186 ; CI-LABEL: shl_v_s_v2i16:
187 ; CI: ; %bb.0:
188 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
189 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
190 ; CI-NEXT: s_mov_b32 s3, 0xf000
191 ; CI-NEXT: s_mov_b32 s2, 0
192 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
193 ; CI-NEXT: s_waitcnt lgkmcnt(0)
194 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
195 ; CI-NEXT: v_mov_b32_e32 v1, 0
196 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
197 ; CI-NEXT: s_mov_b32 s9, 0xffff
198 ; CI-NEXT: s_lshr_b32 s10, s8, 16
199 ; CI-NEXT: s_and_b32 s8, s8, s9
200 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
201 ; CI-NEXT: s_waitcnt vmcnt(0)
202 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
203 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
204 ; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3
205 ; CI-NEXT: v_and_b32_e32 v2, s9, v2
206 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
207 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
208 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
209 ; CI-NEXT: s_endpgm
210 %tid = call i32 @llvm.amdgcn.workitem.id.x()
211 %tid.ext = sext i32 %tid to i64
212 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
213 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
214 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
215 %result = shl <2 x i16> %vgpr, %sgpr
216 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
217 ret void
218 }
219
220 define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
221 ; GFX9-LABEL: shl_s_v_v2i16:
222 ; GFX9: ; %bb.0:
223 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
224 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
225 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
227 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
228 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
229 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
230 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
231 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
232 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
233 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
234 ; GFX9-NEXT: s_waitcnt vmcnt(0)
235 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0
236 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
237 ; GFX9-NEXT: s_endpgm
238 ;
239 ; VI-LABEL: shl_s_v_v2i16:
240 ; VI: ; %bb.0:
241 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
242 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
243 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
244 ; VI-NEXT: s_waitcnt lgkmcnt(0)
245 ; VI-NEXT: v_mov_b32_e32 v1, s7
246 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
247 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
248 ; VI-NEXT: flat_load_dword v3, v[0:1]
249 ; VI-NEXT: s_lshr_b32 s1, s0, 16
250 ; VI-NEXT: v_mov_b32_e32 v4, s1
251 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
252 ; VI-NEXT: v_mov_b32_e32 v1, s5
253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
254 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
255 ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0
256 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
257 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
258 ; VI-NEXT: flat_store_dword v[0:1], v2
259 ; VI-NEXT: s_endpgm
260 ;
261 ; CI-LABEL: shl_s_v_v2i16:
262 ; CI: ; %bb.0:
263 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
264 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
265 ; CI-NEXT: s_mov_b32 s3, 0xf000
266 ; CI-NEXT: s_mov_b32 s2, 0
267 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
268 ; CI-NEXT: s_waitcnt lgkmcnt(0)
269 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
270 ; CI-NEXT: v_mov_b32_e32 v1, 0
271 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
272 ; CI-NEXT: s_mov_b32 s0, 0xffff
273 ; CI-NEXT: s_lshr_b32 s1, s8, 16
274 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
275 ; CI-NEXT: s_waitcnt vmcnt(0)
276 ; CI-NEXT: v_and_b32_e32 v3, s0, v2
277 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
278 ; CI-NEXT: v_lshl_b32_e32 v2, s1, v2
279 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3
280 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
281 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
282 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
283 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
284 ; CI-NEXT: s_endpgm
285 %tid = call i32 @llvm.amdgcn.workitem.id.x()
286 %tid.ext = sext i32 %tid to i64
287 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
288 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
289 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
290 %result = shl <2 x i16> %sgpr, %vgpr
291 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
292 ret void
293 }
294
295 define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
296 ; GFX9-LABEL: shl_imm_v_v2i16:
297 ; GFX9: ; %bb.0:
298 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
299 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
301 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
302 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
303 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
304 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
305 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
306 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
307 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
308 ; GFX9-NEXT: s_waitcnt vmcnt(0)
309 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0]
310 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
311 ; GFX9-NEXT: s_endpgm
312 ;
313 ; VI-LABEL: shl_imm_v_v2i16:
314 ; VI: ; %bb.0:
315 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
316 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
317 ; VI-NEXT: v_mov_b32_e32 v3, 8
318 ; VI-NEXT: s_waitcnt lgkmcnt(0)
319 ; VI-NEXT: v_mov_b32_e32 v1, s3
320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
322 ; VI-NEXT: flat_load_dword v4, v[0:1]
323 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
324 ; VI-NEXT: v_mov_b32_e32 v1, s1
325 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
327 ; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8
328 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
329 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
330 ; VI-NEXT: flat_store_dword v[0:1], v2
331 ; VI-NEXT: s_endpgm
332 ;
333 ; CI-LABEL: shl_imm_v_v2i16:
334 ; CI: ; %bb.0:
335 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
336 ; CI-NEXT: s_mov_b32 s7, 0xf000
337 ; CI-NEXT: s_mov_b32 s6, 0
338 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
339 ; CI-NEXT: v_mov_b32_e32 v1, 0
340 ; CI-NEXT: s_waitcnt lgkmcnt(0)
341 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
342 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
343 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
344 ; CI-NEXT: s_waitcnt vmcnt(0)
345 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2
346 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
347 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2
348 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3
349 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
350 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3
351 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
352 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
353 ; CI-NEXT: s_endpgm
354 %tid = call i32 @llvm.amdgcn.workitem.id.x()
355 %tid.ext = sext i32 %tid to i64
356 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
357 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
358 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
359 %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
360 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
361 ret void
362 }
363
364 define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
365 ; GFX9-LABEL: shl_v_imm_v2i16:
366 ; GFX9: ; %bb.0:
367 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
368 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
371 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
373 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
374 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
375 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
376 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
377 ; GFX9-NEXT: s_waitcnt vmcnt(0)
378 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
379 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
380 ; GFX9-NEXT: s_endpgm
381 ;
382 ; VI-LABEL: shl_v_imm_v2i16:
383 ; VI: ; %bb.0:
384 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
385 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
386 ; VI-NEXT: s_waitcnt lgkmcnt(0)
387 ; VI-NEXT: v_mov_b32_e32 v1, s3
388 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
389 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
390 ; VI-NEXT: flat_load_dword v3, v[0:1]
391 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
392 ; VI-NEXT: v_mov_b32_e32 v1, s1
393 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
394 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
395 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
396 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
397 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
398 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
399 ; VI-NEXT: flat_store_dword v[0:1], v2
400 ; VI-NEXT: s_endpgm
401 ;
402 ; CI-LABEL: shl_v_imm_v2i16:
403 ; CI: ; %bb.0:
404 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
405 ; CI-NEXT: s_mov_b32 s7, 0xf000
406 ; CI-NEXT: s_mov_b32 s6, 0
407 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
408 ; CI-NEXT: v_mov_b32_e32 v1, 0
409 ; CI-NEXT: s_waitcnt lgkmcnt(0)
410 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
411 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
412 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
413 ; CI-NEXT: s_waitcnt vmcnt(0)
414 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
415 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
416 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
417 ; CI-NEXT: s_endpgm
418 %tid = call i32 @llvm.amdgcn.workitem.id.x()
419 %tid.ext = sext i32 %tid to i64
420 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
421 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
422 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
423 %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
424 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
425 ret void
426 }
427
428 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
429 ; GFX9-LABEL: v_shl_v4i16:
430 ; GFX9: ; %bb.0:
431 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
432 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
433 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
434 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
435 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
436 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
437 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
438 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
439 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
440 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
441 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
442 ; GFX9-NEXT: s_waitcnt vmcnt(0)
443 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3
444 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2
445 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
446 ; GFX9-NEXT: s_endpgm
447 ;
448 ; VI-LABEL: v_shl_v4i16:
449 ; VI: ; %bb.0:
450 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
451 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
452 ; VI-NEXT: s_waitcnt lgkmcnt(0)
453 ; VI-NEXT: v_mov_b32_e32 v1, s3
454 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
455 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
456 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
457 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
458 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
459 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
460 ; VI-NEXT: v_mov_b32_e32 v5, s1
461 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
462 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
463 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
464 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
465 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
466 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
467 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
468 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
469 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
470 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
471 ; VI-NEXT: s_endpgm
472 ;
473 ; CI-LABEL: v_shl_v4i16:
474 ; CI: ; %bb.0:
475 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
476 ; CI-NEXT: s_mov_b32 s7, 0xf000
477 ; CI-NEXT: s_mov_b32 s6, 0
478 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
479 ; CI-NEXT: v_mov_b32_e32 v1, 0
480 ; CI-NEXT: s_waitcnt lgkmcnt(0)
481 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
482 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
483 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
484 ; CI-NEXT: s_mov_b32 s8, 0xffff
485 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
486 ; CI-NEXT: s_waitcnt vmcnt(1)
487 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
488 ; CI-NEXT: s_waitcnt vmcnt(0)
489 ; CI-NEXT: v_and_b32_e32 v8, s8, v4
490 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
491 ; CI-NEXT: v_and_b32_e32 v9, s8, v5
492 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
493 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
494 ; CI-NEXT: v_lshl_b32_e32 v5, v7, v5
495 ; CI-NEXT: v_lshl_b32_e32 v3, v3, v9
496 ; CI-NEXT: v_lshl_b32_e32 v4, v6, v4
497 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v8
498 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
499 ; CI-NEXT: v_and_b32_e32 v3, s8, v3
500 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
501 ; CI-NEXT: v_and_b32_e32 v2, s8, v2
502 ; CI-NEXT: v_or_b32_e32 v3, v3, v5
503 ; CI-NEXT: v_or_b32_e32 v2, v2, v4
504 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
505 ; CI-NEXT: s_endpgm
506 %tid = call i32 @llvm.amdgcn.workitem.id.x()
507 %tid.ext = sext i32 %tid to i64
508 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
509 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
510 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
511 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
512 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
513 %result = shl <4 x i16> %a, %b
514 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
515 ret void
516 }
517
518 define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
519 ; GFX9-LABEL: shl_v_imm_v4i16:
520 ; GFX9: ; %bb.0:
521 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
522 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
523 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
524 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
525 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
526 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
527 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
528 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
529 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
530 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
531 ; GFX9-NEXT: s_waitcnt vmcnt(0)
532 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
533 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
534 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
535 ; GFX9-NEXT: s_endpgm
536 ;
537 ; VI-LABEL: shl_v_imm_v4i16:
538 ; VI: ; %bb.0:
539 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
540 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
541 ; VI-NEXT: s_mov_b32 s4, 0xff000000
542 ; VI-NEXT: s_waitcnt lgkmcnt(0)
543 ; VI-NEXT: v_mov_b32_e32 v1, s3
544 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
545 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
546 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
547 ; VI-NEXT: v_mov_b32_e32 v3, s1
548 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
549 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
550 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
551 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
552 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
553 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
554 ; VI-NEXT: v_and_b32_e32 v0, s4, v0
555 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
556 ; VI-NEXT: v_and_b32_e32 v4, s4, v4
557 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
558 ; VI-NEXT: v_or_b32_e32 v0, v5, v0
559 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
560 ; VI-NEXT: s_endpgm
561 ;
562 ; CI-LABEL: shl_v_imm_v4i16:
563 ; CI: ; %bb.0:
564 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
565 ; CI-NEXT: s_mov_b32 s7, 0xf000
566 ; CI-NEXT: s_mov_b32 s6, 0
567 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
568 ; CI-NEXT: v_mov_b32_e32 v1, 0
569 ; CI-NEXT: s_waitcnt lgkmcnt(0)
570 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
571 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
572 ; CI-NEXT: s_mov_b32 s8, 0xff00
573 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
574 ; CI-NEXT: s_waitcnt vmcnt(0)
575 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3
576 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
577 ; CI-NEXT: v_and_b32_e32 v4, s8, v4
578 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
579 ; CI-NEXT: v_and_b32_e32 v3, s8, v3
580 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
581 ; CI-NEXT: v_or_b32_e32 v3, v3, v4
582 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
583 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
584 ; CI-NEXT: s_endpgm
585 %tid = call i32 @llvm.amdgcn.workitem.id.x()
586 %tid.ext = sext i32 %tid to i64
587 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
588 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
589 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
590 %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
591 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
592 ret void
593 }
594
595 declare i32 @llvm.amdgcn.workitem.id.x() #1
596
597 attributes #0 = { nounwind }
598 attributes #1 = { nounwind readnone }