comparison llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @ 173:0572611fdcc8 llvm10 llvm12

reorgnization done
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 25 May 2020 11:55:54 +0900
parents 1d019706d866
children 2e18cbf3894f
comparison
equal deleted inserted replaced
172:9fbae9c8bf63 173:0572611fdcc8
71 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 71 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) 72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 73 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
74 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 74 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
75 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 75 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
76 ; GFX9-NEXT: global_load_dword v3, v[0:1], off 76 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
77 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 77 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
78 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 78 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
79 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 79 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
80 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 80 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
81 ; GFX9-NEXT: s_waitcnt vmcnt(0) 81 ; GFX9-NEXT: s_waitcnt vmcnt(0)
82 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3 82 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4
83 ; GFX9-NEXT: global_store_dword v[0:1], v2, off 83 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
84 ; GFX9-NEXT: s_endpgm 84 ; GFX9-NEXT: s_endpgm
85 ; 85 ;
86 ; VI-LABEL: v_shl_v2i16: 86 ; VI-LABEL: v_shl_v2i16:
87 ; VI: ; %bb.0: 87 ; VI: ; %bb.0:
88 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 88 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
89 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 89 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
90 ; VI-NEXT: s_waitcnt lgkmcnt(0) 90 ; VI-NEXT: s_waitcnt lgkmcnt(0)
91 ; VI-NEXT: v_mov_b32_e32 v1, s3 91 ; VI-NEXT: v_mov_b32_e32 v1, s3
92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
94 ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 94 ; VI-NEXT: v_mov_b32_e32 v3, s1
95 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 95 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
96 ; VI-NEXT: flat_load_dword v5, v[0:1] 96 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
97 ; VI-NEXT: flat_load_dword v2, v[2:3] 97 ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
98 ; VI-NEXT: v_mov_b32_e32 v1, s1 98 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
99 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 99 ; VI-NEXT: flat_load_dword v0, v[0:1]
100 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 100 ; VI-NEXT: flat_load_dword v1, v[4:5]
101 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 101 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
102 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 102 ; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
103 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 103 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
104 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 104 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
105 ; VI-NEXT: flat_store_dword v[0:1], v2 105 ; VI-NEXT: flat_store_dword v[2:3], v0
106 ; VI-NEXT: s_endpgm 106 ; VI-NEXT: s_endpgm
107 ; 107 ;
108 ; CI-LABEL: v_shl_v2i16: 108 ; CI-LABEL: v_shl_v2i16:
109 ; CI: ; %bb.0: 109 ; CI: ; %bb.0:
110 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 110 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
111 ; CI-NEXT: s_mov_b32 s7, 0xf000 111 ; CI-NEXT: s_mov_b32 s3, 0xf000
112 ; CI-NEXT: s_mov_b32 s6, 0 112 ; CI-NEXT: s_mov_b32 s2, 0
113 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 113 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
114 ; CI-NEXT: v_mov_b32_e32 v1, 0 114 ; CI-NEXT: v_mov_b32_e32 v1, 0
115 ; CI-NEXT: s_waitcnt lgkmcnt(0) 115 ; CI-NEXT: s_waitcnt lgkmcnt(0)
116 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 116 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
117 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 117 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
118 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 118 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
119 ; CI-NEXT: s_mov_b32 s8, 0xffff 119 ; CI-NEXT: s_mov_b32 s8, 0xffff
120 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 120 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
121 ; CI-NEXT: s_waitcnt vmcnt(1) 121 ; CI-NEXT: s_waitcnt vmcnt(1)
122 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 122 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
123 ; CI-NEXT: s_waitcnt vmcnt(0) 123 ; CI-NEXT: s_waitcnt vmcnt(0)
124 ; CI-NEXT: v_and_b32_e32 v5, s8, v3 124 ; CI-NEXT: v_and_b32_e32 v5, s8, v3
125 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 125 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
126 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 126 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3
127 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 127 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5
128 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 128 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
129 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 129 ; CI-NEXT: v_and_b32_e32 v2, s8, v2
130 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 130 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
131 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 131 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
132 ; CI-NEXT: s_endpgm 132 ; CI-NEXT: s_endpgm
133 %tid = call i32 @llvm.amdgcn.workitem.id.x() 133 %tid = call i32 @llvm.amdgcn.workitem.id.x()
134 %tid.ext = sext i32 %tid to i64 134 %tid.ext = sext i32 %tid to i64
135 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 135 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
136 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 136 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
150 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 150 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) 151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 152 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
153 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 153 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
154 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 154 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
155 ; GFX9-NEXT: global_load_dword v3, v[0:1], off 155 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
156 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 156 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
157 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 157 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
158 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 158 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
159 ; GFX9-NEXT: s_waitcnt vmcnt(0) 159 ; GFX9-NEXT: s_waitcnt vmcnt(0)
160 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3 160 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0
161 ; GFX9-NEXT: global_store_dword v[0:1], v2, off 161 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
162 ; GFX9-NEXT: s_endpgm 162 ; GFX9-NEXT: s_endpgm
163 ; 163 ;
164 ; VI-LABEL: shl_v_s_v2i16: 164 ; VI-LABEL: shl_v_s_v2i16:
165 ; VI: ; %bb.0: 165 ; VI: ; %bb.0:
166 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 166 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
168 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 168 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
169 ; VI-NEXT: s_waitcnt lgkmcnt(0) 169 ; VI-NEXT: s_waitcnt lgkmcnt(0)
170 ; VI-NEXT: v_mov_b32_e32 v1, s7 170 ; VI-NEXT: v_mov_b32_e32 v1, s7
171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
173 ; VI-NEXT: flat_load_dword v3, v[0:1] 173 ; VI-NEXT: flat_load_dword v0, v[0:1]
174 ; VI-NEXT: s_lshr_b32 s1, s0, 16 174 ; VI-NEXT: s_lshr_b32 s1, s0, 16
175 ; VI-NEXT: v_mov_b32_e32 v4, s1 175 ; VI-NEXT: v_mov_b32_e32 v4, s1
176 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 176 ; VI-NEXT: v_mov_b32_e32 v3, s5
177 ; VI-NEXT: v_mov_b32_e32 v1, s5 177 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
178 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 178 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
179 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 179 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
180 ; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3 180 ; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0
181 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 181 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
182 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 182 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
183 ; VI-NEXT: flat_store_dword v[0:1], v2 183 ; VI-NEXT: flat_store_dword v[2:3], v0
184 ; VI-NEXT: s_endpgm 184 ; VI-NEXT: s_endpgm
185 ; 185 ;
186 ; CI-LABEL: shl_v_s_v2i16: 186 ; CI-LABEL: shl_v_s_v2i16:
187 ; CI: ; %bb.0: 187 ; CI: ; %bb.0:
188 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 188 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
189 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd 189 ; CI-NEXT: s_load_dword s0, s[0:1], 0xd
190 ; CI-NEXT: s_mov_b32 s8, 0xffff
190 ; CI-NEXT: s_mov_b32 s3, 0xf000 191 ; CI-NEXT: s_mov_b32 s3, 0xf000
191 ; CI-NEXT: s_mov_b32 s2, 0 192 ; CI-NEXT: s_mov_b32 s2, 0
192 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 193 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
193 ; CI-NEXT: s_waitcnt lgkmcnt(0) 194 ; CI-NEXT: s_waitcnt lgkmcnt(0)
195 ; CI-NEXT: s_lshr_b32 s9, s0, 16
196 ; CI-NEXT: s_and_b32 s10, s0, s8
194 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 197 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
195 ; CI-NEXT: v_mov_b32_e32 v1, 0 198 ; CI-NEXT: v_mov_b32_e32 v1, 0
196 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 199 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
197 ; CI-NEXT: s_mov_b32 s9, 0xffff
198 ; CI-NEXT: s_lshr_b32 s10, s8, 16
199 ; CI-NEXT: s_and_b32 s8, s8, s9
200 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 200 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
201 ; CI-NEXT: s_waitcnt vmcnt(0) 201 ; CI-NEXT: s_waitcnt vmcnt(0)
202 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 202 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
203 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 203 ; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2
204 ; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3 204 ; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3
205 ; CI-NEXT: v_and_b32_e32 v2, s9, v2 205 ; CI-NEXT: v_and_b32_e32 v2, s8, v2
206 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 206 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
207 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 207 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
208 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 208 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
209 ; CI-NEXT: s_endpgm 209 ; CI-NEXT: s_endpgm
210 %tid = call i32 @llvm.amdgcn.workitem.id.x() 210 %tid = call i32 @llvm.amdgcn.workitem.id.x()
225 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 225 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) 226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
227 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 227 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
228 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 228 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
229 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 229 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
230 ; GFX9-NEXT: global_load_dword v3, v[0:1], off 230 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
231 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 231 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
232 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 232 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
233 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 233 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
234 ; GFX9-NEXT: s_waitcnt vmcnt(0) 234 ; GFX9-NEXT: s_waitcnt vmcnt(0)
235 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0 235 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0
236 ; GFX9-NEXT: global_store_dword v[0:1], v2, off 236 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
237 ; GFX9-NEXT: s_endpgm 237 ; GFX9-NEXT: s_endpgm
238 ; 238 ;
239 ; VI-LABEL: shl_s_v_v2i16: 239 ; VI-LABEL: shl_s_v_v2i16:
240 ; VI: ; %bb.0: 240 ; VI: ; %bb.0:
241 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 241 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
243 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 243 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
244 ; VI-NEXT: s_waitcnt lgkmcnt(0) 244 ; VI-NEXT: s_waitcnt lgkmcnt(0)
245 ; VI-NEXT: v_mov_b32_e32 v1, s7 245 ; VI-NEXT: v_mov_b32_e32 v1, s7
246 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 246 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
247 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 247 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
248 ; VI-NEXT: flat_load_dword v3, v[0:1] 248 ; VI-NEXT: flat_load_dword v0, v[0:1]
249 ; VI-NEXT: s_lshr_b32 s1, s0, 16 249 ; VI-NEXT: s_lshr_b32 s1, s0, 16
250 ; VI-NEXT: v_mov_b32_e32 v4, s1 250 ; VI-NEXT: v_mov_b32_e32 v4, s1
251 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 251 ; VI-NEXT: v_mov_b32_e32 v3, s5
252 ; VI-NEXT: v_mov_b32_e32 v1, s5 252 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 253 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
254 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 254 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
255 ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0 255 ; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0
256 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 256 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
257 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 257 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
258 ; VI-NEXT: flat_store_dword v[0:1], v2 258 ; VI-NEXT: flat_store_dword v[2:3], v0
259 ; VI-NEXT: s_endpgm 259 ; VI-NEXT: s_endpgm
260 ; 260 ;
261 ; CI-LABEL: shl_s_v_v2i16: 261 ; CI-LABEL: shl_s_v_v2i16:
262 ; CI: ; %bb.0: 262 ; CI: ; %bb.0:
263 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 263 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
268 ; CI-NEXT: s_waitcnt lgkmcnt(0) 268 ; CI-NEXT: s_waitcnt lgkmcnt(0)
269 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 269 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
270 ; CI-NEXT: v_mov_b32_e32 v1, 0 270 ; CI-NEXT: v_mov_b32_e32 v1, 0
271 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 271 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
272 ; CI-NEXT: s_mov_b32 s0, 0xffff 272 ; CI-NEXT: s_mov_b32 s0, 0xffff
273 ; CI-NEXT: s_lshr_b32 s1, s8, 16 273 ; CI-NEXT: s_lshr_b32 s9, s8, 16
274 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 274 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
275 ; CI-NEXT: s_waitcnt vmcnt(0) 275 ; CI-NEXT: s_waitcnt vmcnt(0)
276 ; CI-NEXT: v_and_b32_e32 v3, s0, v2 276 ; CI-NEXT: v_and_b32_e32 v3, s0, v2
277 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 277 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
278 ; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 278 ; CI-NEXT: v_lshl_b32_e32 v2, s9, v2
279 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 279 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3
280 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 280 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
281 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 281 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
282 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 282 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
283 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 283 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
299 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 299 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) 300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
301 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 301 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
302 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 302 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
303 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 303 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
304 ; GFX9-NEXT: global_load_dword v3, v[0:1], off 304 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
305 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 305 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
306 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 306 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
307 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 307 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
308 ; GFX9-NEXT: s_waitcnt vmcnt(0) 308 ; GFX9-NEXT: s_waitcnt vmcnt(0)
309 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0] 309 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0]
310 ; GFX9-NEXT: global_store_dword v[0:1], v2, off 310 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
311 ; GFX9-NEXT: s_endpgm 311 ; GFX9-NEXT: s_endpgm
312 ; 312 ;
313 ; VI-LABEL: shl_imm_v_v2i16: 313 ; VI-LABEL: shl_imm_v_v2i16:
314 ; VI: ; %bb.0: 314 ; VI: ; %bb.0:
315 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 315 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
316 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 316 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
317 ; VI-NEXT: v_mov_b32_e32 v3, 8 317 ; VI-NEXT: v_mov_b32_e32 v4, 8
318 ; VI-NEXT: s_waitcnt lgkmcnt(0) 318 ; VI-NEXT: s_waitcnt lgkmcnt(0)
319 ; VI-NEXT: v_mov_b32_e32 v1, s3 319 ; VI-NEXT: v_mov_b32_e32 v1, s3
320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
322 ; VI-NEXT: flat_load_dword v4, v[0:1] 322 ; VI-NEXT: flat_load_dword v0, v[0:1]
323 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 323 ; VI-NEXT: v_mov_b32_e32 v3, s1
324 ; VI-NEXT: v_mov_b32_e32 v1, s1 324 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
325 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 325 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
327 ; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8 327 ; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8
328 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 328 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
329 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 329 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
330 ; VI-NEXT: flat_store_dword v[0:1], v2 330 ; VI-NEXT: flat_store_dword v[2:3], v0
331 ; VI-NEXT: s_endpgm 331 ; VI-NEXT: s_endpgm
332 ; 332 ;
333 ; CI-LABEL: shl_imm_v_v2i16: 333 ; CI-LABEL: shl_imm_v_v2i16:
334 ; CI: ; %bb.0: 334 ; CI: ; %bb.0:
335 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 335 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
336 ; CI-NEXT: s_mov_b32 s7, 0xf000 336 ; CI-NEXT: s_mov_b32 s3, 0xf000
337 ; CI-NEXT: s_mov_b32 s6, 0 337 ; CI-NEXT: s_mov_b32 s2, 0
338 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 338 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
339 ; CI-NEXT: v_mov_b32_e32 v1, 0 339 ; CI-NEXT: v_mov_b32_e32 v1, 0
340 ; CI-NEXT: s_waitcnt lgkmcnt(0) 340 ; CI-NEXT: s_waitcnt lgkmcnt(0)
341 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 341 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
342 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 342 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
343 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 343 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
344 ; CI-NEXT: s_waitcnt vmcnt(0) 344 ; CI-NEXT: s_waitcnt vmcnt(0)
345 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2 345 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2
346 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 346 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
347 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 347 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2
348 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3 348 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3
349 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 349 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
350 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3 350 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3
351 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 351 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
352 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 352 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
353 ; CI-NEXT: s_endpgm 353 ; CI-NEXT: s_endpgm
354 %tid = call i32 @llvm.amdgcn.workitem.id.x() 354 %tid = call i32 @llvm.amdgcn.workitem.id.x()
355 %tid.ext = sext i32 %tid to i64 355 %tid.ext = sext i32 %tid to i64
356 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 356 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
357 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 357 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
368 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 368 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) 369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 370 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
371 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 371 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
373 ; GFX9-NEXT: global_load_dword v3, v[0:1], off 373 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
374 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 374 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
375 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 375 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
376 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 376 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
377 ; GFX9-NEXT: s_waitcnt vmcnt(0) 377 ; GFX9-NEXT: s_waitcnt vmcnt(0)
378 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] 378 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
379 ; GFX9-NEXT: global_store_dword v[0:1], v2, off 379 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
380 ; GFX9-NEXT: s_endpgm 380 ; GFX9-NEXT: s_endpgm
381 ; 381 ;
382 ; VI-LABEL: shl_v_imm_v2i16: 382 ; VI-LABEL: shl_v_imm_v2i16:
383 ; VI: ; %bb.0: 383 ; VI: ; %bb.0:
384 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 384 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
385 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 385 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
386 ; VI-NEXT: s_waitcnt lgkmcnt(0) 386 ; VI-NEXT: s_waitcnt lgkmcnt(0)
387 ; VI-NEXT: v_mov_b32_e32 v1, s3 387 ; VI-NEXT: v_mov_b32_e32 v1, s3
388 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 388 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
389 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 389 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
390 ; VI-NEXT: flat_load_dword v3, v[0:1] 390 ; VI-NEXT: flat_load_dword v0, v[0:1]
391 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 391 ; VI-NEXT: v_mov_b32_e32 v3, s1
392 ; VI-NEXT: v_mov_b32_e32 v1, s1 392 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
393 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 393 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
394 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 394 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
395 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 395 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
396 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 396 ; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1
397 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 397 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
398 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 398 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
399 ; VI-NEXT: flat_store_dword v[0:1], v2 399 ; VI-NEXT: flat_store_dword v[2:3], v0
400 ; VI-NEXT: s_endpgm 400 ; VI-NEXT: s_endpgm
401 ; 401 ;
402 ; CI-LABEL: shl_v_imm_v2i16: 402 ; CI-LABEL: shl_v_imm_v2i16:
403 ; CI: ; %bb.0: 403 ; CI: ; %bb.0:
404 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 404 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
405 ; CI-NEXT: s_mov_b32 s7, 0xf000 405 ; CI-NEXT: s_mov_b32 s3, 0xf000
406 ; CI-NEXT: s_mov_b32 s6, 0 406 ; CI-NEXT: s_mov_b32 s2, 0
407 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 407 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
408 ; CI-NEXT: v_mov_b32_e32 v1, 0 408 ; CI-NEXT: v_mov_b32_e32 v1, 0
409 ; CI-NEXT: s_waitcnt lgkmcnt(0) 409 ; CI-NEXT: s_waitcnt lgkmcnt(0)
410 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 410 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
411 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 411 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
412 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 412 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
413 ; CI-NEXT: s_waitcnt vmcnt(0) 413 ; CI-NEXT: s_waitcnt vmcnt(0)
414 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 414 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
415 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 415 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
416 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 416 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
417 ; CI-NEXT: s_endpgm 417 ; CI-NEXT: s_endpgm
418 %tid = call i32 @llvm.amdgcn.workitem.id.x() 418 %tid = call i32 @llvm.amdgcn.workitem.id.x()
419 %tid.ext = sext i32 %tid to i64 419 %tid.ext = sext i32 %tid to i64
420 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 420 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
421 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 421 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
427 427
428 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 428 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
429 ; GFX9-LABEL: v_shl_v4i16: 429 ; GFX9-LABEL: v_shl_v4i16:
430 ; GFX9: ; %bb.0: 430 ; GFX9: ; %bb.0:
431 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 431 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
432 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 432 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
433 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) 433 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
434 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 434 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
435 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 435 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
436 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 436 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
437 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 437 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
438 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 438 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
439 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 439 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
440 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 440 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
441 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 441 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
442 ; GFX9-NEXT: s_waitcnt vmcnt(0) 442 ; GFX9-NEXT: s_waitcnt vmcnt(0)
443 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 443 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5
444 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 444 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4
445 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 445 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
446 ; GFX9-NEXT: s_endpgm 446 ; GFX9-NEXT: s_endpgm
447 ; 447 ;
448 ; VI-LABEL: v_shl_v4i16: 448 ; VI-LABEL: v_shl_v4i16:
449 ; VI: ; %bb.0: 449 ; VI: ; %bb.0:
450 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 450 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
451 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 451 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
452 ; VI-NEXT: s_waitcnt lgkmcnt(0) 452 ; VI-NEXT: s_waitcnt lgkmcnt(0)
453 ; VI-NEXT: v_mov_b32_e32 v1, s3 453 ; VI-NEXT: v_mov_b32_e32 v1, s3
454 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 454 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
455 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 455 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
456 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 456 ; VI-NEXT: v_mov_b32_e32 v3, s1
457 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 457 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
458 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
459 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
460 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
458 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 461 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
459 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 462 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
460 ; VI-NEXT: v_mov_b32_e32 v5, s1 463 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
461 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 464 ; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1
462 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 465 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
463 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 466 ; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0
464 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 467 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
465 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
466 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
467 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
468 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 468 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
469 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 469 ; VI-NEXT: v_or_b32_e32 v0, v5, v0
470 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 470 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
471 ; VI-NEXT: s_endpgm 471 ; VI-NEXT: s_endpgm
472 ; 472 ;
473 ; CI-LABEL: v_shl_v4i16: 473 ; CI-LABEL: v_shl_v4i16:
474 ; CI: ; %bb.0: 474 ; CI: ; %bb.0:
475 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 475 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
476 ; CI-NEXT: s_mov_b32 s7, 0xf000 476 ; CI-NEXT: s_mov_b32 s3, 0xf000
477 ; CI-NEXT: s_mov_b32 s6, 0 477 ; CI-NEXT: s_mov_b32 s2, 0
478 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 478 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
479 ; CI-NEXT: v_mov_b32_e32 v1, 0 479 ; CI-NEXT: v_mov_b32_e32 v1, 0
480 ; CI-NEXT: s_waitcnt lgkmcnt(0) 480 ; CI-NEXT: s_waitcnt lgkmcnt(0)
481 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 481 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
482 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 482 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
483 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 483 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
484 ; CI-NEXT: s_mov_b32 s8, 0xffff 484 ; CI-NEXT: s_mov_b32 s8, 0xffff
485 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 485 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
486 ; CI-NEXT: s_waitcnt vmcnt(1) 486 ; CI-NEXT: s_waitcnt vmcnt(1)
487 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 487 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
488 ; CI-NEXT: s_waitcnt vmcnt(0) 488 ; CI-NEXT: s_waitcnt vmcnt(0)
489 ; CI-NEXT: v_and_b32_e32 v8, s8, v4 489 ; CI-NEXT: v_and_b32_e32 v8, s8, v4
490 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 490 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
499 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 499 ; CI-NEXT: v_and_b32_e32 v3, s8, v3
500 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 500 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
501 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 501 ; CI-NEXT: v_and_b32_e32 v2, s8, v2
502 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 502 ; CI-NEXT: v_or_b32_e32 v3, v3, v5
503 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 503 ; CI-NEXT: v_or_b32_e32 v2, v2, v4
504 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 504 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
505 ; CI-NEXT: s_endpgm 505 ; CI-NEXT: s_endpgm
506 %tid = call i32 @llvm.amdgcn.workitem.id.x() 506 %tid = call i32 @llvm.amdgcn.workitem.id.x()
507 %tid.ext = sext i32 %tid to i64 507 %tid.ext = sext i32 %tid to i64
508 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 508 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
509 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 509 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
536 ; 536 ;
537 ; VI-LABEL: shl_v_imm_v4i16: 537 ; VI-LABEL: shl_v_imm_v4i16:
538 ; VI: ; %bb.0: 538 ; VI: ; %bb.0:
539 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 539 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
540 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 540 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
541 ; VI-NEXT: s_mov_b32 s4, 0xff000000
542 ; VI-NEXT: s_waitcnt lgkmcnt(0) 541 ; VI-NEXT: s_waitcnt lgkmcnt(0)
543 ; VI-NEXT: v_mov_b32_e32 v1, s3 542 ; VI-NEXT: v_mov_b32_e32 v1, s3
544 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 543 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
545 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 544 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
546 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 545 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
546 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
547 ; VI-NEXT: s_mov_b32 s0, 0xff000000
547 ; VI-NEXT: v_mov_b32_e32 v3, s1 548 ; VI-NEXT: v_mov_b32_e32 v3, s1
548 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
549 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 549 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
550 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 550 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
551 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 551 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
552 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 552 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
553 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 553 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
554 ; VI-NEXT: v_and_b32_e32 v0, s4, v0 554 ; VI-NEXT: v_and_b32_e32 v0, s0, v0
555 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 555 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
556 ; VI-NEXT: v_and_b32_e32 v4, s4, v4 556 ; VI-NEXT: v_and_b32_e32 v4, s0, v4
557 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 557 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
558 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 558 ; VI-NEXT: v_or_b32_e32 v0, v5, v0
559 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 559 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
560 ; VI-NEXT: s_endpgm 560 ; VI-NEXT: s_endpgm
561 ; 561 ;
562 ; CI-LABEL: shl_v_imm_v4i16: 562 ; CI-LABEL: shl_v_imm_v4i16:
563 ; CI: ; %bb.0: 563 ; CI: ; %bb.0:
564 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 564 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
565 ; CI-NEXT: s_mov_b32 s7, 0xf000 565 ; CI-NEXT: s_mov_b32 s3, 0xf000
566 ; CI-NEXT: s_mov_b32 s6, 0 566 ; CI-NEXT: s_mov_b32 s2, 0
567 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 567 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
568 ; CI-NEXT: v_mov_b32_e32 v1, 0 568 ; CI-NEXT: v_mov_b32_e32 v1, 0
569 ; CI-NEXT: s_waitcnt lgkmcnt(0) 569 ; CI-NEXT: s_waitcnt lgkmcnt(0)
570 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 570 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
571 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 571 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
572 ; CI-NEXT: s_mov_b32 s8, 0xff00 572 ; CI-NEXT: s_mov_b32 s8, 0xff00
573 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 573 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
574 ; CI-NEXT: s_waitcnt vmcnt(0) 574 ; CI-NEXT: s_waitcnt vmcnt(0)
575 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 575 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3
576 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 576 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
577 ; CI-NEXT: v_and_b32_e32 v4, s8, v4 577 ; CI-NEXT: v_and_b32_e32 v4, s8, v4
578 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 578 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
579 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 579 ; CI-NEXT: v_and_b32_e32 v3, s8, v3
580 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 580 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
581 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 581 ; CI-NEXT: v_or_b32_e32 v3, v3, v4
582 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 582 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
583 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 583 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
584 ; CI-NEXT: s_endpgm 584 ; CI-NEXT: s_endpgm
585 %tid = call i32 @llvm.amdgcn.workitem.id.x() 585 %tid = call i32 @llvm.amdgcn.workitem.id.x()
586 %tid.ext = sext i32 %tid to i64 586 %tid.ext = sext i32 %tid to i64
587 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 587 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
588 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 588 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext