150
|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
221
|
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
|
|
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
|
|
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
|
|
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
|
150
|
6
|
|
7 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
|
|
8 ; GFX9-LABEL: s_shl_v2i16:
|
|
9 ; GFX9: ; %bb.0:
|
|
10 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
|
11 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
221
|
12 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30
|
150
|
13 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
|
14 ; GFX9-NEXT: s_mov_b32 s6, -1
|
|
15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
16 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
221
|
17 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0
|
150
|
18 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
19 ; GFX9-NEXT: s_endpgm
|
|
20 ;
|
|
21 ; VI-LABEL: s_shl_v2i16:
|
|
22 ; VI: ; %bb.0:
|
|
23 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
|
24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
|
25 ; VI-NEXT: s_load_dword s0, s[0:1], 0x30
|
|
26 ; VI-NEXT: s_mov_b32 s3, 0xffff
|
|
27 ; VI-NEXT: s_mov_b32 s7, 0xf000
|
|
28 ; VI-NEXT: s_mov_b32 s6, -1
|
|
29 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
30 ; VI-NEXT: s_lshr_b32 s1, s2, 16
|
|
31 ; VI-NEXT: s_lshr_b32 s8, s0, 16
|
|
32 ; VI-NEXT: s_and_b32 s2, s2, s3
|
|
33 ; VI-NEXT: s_and_b32 s0, s0, s3
|
|
34 ; VI-NEXT: s_lshl_b32 s0, s2, s0
|
|
35 ; VI-NEXT: s_lshl_b32 s1, s1, s8
|
|
36 ; VI-NEXT: s_lshl_b32 s1, s1, 16
|
|
37 ; VI-NEXT: s_and_b32 s0, s0, s3
|
|
38 ; VI-NEXT: s_or_b32 s0, s0, s1
|
|
39 ; VI-NEXT: v_mov_b32_e32 v0, s0
|
|
40 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
41 ; VI-NEXT: s_endpgm
|
|
42 ;
|
|
43 ; CI-LABEL: s_shl_v2i16:
|
|
44 ; CI: ; %bb.0:
|
|
45 ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
46 ; CI-NEXT: s_load_dword s2, s[0:1], 0xb
|
|
47 ; CI-NEXT: s_load_dword s0, s[0:1], 0xc
|
|
48 ; CI-NEXT: s_mov_b32 s3, 0xffff
|
|
49 ; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
50 ; CI-NEXT: s_mov_b32 s6, -1
|
|
51 ; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
52 ; CI-NEXT: s_lshr_b32 s1, s2, 16
|
|
53 ; CI-NEXT: s_and_b32 s8, s0, s3
|
|
54 ; CI-NEXT: s_lshr_b32 s0, s0, 16
|
|
55 ; CI-NEXT: s_lshl_b32 s0, s1, s0
|
|
56 ; CI-NEXT: s_lshl_b32 s1, s2, s8
|
|
57 ; CI-NEXT: s_lshl_b32 s0, s0, 16
|
|
58 ; CI-NEXT: s_and_b32 s1, s1, s3
|
|
59 ; CI-NEXT: s_or_b32 s0, s1, s0
|
|
60 ; CI-NEXT: v_mov_b32_e32 v0, s0
|
|
61 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
62 ; CI-NEXT: s_endpgm
|
221
|
63 ;
|
|
64 ; GFX10-LABEL: s_shl_v2i16:
|
|
65 ; GFX10: ; %bb.0:
|
|
66 ; GFX10-NEXT: s_clause 0x2
|
|
67 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
|
|
68 ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x30
|
|
69 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
|
70 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
|
|
71 ; GFX10-NEXT: s_mov_b32 s6, -1
|
|
72 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
73 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2
|
|
74 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
75 ; GFX10-NEXT: s_endpgm
|
150
|
76 %result = shl <2 x i16> %lhs, %rhs
|
|
77 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
|
|
78 ret void
|
|
79 }
|
|
80
|
|
81 define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
|
|
82 ; GFX9-LABEL: v_shl_v2i16:
|
|
83 ; GFX9: ; %bb.0:
|
|
84 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
221
|
85 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
150
|
86 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
221
|
87 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
|
88 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4
|
150
|
89 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
221
|
90 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1
|
|
91 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
150
|
92 ; GFX9-NEXT: s_endpgm
|
|
93 ;
|
|
94 ; VI-LABEL: v_shl_v2i16:
|
|
95 ; VI: ; %bb.0:
|
|
96 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
221
|
97 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
150
|
98 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
99 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
221
|
100 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
|
150
|
101 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
221
|
102 ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
|
|
103 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
|
104 ; VI-NEXT: flat_load_dword v5, v[0:1]
|
|
105 ; VI-NEXT: flat_load_dword v2, v[2:3]
|
|
106 ; VI-NEXT: v_mov_b32_e32 v1, s1
|
|
107 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
|
|
108 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
109 ; VI-NEXT: s_waitcnt vmcnt(0)
|
|
110 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5
|
|
111 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
|
112 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
|
113 ; VI-NEXT: flat_store_dword v[0:1], v2
|
150
|
114 ; VI-NEXT: s_endpgm
|
|
115 ;
|
|
116 ; CI-LABEL: v_shl_v2i16:
|
|
117 ; CI: ; %bb.0:
|
173
|
118 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
119 ; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
120 ; CI-NEXT: s_mov_b32 s2, 0
|
150
|
121 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
122 ; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
123 ; CI-NEXT: s_waitcnt lgkmcnt(0)
|
173
|
124 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
|
125 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
126 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
|
221
|
127 ; CI-NEXT: s_mov_b32 s0, 0xffff
|
173
|
128 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
150
|
129 ; CI-NEXT: s_waitcnt vmcnt(1)
|
|
130 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
|
|
131 ; CI-NEXT: s_waitcnt vmcnt(0)
|
221
|
132 ; CI-NEXT: v_and_b32_e32 v5, s0, v3
|
150
|
133 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
134 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3
|
|
135 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5
|
|
136 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
221
|
137 ; CI-NEXT: v_and_b32_e32 v2, s0, v2
|
150
|
138 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
|
173
|
139 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
150
|
140 ; CI-NEXT: s_endpgm
|
221
|
141 ;
|
|
142 ; GFX10-LABEL: v_shl_v2i16:
|
|
143 ; GFX10: ; %bb.0:
|
|
144 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
145 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
146 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
147 ; GFX10-NEXT: s_clause 0x1
|
|
148 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
|
149 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4
|
|
150 ; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
151 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v2, v1
|
|
152 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
|
|
153 ; GFX10-NEXT: s_endpgm
|
150
|
154 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
155 %tid.ext = sext i32 %tid to i64
|
|
156 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
|
|
157 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
|
|
158 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
|
|
159 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
|
|
160 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
|
|
161 %result = shl <2 x i16> %a, %b
|
|
162 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
|
|
163 ret void
|
|
164 }
|
|
165
|
|
166 define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
|
|
167 ; GFX9-LABEL: shl_v_s_v2i16:
|
|
168 ; GFX9: ; %bb.0:
|
|
169 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
221
|
170 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
171 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
150
|
172 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
221
|
173 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
150
|
174 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
221
|
175 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1
|
|
176 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
|
150
|
177 ; GFX9-NEXT: s_endpgm
|
|
178 ;
|
|
179 ; VI-LABEL: shl_v_s_v2i16:
|
|
180 ; VI: ; %bb.0:
|
|
181 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
182 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
|
|
183 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
184 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
185 ; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
186 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
|
|
187 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
221
|
188 ; VI-NEXT: flat_load_dword v3, v[0:1]
|
150
|
189 ; VI-NEXT: s_lshr_b32 s1, s0, 16
|
221
|
190 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
|
|
191 ; VI-NEXT: v_mov_b32_e32 v2, s1
|
|
192 ; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
193 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
194 ; VI-NEXT: s_waitcnt vmcnt(0)
|
|
195 ; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3
|
|
196 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
197 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
|
|
198 ; VI-NEXT: flat_store_dword v[0:1], v2
|
150
|
199 ; VI-NEXT: s_endpgm
|
|
200 ;
|
|
201 ; CI-LABEL: shl_v_s_v2i16:
|
|
202 ; CI: ; %bb.0:
|
|
203 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
221
|
204 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
|
150
|
205 ; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
206 ; CI-NEXT: s_mov_b32 s2, 0
|
|
207 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
208 ; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
209 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
|
210 ; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
211 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
221
|
212 ; CI-NEXT: s_mov_b32 s0, 0xffff
|
|
213 ; CI-NEXT: s_lshr_b32 s1, s8, 16
|
|
214 ; CI-NEXT: s_and_b32 s8, s8, s0
|
150
|
215 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
216 ; CI-NEXT: s_waitcnt vmcnt(0)
|
|
217 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
221
|
218 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
|
|
219 ; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3
|
|
220 ; CI-NEXT: v_and_b32_e32 v2, s0, v2
|
150
|
221 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
222 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
|
|
223 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
224 ; CI-NEXT: s_endpgm
|
221
|
225 ;
|
|
226 ; GFX10-LABEL: shl_v_s_v2i16:
|
|
227 ; GFX10: ; %bb.0:
|
|
228 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
229 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
230 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
|
|
231 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
232 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
|
|
233 ; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
234 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, s0, v1
|
|
235 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
236 ; GFX10-NEXT: s_endpgm
|
150
|
237 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
238 %tid.ext = sext i32 %tid to i64
|
|
239 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
|
|
240 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
|
|
241 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
|
|
242 %result = shl <2 x i16> %vgpr, %sgpr
|
|
243 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
|
|
244 ret void
|
|
245 }
|
|
246
|
|
247 define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
|
|
248 ; GFX9-LABEL: shl_s_v_v2i16:
|
|
249 ; GFX9: ; %bb.0:
|
|
250 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
221
|
251 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
252 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
150
|
253 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
221
|
254 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
150
|
255 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
221
|
256 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2
|
|
257 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
|
150
|
258 ; GFX9-NEXT: s_endpgm
|
|
259 ;
|
|
260 ; VI-LABEL: shl_s_v_v2i16:
|
|
261 ; VI: ; %bb.0:
|
|
262 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
263 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
|
|
264 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
265 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
266 ; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
267 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
|
|
268 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
221
|
269 ; VI-NEXT: flat_load_dword v3, v[0:1]
|
150
|
270 ; VI-NEXT: s_lshr_b32 s1, s0, 16
|
221
|
271 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
|
|
272 ; VI-NEXT: v_mov_b32_e32 v2, s1
|
|
273 ; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
274 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
275 ; VI-NEXT: s_waitcnt vmcnt(0)
|
|
276 ; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0
|
|
277 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
|
278 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
|
|
279 ; VI-NEXT: flat_store_dword v[0:1], v2
|
150
|
280 ; VI-NEXT: s_endpgm
|
|
281 ;
|
|
282 ; CI-LABEL: shl_s_v_v2i16:
|
|
283 ; CI: ; %bb.0:
|
|
284 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
285 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
|
|
286 ; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
287 ; CI-NEXT: s_mov_b32 s2, 0
|
|
288 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
289 ; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
290 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
|
291 ; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
292 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
293 ; CI-NEXT: s_mov_b32 s0, 0xffff
|
221
|
294 ; CI-NEXT: s_lshr_b32 s1, s8, 16
|
150
|
295 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
296 ; CI-NEXT: s_waitcnt vmcnt(0)
|
|
297 ; CI-NEXT: v_and_b32_e32 v3, s0, v2
|
|
298 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
221
|
299 ; CI-NEXT: v_lshl_b32_e32 v2, s1, v2
|
150
|
300 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3
|
|
301 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
|
302 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
|
|
303 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
|
|
304 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
|
305 ; CI-NEXT: s_endpgm
|
221
|
306 ;
|
|
307 ; GFX10-LABEL: shl_s_v_v2i16:
|
|
308 ; GFX10: ; %bb.0:
|
|
309 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
310 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
311 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
|
|
312 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
313 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
|
|
314 ; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
315 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0
|
|
316 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
|
317 ; GFX10-NEXT: s_endpgm
|
150
|
318 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
319 %tid.ext = sext i32 %tid to i64
|
|
320 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
|
|
321 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
|
|
322 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
|
|
323 %result = shl <2 x i16> %sgpr, %vgpr
|
|
324 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
|
|
325 ret void
|
|
326 }
|
|
327
|
|
328 define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
|
|
329 ; GFX9-LABEL: shl_imm_v_v2i16:
|
|
330 ; GFX9: ; %bb.0:
|
|
331 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
221
|
332 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
150
|
333 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
221
|
334 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
150
|
335 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
221
|
336 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
|
|
337 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
150
|
338 ; GFX9-NEXT: s_endpgm
|
|
339 ;
|
|
340 ; VI-LABEL: shl_imm_v_v2i16:
|
|
341 ; VI: ; %bb.0:
|
|
342 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
343 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
173
|
344 ; VI-NEXT: v_mov_b32_e32 v4, 8
|
150
|
345 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
346 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
347 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
|
348 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
221
|
349 ; VI-NEXT: flat_load_dword v3, v[0:1]
|
|
350 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
|
|
351 ; VI-NEXT: v_mov_b32_e32 v1, s1
|
|
352 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
353 ; VI-NEXT: s_waitcnt vmcnt(0)
|
|
354 ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8
|
|
355 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
|
356 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
|
357 ; VI-NEXT: flat_store_dword v[0:1], v2
|
150
|
358 ; VI-NEXT: s_endpgm
|
|
359 ;
|
|
360 ; CI-LABEL: shl_imm_v_v2i16:
|
|
361 ; CI: ; %bb.0:
|
173
|
362 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
363 ; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
364 ; CI-NEXT: s_mov_b32 s2, 0
|
150
|
365 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
366 ; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
367 ; CI-NEXT: s_waitcnt lgkmcnt(0)
|
173
|
368 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
|
369 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
370 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
150
|
371 ; CI-NEXT: s_waitcnt vmcnt(0)
|
|
372 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2
|
|
373 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
374 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2
|
|
375 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3
|
|
376 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
|
377 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3
|
|
378 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
|
173
|
379 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
150
|
380 ; CI-NEXT: s_endpgm
|
221
|
381 ;
|
|
382 ; GFX10-LABEL: shl_imm_v_v2i16:
|
|
383 ; GFX10: ; %bb.0:
|
|
384 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
385 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
386 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
387 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
|
388 ; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
389 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
|
|
390 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
|
|
391 ; GFX10-NEXT: s_endpgm
|
150
|
392 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
393 %tid.ext = sext i32 %tid to i64
|
|
394 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
|
|
395 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
|
|
396 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
|
|
397 %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
|
|
398 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
|
|
399 ret void
|
|
400 }
|
|
401
|
|
402 define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
|
|
403 ; GFX9-LABEL: shl_v_imm_v2i16:
|
|
404 ; GFX9: ; %bb.0:
|
|
405 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
221
|
406 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
150
|
407 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
221
|
408 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
150
|
409 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
221
|
410 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
|
411 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
150
|
412 ; GFX9-NEXT: s_endpgm
|
|
413 ;
|
|
414 ; VI-LABEL: shl_v_imm_v2i16:
|
|
415 ; VI: ; %bb.0:
|
|
416 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
417 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
418 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
419 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
420 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
|
421 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
221
|
422 ; VI-NEXT: flat_load_dword v3, v[0:1]
|
|
423 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
|
|
424 ; VI-NEXT: v_mov_b32_e32 v1, s1
|
|
425 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
426 ; VI-NEXT: s_waitcnt vmcnt(0)
|
|
427 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
|
|
428 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
|
|
429 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
|
|
430 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
|
431 ; VI-NEXT: flat_store_dword v[0:1], v2
|
150
|
432 ; VI-NEXT: s_endpgm
|
|
433 ;
|
|
434 ; CI-LABEL: shl_v_imm_v2i16:
|
|
435 ; CI: ; %bb.0:
|
173
|
436 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
437 ; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
438 ; CI-NEXT: s_mov_b32 s2, 0
|
150
|
439 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
440 ; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
441 ; CI-NEXT: s_waitcnt lgkmcnt(0)
|
173
|
442 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
|
443 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
444 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
150
|
445 ; CI-NEXT: s_waitcnt vmcnt(0)
|
|
446 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
|
447 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
|
173
|
448 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
150
|
449 ; CI-NEXT: s_endpgm
|
221
|
450 ;
|
|
451 ; GFX10-LABEL: shl_v_imm_v2i16:
|
|
452 ; GFX10: ; %bb.0:
|
|
453 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
454 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
455 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
456 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
|
457 ; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
458 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
|
459 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
|
|
460 ; GFX10-NEXT: s_endpgm
|
150
|
461 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
462 %tid.ext = sext i32 %tid to i64
|
|
463 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
|
|
464 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
|
|
465 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
|
|
466 %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
|
|
467 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
|
|
468 ret void
|
|
469 }
|
|
470
|
|
471 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
|
|
472 ; GFX9-LABEL: v_shl_v4i16:
|
|
473 ; GFX9: ; %bb.0:
|
|
474 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
221
|
475 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
150
|
476 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
221
|
477 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
|
|
478 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
|
150
|
479 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
221
|
480 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1
|
|
481 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0
|
|
482 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
150
|
483 ; GFX9-NEXT: s_endpgm
|
|
484 ;
|
|
485 ; VI-LABEL: v_shl_v4i16:
|
|
486 ; VI: ; %bb.0:
|
|
487 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
221
|
488 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
150
|
489 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
490 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
221
|
491 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
|
150
|
492 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
221
|
493 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
|
|
494 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
150
|
495 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
221
|
496 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
|
|
497 ; VI-NEXT: v_mov_b32_e32 v5, s1
|
|
498 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
|
|
499 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
|
500 ; VI-NEXT: s_waitcnt vmcnt(0)
|
|
501 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
|
|
502 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
|
503 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
|
|
504 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
150
|
505 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
|
221
|
506 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
|
|
507 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
|
150
|
508 ; VI-NEXT: s_endpgm
|
|
509 ;
|
|
510 ; CI-LABEL: v_shl_v4i16:
|
|
511 ; CI: ; %bb.0:
|
173
|
512 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
513 ; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
514 ; CI-NEXT: s_mov_b32 s2, 0
|
150
|
515 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
|
516 ; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
517 ; CI-NEXT: s_waitcnt lgkmcnt(0)
|
173
|
518 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
|
519 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
|
520 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
|
221
|
521 ; CI-NEXT: s_mov_b32 s0, 0xffff
|
173
|
522 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
150
|
523 ; CI-NEXT: s_waitcnt vmcnt(1)
|
|
524 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
|
525 ; CI-NEXT: s_waitcnt vmcnt(0)
|
221
|
526 ; CI-NEXT: v_and_b32_e32 v8, s0, v4
|
150
|
527 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
221
|
528 ; CI-NEXT: v_and_b32_e32 v9, s0, v5
|
150
|
529 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
|
|
530 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
|
531 ; CI-NEXT: v_lshl_b32_e32 v5, v7, v5
|
|
532 ; CI-NEXT: v_lshl_b32_e32 v3, v3, v9
|
|
533 ; CI-NEXT: v_lshl_b32_e32 v4, v6, v4
|
|
534 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v8
|
|
535 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
221
|
536 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
|
150
|
537 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
221
|
538 ; CI-NEXT: v_and_b32_e32 v2, s0, v2
|
150
|
539 ; CI-NEXT: v_or_b32_e32 v3, v3, v5
|
|
540 ; CI-NEXT: v_or_b32_e32 v2, v2, v4
|
173
|
541 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
150
|
542 ; CI-NEXT: s_endpgm
|
221
|
543 ;
|
|
544 ; GFX10-LABEL: v_shl_v4i16:
|
|
545 ; GFX10: ; %bb.0:
|
|
546 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
547 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
548 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
549 ; GFX10-NEXT: s_clause 0x1
|
|
550 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
|
|
551 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
|
|
552 ; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
553 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1
|
|
554 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0
|
|
555 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
|
556 ; GFX10-NEXT: s_endpgm
|
150
|
557 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
558 %tid.ext = sext i32 %tid to i64
|
|
559 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
|
|
560 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
|
|
561 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
|
|
562 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
|
|
563 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
|
|
564 %result = shl <4 x i16> %a, %b
|
|
565 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
|
|
566 ret void
|
|
567 }
|
|
568
|
|
569 define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
|
|
570 ; GFX9-LABEL: shl_v_imm_v4i16:
|
|
571 ; GFX9: ; %bb.0:
|
|
572 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
573 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
574 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
221
|
575 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
|
150
|
576 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
577 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
|
578 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
221
|
579 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
150
|
580 ; GFX9-NEXT: s_endpgm
|
|
581 ;
|
|
582 ; VI-LABEL: shl_v_imm_v4i16:
|
|
583 ; VI: ; %bb.0:
|
|
584 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
585 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
586 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
221
|
587 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
150
|
588 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
589 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
590 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
221
|
591 ; VI-NEXT: s_mov_b32 s2, 0xff000000
|
150
|
592 ; VI-NEXT: v_mov_b32_e32 v3, s1
|
221
|
593 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
150
|
594 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
221
|
595 ; VI-NEXT: s_waitcnt vmcnt(0)
|
150
|
596 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
|
|
597 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
|
|
598 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
|
221
|
599 ; VI-NEXT: v_and_b32_e32 v0, s2, v0
|
150
|
600 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
|
221
|
601 ; VI-NEXT: v_and_b32_e32 v4, s2, v4
|
150
|
602 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
|
|
603 ; VI-NEXT: v_or_b32_e32 v0, v5, v0
|
|
604 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
605 ; VI-NEXT: s_endpgm
|
|
606 ;
|
|
607 ; CI-LABEL: shl_v_imm_v4i16:
|
|
608 ; CI: ; %bb.0:
|
173
|
609 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
|
610 ; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
611 ; CI-NEXT: s_mov_b32 s2, 0
|
150
|
612 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
|
613 ; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
614 ; CI-NEXT: s_waitcnt lgkmcnt(0)
|
173
|
615 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
|
616 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
221
|
617 ; CI-NEXT: s_mov_b32 s0, 0xff00
|
173
|
618 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
150
|
619 ; CI-NEXT: s_waitcnt vmcnt(0)
|
|
620 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3
|
|
621 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
|
221
|
622 ; CI-NEXT: v_and_b32_e32 v4, s0, v4
|
150
|
623 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
221
|
624 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
|
150
|
625 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
|
626 ; CI-NEXT: v_or_b32_e32 v3, v3, v4
|
|
627 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
|
173
|
628 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
150
|
629 ; CI-NEXT: s_endpgm
|
221
|
630 ;
|
|
631 ; GFX10-LABEL: shl_v_imm_v4i16:
|
|
632 ; GFX10: ; %bb.0:
|
|
633 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
634 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
635 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
636 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
|
|
637 ; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
638 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
|
639 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
|
640 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
641 ; GFX10-NEXT: s_endpgm
|
150
|
642 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
643 %tid.ext = sext i32 %tid to i64
|
|
644 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
|
|
645 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
|
|
646 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
|
|
647 %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
|
|
648 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
|
|
649 ret void
|
|
650 }
|
|
651
|
|
652 declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
653
|
|
654 attributes #0 = { nounwind }
|
|
655 attributes #1 = { nounwind readnone }
|