Mercurial > hg > CbC > CbC_llvm
comparison llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @ 173:0572611fdcc8 llvm10 llvm12
reorgnization done
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 25 May 2020 11:55:54 +0900 |
parents | 1d019706d866 |
children | 2e18cbf3894f |
comparison
equal
deleted
inserted
replaced
172:9fbae9c8bf63 | 173:0572611fdcc8 |
---|---|
71 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 71 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
73 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 73 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
74 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 | 74 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 |
75 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 75 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
76 ; GFX9-NEXT: global_load_dword v3, v[0:1], off | 76 ; GFX9-NEXT: global_load_dword v4, v[0:1], off |
77 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 | 77 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 |
78 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 | 78 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 |
79 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 | 79 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 |
80 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 80 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
81 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 81 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
82 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3 | 82 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 |
83 ; GFX9-NEXT: global_store_dword v[0:1], v2, off | 83 ; GFX9-NEXT: global_store_dword v[2:3], v0, off |
84 ; GFX9-NEXT: s_endpgm | 84 ; GFX9-NEXT: s_endpgm |
85 ; | 85 ; |
86 ; VI-LABEL: v_shl_v2i16: | 86 ; VI-LABEL: v_shl_v2i16: |
87 ; VI: ; %bb.0: | 87 ; VI: ; %bb.0: |
88 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 88 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
89 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 | 89 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
90 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 90 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
91 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 91 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 | 92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 |
93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
94 ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 | 94 ; VI-NEXT: v_mov_b32_e32 v3, s1 |
95 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc | 95 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 |
96 ; VI-NEXT: flat_load_dword v5, v[0:1] | 96 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
97 ; VI-NEXT: flat_load_dword v2, v[2:3] | 97 ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 |
98 ; VI-NEXT: v_mov_b32_e32 v1, s1 | 98 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc |
99 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 | 99 ; VI-NEXT: flat_load_dword v0, v[0:1] |
100 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 100 ; VI-NEXT: flat_load_dword v1, v[4:5] |
101 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 101 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
102 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 | 102 ; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0 |
103 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | 103 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 |
104 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 | 104 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 |
105 ; VI-NEXT: flat_store_dword v[0:1], v2 | 105 ; VI-NEXT: flat_store_dword v[2:3], v0 |
106 ; VI-NEXT: s_endpgm | 106 ; VI-NEXT: s_endpgm |
107 ; | 107 ; |
108 ; CI-LABEL: v_shl_v2i16: | 108 ; CI-LABEL: v_shl_v2i16: |
109 ; CI: ; %bb.0: | 109 ; CI: ; %bb.0: |
110 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | 110 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
111 ; CI-NEXT: s_mov_b32 s7, 0xf000 | 111 ; CI-NEXT: s_mov_b32 s3, 0xf000 |
112 ; CI-NEXT: s_mov_b32 s6, 0 | 112 ; CI-NEXT: s_mov_b32 s2, 0 |
113 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | 113 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
114 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 114 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
115 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 115 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
116 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] | 116 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
117 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 | 117 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
118 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 | 118 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 |
119 ; CI-NEXT: s_mov_b32 s8, 0xffff | 119 ; CI-NEXT: s_mov_b32 s8, 0xffff |
120 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] | 120 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
121 ; CI-NEXT: s_waitcnt vmcnt(1) | 121 ; CI-NEXT: s_waitcnt vmcnt(1) |
122 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 | 122 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 |
123 ; CI-NEXT: s_waitcnt vmcnt(0) | 123 ; CI-NEXT: s_waitcnt vmcnt(0) |
124 ; CI-NEXT: v_and_b32_e32 v5, s8, v3 | 124 ; CI-NEXT: v_and_b32_e32 v5, s8, v3 |
125 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 | 125 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
126 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 | 126 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 |
127 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 | 127 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 |
128 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 | 128 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
129 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 | 129 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 |
130 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 | 130 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 |
131 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 | 131 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
132 ; CI-NEXT: s_endpgm | 132 ; CI-NEXT: s_endpgm |
133 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 133 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
134 %tid.ext = sext i32 %tid to i64 | 134 %tid.ext = sext i32 %tid to i64 |
135 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext | 135 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext |
136 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext | 136 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext |
150 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 150 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
152 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 | 152 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 |
153 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 | 153 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 |
154 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 154 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
155 ; GFX9-NEXT: global_load_dword v3, v[0:1], off | 155 ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
156 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 | 156 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 |
157 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 | 157 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 |
158 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 158 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
159 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 159 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
160 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3 | 160 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 |
161 ; GFX9-NEXT: global_store_dword v[0:1], v2, off | 161 ; GFX9-NEXT: global_store_dword v[2:3], v0, off |
162 ; GFX9-NEXT: s_endpgm | 162 ; GFX9-NEXT: s_endpgm |
163 ; | 163 ; |
164 ; VI-LABEL: shl_v_s_v2i16: | 164 ; VI-LABEL: shl_v_s_v2i16: |
165 ; VI: ; %bb.0: | 165 ; VI: ; %bb.0: |
166 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | 166 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 |
168 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 168 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
169 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 169 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
170 ; VI-NEXT: v_mov_b32_e32 v1, s7 | 170 ; VI-NEXT: v_mov_b32_e32 v1, s7 |
171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 | 171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 |
172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
173 ; VI-NEXT: flat_load_dword v3, v[0:1] | 173 ; VI-NEXT: flat_load_dword v0, v[0:1] |
174 ; VI-NEXT: s_lshr_b32 s1, s0, 16 | 174 ; VI-NEXT: s_lshr_b32 s1, s0, 16 |
175 ; VI-NEXT: v_mov_b32_e32 v4, s1 | 175 ; VI-NEXT: v_mov_b32_e32 v4, s1 |
176 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 | 176 ; VI-NEXT: v_mov_b32_e32 v3, s5 |
177 ; VI-NEXT: v_mov_b32_e32 v1, s5 | 177 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 |
178 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 178 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
179 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 179 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
180 ; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3 | 180 ; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0 |
181 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 | 181 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
182 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 | 182 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 |
183 ; VI-NEXT: flat_store_dword v[0:1], v2 | 183 ; VI-NEXT: flat_store_dword v[2:3], v0 |
184 ; VI-NEXT: s_endpgm | 184 ; VI-NEXT: s_endpgm |
185 ; | 185 ; |
186 ; CI-LABEL: shl_v_s_v2i16: | 186 ; CI-LABEL: shl_v_s_v2i16: |
187 ; CI: ; %bb.0: | 187 ; CI: ; %bb.0: |
188 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | 188 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
189 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd | 189 ; CI-NEXT: s_load_dword s0, s[0:1], 0xd |
190 ; CI-NEXT: s_mov_b32 s8, 0xffff | |
190 ; CI-NEXT: s_mov_b32 s3, 0xf000 | 191 ; CI-NEXT: s_mov_b32 s3, 0xf000 |
191 ; CI-NEXT: s_mov_b32 s2, 0 | 192 ; CI-NEXT: s_mov_b32 s2, 0 |
192 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | 193 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
193 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 194 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
195 ; CI-NEXT: s_lshr_b32 s9, s0, 16 | |
196 ; CI-NEXT: s_and_b32 s10, s0, s8 | |
194 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] | 197 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
195 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 198 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
196 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 | 199 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
197 ; CI-NEXT: s_mov_b32 s9, 0xffff | |
198 ; CI-NEXT: s_lshr_b32 s10, s8, 16 | |
199 ; CI-NEXT: s_and_b32 s8, s8, s9 | |
200 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] | 200 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
201 ; CI-NEXT: s_waitcnt vmcnt(0) | 201 ; CI-NEXT: s_waitcnt vmcnt(0) |
202 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 | 202 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 |
203 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 | 203 ; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2 |
204 ; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3 | 204 ; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3 |
205 ; CI-NEXT: v_and_b32_e32 v2, s9, v2 | 205 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 |
206 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 | 206 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
207 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 | 207 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 |
208 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 | 208 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
209 ; CI-NEXT: s_endpgm | 209 ; CI-NEXT: s_endpgm |
210 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 210 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
225 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 225 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
227 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 | 227 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 |
228 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 | 228 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 |
229 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 229 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
230 ; GFX9-NEXT: global_load_dword v3, v[0:1], off | 230 ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
231 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 | 231 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 |
232 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 | 232 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 |
233 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 233 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
234 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 234 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
235 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0 | 235 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 |
236 ; GFX9-NEXT: global_store_dword v[0:1], v2, off | 236 ; GFX9-NEXT: global_store_dword v[2:3], v0, off |
237 ; GFX9-NEXT: s_endpgm | 237 ; GFX9-NEXT: s_endpgm |
238 ; | 238 ; |
239 ; VI-LABEL: shl_s_v_v2i16: | 239 ; VI-LABEL: shl_s_v_v2i16: |
240 ; VI: ; %bb.0: | 240 ; VI: ; %bb.0: |
241 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | 241 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 |
243 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 243 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
244 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 244 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
245 ; VI-NEXT: v_mov_b32_e32 v1, s7 | 245 ; VI-NEXT: v_mov_b32_e32 v1, s7 |
246 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 | 246 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 |
247 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 247 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
248 ; VI-NEXT: flat_load_dword v3, v[0:1] | 248 ; VI-NEXT: flat_load_dword v0, v[0:1] |
249 ; VI-NEXT: s_lshr_b32 s1, s0, 16 | 249 ; VI-NEXT: s_lshr_b32 s1, s0, 16 |
250 ; VI-NEXT: v_mov_b32_e32 v4, s1 | 250 ; VI-NEXT: v_mov_b32_e32 v4, s1 |
251 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 | 251 ; VI-NEXT: v_mov_b32_e32 v3, s5 |
252 ; VI-NEXT: v_mov_b32_e32 v1, s5 | 252 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 |
253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 253 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
254 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 254 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
255 ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0 | 255 ; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0 |
256 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 256 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
257 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 | 257 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 |
258 ; VI-NEXT: flat_store_dword v[0:1], v2 | 258 ; VI-NEXT: flat_store_dword v[2:3], v0 |
259 ; VI-NEXT: s_endpgm | 259 ; VI-NEXT: s_endpgm |
260 ; | 260 ; |
261 ; CI-LABEL: shl_s_v_v2i16: | 261 ; CI-LABEL: shl_s_v_v2i16: |
262 ; CI: ; %bb.0: | 262 ; CI: ; %bb.0: |
263 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | 263 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
268 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 268 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
269 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] | 269 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
270 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 270 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
271 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 | 271 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
272 ; CI-NEXT: s_mov_b32 s0, 0xffff | 272 ; CI-NEXT: s_mov_b32 s0, 0xffff |
273 ; CI-NEXT: s_lshr_b32 s1, s8, 16 | 273 ; CI-NEXT: s_lshr_b32 s9, s8, 16 |
274 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] | 274 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
275 ; CI-NEXT: s_waitcnt vmcnt(0) | 275 ; CI-NEXT: s_waitcnt vmcnt(0) |
276 ; CI-NEXT: v_and_b32_e32 v3, s0, v2 | 276 ; CI-NEXT: v_and_b32_e32 v3, s0, v2 |
277 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 | 277 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
278 ; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 | 278 ; CI-NEXT: v_lshl_b32_e32 v2, s9, v2 |
279 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 | 279 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 |
280 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 | 280 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
281 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 | 281 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 |
282 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 | 282 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 |
283 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 | 283 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
299 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 299 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
301 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 301 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
302 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 | 302 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 |
303 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 303 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
304 ; GFX9-NEXT: global_load_dword v3, v[0:1], off | 304 ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
305 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 | 305 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 |
306 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 | 306 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 |
307 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 307 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
308 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 308 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
309 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0] | 309 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0] |
310 ; GFX9-NEXT: global_store_dword v[0:1], v2, off | 310 ; GFX9-NEXT: global_store_dword v[2:3], v0, off |
311 ; GFX9-NEXT: s_endpgm | 311 ; GFX9-NEXT: s_endpgm |
312 ; | 312 ; |
313 ; VI-LABEL: shl_imm_v_v2i16: | 313 ; VI-LABEL: shl_imm_v_v2i16: |
314 ; VI: ; %bb.0: | 314 ; VI: ; %bb.0: |
315 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 315 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
316 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 316 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
317 ; VI-NEXT: v_mov_b32_e32 v3, 8 | 317 ; VI-NEXT: v_mov_b32_e32 v4, 8 |
318 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 318 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
319 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 319 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | 320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 |
321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
322 ; VI-NEXT: flat_load_dword v4, v[0:1] | 322 ; VI-NEXT: flat_load_dword v0, v[0:1] |
323 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 | 323 ; VI-NEXT: v_mov_b32_e32 v3, s1 |
324 ; VI-NEXT: v_mov_b32_e32 v1, s1 | 324 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 |
325 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 325 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
327 ; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8 | 327 ; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8 |
328 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 328 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
329 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 | 329 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 |
330 ; VI-NEXT: flat_store_dword v[0:1], v2 | 330 ; VI-NEXT: flat_store_dword v[2:3], v0 |
331 ; VI-NEXT: s_endpgm | 331 ; VI-NEXT: s_endpgm |
332 ; | 332 ; |
333 ; CI-LABEL: shl_imm_v_v2i16: | 333 ; CI-LABEL: shl_imm_v_v2i16: |
334 ; CI: ; %bb.0: | 334 ; CI: ; %bb.0: |
335 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | 335 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
336 ; CI-NEXT: s_mov_b32 s7, 0xf000 | 336 ; CI-NEXT: s_mov_b32 s3, 0xf000 |
337 ; CI-NEXT: s_mov_b32 s6, 0 | 337 ; CI-NEXT: s_mov_b32 s2, 0 |
338 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | 338 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
339 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 339 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
340 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 340 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
341 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] | 341 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
342 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 | 342 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
343 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] | 343 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
344 ; CI-NEXT: s_waitcnt vmcnt(0) | 344 ; CI-NEXT: s_waitcnt vmcnt(0) |
345 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2 | 345 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2 |
346 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 | 346 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
347 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 | 347 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 |
348 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3 | 348 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3 |
349 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 | 349 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
350 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3 | 350 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3 |
351 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 | 351 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 |
352 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 | 352 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
353 ; CI-NEXT: s_endpgm | 353 ; CI-NEXT: s_endpgm |
354 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 354 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
355 %tid.ext = sext i32 %tid to i64 | 355 %tid.ext = sext i32 %tid to i64 |
356 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext | 356 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext |
357 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext | 357 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext |
368 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 368 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
370 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 370 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
371 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 | 371 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 |
372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
373 ; GFX9-NEXT: global_load_dword v3, v[0:1], off | 373 ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
374 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 | 374 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 |
375 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 | 375 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 |
376 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 376 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
377 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 377 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
378 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] | 378 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] |
379 ; GFX9-NEXT: global_store_dword v[0:1], v2, off | 379 ; GFX9-NEXT: global_store_dword v[2:3], v0, off |
380 ; GFX9-NEXT: s_endpgm | 380 ; GFX9-NEXT: s_endpgm |
381 ; | 381 ; |
382 ; VI-LABEL: shl_v_imm_v2i16: | 382 ; VI-LABEL: shl_v_imm_v2i16: |
383 ; VI: ; %bb.0: | 383 ; VI: ; %bb.0: |
384 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 384 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
385 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 385 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
386 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 386 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
387 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 387 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
388 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | 388 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 |
389 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 389 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
390 ; VI-NEXT: flat_load_dword v3, v[0:1] | 390 ; VI-NEXT: flat_load_dword v0, v[0:1] |
391 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 | 391 ; VI-NEXT: v_mov_b32_e32 v3, s1 |
392 ; VI-NEXT: v_mov_b32_e32 v1, s1 | 392 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 |
393 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 393 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
394 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 394 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
395 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 | 395 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 |
396 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 | 396 ; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1 |
397 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 | 397 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 |
398 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 | 398 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 |
399 ; VI-NEXT: flat_store_dword v[0:1], v2 | 399 ; VI-NEXT: flat_store_dword v[2:3], v0 |
400 ; VI-NEXT: s_endpgm | 400 ; VI-NEXT: s_endpgm |
401 ; | 401 ; |
402 ; CI-LABEL: shl_v_imm_v2i16: | 402 ; CI-LABEL: shl_v_imm_v2i16: |
403 ; CI: ; %bb.0: | 403 ; CI: ; %bb.0: |
404 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | 404 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
405 ; CI-NEXT: s_mov_b32 s7, 0xf000 | 405 ; CI-NEXT: s_mov_b32 s3, 0xf000 |
406 ; CI-NEXT: s_mov_b32 s6, 0 | 406 ; CI-NEXT: s_mov_b32 s2, 0 |
407 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | 407 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
408 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 408 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
409 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 409 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
410 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] | 410 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
411 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 | 411 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
412 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] | 412 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
413 ; CI-NEXT: s_waitcnt vmcnt(0) | 413 ; CI-NEXT: s_waitcnt vmcnt(0) |
414 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 | 414 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 |
415 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 | 415 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 |
416 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 | 416 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
417 ; CI-NEXT: s_endpgm | 417 ; CI-NEXT: s_endpgm |
418 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 418 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
419 %tid.ext = sext i32 %tid to i64 | 419 %tid.ext = sext i32 %tid to i64 |
420 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext | 420 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext |
421 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext | 421 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext |
427 | 427 |
428 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { | 428 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { |
429 ; GFX9-LABEL: v_shl_v4i16: | 429 ; GFX9-LABEL: v_shl_v4i16: |
430 ; GFX9: ; %bb.0: | 430 ; GFX9: ; %bb.0: |
431 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 431 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
432 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 | 432 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
433 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 433 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
434 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 434 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
435 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 | 435 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 |
436 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 436 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
437 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off | 437 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off |
438 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 | 438 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 |
439 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 | 439 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 |
440 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 | 440 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 |
441 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc | 441 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
442 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 442 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
443 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 | 443 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5 |
444 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 | 444 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 |
445 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off | 445 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off |
446 ; GFX9-NEXT: s_endpgm | 446 ; GFX9-NEXT: s_endpgm |
447 ; | 447 ; |
448 ; VI-LABEL: v_shl_v4i16: | 448 ; VI-LABEL: v_shl_v4i16: |
449 ; VI: ; %bb.0: | 449 ; VI: ; %bb.0: |
450 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 450 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
451 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 | 451 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
452 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 452 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
453 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 453 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
454 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 | 454 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 |
455 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 455 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
456 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 | 456 ; VI-NEXT: v_mov_b32_e32 v3, s1 |
457 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc | 457 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 |
458 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | |
459 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 | |
460 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc | |
458 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] | 461 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] |
459 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] | 462 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] |
460 ; VI-NEXT: v_mov_b32_e32 v5, s1 | 463 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
461 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 | 464 ; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1 |
462 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc | 465 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 |
463 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 466 ; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0 |
464 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 | 467 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 |
465 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | |
466 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 | |
467 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | |
468 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 | 468 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 |
469 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 | 469 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 |
470 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] | 470 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
471 ; VI-NEXT: s_endpgm | 471 ; VI-NEXT: s_endpgm |
472 ; | 472 ; |
473 ; CI-LABEL: v_shl_v4i16: | 473 ; CI-LABEL: v_shl_v4i16: |
474 ; CI: ; %bb.0: | 474 ; CI: ; %bb.0: |
475 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | 475 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
476 ; CI-NEXT: s_mov_b32 s7, 0xf000 | 476 ; CI-NEXT: s_mov_b32 s3, 0xf000 |
477 ; CI-NEXT: s_mov_b32 s6, 0 | 477 ; CI-NEXT: s_mov_b32 s2, 0 |
478 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 | 478 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
479 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 479 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
480 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 480 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
481 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] | 481 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
482 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 | 482 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 |
483 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 | 483 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 |
484 ; CI-NEXT: s_mov_b32 s8, 0xffff | 484 ; CI-NEXT: s_mov_b32 s8, 0xffff |
485 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] | 485 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
486 ; CI-NEXT: s_waitcnt vmcnt(1) | 486 ; CI-NEXT: s_waitcnt vmcnt(1) |
487 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 | 487 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 |
488 ; CI-NEXT: s_waitcnt vmcnt(0) | 488 ; CI-NEXT: s_waitcnt vmcnt(0) |
489 ; CI-NEXT: v_and_b32_e32 v8, s8, v4 | 489 ; CI-NEXT: v_and_b32_e32 v8, s8, v4 |
490 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 | 490 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
499 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 | 499 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 |
500 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 | 500 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
501 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 | 501 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 |
502 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 | 502 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 |
503 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 | 503 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 |
504 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 | 504 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 |
505 ; CI-NEXT: s_endpgm | 505 ; CI-NEXT: s_endpgm |
506 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 506 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
507 %tid.ext = sext i32 %tid to i64 | 507 %tid.ext = sext i32 %tid to i64 |
508 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext | 508 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext |
509 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext | 509 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext |
536 ; | 536 ; |
537 ; VI-LABEL: shl_v_imm_v4i16: | 537 ; VI-LABEL: shl_v_imm_v4i16: |
538 ; VI: ; %bb.0: | 538 ; VI: ; %bb.0: |
539 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 539 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
540 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 | 540 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
541 ; VI-NEXT: s_mov_b32 s4, 0xff000000 | |
542 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 541 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
543 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 542 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
544 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | 543 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 |
545 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 544 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
546 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] | 545 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] |
546 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 | |
547 ; VI-NEXT: s_mov_b32 s0, 0xff000000 | |
547 ; VI-NEXT: v_mov_b32_e32 v3, s1 | 548 ; VI-NEXT: v_mov_b32_e32 v3, s1 |
548 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 | |
549 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | 549 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
550 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 550 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
551 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 | 551 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 |
552 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 | 552 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 |
553 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 | 553 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 |
554 ; VI-NEXT: v_and_b32_e32 v0, s4, v0 | 554 ; VI-NEXT: v_and_b32_e32 v0, s0, v0 |
555 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 | 555 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 |
556 ; VI-NEXT: v_and_b32_e32 v4, s4, v4 | 556 ; VI-NEXT: v_and_b32_e32 v4, s0, v4 |
557 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 | 557 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 |
558 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 | 558 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 |
559 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] | 559 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
560 ; VI-NEXT: s_endpgm | 560 ; VI-NEXT: s_endpgm |
561 ; | 561 ; |
562 ; CI-LABEL: shl_v_imm_v4i16: | 562 ; CI-LABEL: shl_v_imm_v4i16: |
563 ; CI: ; %bb.0: | 563 ; CI: ; %bb.0: |
564 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 | 564 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
565 ; CI-NEXT: s_mov_b32 s7, 0xf000 | 565 ; CI-NEXT: s_mov_b32 s3, 0xf000 |
566 ; CI-NEXT: s_mov_b32 s6, 0 | 566 ; CI-NEXT: s_mov_b32 s2, 0 |
567 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 | 567 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
568 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 568 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
569 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 569 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
570 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] | 570 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
571 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 | 571 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 |
572 ; CI-NEXT: s_mov_b32 s8, 0xff00 | 572 ; CI-NEXT: s_mov_b32 s8, 0xff00 |
573 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] | 573 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
574 ; CI-NEXT: s_waitcnt vmcnt(0) | 574 ; CI-NEXT: s_waitcnt vmcnt(0) |
575 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 | 575 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 |
576 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 | 576 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 |
577 ; CI-NEXT: v_and_b32_e32 v4, s8, v4 | 577 ; CI-NEXT: v_and_b32_e32 v4, s8, v4 |
578 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 | 578 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 |
579 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 | 579 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 |
580 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 | 580 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
581 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 | 581 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 |
582 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 | 582 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 |
583 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 | 583 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 |
584 ; CI-NEXT: s_endpgm | 584 ; CI-NEXT: s_endpgm |
585 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 585 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
586 %tid.ext = sext i32 %tid to i64 | 586 %tid.ext = sext i32 %tid to i64 |
587 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext | 587 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext |
588 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext | 588 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext |