Mercurial > hg > CbC > CbC_llvm
comparison llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @ 236:c4bab56944e8 llvm-original
LLVM 16
author | kono |
---|---|
date | Wed, 09 Nov 2022 17:45:10 +0900 |
parents | 5f17cb93ff66 |
children | 1f2b6ac9f198 |
comparison
equal
deleted
inserted
replaced
232:70dce7da266c | 236:c4bab56944e8 |
---|---|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s | 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s |
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s | 3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s |
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s | 4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s |
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s | 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s |
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s | |
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s | |
6 | 8 |
7 define <2 x half> @chain_hi_to_lo_private() { | 9 define <2 x half> @chain_hi_to_lo_private() { |
8 ; GFX900-LABEL: chain_hi_to_lo_private: | 10 ; GFX900-LABEL: chain_hi_to_lo_private: |
9 ; GFX900: ; %bb.0: ; %bb | 11 ; GFX900: ; %bb.0: ; %bb |
10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 12 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
45 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 | 47 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 |
46 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) | 48 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
47 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 | 49 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 |
48 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) | 50 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
49 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] | 51 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] |
52 ; | |
53 ; GFX11-LABEL: chain_hi_to_lo_private: | |
54 ; GFX11: ; %bb.0: ; %bb | |
55 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
56 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
57 ; GFX11-NEXT: s_mov_b32 s0, 2 | |
58 ; GFX11-NEXT: scratch_load_u16 v0, off, s0 | |
59 ; GFX11-NEXT: s_mov_b32 s0, 0 | |
60 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
61 ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0 | |
62 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
63 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
50 bb: | 64 bb: |
51 %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 | 65 %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 |
52 %load_lo = load half, half addrspace(5)* %gep_lo | 66 %load_lo = load half, half addrspace(5)* %gep_lo |
53 %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0 | 67 %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0 |
54 %load_hi = load half, half addrspace(5)* %gep_hi | 68 %load_hi = load half, half addrspace(5)* %gep_hi |
95 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off | 109 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off |
96 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) | 110 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
97 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off | 111 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off |
98 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) | 112 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
99 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] | 113 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] |
114 ; | |
115 ; GFX11-LABEL: chain_hi_to_lo_private_different_bases: | |
116 ; GFX11: ; %bb.0: ; %bb | |
117 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
118 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
119 ; GFX11-NEXT: scratch_load_u16 v0, v0, off | |
120 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
121 ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off | |
122 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
123 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
100 bb: | 124 bb: |
101 %load_lo = load half, half addrspace(5)* %base_lo | 125 %load_lo = load half, half addrspace(5)* %base_lo |
102 %load_hi = load half, half addrspace(5)* %base_hi | 126 %load_hi = load half, half addrspace(5)* %base_hi |
103 | 127 |
104 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 | 128 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 |
143 ; FLATSCR_GFX10-NEXT: v_add_f16_e32 v1, 1.0, v1 | 167 ; FLATSCR_GFX10-NEXT: v_add_f16_e32 v1, 1.0, v1 |
144 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off | 168 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off |
145 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) | 169 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
146 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 | 170 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 |
147 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] | 171 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] |
172 ; | |
173 ; GFX11-LABEL: chain_hi_to_lo_arithmatic: | |
174 ; GFX11: ; %bb.0: ; %bb | |
175 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
176 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
177 ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 | |
178 ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off | |
179 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
180 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 | |
181 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
148 bb: | 182 bb: |
149 %arith_lo = fadd half %in, 1.0 | 183 %arith_lo = fadd half %in, 1.0 |
150 %load_hi = load half, half addrspace(5)* %base | 184 %load_hi = load half, half addrspace(5)* %base |
151 | 185 |
152 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0 | 186 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0 |
174 ; GFX10-NEXT: ds_read_u16 v0, v1 offset:2 | 208 ; GFX10-NEXT: ds_read_u16 v0, v1 offset:2 |
175 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | 209 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
176 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 | 210 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 |
177 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | 211 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
178 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 212 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
213 ; | |
214 ; GFX11-LABEL: chain_hi_to_lo_group: | |
215 ; GFX11: ; %bb.0: ; %bb | |
216 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
217 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
218 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 | |
219 ; GFX11-NEXT: ds_load_u16 v0, v1 offset:2 | |
220 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
221 ; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 | |
222 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
223 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
179 bb: | 224 bb: |
180 %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1 | 225 %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1 |
181 %load_lo = load half, half addrspace(3)* %gep_lo | 226 %load_lo = load half, half addrspace(3)* %gep_lo |
182 %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0 | 227 %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0 |
183 %load_hi = load half, half addrspace(3)* %gep_hi | 228 %load_hi = load half, half addrspace(3)* %gep_hi |
205 ; GFX10-NEXT: ds_read_u16 v0, v0 | 250 ; GFX10-NEXT: ds_read_u16 v0, v0 |
206 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | 251 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
207 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 | 252 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 |
208 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | 253 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
209 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 254 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
255 ; | |
256 ; GFX11-LABEL: chain_hi_to_lo_group_different_bases: | |
257 ; GFX11: ; %bb.0: ; %bb | |
258 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
259 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
260 ; GFX11-NEXT: ds_load_u16 v0, v0 | |
261 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
262 ; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 | |
263 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
264 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
210 bb: | 265 bb: |
211 %load_lo = load half, half addrspace(3)* %base_lo | 266 %load_lo = load half, half addrspace(3)* %base_lo |
212 %load_hi = load half, half addrspace(3)* %base_hi | 267 %load_hi = load half, half addrspace(3)* %base_hi |
213 | 268 |
214 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 | 269 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 |
222 ; GCN: ; %bb.0: ; %bb | 277 ; GCN: ; %bb.0: ; %bb |
223 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 278 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
224 ; GCN-NEXT: v_mov_b32_e32 v0, 2 | 279 ; GCN-NEXT: v_mov_b32_e32 v0, 2 |
225 ; GCN-NEXT: v_mov_b32_e32 v1, 0 | 280 ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
226 ; GCN-NEXT: global_load_ushort v0, v[0:1], off | 281 ; GCN-NEXT: global_load_ushort v0, v[0:1], off |
282 ; GCN-NEXT: v_mov_b32_e32 v1, 0 | |
227 ; GCN-NEXT: v_mov_b32_e32 v2, 0 | 283 ; GCN-NEXT: v_mov_b32_e32 v2, 0 |
228 ; GCN-NEXT: v_mov_b32_e32 v3, 0 | |
229 ; GCN-NEXT: s_waitcnt vmcnt(0) | 284 ; GCN-NEXT: s_waitcnt vmcnt(0) |
230 ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off | 285 ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off |
231 ; GCN-NEXT: s_waitcnt vmcnt(0) | 286 ; GCN-NEXT: s_waitcnt vmcnt(0) |
232 ; GCN-NEXT: s_setpc_b64 s[30:31] | 287 ; GCN-NEXT: s_setpc_b64 s[30:31] |
233 ; | 288 ; |
234 ; GFX10-LABEL: chain_hi_to_lo_global: | 289 ; GFX10-LABEL: chain_hi_to_lo_global: |
235 ; GFX10: ; %bb.0: ; %bb | 290 ; GFX10: ; %bb.0: ; %bb |
242 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 | 297 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
243 ; GFX10-NEXT: s_waitcnt vmcnt(0) | 298 ; GFX10-NEXT: s_waitcnt vmcnt(0) |
244 ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off | 299 ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off |
245 ; GFX10-NEXT: s_waitcnt vmcnt(0) | 300 ; GFX10-NEXT: s_waitcnt vmcnt(0) |
246 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 301 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
302 ; | |
303 ; GFX11-LABEL: chain_hi_to_lo_global: | |
304 ; GFX11: ; %bb.0: ; %bb | |
305 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
306 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
307 ; GFX11-NEXT: v_mov_b32_e32 v0, 2 | |
308 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 | |
309 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off | |
310 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 | |
311 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 | |
312 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
313 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off | |
314 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
315 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
247 bb: | 316 bb: |
248 %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1 | 317 %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1 |
249 %load_lo = load half, half addrspace(1)* %gep_lo | 318 %load_lo = load half, half addrspace(1)* %gep_lo |
250 %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0 | 319 %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0 |
251 %load_hi = load half, half addrspace(1)* %gep_hi | 320 %load_hi = load half, half addrspace(1)* %gep_hi |
273 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off | 342 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off |
274 ; GFX10-NEXT: s_waitcnt vmcnt(0) | 343 ; GFX10-NEXT: s_waitcnt vmcnt(0) |
275 ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off | 344 ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off |
276 ; GFX10-NEXT: s_waitcnt vmcnt(0) | 345 ; GFX10-NEXT: s_waitcnt vmcnt(0) |
277 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 346 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
347 ; | |
348 ; GFX11-LABEL: chain_hi_to_lo_global_different_bases: | |
349 ; GFX11: ; %bb.0: ; %bb | |
350 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
351 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
352 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off | |
353 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
354 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off | |
355 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
356 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
278 bb: | 357 bb: |
279 %load_lo = load half, half addrspace(1)* %base_lo | 358 %load_lo = load half, half addrspace(1)* %base_lo |
280 %load_hi = load half, half addrspace(1)* %base_hi | 359 %load_hi = load half, half addrspace(1)* %base_hi |
281 | 360 |
282 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 | 361 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 |
290 ; GCN: ; %bb.0: ; %bb | 369 ; GCN: ; %bb.0: ; %bb |
291 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 370 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
292 ; GCN-NEXT: v_mov_b32_e32 v0, 2 | 371 ; GCN-NEXT: v_mov_b32_e32 v0, 2 |
293 ; GCN-NEXT: v_mov_b32_e32 v1, 0 | 372 ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
294 ; GCN-NEXT: flat_load_ushort v0, v[0:1] | 373 ; GCN-NEXT: flat_load_ushort v0, v[0:1] |
374 ; GCN-NEXT: v_mov_b32_e32 v1, 0 | |
295 ; GCN-NEXT: v_mov_b32_e32 v2, 0 | 375 ; GCN-NEXT: v_mov_b32_e32 v2, 0 |
296 ; GCN-NEXT: v_mov_b32_e32 v3, 0 | |
297 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 376 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
298 ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] | 377 ; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] |
299 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 378 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
300 ; GCN-NEXT: s_setpc_b64 s[30:31] | 379 ; GCN-NEXT: s_setpc_b64 s[30:31] |
301 ; | 380 ; |
302 ; GFX10-LABEL: chain_hi_to_lo_flat: | 381 ; GFX10-LABEL: chain_hi_to_lo_flat: |
303 ; GFX10: ; %bb.0: ; %bb | 382 ; GFX10: ; %bb.0: ; %bb |
310 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 | 389 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
311 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 390 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
312 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2] | 391 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2] |
313 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 392 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
314 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 393 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
394 ; | |
395 ; GFX11-LABEL: chain_hi_to_lo_flat: | |
396 ; GFX11: ; %bb.0: ; %bb | |
397 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
398 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
399 ; GFX11-NEXT: v_mov_b32_e32 v0, 2 | |
400 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 | |
401 ; GFX11-NEXT: flat_load_u16 v0, v[0:1] | |
402 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 | |
403 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 | |
404 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | |
405 ; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2] | |
406 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | |
407 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
315 bb: | 408 bb: |
316 %gep_lo = getelementptr inbounds half, half* null, i64 1 | 409 %gep_lo = getelementptr inbounds half, half* null, i64 1 |
317 %load_lo = load half, half* %gep_lo | 410 %load_lo = load half, half* %gep_lo |
318 %gep_hi = getelementptr inbounds half, half* null, i64 0 | 411 %gep_hi = getelementptr inbounds half, half* null, i64 0 |
319 %load_hi = load half, half* %gep_hi | 412 %load_hi = load half, half* %gep_hi |
341 ; GFX10-NEXT: flat_load_ushort v0, v[0:1] | 434 ; GFX10-NEXT: flat_load_ushort v0, v[0:1] |
342 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 435 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
343 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3] | 436 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3] |
344 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 437 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
345 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 438 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
439 ; | |
440 ; GFX11-LABEL: chain_hi_to_lo_flat_different_bases: | |
441 ; GFX11: ; %bb.0: ; %bb | |
442 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
443 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
444 ; GFX11-NEXT: flat_load_u16 v0, v[0:1] | |
445 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | |
446 ; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3] | |
447 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | |
448 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
346 bb: | 449 bb: |
347 %load_lo = load half, half* %base_lo | 450 %load_lo = load half, half* %base_lo |
348 %load_hi = load half, half* %base_hi | 451 %load_hi = load half, half* %base_hi |
349 | 452 |
350 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 | 453 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 |
369 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 | 472 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 |
370 ; GFX900-NEXT: s_waitcnt vmcnt(0) | 473 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
371 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 | 474 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 |
372 ; GFX900-NEXT: s_waitcnt vmcnt(0) | 475 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
373 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 | 476 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 |
477 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 | |
374 ; GFX900-NEXT: s_waitcnt vmcnt(0) | 478 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
375 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 | 479 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 |
376 ; GFX900-NEXT: s_waitcnt vmcnt(0) | 480 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
377 ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 | 481 ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 |
378 ; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 | 482 ; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 |
379 ; GFX900-NEXT: s_waitcnt vmcnt(1) | 483 ; GFX900-NEXT: s_waitcnt vmcnt(1) |
380 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 | 484 ; GFX900-NEXT: v_mov_b32_e32 v1, v0 |
381 ; GFX900-NEXT: s_waitcnt vmcnt(0) | |
382 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 | |
383 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 | 485 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 |
384 ; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0 | 486 ; GFX900-NEXT: s_waitcnt vmcnt(1) |
487 ; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 | |
385 ; GFX900-NEXT: s_waitcnt vmcnt(0) | 488 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
386 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] | 489 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] |
387 ; GFX900-NEXT: s_endpgm | 490 ; GFX900-NEXT: s_endpgm |
388 ; | 491 ; |
389 ; FLATSCR-LABEL: vload2_private: | 492 ; FLATSCR-LABEL: vload2_private: |
439 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 | 542 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 |
440 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 | 543 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 |
441 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) | 544 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) |
442 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 | 545 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 |
443 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) | 546 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
444 ; GFX10_DEFAULT-NEXT: v_and_b32_e32 v3, 0xffff, v3 | 547 ; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 |
445 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 | 548 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 |
446 ; GFX10_DEFAULT-NEXT: v_lshl_or_b32 v0, v0, 16, v3 | |
447 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) | 549 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
448 ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] | 550 ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] |
449 ; GFX10_DEFAULT-NEXT: s_endpgm | 551 ; GFX10_DEFAULT-NEXT: s_endpgm |
450 ; | 552 ; |
451 ; FLATSCR_GFX10-LABEL: vload2_private: | 553 ; FLATSCR_GFX10-LABEL: vload2_private: |
481 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 | 583 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 |
482 ; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, vcc_lo offset:6 | 584 ; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, vcc_lo offset:6 |
483 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) | 585 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
484 ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] | 586 ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] |
485 ; FLATSCR_GFX10-NEXT: s_endpgm | 587 ; FLATSCR_GFX10-NEXT: s_endpgm |
588 ; | |
589 ; GFX11-LABEL: vload2_private: | |
590 ; GFX11: ; %bb.0: ; %entry | |
591 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 | |
592 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 | |
593 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
594 ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] | |
595 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
596 ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc | |
597 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
598 ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2 | |
599 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
600 ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:6 dlc | |
601 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
602 ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4 | |
603 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
604 ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:8 dlc | |
605 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
606 ; GFX11-NEXT: s_clause 0x1 | |
607 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 | |
608 ; GFX11-NEXT: scratch_load_b32 v1, off, off offset:6 | |
609 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
610 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] | |
611 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | |
612 ; GFX11-NEXT: s_endpgm | |
486 entry: | 613 entry: |
487 %loc = alloca [3 x i16], align 2, addrspace(5) | 614 %loc = alloca [3 x i16], align 2, addrspace(5) |
488 %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* | 615 %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* |
489 %tmp = load i16, i16 addrspace(1)* %in, align 2 | 616 %tmp = load i16, i16 addrspace(1)* %in, align 2 |
490 %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0 | 617 %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0 |
532 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] | 659 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] |
533 ; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2 | 660 ; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2 |
534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | 661 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
535 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 | 662 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 |
536 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 663 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
664 ; | |
665 ; GFX11-LABEL: chain_hi_to_lo_group_other_dep: | |
666 ; GFX11: ; %bb.0: ; %bb | |
667 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
668 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
669 ; GFX11-NEXT: ds_load_u16_d16_hi v1, v0 | |
670 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
671 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] | |
672 ; GFX11-NEXT: ds_load_u16_d16 v1, v0 offset:2 | |
673 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
674 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 | |
675 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
537 bb: | 676 bb: |
538 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 | 677 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 |
539 %load_lo = load i16, i16 addrspace(3)* %gep_lo | 678 %load_lo = load i16, i16 addrspace(3)* %gep_lo |
540 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 | 679 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 |
541 %load_hi = load i16, i16 addrspace(3)* %gep_hi | 680 %load_hi = load i16, i16 addrspace(3)* %gep_hi |
545 ret <2 x i16> %result | 684 ret <2 x i16> %result |
546 } | 685 } |
547 | 686 |
548 ; The volatile operations aren't put on the same chain | 687 ; The volatile operations aren't put on the same chain |
549 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) { | 688 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) { |
550 ; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: | 689 ; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: |
551 ; GCN: ; %bb.0: ; %bb | 690 ; GFX900: ; %bb.0: ; %bb |
552 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 691 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
553 ; GCN-NEXT: ds_read_u16 v1, v0 offset:2 | 692 ; GFX900-NEXT: ds_read_u16 v1, v0 offset:2 |
554 ; GCN-NEXT: ds_read_u16_d16_hi v0, v0 | 693 ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 |
555 ; GCN-NEXT: v_mov_b32_e32 v2, 0xffff | 694 ; GFX900-NEXT: s_mov_b32 s4, 0xffff |
556 ; GCN-NEXT: s_waitcnt lgkmcnt(0) | 695 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) |
557 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | 696 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] |
558 ; GCN-NEXT: v_bfi_b32 v0, v2, v1, v0 | 697 ; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 |
559 ; GCN-NEXT: s_setpc_b64 s[30:31] | 698 ; GFX900-NEXT: s_setpc_b64 s[30:31] |
699 ; | |
700 ; FLATSCR-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: | |
701 ; FLATSCR: ; %bb.0: ; %bb | |
702 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
703 ; FLATSCR-NEXT: ds_read_u16 v1, v0 offset:2 | |
704 ; FLATSCR-NEXT: ds_read_u16_d16_hi v0, v0 | |
705 ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff | |
706 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) | |
707 ; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | |
708 ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v1, v0 | |
709 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] | |
560 ; | 710 ; |
561 ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: | 711 ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: |
562 ; GFX10: ; %bb.0: ; %bb | 712 ; GFX10: ; %bb.0: ; %bb |
563 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 713 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
564 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 | 714 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
566 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v0 | 716 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v0 |
567 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | 717 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
568 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | 718 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] |
569 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 | 719 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 |
570 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 720 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
721 ; | |
722 ; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: | |
723 ; GFX11: ; %bb.0: ; %bb | |
724 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
725 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
726 ; GFX11-NEXT: ds_load_u16 v1, v0 offset:2 | |
727 ; GFX11-NEXT: ds_load_u16_d16_hi v0, v0 | |
728 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
729 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | |
730 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | |
731 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 | |
732 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
571 bb: | 733 bb: |
572 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 | 734 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 |
573 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo | 735 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo |
574 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 | 736 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 |
575 %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi | 737 %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi |
623 ; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] | 785 ; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] |
624 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2 | 786 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2 |
625 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) | 787 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
626 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 | 788 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 |
627 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] | 789 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] |
790 ; | |
791 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep: | |
792 ; GFX11: ; %bb.0: ; %bb | |
793 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
794 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
795 ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off | |
796 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
797 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] | |
798 ; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2 | |
799 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
800 ; GFX11-NEXT: v_mov_b32_e32 v0, v1 | |
801 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
628 bb: | 802 bb: |
629 %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 | 803 %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 |
630 %load_lo = load i16, i16 addrspace(5)* %gep_lo | 804 %load_lo = load i16, i16 addrspace(5)* %gep_lo |
631 %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0 | 805 %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0 |
632 %load_hi = load i16, i16 addrspace(5)* %gep_hi | 806 %load_hi = load i16, i16 addrspace(5)* %gep_hi |
635 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 | 809 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 |
636 ret <2 x i16> %result | 810 ret <2 x i16> %result |
637 } | 811 } |
638 | 812 |
639 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) { | 813 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) { |
640 ; GCN-LABEL: chain_hi_to_lo_global_other_dep: | 814 ; GFX900-LABEL: chain_hi_to_lo_global_other_dep: |
641 ; GCN: ; %bb.0: ; %bb | 815 ; GFX900: ; %bb.0: ; %bb |
642 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 816 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
643 ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc | 817 ; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc |
644 ; GCN-NEXT: s_waitcnt vmcnt(0) | 818 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
645 ; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off glc | 819 ; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off glc |
646 ; GCN-NEXT: s_waitcnt vmcnt(0) | 820 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
647 ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff | 821 ; GFX900-NEXT: s_mov_b32 s4, 0xffff |
648 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | 822 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] |
649 ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 | 823 ; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 |
650 ; GCN-NEXT: s_setpc_b64 s[30:31] | 824 ; GFX900-NEXT: s_setpc_b64 s[30:31] |
825 ; | |
826 ; FLATSCR-LABEL: chain_hi_to_lo_global_other_dep: | |
827 ; FLATSCR: ; %bb.0: ; %bb | |
828 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
829 ; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc | |
830 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) | |
831 ; FLATSCR-NEXT: global_load_short_d16_hi v0, v[0:1], off glc | |
832 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) | |
833 ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff | |
834 ; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | |
835 ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 | |
836 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] | |
651 ; | 837 ; |
652 ; GFX10-LABEL: chain_hi_to_lo_global_other_dep: | 838 ; GFX10-LABEL: chain_hi_to_lo_global_other_dep: |
653 ; GFX10: ; %bb.0: ; %bb | 839 ; GFX10: ; %bb.0: ; %bb |
654 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 840 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
655 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 | 841 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
658 ; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc | 844 ; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc |
659 ; GFX10-NEXT: s_waitcnt vmcnt(0) | 845 ; GFX10-NEXT: s_waitcnt vmcnt(0) |
660 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | 846 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] |
661 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 | 847 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 |
662 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 848 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
849 ; | |
850 ; GFX11-LABEL: chain_hi_to_lo_global_other_dep: | |
851 ; GFX11: ; %bb.0: ; %bb | |
852 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
853 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
854 ; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc | |
855 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
856 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc | |
857 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
858 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | |
859 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | |
860 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 | |
861 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
663 bb: | 862 bb: |
664 %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1 | 863 %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1 |
665 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo | 864 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo |
666 %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0 | 865 %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0 |
667 %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi | 866 %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi |
670 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 | 869 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 |
671 ret <2 x i16> %result | 870 ret <2 x i16> %result |
672 } | 871 } |
673 | 872 |
674 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) { | 873 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) { |
675 ; GCN-LABEL: chain_hi_to_lo_flat_other_dep: | 874 ; GFX900-LABEL: chain_hi_to_lo_flat_other_dep: |
676 ; GCN: ; %bb.0: ; %bb | 875 ; GFX900: ; %bb.0: ; %bb |
677 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 876 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
678 ; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc | 877 ; GFX900-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc |
679 ; GCN-NEXT: s_waitcnt vmcnt(0) | 878 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
680 ; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] glc | 879 ; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1] glc |
681 ; GCN-NEXT: s_waitcnt vmcnt(0) | 880 ; GFX900-NEXT: s_waitcnt vmcnt(0) |
682 ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff | 881 ; GFX900-NEXT: s_mov_b32 s4, 0xffff |
683 ; GCN-NEXT: s_waitcnt lgkmcnt(0) | 882 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) |
684 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | 883 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] |
685 ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 | 884 ; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 |
686 ; GCN-NEXT: s_setpc_b64 s[30:31] | 885 ; GFX900-NEXT: s_setpc_b64 s[30:31] |
886 ; | |
887 ; FLATSCR-LABEL: chain_hi_to_lo_flat_other_dep: | |
888 ; FLATSCR: ; %bb.0: ; %bb | |
889 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
890 ; FLATSCR-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc | |
891 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) | |
892 ; FLATSCR-NEXT: flat_load_short_d16_hi v0, v[0:1] glc | |
893 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) | |
894 ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff | |
895 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) | |
896 ; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | |
897 ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 | |
898 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] | |
687 ; | 899 ; |
688 ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep: | 900 ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep: |
689 ; GFX10: ; %bb.0: ; %bb | 901 ; GFX10: ; %bb.0: ; %bb |
690 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 902 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
691 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 | 903 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
696 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc | 908 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc |
697 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 909 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
698 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | 910 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] |
699 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 | 911 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 |
700 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 912 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
913 ; | |
914 ; GFX11-LABEL: chain_hi_to_lo_flat_other_dep: | |
915 ; GFX11: ; %bb.0: ; %bb | |
916 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
917 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
918 ; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc | |
919 ; GFX11-NEXT: s_waitcnt vmcnt(0) | |
920 ; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc | |
921 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | |
922 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] | |
923 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | |
924 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 | |
925 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
701 bb: | 926 bb: |
702 %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1 | 927 %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1 |
703 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo | 928 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo |
704 %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0 | 929 %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0 |
705 %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi | 930 %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi |
708 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 | 933 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 |
709 ret <2 x i16> %result | 934 ret <2 x i16> %result |
710 } | 935 } |
711 | 936 |
712 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) { | 937 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) { |
713 ; GCN-LABEL: chain_hi_to_lo_group_may_alias_store: | 938 ; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store: |
714 ; GCN: ; %bb.0: ; %bb | 939 ; GFX900: ; %bb.0: ; %bb |
715 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 940 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
716 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7b | 941 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7b |
717 ; GCN-NEXT: ds_read_u16 v2, v0 | 942 ; GFX900-NEXT: ds_read_u16 v2, v0 |
718 ; GCN-NEXT: ds_write_b16 v1, v3 | 943 ; GFX900-NEXT: ds_write_b16 v1, v3 |
719 ; GCN-NEXT: ds_read_u16 v0, v0 offset:2 | 944 ; GFX900-NEXT: ds_read_u16 v0, v0 offset:2 |
720 ; GCN-NEXT: s_waitcnt lgkmcnt(0) | 945 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 |
721 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 | 946 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) |
722 ; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0 | 947 ; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 |
723 ; GCN-NEXT: s_setpc_b64 s[30:31] | 948 ; GFX900-NEXT: s_setpc_b64 s[30:31] |
949 ; | |
950 ; FLATSCR-LABEL: chain_hi_to_lo_group_may_alias_store: | |
951 ; FLATSCR: ; %bb.0: ; %bb | |
952 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
953 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0x7b | |
954 ; FLATSCR-NEXT: ds_read_u16 v2, v0 | |
955 ; FLATSCR-NEXT: ds_write_b16 v1, v3 | |
956 ; FLATSCR-NEXT: ds_read_u16 v0, v0 offset:2 | |
957 ; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 | |
958 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) | |
959 ; FLATSCR-NEXT: v_perm_b32 v0, v2, v0, s0 | |
960 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] | |
724 ; | 961 ; |
725 ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store: | 962 ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store: |
726 ; GFX10: ; %bb.0: ; %bb | 963 ; GFX10: ; %bb.0: ; %bb |
727 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 964 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
728 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 | 965 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
729 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b | 966 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b |
730 ; GFX10-NEXT: ds_read_u16 v3, v0 | 967 ; GFX10-NEXT: ds_read_u16 v3, v0 |
731 ; GFX10-NEXT: ds_write_b16 v1, v2 | 968 ; GFX10-NEXT: ds_write_b16 v1, v2 |
732 ; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 | 969 ; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 |
733 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | 970 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
734 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 | 971 ; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 |
735 ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 | 972 ; GFX10-NEXT: s_setpc_b64 s[30:31] |
736 ; GFX10-NEXT: s_setpc_b64 s[30:31] | 973 ; |
974 ; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store: | |
975 ; GFX11: ; %bb.0: ; %bb | |
976 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |
977 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 | |
978 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x7b | |
979 ; GFX11-NEXT: ds_load_u16 v3, v0 | |
980 ; GFX11-NEXT: ds_store_b16 v1, v2 | |
981 ; GFX11-NEXT: ds_load_u16 v0, v0 offset:2 | |
982 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) | |
983 ; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 | |
984 ; GFX11-NEXT: s_setpc_b64 s[30:31] | |
737 bb: | 985 bb: |
738 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 | 986 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 |
739 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 | 987 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 |
740 %load_hi = load i16, i16 addrspace(3)* %gep_hi | 988 %load_hi = load i16, i16 addrspace(3)* %gep_hi |
741 store i16 123, i16 addrspace(3)* %may.alias | 989 store i16 123, i16 addrspace(3)* %may.alias |