comparison llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @ 236:c4bab56944e8 llvm-original

LLVM 16
author kono
date Wed, 09 Nov 2022 17:45:10 +0900
parents 5f17cb93ff66
children 1f2b6ac9f198
comparison
equal deleted inserted replaced
232:70dce7da266c 236:c4bab56944e8
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s 3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s 4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s
6 8
7 define <2 x half> @chain_hi_to_lo_private() { 9 define <2 x half> @chain_hi_to_lo_private() {
8 ; GFX900-LABEL: chain_hi_to_lo_private: 10 ; GFX900-LABEL: chain_hi_to_lo_private:
9 ; GFX900: ; %bb.0: ; %bb 11 ; GFX900: ; %bb.0: ; %bb
10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 47 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0
46 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 48 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
47 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 49 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0
48 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 50 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
49 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] 51 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
52 ;
53 ; GFX11-LABEL: chain_hi_to_lo_private:
54 ; GFX11: ; %bb.0: ; %bb
55 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
57 ; GFX11-NEXT: s_mov_b32 s0, 2
58 ; GFX11-NEXT: scratch_load_u16 v0, off, s0
59 ; GFX11-NEXT: s_mov_b32 s0, 0
60 ; GFX11-NEXT: s_waitcnt vmcnt(0)
61 ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0
62 ; GFX11-NEXT: s_waitcnt vmcnt(0)
63 ; GFX11-NEXT: s_setpc_b64 s[30:31]
50 bb: 64 bb:
51 %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 65 %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
52 %load_lo = load half, half addrspace(5)* %gep_lo 66 %load_lo = load half, half addrspace(5)* %gep_lo
53 %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0 67 %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0
54 %load_hi = load half, half addrspace(5)* %gep_hi 68 %load_hi = load half, half addrspace(5)* %gep_hi
95 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off 109 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off
96 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 110 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
97 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off 111 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off
98 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 112 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
99 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] 113 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
114 ;
115 ; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
116 ; GFX11: ; %bb.0: ; %bb
117 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
119 ; GFX11-NEXT: scratch_load_u16 v0, v0, off
120 ; GFX11-NEXT: s_waitcnt vmcnt(0)
121 ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off
122 ; GFX11-NEXT: s_waitcnt vmcnt(0)
123 ; GFX11-NEXT: s_setpc_b64 s[30:31]
100 bb: 124 bb:
101 %load_lo = load half, half addrspace(5)* %base_lo 125 %load_lo = load half, half addrspace(5)* %base_lo
102 %load_hi = load half, half addrspace(5)* %base_hi 126 %load_hi = load half, half addrspace(5)* %base_hi
103 127
104 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 128 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
143 ; FLATSCR_GFX10-NEXT: v_add_f16_e32 v1, 1.0, v1 167 ; FLATSCR_GFX10-NEXT: v_add_f16_e32 v1, 1.0, v1
144 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off 168 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off
145 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 169 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
146 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 170 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
147 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] 171 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
172 ;
173 ; GFX11-LABEL: chain_hi_to_lo_arithmatic:
174 ; GFX11: ; %bb.0: ; %bb
175 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
177 ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
178 ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
179 ; GFX11-NEXT: s_waitcnt vmcnt(0)
180 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
181 ; GFX11-NEXT: s_setpc_b64 s[30:31]
148 bb: 182 bb:
149 %arith_lo = fadd half %in, 1.0 183 %arith_lo = fadd half %in, 1.0
150 %load_hi = load half, half addrspace(5)* %base 184 %load_hi = load half, half addrspace(5)* %base
151 185
152 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0 186 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0
174 ; GFX10-NEXT: ds_read_u16 v0, v1 offset:2 208 ; GFX10-NEXT: ds_read_u16 v0, v1 offset:2
175 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) 209 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
176 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 210 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1
177 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) 211 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
178 ; GFX10-NEXT: s_setpc_b64 s[30:31] 212 ; GFX10-NEXT: s_setpc_b64 s[30:31]
213 ;
214 ; GFX11-LABEL: chain_hi_to_lo_group:
215 ; GFX11: ; %bb.0: ; %bb
216 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
218 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
219 ; GFX11-NEXT: ds_load_u16 v0, v1 offset:2
220 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
221 ; GFX11-NEXT: ds_load_u16_d16_hi v0, v1
222 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
223 ; GFX11-NEXT: s_setpc_b64 s[30:31]
179 bb: 224 bb:
180 %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1 225 %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
181 %load_lo = load half, half addrspace(3)* %gep_lo 226 %load_lo = load half, half addrspace(3)* %gep_lo
182 %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0 227 %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0
183 %load_hi = load half, half addrspace(3)* %gep_hi 228 %load_hi = load half, half addrspace(3)* %gep_hi
205 ; GFX10-NEXT: ds_read_u16 v0, v0 250 ; GFX10-NEXT: ds_read_u16 v0, v0
206 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) 251 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 252 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1
208 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) 253 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX10-NEXT: s_setpc_b64 s[30:31] 254 ; GFX10-NEXT: s_setpc_b64 s[30:31]
255 ;
256 ; GFX11-LABEL: chain_hi_to_lo_group_different_bases:
257 ; GFX11: ; %bb.0: ; %bb
258 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
260 ; GFX11-NEXT: ds_load_u16 v0, v0
261 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
262 ; GFX11-NEXT: ds_load_u16_d16_hi v0, v1
263 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
264 ; GFX11-NEXT: s_setpc_b64 s[30:31]
210 bb: 265 bb:
211 %load_lo = load half, half addrspace(3)* %base_lo 266 %load_lo = load half, half addrspace(3)* %base_lo
212 %load_hi = load half, half addrspace(3)* %base_hi 267 %load_hi = load half, half addrspace(3)* %base_hi
213 268
214 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 269 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
222 ; GCN: ; %bb.0: ; %bb 277 ; GCN: ; %bb.0: ; %bb
223 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 278 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224 ; GCN-NEXT: v_mov_b32_e32 v0, 2 279 ; GCN-NEXT: v_mov_b32_e32 v0, 2
225 ; GCN-NEXT: v_mov_b32_e32 v1, 0 280 ; GCN-NEXT: v_mov_b32_e32 v1, 0
226 ; GCN-NEXT: global_load_ushort v0, v[0:1], off 281 ; GCN-NEXT: global_load_ushort v0, v[0:1], off
282 ; GCN-NEXT: v_mov_b32_e32 v1, 0
227 ; GCN-NEXT: v_mov_b32_e32 v2, 0 283 ; GCN-NEXT: v_mov_b32_e32 v2, 0
228 ; GCN-NEXT: v_mov_b32_e32 v3, 0
229 ; GCN-NEXT: s_waitcnt vmcnt(0) 284 ; GCN-NEXT: s_waitcnt vmcnt(0)
230 ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off 285 ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off
231 ; GCN-NEXT: s_waitcnt vmcnt(0) 286 ; GCN-NEXT: s_waitcnt vmcnt(0)
232 ; GCN-NEXT: s_setpc_b64 s[30:31] 287 ; GCN-NEXT: s_setpc_b64 s[30:31]
233 ; 288 ;
234 ; GFX10-LABEL: chain_hi_to_lo_global: 289 ; GFX10-LABEL: chain_hi_to_lo_global:
235 ; GFX10: ; %bb.0: ; %bb 290 ; GFX10: ; %bb.0: ; %bb
242 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 297 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
243 ; GFX10-NEXT: s_waitcnt vmcnt(0) 298 ; GFX10-NEXT: s_waitcnt vmcnt(0)
244 ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off 299 ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off
245 ; GFX10-NEXT: s_waitcnt vmcnt(0) 300 ; GFX10-NEXT: s_waitcnt vmcnt(0)
246 ; GFX10-NEXT: s_setpc_b64 s[30:31] 301 ; GFX10-NEXT: s_setpc_b64 s[30:31]
302 ;
303 ; GFX11-LABEL: chain_hi_to_lo_global:
304 ; GFX11: ; %bb.0: ; %bb
305 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
307 ; GFX11-NEXT: v_mov_b32_e32 v0, 2
308 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
309 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
310 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
311 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
312 ; GFX11-NEXT: s_waitcnt vmcnt(0)
313 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
314 ; GFX11-NEXT: s_waitcnt vmcnt(0)
315 ; GFX11-NEXT: s_setpc_b64 s[30:31]
247 bb: 316 bb:
248 %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1 317 %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
249 %load_lo = load half, half addrspace(1)* %gep_lo 318 %load_lo = load half, half addrspace(1)* %gep_lo
250 %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0 319 %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0
251 %load_hi = load half, half addrspace(1)* %gep_hi 320 %load_hi = load half, half addrspace(1)* %gep_hi
273 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off 342 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
274 ; GFX10-NEXT: s_waitcnt vmcnt(0) 343 ; GFX10-NEXT: s_waitcnt vmcnt(0)
275 ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off 344 ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off
276 ; GFX10-NEXT: s_waitcnt vmcnt(0) 345 ; GFX10-NEXT: s_waitcnt vmcnt(0)
277 ; GFX10-NEXT: s_setpc_b64 s[30:31] 346 ; GFX10-NEXT: s_setpc_b64 s[30:31]
347 ;
348 ; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
349 ; GFX11: ; %bb.0: ; %bb
350 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
352 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
353 ; GFX11-NEXT: s_waitcnt vmcnt(0)
354 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
355 ; GFX11-NEXT: s_waitcnt vmcnt(0)
356 ; GFX11-NEXT: s_setpc_b64 s[30:31]
278 bb: 357 bb:
279 %load_lo = load half, half addrspace(1)* %base_lo 358 %load_lo = load half, half addrspace(1)* %base_lo
280 %load_hi = load half, half addrspace(1)* %base_hi 359 %load_hi = load half, half addrspace(1)* %base_hi
281 360
282 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 361 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
290 ; GCN: ; %bb.0: ; %bb 369 ; GCN: ; %bb.0: ; %bb
291 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 370 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292 ; GCN-NEXT: v_mov_b32_e32 v0, 2 371 ; GCN-NEXT: v_mov_b32_e32 v0, 2
293 ; GCN-NEXT: v_mov_b32_e32 v1, 0 372 ; GCN-NEXT: v_mov_b32_e32 v1, 0
294 ; GCN-NEXT: flat_load_ushort v0, v[0:1] 373 ; GCN-NEXT: flat_load_ushort v0, v[0:1]
374 ; GCN-NEXT: v_mov_b32_e32 v1, 0
295 ; GCN-NEXT: v_mov_b32_e32 v2, 0 375 ; GCN-NEXT: v_mov_b32_e32 v2, 0
296 ; GCN-NEXT: v_mov_b32_e32 v3, 0
297 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 376 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
298 ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] 377 ; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2]
299 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 378 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300 ; GCN-NEXT: s_setpc_b64 s[30:31] 379 ; GCN-NEXT: s_setpc_b64 s[30:31]
301 ; 380 ;
302 ; GFX10-LABEL: chain_hi_to_lo_flat: 381 ; GFX10-LABEL: chain_hi_to_lo_flat:
303 ; GFX10: ; %bb.0: ; %bb 382 ; GFX10: ; %bb.0: ; %bb
310 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 389 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
311 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 390 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
312 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2] 391 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2]
313 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 392 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
314 ; GFX10-NEXT: s_setpc_b64 s[30:31] 393 ; GFX10-NEXT: s_setpc_b64 s[30:31]
394 ;
395 ; GFX11-LABEL: chain_hi_to_lo_flat:
396 ; GFX11: ; %bb.0: ; %bb
397 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
398 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
399 ; GFX11-NEXT: v_mov_b32_e32 v0, 2
400 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
401 ; GFX11-NEXT: flat_load_u16 v0, v[0:1]
402 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
403 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
404 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
405 ; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
406 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
407 ; GFX11-NEXT: s_setpc_b64 s[30:31]
315 bb: 408 bb:
316 %gep_lo = getelementptr inbounds half, half* null, i64 1 409 %gep_lo = getelementptr inbounds half, half* null, i64 1
317 %load_lo = load half, half* %gep_lo 410 %load_lo = load half, half* %gep_lo
318 %gep_hi = getelementptr inbounds half, half* null, i64 0 411 %gep_hi = getelementptr inbounds half, half* null, i64 0
319 %load_hi = load half, half* %gep_hi 412 %load_hi = load half, half* %gep_hi
341 ; GFX10-NEXT: flat_load_ushort v0, v[0:1] 434 ; GFX10-NEXT: flat_load_ushort v0, v[0:1]
342 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 435 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
343 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3] 436 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3]
344 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 437 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
345 ; GFX10-NEXT: s_setpc_b64 s[30:31] 438 ; GFX10-NEXT: s_setpc_b64 s[30:31]
439 ;
440 ; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
441 ; GFX11: ; %bb.0: ; %bb
442 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
444 ; GFX11-NEXT: flat_load_u16 v0, v[0:1]
445 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
446 ; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
447 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
448 ; GFX11-NEXT: s_setpc_b64 s[30:31]
346 bb: 449 bb:
347 %load_lo = load half, half* %base_lo 450 %load_lo = load half, half* %base_lo
348 %load_hi = load half, half* %base_hi 451 %load_hi = load half, half* %base_hi
349 452
350 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 453 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
369 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 472 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2
370 ; GFX900-NEXT: s_waitcnt vmcnt(0) 473 ; GFX900-NEXT: s_waitcnt vmcnt(0)
371 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 474 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6
372 ; GFX900-NEXT: s_waitcnt vmcnt(0) 475 ; GFX900-NEXT: s_waitcnt vmcnt(0)
373 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 476 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4
477 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100
374 ; GFX900-NEXT: s_waitcnt vmcnt(0) 478 ; GFX900-NEXT: s_waitcnt vmcnt(0)
375 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 479 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8
376 ; GFX900-NEXT: s_waitcnt vmcnt(0) 480 ; GFX900-NEXT: s_waitcnt vmcnt(0)
377 ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 481 ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6
378 ; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 482 ; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4
379 ; GFX900-NEXT: s_waitcnt vmcnt(1) 483 ; GFX900-NEXT: s_waitcnt vmcnt(1)
380 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 484 ; GFX900-NEXT: v_mov_b32_e32 v1, v0
381 ; GFX900-NEXT: s_waitcnt vmcnt(0)
382 ; GFX900-NEXT: v_mov_b32_e32 v1, v3
383 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 485 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
384 ; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0 486 ; GFX900-NEXT: s_waitcnt vmcnt(1)
487 ; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
385 ; GFX900-NEXT: s_waitcnt vmcnt(0) 488 ; GFX900-NEXT: s_waitcnt vmcnt(0)
386 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 489 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
387 ; GFX900-NEXT: s_endpgm 490 ; GFX900-NEXT: s_endpgm
388 ; 491 ;
389 ; FLATSCR-LABEL: vload2_private: 492 ; FLATSCR-LABEL: vload2_private:
439 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 542 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6
440 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 543 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4
441 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) 544 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1)
442 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 545 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0
443 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 546 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
444 ; GFX10_DEFAULT-NEXT: v_and_b32_e32 v3, 0xffff, v3 547 ; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
445 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 548 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
446 ; GFX10_DEFAULT-NEXT: v_lshl_or_b32 v0, v0, 16, v3
447 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 549 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
448 ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 550 ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
449 ; GFX10_DEFAULT-NEXT: s_endpgm 551 ; GFX10_DEFAULT-NEXT: s_endpgm
450 ; 552 ;
451 ; FLATSCR_GFX10-LABEL: vload2_private: 553 ; FLATSCR_GFX10-LABEL: vload2_private:
481 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 583 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0
482 ; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, vcc_lo offset:6 584 ; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, vcc_lo offset:6
483 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 585 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
484 ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 586 ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
485 ; FLATSCR_GFX10-NEXT: s_endpgm 587 ; FLATSCR_GFX10-NEXT: s_endpgm
588 ;
589 ; GFX11-LABEL: vload2_private:
590 ; GFX11: ; %bb.0: ; %entry
591 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
592 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
593 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
594 ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1]
595 ; GFX11-NEXT: s_waitcnt vmcnt(0)
596 ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
597 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
598 ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
599 ; GFX11-NEXT: s_waitcnt vmcnt(0)
600 ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:6 dlc
601 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
602 ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
603 ; GFX11-NEXT: s_waitcnt vmcnt(0)
604 ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:8 dlc
605 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
606 ; GFX11-NEXT: s_clause 0x1
607 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4
608 ; GFX11-NEXT: scratch_load_b32 v1, off, off offset:6
609 ; GFX11-NEXT: s_waitcnt vmcnt(0)
610 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
611 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
612 ; GFX11-NEXT: s_endpgm
486 entry: 613 entry:
487 %loc = alloca [3 x i16], align 2, addrspace(5) 614 %loc = alloca [3 x i16], align 2, addrspace(5)
488 %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* 615 %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
489 %tmp = load i16, i16 addrspace(1)* %in, align 2 616 %tmp = load i16, i16 addrspace(1)* %in, align 2
490 %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0 617 %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
532 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] 659 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
533 ; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2 660 ; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2
534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) 661 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
535 ; GFX10-NEXT: v_mov_b32_e32 v0, v1 662 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
536 ; GFX10-NEXT: s_setpc_b64 s[30:31] 663 ; GFX10-NEXT: s_setpc_b64 s[30:31]
664 ;
665 ; GFX11-LABEL: chain_hi_to_lo_group_other_dep:
666 ; GFX11: ; %bb.0: ; %bb
667 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
668 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
669 ; GFX11-NEXT: ds_load_u16_d16_hi v1, v0
670 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
671 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
672 ; GFX11-NEXT: ds_load_u16_d16 v1, v0 offset:2
673 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
674 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
675 ; GFX11-NEXT: s_setpc_b64 s[30:31]
537 bb: 676 bb:
538 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 677 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
539 %load_lo = load i16, i16 addrspace(3)* %gep_lo 678 %load_lo = load i16, i16 addrspace(3)* %gep_lo
540 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 679 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
541 %load_hi = load i16, i16 addrspace(3)* %gep_hi 680 %load_hi = load i16, i16 addrspace(3)* %gep_hi
545 ret <2 x i16> %result 684 ret <2 x i16> %result
546 } 685 }
547 686
548 ; The volatile operations aren't put on the same chain 687 ; The volatile operations aren't put on the same chain
549 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) { 688 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
550 ; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: 689 ; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
551 ; GCN: ; %bb.0: ; %bb 690 ; GFX900: ; %bb.0: ; %bb
552 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 691 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
553 ; GCN-NEXT: ds_read_u16 v1, v0 offset:2 692 ; GFX900-NEXT: ds_read_u16 v1, v0 offset:2
554 ; GCN-NEXT: ds_read_u16_d16_hi v0, v0 693 ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
555 ; GCN-NEXT: v_mov_b32_e32 v2, 0xffff 694 ; GFX900-NEXT: s_mov_b32 s4, 0xffff
556 ; GCN-NEXT: s_waitcnt lgkmcnt(0) 695 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
557 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 696 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
558 ; GCN-NEXT: v_bfi_b32 v0, v2, v1, v0 697 ; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0
559 ; GCN-NEXT: s_setpc_b64 s[30:31] 698 ; GFX900-NEXT: s_setpc_b64 s[30:31]
699 ;
700 ; FLATSCR-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
701 ; FLATSCR: ; %bb.0: ; %bb
702 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703 ; FLATSCR-NEXT: ds_read_u16 v1, v0 offset:2
704 ; FLATSCR-NEXT: ds_read_u16_d16_hi v0, v0
705 ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff
706 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
707 ; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
708 ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v1, v0
709 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
560 ; 710 ;
561 ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: 711 ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
562 ; GFX10: ; %bb.0: ; %bb 712 ; GFX10: ; %bb.0: ; %bb
563 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 713 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
564 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 714 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
566 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v0 716 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v0
567 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) 717 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 718 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
569 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 719 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
570 ; GFX10-NEXT: s_setpc_b64 s[30:31] 720 ; GFX10-NEXT: s_setpc_b64 s[30:31]
721 ;
722 ; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
723 ; GFX11: ; %bb.0: ; %bb
724 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
725 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
726 ; GFX11-NEXT: ds_load_u16 v1, v0 offset:2
727 ; GFX11-NEXT: ds_load_u16_d16_hi v0, v0
728 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
729 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
730 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
731 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
732 ; GFX11-NEXT: s_setpc_b64 s[30:31]
571 bb: 733 bb:
572 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 734 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
573 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo 735 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
574 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 736 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
575 %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi 737 %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
623 ; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] 785 ; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
624 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2 786 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2
625 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 787 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
626 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 788 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
627 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] 789 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
790 ;
791 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
792 ; GFX11: ; %bb.0: ; %bb
793 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
794 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
795 ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
796 ; GFX11-NEXT: s_waitcnt vmcnt(0)
797 ; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
798 ; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2
799 ; GFX11-NEXT: s_waitcnt vmcnt(0)
800 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
801 ; GFX11-NEXT: s_setpc_b64 s[30:31]
628 bb: 802 bb:
629 %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 803 %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
630 %load_lo = load i16, i16 addrspace(5)* %gep_lo 804 %load_lo = load i16, i16 addrspace(5)* %gep_lo
631 %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0 805 %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
632 %load_hi = load i16, i16 addrspace(5)* %gep_hi 806 %load_hi = load i16, i16 addrspace(5)* %gep_hi
635 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 809 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
636 ret <2 x i16> %result 810 ret <2 x i16> %result
637 } 811 }
638 812
639 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) { 813 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
640 ; GCN-LABEL: chain_hi_to_lo_global_other_dep: 814 ; GFX900-LABEL: chain_hi_to_lo_global_other_dep:
641 ; GCN: ; %bb.0: ; %bb 815 ; GFX900: ; %bb.0: ; %bb
642 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 816 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643 ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc 817 ; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc
644 ; GCN-NEXT: s_waitcnt vmcnt(0) 818 ; GFX900-NEXT: s_waitcnt vmcnt(0)
645 ; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off glc 819 ; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off glc
646 ; GCN-NEXT: s_waitcnt vmcnt(0) 820 ; GFX900-NEXT: s_waitcnt vmcnt(0)
647 ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff 821 ; GFX900-NEXT: s_mov_b32 s4, 0xffff
648 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 822 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
649 ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 823 ; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0
650 ; GCN-NEXT: s_setpc_b64 s[30:31] 824 ; GFX900-NEXT: s_setpc_b64 s[30:31]
825 ;
826 ; FLATSCR-LABEL: chain_hi_to_lo_global_other_dep:
827 ; FLATSCR: ; %bb.0: ; %bb
828 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
829 ; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc
830 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
831 ; FLATSCR-NEXT: global_load_short_d16_hi v0, v[0:1], off glc
832 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
833 ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff
834 ; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
835 ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0
836 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
651 ; 837 ;
652 ; GFX10-LABEL: chain_hi_to_lo_global_other_dep: 838 ; GFX10-LABEL: chain_hi_to_lo_global_other_dep:
653 ; GFX10: ; %bb.0: ; %bb 839 ; GFX10: ; %bb.0: ; %bb
654 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 840 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 841 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
658 ; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc 844 ; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc
659 ; GFX10-NEXT: s_waitcnt vmcnt(0) 845 ; GFX10-NEXT: s_waitcnt vmcnt(0)
660 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 846 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
661 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 847 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
662 ; GFX10-NEXT: s_setpc_b64 s[30:31] 848 ; GFX10-NEXT: s_setpc_b64 s[30:31]
849 ;
850 ; GFX11-LABEL: chain_hi_to_lo_global_other_dep:
851 ; GFX11: ; %bb.0: ; %bb
852 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
853 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
854 ; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
855 ; GFX11-NEXT: s_waitcnt vmcnt(0)
856 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
857 ; GFX11-NEXT: s_waitcnt vmcnt(0)
858 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
859 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
860 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
861 ; GFX11-NEXT: s_setpc_b64 s[30:31]
663 bb: 862 bb:
664 %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1 863 %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
665 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo 864 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
666 %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0 865 %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
667 %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi 866 %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
670 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 869 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
671 ret <2 x i16> %result 870 ret <2 x i16> %result
672 } 871 }
673 872
674 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) { 873 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
675 ; GCN-LABEL: chain_hi_to_lo_flat_other_dep: 874 ; GFX900-LABEL: chain_hi_to_lo_flat_other_dep:
676 ; GCN: ; %bb.0: ; %bb 875 ; GFX900: ; %bb.0: ; %bb
677 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 876 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
678 ; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc 877 ; GFX900-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc
679 ; GCN-NEXT: s_waitcnt vmcnt(0) 878 ; GFX900-NEXT: s_waitcnt vmcnt(0)
680 ; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] glc 879 ; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1] glc
681 ; GCN-NEXT: s_waitcnt vmcnt(0) 880 ; GFX900-NEXT: s_waitcnt vmcnt(0)
682 ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff 881 ; GFX900-NEXT: s_mov_b32 s4, 0xffff
683 ; GCN-NEXT: s_waitcnt lgkmcnt(0) 882 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
684 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 883 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
685 ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 884 ; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0
686 ; GCN-NEXT: s_setpc_b64 s[30:31] 885 ; GFX900-NEXT: s_setpc_b64 s[30:31]
886 ;
887 ; FLATSCR-LABEL: chain_hi_to_lo_flat_other_dep:
888 ; FLATSCR: ; %bb.0: ; %bb
889 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
890 ; FLATSCR-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc
891 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
892 ; FLATSCR-NEXT: flat_load_short_d16_hi v0, v[0:1] glc
893 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
894 ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff
895 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
896 ; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
897 ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0
898 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
687 ; 899 ;
688 ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep: 900 ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep:
689 ; GFX10: ; %bb.0: ; %bb 901 ; GFX10: ; %bb.0: ; %bb
690 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 902 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
691 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 903 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
696 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc 908 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc
697 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 909 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
698 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 910 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
699 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 911 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
700 ; GFX10-NEXT: s_setpc_b64 s[30:31] 912 ; GFX10-NEXT: s_setpc_b64 s[30:31]
913 ;
914 ; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
915 ; GFX11: ; %bb.0: ; %bb
916 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
917 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
918 ; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
919 ; GFX11-NEXT: s_waitcnt vmcnt(0)
920 ; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
921 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
922 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
923 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
924 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
925 ; GFX11-NEXT: s_setpc_b64 s[30:31]
701 bb: 926 bb:
702 %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1 927 %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
703 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo 928 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
704 %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0 929 %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
705 %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi 930 %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
708 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 933 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
709 ret <2 x i16> %result 934 ret <2 x i16> %result
710 } 935 }
711 936
712 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) { 937 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
713 ; GCN-LABEL: chain_hi_to_lo_group_may_alias_store: 938 ; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store:
714 ; GCN: ; %bb.0: ; %bb 939 ; GFX900: ; %bb.0: ; %bb
715 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 940 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7b 941 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7b
717 ; GCN-NEXT: ds_read_u16 v2, v0 942 ; GFX900-NEXT: ds_read_u16 v2, v0
718 ; GCN-NEXT: ds_write_b16 v1, v3 943 ; GFX900-NEXT: ds_write_b16 v1, v3
719 ; GCN-NEXT: ds_read_u16 v0, v0 offset:2 944 ; GFX900-NEXT: ds_read_u16 v0, v0 offset:2
720 ; GCN-NEXT: s_waitcnt lgkmcnt(0) 945 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100
721 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 946 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
722 ; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0 947 ; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4
723 ; GCN-NEXT: s_setpc_b64 s[30:31] 948 ; GFX900-NEXT: s_setpc_b64 s[30:31]
949 ;
950 ; FLATSCR-LABEL: chain_hi_to_lo_group_may_alias_store:
951 ; FLATSCR: ; %bb.0: ; %bb
952 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
953 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0x7b
954 ; FLATSCR-NEXT: ds_read_u16 v2, v0
955 ; FLATSCR-NEXT: ds_write_b16 v1, v3
956 ; FLATSCR-NEXT: ds_read_u16 v0, v0 offset:2
957 ; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100
958 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
959 ; FLATSCR-NEXT: v_perm_b32 v0, v2, v0, s0
960 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
724 ; 961 ;
725 ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store: 962 ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store:
726 ; GFX10: ; %bb.0: ; %bb 963 ; GFX10: ; %bb.0: ; %bb
727 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 964 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 965 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
729 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 966 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
730 ; GFX10-NEXT: ds_read_u16 v3, v0 967 ; GFX10-NEXT: ds_read_u16 v3, v0
731 ; GFX10-NEXT: ds_write_b16 v1, v2 968 ; GFX10-NEXT: ds_write_b16 v1, v2
732 ; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 969 ; GFX10-NEXT: ds_read_u16 v0, v0 offset:2
733 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) 970 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
734 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 971 ; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
735 ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 972 ; GFX10-NEXT: s_setpc_b64 s[30:31]
736 ; GFX10-NEXT: s_setpc_b64 s[30:31] 973 ;
974 ; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store:
975 ; GFX11: ; %bb.0: ; %bb
976 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
977 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
978 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x7b
979 ; GFX11-NEXT: ds_load_u16 v3, v0
980 ; GFX11-NEXT: ds_store_b16 v1, v2
981 ; GFX11-NEXT: ds_load_u16 v0, v0 offset:2
982 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
983 ; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
984 ; GFX11-NEXT: s_setpc_b64 s[30:31]
737 bb: 985 bb:
738 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 986 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
739 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 987 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
740 %load_hi = load i16, i16 addrspace(3)* %gep_hi 988 %load_hi = load i16, i16 addrspace(3)* %gep_hi
741 store i16 123, i16 addrspace(3)* %may.alias 989 store i16 123, i16 addrspace(3)* %may.alias