Mercurial > hg > CbC > CbC_llvm
diff llvm/test/CodeGen/AMDGPU/select.f16.ll @ 207:2e18cbf3894f
LLVM12
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 08 Jun 2021 06:07:14 +0900 |
parents | 0572611fdcc8 |
children | c4bab56944e8 |
line wrap: on
line diff
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll Mon May 25 11:55:54 2020 +0900 +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll Tue Jun 08 06:07:14 2021 +0900 @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI -; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: @@ -24,19 +24,19 @@ ; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 -; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc @@ -52,28 +52,30 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 +; VI-NEXT: s_mov_b32 s16, s6 +; VI-NEXT: s_mov_b32 s17, s7 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s20, s8 +; VI-NEXT: s_mov_b32 s21, s9 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 -; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -112,16 +114,16 @@ ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc @@ -137,24 +139,25 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -191,16 +194,16 @@ ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc @@ -216,24 +219,25 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -270,16 +274,16 @@ ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc @@ -295,26 +299,27 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -350,16 +355,16 @@ ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc @@ -375,26 +380,27 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -474,25 +480,25 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 +; VI-NEXT: s_mov_b32 s16, s6 +; VI-NEXT: s_mov_b32 s17, s7 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s20, s8 +; VI-NEXT: s_mov_b32 s21, s9 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -534,15 +540,15 @@ ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 @@ -580,22 +586,22 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; VI-NEXT: s_movk_i32 s0, 0x3900 +; VI-NEXT: s_movk_i32 s2, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 @@ -603,7 +609,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, s0, v3 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -633,15 +639,15 @@ ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 @@ -679,22 +685,22 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; VI-NEXT: s_movk_i32 s0, 0x3900 +; VI-NEXT: s_movk_i32 s2, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 @@ -702,7 +708,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_cmp_gt_f16_e32 vcc, s0, v3 +; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -731,38 +737,39 @@ ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s4 -; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0 -; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -777,32 +784,33 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 -; VI-NEXT: v_mov_b32_e32 v3, 0x3900 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: v_mov_b32_e32 v4, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v4 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 @@ -830,41 +838,41 @@ ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s4 -; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0 -; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 -; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -877,32 +885,33 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 -; VI-NEXT: v_mov_b32_e32 v3, 0x3900 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: v_mov_b32_e32 v4, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v4 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0