diff llvm/test/CodeGen/AMDGPU/select.f16.ll @ 207:2e18cbf3894f

LLVM12
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 08 Jun 2021 06:07:14 +0900
parents 0572611fdcc8
children c4bab56944e8
line wrap: on
line diff
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll	Mon May 25 11:55:54 2020 +0900
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll	Tue Jun 08 06:07:14 2021 +0900
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs  | FileCheck %s -check-prefixes=GCN,SI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs  | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
 
 define amdgpu_kernel void @select_f16(
 ; SI-LABEL: select_f16:
@@ -24,19 +24,19 @@
 ; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_mov_b32 s14, s2
 ; SI-NEXT:    s_mov_b32 s15, s3
-; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
@@ -52,28 +52,30 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s6, s2
-; VI-NEXT:    s_mov_b32 s7, s3
-; VI-NEXT:    s_mov_b32 s16, s8
-; VI-NEXT:    s_mov_b32 s17, s9
+; VI-NEXT:    s_mov_b32 s16, s6
+; VI-NEXT:    s_mov_b32 s17, s7
+; VI-NEXT:    s_mov_b32 s19, s3
+; VI-NEXT:    s_mov_b32 s20, s8
+; VI-NEXT:    s_mov_b32 s21, s9
 ; VI-NEXT:    s_mov_b32 s8, s10
 ; VI-NEXT:    s_mov_b32 s9, s11
-; VI-NEXT:    s_mov_b32 s19, s3
+; VI-NEXT:    s_mov_b32 s22, s2
+; VI-NEXT:    s_mov_b32 s23, s3
 ; VI-NEXT:    s_mov_b32 s10, s2
 ; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s14, s2
 ; VI-NEXT:    s_mov_b32 s15, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
-; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -112,16 +114,16 @@
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
@@ -137,24 +139,25 @@
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s0, s2
-; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
-; VI-NEXT:    s_mov_b32 s12, s4
-; VI-NEXT:    s_mov_b32 s13, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s6, s10
-; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
@@ -191,16 +194,16 @@
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
@@ -216,24 +219,25 @@
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s0, s2
-; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
-; VI-NEXT:    s_mov_b32 s12, s4
-; VI-NEXT:    s_mov_b32 s13, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s6, s10
-; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
-; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
@@ -270,16 +274,16 @@
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
@@ -295,26 +299,27 @@
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s0, s2
-; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
-; VI-NEXT:    s_mov_b32 s12, s4
-; VI-NEXT:    s_mov_b32 s13, s5
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s6, s10
-; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
-; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
     half addrspace(1)* %r,
@@ -350,16 +355,16 @@
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
@@ -375,26 +380,27 @@
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s0, s2
-; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
-; VI-NEXT:    s_mov_b32 s12, s4
-; VI-NEXT:    s_mov_b32 s13, s5
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s6, s10
-; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
-; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
     half addrspace(1)* %r,
@@ -474,25 +480,25 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_mov_b32 s4, s6
-; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s6, s2
-; VI-NEXT:    s_mov_b32 s7, s3
-; VI-NEXT:    s_mov_b32 s16, s8
-; VI-NEXT:    s_mov_b32 s17, s9
+; VI-NEXT:    s_mov_b32 s16, s6
+; VI-NEXT:    s_mov_b32 s17, s7
+; VI-NEXT:    s_mov_b32 s19, s3
+; VI-NEXT:    s_mov_b32 s20, s8
+; VI-NEXT:    s_mov_b32 s21, s9
 ; VI-NEXT:    s_mov_b32 s8, s10
 ; VI-NEXT:    s_mov_b32 s9, s11
-; VI-NEXT:    s_mov_b32 s19, s3
+; VI-NEXT:    s_mov_b32 s22, s2
+; VI-NEXT:    s_mov_b32 s23, s3
 ; VI-NEXT:    s_mov_b32 s10, s2
 ; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_mov_b32 s14, s2
 ; VI-NEXT:    s_mov_b32 s15, s3
-; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
 ; VI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
 ; VI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -534,15 +540,15 @@
 ; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s7
-; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_mov_b32 s18, s10
+; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
 ; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
@@ -580,22 +586,22 @@
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s0
-; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s0, s2
-; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s12, s4
-; VI-NEXT:    s_mov_b32 s13, s5
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
 ; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT:    s_movk_i32 s0, 0x3900
+; VI-NEXT:    s_movk_i32 s2, 0x3900
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
@@ -603,7 +609,7 @@
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_cmp_lt_f16_e32 vcc, s0, v3
+; VI-NEXT:    v_cmp_lt_f16_e32 vcc, s2, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -633,15 +639,15 @@
 ; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s16, s4
 ; SI-NEXT:    s_mov_b32 s17, s5
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s7
-; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_mov_b32 s18, s10
+; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
 ; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
@@ -679,22 +685,22 @@
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s0
-; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s0, s2
-; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s12, s4
-; VI-NEXT:    s_mov_b32 s13, s5
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
 ; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT:    s_movk_i32 s0, 0x3900
+; VI-NEXT:    s_movk_i32 s2, 0x3900
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
@@ -702,7 +708,7 @@
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_cmp_gt_f16_e32 vcc, s0, v3
+; VI-NEXT:    v_cmp_gt_f16_e32 vcc, s2, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -731,38 +737,39 @@
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s16, s4
-; SI-NEXT:    s_mov_b32 s17, s5
 ; SI-NEXT:    s_mov_b32 s12, s2
 ; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_mov_b32 s16, s4
+; SI-NEXT:    s_mov_b32 s17, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
-; SI-NEXT:    buffer_load_dword v3, off, s[16:19], 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0x3f200000
+; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v5
-; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
-; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v4, v3
+; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
+; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v4, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_cndmask_b32_e32 v1, 0.5, v1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0.5, v2, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -777,32 +784,33 @@
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s0
-; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s0, s2
-; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s12, s4
-; VI-NEXT:    s_mov_b32 s13, s5
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
-; VI-NEXT:    buffer_load_dword v4, off, s[12:15], 0
-; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
-; VI-NEXT:    v_mov_b32_e32 v3, 0x3900
+; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v6, v5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
@@ -830,41 +838,41 @@
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s16, s4
-; SI-NEXT:    s_mov_b32 s17, s5
 ; SI-NEXT:    s_mov_b32 s12, s2
 ; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_mov_b32 s16, s4
+; SI-NEXT:    s_mov_b32 s17, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
-; SI-NEXT:    buffer_load_dword v3, off, s[16:19], 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0x3f200000
+; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
-; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
-; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v1, vcc
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
@@ -877,32 +885,33 @@
 ; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s0
-; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s0, s2
-; VI-NEXT:    s_mov_b32 s1, s3
-; VI-NEXT:    s_mov_b32 s12, s4
-; VI-NEXT:    s_mov_b32 s13, s5
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
-; VI-NEXT:    buffer_load_dword v4, off, s[12:15], 0
-; VI-NEXT:    v_mov_b32_e32 v2, 0x3800
-; VI-NEXT:    v_mov_b32_e32 v3, 0x3900
+; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0