diff llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @ 236:c4bab56944e8 llvm-original

LLVM 16
author kono
date Wed, 09 Nov 2022 17:45:10 +0900
parents 5f17cb93ff66
children 1f2b6ac9f198
line wrap: on
line diff
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll	Wed Jul 21 10:27:27 2021 +0900
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll	Wed Nov 09 17:45:10 2022 +0900
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck -check-prefix=GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
 
 define i32 @s_add_co_select_user() {
 ; GFX7-LABEL: s_add_co_select_user:
@@ -13,11 +14,13 @@
 ; GFX7-NEXT:    v_add_i32_e64 v0, s[4:5], s6, s6
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX7-NEXT:    s_addc_u32 s4, s6, 0
+; GFX7-NEXT:    s_addc_u32 s7, s6, 0
+; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s7, 0
+; GFX7-NEXT:    s_cmp_gt_u32 s6, 31
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    s_cselect_b64 vcc, 1, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX7-NEXT:    v_cmp_gt_u32_e64 vcc, s6, 31
+; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -29,11 +32,12 @@
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], s6, s6
 ; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s4, s6, 0
-; GFX9-NEXT:    s_cselect_b64 vcc, 1, 0
+; GFX9-NEXT:    s_addc_u32 s7, s6, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s7, 0
+; GFX9-NEXT:    s_cmp_gt_u32 s6, 31
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    s_cmp_gt_u32 s6, 31
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -48,12 +52,32 @@
 ; GFX10-NEXT:    v_add_co_u32 v0, s5, s4, s4
 ; GFX10-NEXT:    s_cmpk_lg_u32 s5, 0x0
 ; GFX10-NEXT:    s_addc_u32 s5, s4, 0
-; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s5, s5, 0
 ; GFX10-NEXT:    s_cmp_gt_u32 s4, 31
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, s5, s6
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_add_co_select_user:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, s1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmpk_lg_u32 s1, 0x0
+; GFX11-NEXT:    s_addc_u32 s1, s0, 0
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX11-NEXT:    s_cmp_gt_u32 s0, 31
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = load volatile i32, i32 addrspace(4)* null, align 8
   %i1 = add i32 %i, %i
@@ -71,25 +95,24 @@
 define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_i32 s1, s0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v0
-; GFX7-NEXT:    s_or_b32 s1, vcc_lo, vcc_hi
-; GFX7-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_addc_u32 s0, s0, 0
+; GFX7-NEXT:    s_add_i32 s0, s2, s2
+; GFX7-NEXT:    s_cmp_lt_u32 s0, s2
+; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX7-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v0
-; GFX7-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX7-NEXT:    s_cbranch_vccnz BB1_2
+; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX7-NEXT:  ; %bb.1: ; %bb0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:  BB1_2: ; %bb1
+; GFX7-NEXT:  .LBB1_2: ; %bb1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 10
@@ -99,24 +122,23 @@
 ;
 ; GFX9-LABEL: s_add_co_br_user:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s1, s0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_addc_u32 s0, s0, 0
+; GFX9-NEXT:    s_add_i32 s0, s2, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s0, s2
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v0
-; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX9-NEXT:    s_cbranch_vccnz BB1_2
+; GFX9-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:  BB1_2: ; %bb1
+; GFX9-NEXT:  .LBB1_2: ; %bb1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 10
@@ -129,26 +151,53 @@
 ; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_i32 s1, s0, s0
-; GFX10-NEXT:    v_cmp_lt_u32_e64 s1, s1, s0
+; GFX10-NEXT:    s_cmp_lt_u32 s1, s0
+; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
 ; GFX10-NEXT:    s_cmpk_lg_u32 s1, 0x0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s0, v0
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    s_cbranch_vccnz BB1_2
+; GFX10-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:  BB1_2: ; %bb1
+; GFX10-NEXT:  .LBB1_2: ; %bb1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_add_co_br_user:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s1, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lt_u32 s1, s0
+; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    s_cmpk_lg_u32 s1, 0x0
+; GFX11-NEXT:    s_addc_u32 s0, s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s0, v0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX11-NEXT:  ; %bb.1: ; %bb0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 9
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:  .LBB1_2: ; %bb1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 10
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 bb:
   %i1 = add i32 %i, %i
   %i2 = icmp ult i32 %i1, %i