Mercurial > hg > CbC > CbC_llvm
view llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @ 266:00f31e85ec16 default tip
Added tag current for changeset 31d058e83c98
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Sat, 14 Oct 2023 10:13:55 +0900 |
parents | 1f2b6ac9f198 |
children |
line wrap: on
line source
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600,EG %s ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=R600,CM %s define float @v_rcp_f32_ieee(float %x) #3 { ; SI-LABEL: v_rcp_f32_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_f32_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float 1.0, %x ret float %rcp } define float @v_rcp_f32_ieee_unsafe(float %x) #4 { ; GCN-LABEL: v_rcp_f32_ieee_unsafe: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_unsafe: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float 1.0, %x ret float %rcp } define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 { ; SI-LABEL: v_rcp_f32_ieee_known_not_denormal: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_f32_ieee_known_not_denormal: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_known_not_denormal: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float 1.0, %x ret float %rcp } define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 { ; SI-LABEL: v_rcp_f32_ieee_nnan_ninf: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_f32_ieee_nnan_ninf: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_nnan_ninf: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv nnan ninf float 1.0, %x ret float %rcp } define float @v_neg_rcp_f32_ieee(float %x) #3 { ; SI-LABEL: v_neg_rcp_f32_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_neg_rcp_f32_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 ; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_neg_rcp_f32_ieee: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float -1.0, %x ret float %rcp } define float @v_rcp_f32_daz(float %x) #0 { ; SI-LABEL: v_rcp_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_daz: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float 1.0, %x ret float %rcp } define float @v_neg_rcp_f32_daz(float %x) #0 { ; SI-LABEL: v_neg_rcp_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_neg_rcp_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 ; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_neg_rcp_f32_daz: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float -1.0, %x ret float %rcp } define float @v_rcp_f32_ieee_ulp25(float %x) #3 { ; SI-LABEL: v_rcp_f32_ieee_ulp25: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_rcp_f32_e32 v1, v1 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_f32_ieee_ulp25: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 ; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_ulp25: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float 1.0, %x, !fpmath !0 ret float %rcp } define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 { ; SI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_rcp_f32_e32 v1, v1 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 ; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float 1.0, %x, !fpmath !0 ret float %rcp } define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 { ; SI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 ; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] ; SI-NEXT: v_rcp_f32_e32 v1, v1 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 ; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float -1.0, %x, !fpmath !0 ret float %rcp } define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 { ; SI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_rcp_f32_e32 v1, v1 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 ; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv ninf nnan float 1.0, %x, !fpmath !0 ret float %rcp } define float @v_rcp_f32_daz_ulp25(float %x) #0 { ; GCN-LABEL: v_rcp_f32_daz_ulp25: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_f32_daz_ulp25: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float 1.0, %x, !fpmath !0 ret float %rcp } define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 { ; SI-LABEL: v_neg_rcp_f32_ieee_ulp25: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 ; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] ; SI-NEXT: v_rcp_f32_e32 v1, v1 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_neg_rcp_f32_ieee_ulp25: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0 ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 ; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_neg_rcp_f32_ieee_ulp25: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float -1.0, %x, !fpmath !0 ret float %rcp } define float @v_neg_rcp_f32_daz_ulp25(float %x) #0 { ; GCN-LABEL: v_neg_rcp_f32_daz_ulp25: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e64 v0, -v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_neg_rcp_f32_daz_ulp25: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %rcp = fdiv float -1.0, %x, !fpmath !0 ret float %rcp } define float @v_rcp_fabs_f32_ieee(float %x) #3 { ; SI-LABEL: v_rcp_fabs_f32_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 ; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; SI-NEXT: v_fma_f32 v3, v4, v3, v3 ; SI-NEXT: v_mul_f32_e32 v4, v1, v3 ; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 ; SI-NEXT: v_fma_f32 v4, v5, v3, v4 ; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 ; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_fabs_f32_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 ; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v1, v3 ; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_fabs_f32_ieee: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %fabs.x = call float @llvm.fabs.f32(float %x) %rcp = fdiv float 1.0, %fabs.x ret float %rcp } define float @v_rcp_fabs_f32_daz(float %x) #0 { ; SI-LABEL: v_rcp_fabs_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; SI-NEXT: v_fma_f32 v3, v4, v3, v3 ; SI-NEXT: v_mul_f32_e32 v4, v1, v3 ; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 ; SI-NEXT: v_fma_f32 v4, v5, v3, v4 ; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_fabs_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 ; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v1, v3 ; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_fabs_f32_daz: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %fabs.x = call float @llvm.fabs.f32(float %x) %rcp = fdiv float 1.0, %fabs.x ret float %rcp } define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 { ; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0| ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 ; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5] ; SI-NEXT: v_rcp_f32_e32 v1, v1 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_fabs_f32_ieee_ulp25: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_frexp_mant_f32_e64 v1, |v0| ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 ; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_fabs_f32_ieee_ulp25: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %fabs.x = call float @llvm.fabs.f32(float %x) %rcp = fdiv float 1.0, %fabs.x, !fpmath !0 ret float %rcp } define float @v_rcp_fabs_f32_daz_ulp25(float %x) #0 { ; GCN-LABEL: v_rcp_fabs_f32_daz_ulp25: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e64 v0, |v0| ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_fabs_f32_daz_ulp25: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %fabs.x = call float @llvm.fabs.f32(float %x) %rcp = fdiv float 1.0, %fabs.x, !fpmath !0 ret float %rcp } define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 { ; SI-LABEL: v_rcp_neg_fabs_f32_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 ; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; SI-NEXT: v_fma_f32 v3, v4, v3, v3 ; SI-NEXT: v_mul_f32_e32 v4, v1, v3 ; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 ; SI-NEXT: v_fma_f32 v4, v5, v3, v4 ; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 ; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_neg_fabs_f32_ieee: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 ; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v1, v3 ; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_neg_fabs_f32_ieee: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %fabs.x = call float @llvm.fabs.f32(float %x) %rcp = fdiv float -1.0, %fabs.x ret float %rcp } define float @v_rcp_neg_fabs_f32_daz(float %x) #0 { ; SI-LABEL: v_rcp_neg_fabs_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; SI-NEXT: v_fma_f32 v3, v4, v3, v3 ; SI-NEXT: v_mul_f32_e32 v4, v1, v3 ; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 ; SI-NEXT: v_fma_f32 v4, v5, v3, v4 ; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_neg_fabs_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 ; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3 ; VI-NEXT: v_mul_f32_e32 v4, v1, v3 ; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4 ; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_neg_fabs_f32_daz: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %fabs.x = call float @llvm.fabs.f32(float %x) %rcp = fdiv float -1.0, %fabs.x ret float %rcp } define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 { ; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-NEXT: v_frexp_mant_f32_e64 v1, -|v0| ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 ; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v1, s[4:5] ; SI-NEXT: v_rcp_f32_e32 v1, v1 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_frexp_mant_f32_e64 v1, -|v0| ; VI-NEXT: v_rcp_f32_e32 v1, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 ; VI-NEXT: v_ldexp_f32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %fabs.x = call float @llvm.fabs.f32(float %x) %rcp = fdiv float -1.0, %fabs.x, !fpmath !0 ret float %rcp } define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 { ; GCN-LABEL: v_rcp_neg_fabs_f32_daz_ulp25: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e64 v0, -|v0| ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_rcp_neg_fabs_f32_daz_ulp25: ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD %fabs.x = call float @llvm.fabs.f32(float %x) %rcp = fdiv float -1.0, %fabs.x, !fpmath !0 ret float %rcp } define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_rcp_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_rcp_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_rcp_ulp25_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_rcp_ulp25_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv fast float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv arcp float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 { ; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e64 v0, |s2| ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e64 v2, |s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_rcp_fabs_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: RECIP_IEEE * T1.X, |KC0[2].Z|, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_rcp_fabs_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; CM-NEXT: RECIP_IEEE T1.X, |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), |KC0[2].Z|, %src.fabs = call float @llvm.fabs.f32(float %src) %rcp = fdiv float 1.0, %src.fabs, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_neg_rcp_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e64 v0, -s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_neg_rcp_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e64 v2, -s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_neg_rcp_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: RECIP_IEEE * T0.X, KC0[2].Z, ; EG-NEXT: MUL_IEEE T0.X, literal.x, PS, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) ; ; CM-LABEL: s_neg_rcp_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: RECIP_IEEE T0.X, KC0[2].Z, ; CM-NEXT: RECIP_IEEE T0.Y (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE T0.Z (MASKED), KC0[2].Z, ; CM-NEXT: RECIP_IEEE * T0.W (MASKED), KC0[2].Z, ; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X, ; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %rcp = fdiv float -1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e64 v0, -|s2| ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e64 v2, -|s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: RECIP_IEEE * T0.X, |KC0[2].Z|, ; EG-NEXT: MUL_IEEE T0.X, literal.x, PS, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) ; ; CM-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: RECIP_IEEE T0.X, |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE T0.Y (MASKED), |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|, ; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X, ; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fneg float %src.fabs %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_rcp_f32_e64 v0, -|s4| ; SI-NEXT: v_mul_f32_e64 v1, s4, -|s4| ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rcp_f32_e64 v2, -|s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2| ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: MUL_IEEE T0.X, KC0[2].Z, -|KC0[2].Z|, ; EG-NEXT: RECIP_IEEE * T0.Y, |KC0[2].Z|, ; EG-NEXT: MUL_IEEE T1.X, literal.x, PS, ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.y, ; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) ; ; CM-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X ; CM-NEXT: CF_END ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: MUL_IEEE * T0.X, KC0[2].Z, -|KC0[2].Z|, ; CM-NEXT: RECIP_IEEE T0.X (MASKED), |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE T0.Y, |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|, ; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|, ; CM-NEXT: MUL_IEEE * T1.X, literal.x, PV.Y, ; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) ; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fneg float %src.fabs %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0 store volatile float %rcp, ptr addrspace(1) %out, align 4 %other = fmul float %src, %src.fabs.fneg store volatile float %other, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 { ; SI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mul_f32_e64 v0, s4, 0.5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_div_arcp_2_x_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: TEX 0 @4 ; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 4: ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 6: ; EG-NEXT: MUL_IEEE T0.X, T0.X, 0.5, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: s_div_arcp_2_x_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: TEX 0 @4 ; CM-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 4: ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; CM-NEXT: ALU clause starting at 6: ; CM-NEXT: MUL_IEEE * T0.X, T0.X, 0.5, ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %x = load float, ptr addrspace(1) undef %rcp = fdiv arcp float %x, 2.0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 { ; SI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, s4, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_div_arcp_k_x_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: TEX 0 @4 ; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 4: ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 6: ; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 1036831949(1.000000e-01), 2(2.802597e-45) ; ; CM-LABEL: s_div_arcp_k_x_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: TEX 0 @4 ; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 4: ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; CM-NEXT: ALU clause starting at 6: ; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x, ; CM-NEXT: 1036831949(1.000000e-01), 0(0.000000e+00) ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %x = load float, ptr addrspace(1) undef %rcp = fdiv arcp float %x, 10.0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 { ; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, s4, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: TEX 0 @4 ; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 4: ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 6: ; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1110651699(-1.000000e-01), 2(2.802597e-45) ; ; CM-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; CM: ; %bb.0: ; CM-NEXT: TEX 0 @4 ; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 4: ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; CM-NEXT: ALU clause starting at 6: ; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x, ; CM-NEXT: -1110651699(-1.000000e-01), 0(0.000000e+00) ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %x = load float, ptr addrspace(1) undef %rcp = fdiv arcp float %x, -10.0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } declare float @llvm.fabs.f32(float) #1 declare float @llvm.sqrt.f32(float) #1 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" } !0 = !{float 2.500000e+00}