Mercurial > hg > CbC > CbC_llvm
diff llvm/test/CodeGen/AMDGPU/wqm.ll @ 221:79ff65ed7e25
LLVM12 Original
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 15 Jun 2021 19:15:29 +0900 |
parents | 0572611fdcc8 |
children | c4bab56944e8 |
line wrap: on
line diff
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll Tue Jun 15 19:13:43 2021 +0900 +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll Tue Jun 15 19:15:29 2021 +0900 @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s ; Check that WQM isn't triggered by image load/store intrinsics. ; @@ -93,11 +93,10 @@ ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: v_mul_lo_u32 [[MUL:v[0-9]+]], v0, v1 +;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: image_sample ;CHECK: store -;CHECK: s_wqm_b64 exec, exec -;CHECK: image_sample -;CHECK: image_sample define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { main_body: %c.1 = mul i32 %c, %d @@ -147,6 +146,8 @@ ret float %out.2 } +; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. + ; Check that WWM is triggered by the wwm intrinsic. ; ;CHECK-LABEL: {{^}}test_wwm1: @@ -185,13 +186,17 @@ ; Check that we don't leave WWM on for computations that don't require WWM, ; since that will lead clobbering things that aren't supposed to be clobbered ; in cases like this. +; We enforce this by checking that v_add gets emitted in the same block as +; WWM computations. ; ;CHECK-LABEL: {{^}}test_wwm3: +;CHECK: %if ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: v_add_f32_e32 +;CHECK: %endif define amdgpu_ps float @test_wwm3(i32 inreg %idx) { main_body: ; use mbcnt to make sure the branch is divergent @@ -214,13 +219,17 @@ ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM ; write could clobber disabled channels in the non-WWM one. +; We enforce this by checking that v_mov gets emitted in the same block as +; WWM computations. ; ;CHECK-LABEL: {{^}}test_wwm4: +;CHECK: %if ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK-NEXT: v_mov_b32_e32 +;CHECK: %endif define amdgpu_ps float @test_wwm4(i32 inreg %idx) { main_body: ; use mbcnt to make sure the branch is divergent @@ -276,6 +285,7 @@ ;VI-CHECK: flat_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endif define amdgpu_ps float @test_wwm6_then() { main_body: %src0 = load volatile float, float addrspace(1)* undef @@ -309,6 +319,7 @@ ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword ;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endloop define amdgpu_ps float @test_wwm6_loop() { main_body: %src0 = load volatile float, float addrspace(1)* undef @@ -332,14 +343,14 @@ ; Check that @llvm.amdgcn.set.inactive disables WWM. ; -;CHECK-LABEL: {{^}}test_set_inactive1: +;CHECK-LABEL: {{^}}test_wwm_set_inactive1: ;CHECK: buffer_load_dword ;CHECK: s_not_b64 exec, exec ;CHECK: v_mov_b32_e32 ;CHECK: s_not_b64 exec, exec ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ;CHECK: v_add_{{[iu]}}32_e32 -define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) { +define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { main_body: %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %src.0 = bitcast float %src to i32 @@ -351,6 +362,208 @@ ret void } +; Check that Strict WQM is triggered by the strict_wqm intrinsic. +; +;CHECK-LABEL: {{^}}test_strict_wqm1: +;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + ret float %out.0 +} + +; Same as above, but with an integer type. +; +;CHECK-LABEL: {{^}}test_strict_wqm2: +;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_{{[iu]}}32_e32 +define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0.0 = bitcast float %src0 to i32 + %src1.0 = bitcast float %src1 to i32 + %out = add i32 %src0.0, %src1.0 + %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + ret float %out.1 +} + +; Check that we don't leave Strict WQM on for computations that don't require it, +; since that will lead clobbering things that aren't supposed to be clobbered +; in cases like this. +; We enforce this by checking that v_add gets emitted in the same block as +; WWM computations. +; +;CHECK-LABEL: {{^}}test_strict_wqm3: +;CHECK: %if +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: v_add_f32_e32 +;CHECK: %endif +define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +; Check that Strict WQM writes aren't coalesced with non-strict writes, since +; the Strict WQM write could clobber disabled channels in the non-strict one. +; We enforce this by checking that v_mov gets emitted in the same block as +; WWM computations. +; +;CHECK-LABEL: {{^}}test_strict_wqm4: +;CHECK: %if +;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK-NEXT: v_mov_b32_e32 +;CHECK: %endif +define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +; Make sure the transition from Exact to Strict WQM then WQM works properly. +; +;CHECK-LABEL: {{^}}test_strict_wqm5: +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: buffer_store_dword +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: s_wqm_b64 exec, exec +define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %src1, %src1 + %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +; Check that Strict WQM is turned on correctly across basic block boundaries. +; if..then..endif version +; +;CHECK-LABEL: {{^}}test_strict_wqm6_then: +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %if +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endif +define amdgpu_ps float @test_strict_wqm6_then() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +; Check that Strict WQM is turned on correctly across basic block boundaries. +; loop version +; +;CHECK-LABEL: {{^}}test_strict_wqm6_loop: +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %loop +;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endloop +define amdgpu_ps float @test_strict_wqm6_loop() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + br label %loop + +loop: + %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + %counter.1 = sub i32 %counter, 1 + %cc = icmp ne i32 %counter.1, 0 + br i1 %cc, label %loop, label %endloop + +endloop: + ret float %out.0 +} + ; Check that enabling WQM anywhere enables WQM for the set.inactive source. ; ;CHECK-LABEL: {{^}}test_set_inactive2: @@ -577,11 +790,11 @@ ;CHECK: image_sample ;CHECK: buffer_store_dword ;CHECK: s_wqm_b64 exec, exec -;CHECK: v_cmpx_ -;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] +;CHECK: v_cmp_ +;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: image_sample ;CHECK: buffer_store_dword -;CHECK: s_mov_b64 exec, [[SAVE]] -;CHECK: image_sample define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 @@ -612,9 +825,9 @@ ; CHECK: image_sample ; CHECK: s_and_b64 exec, exec, [[ORIG]] ; CHECK: image_sample -; CHECK: buffer_store_dword ; CHECK-NOT: wqm -; CHECK: v_cmpx_ +; CHECK-DAG: buffer_store_dword +; CHECK-DAG: v_cmp_ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 @@ -646,19 +859,21 @@ ; CHECK-NEXT: ; %entry ; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: s_wqm_b64 exec, exec +; CHECK: v_mov +; CHECK: v_mov +; CHECK: v_mov +; CHECK: v_mov ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: image_store ; CHECK: s_wqm_b64 exec, exec ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 ; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000 -; CHECK: ; %body +; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] -; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %loop +; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop ; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] -; CHECK: s_cbranch_vccz - -; CHECK: s_cbranch_vccnz [[LOOPHDR]] +; CHECK: s_cbranch_vccz [[LOOPHDR]] ; CHECK: ; %break ; CHECK: ; return @@ -825,6 +1040,381 @@ ret float %r } +; Check that WWM is triggered by the strict_wwm intrinsic. +; +;CHECK-LABEL: {{^}}test_strict_wwm1: +;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + ret float %out.0 +} + +; Same as above, but with an integer type. +; +;CHECK-LABEL: {{^}}test_strict_wwm2: +;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_{{[iu]}}32_e32 +define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0.0 = bitcast float %src0 to i32 + %src1.0 = bitcast float %src1 to i32 + %out = add i32 %src0.0, %src1.0 + %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + ret float %out.1 +} + +; Check that we don't leave WWM on for computations that don't require WWM, +; since that will lead clobbering things that aren't supposed to be clobbered +; in cases like this. +; We enforce this by checking that v_add gets emitted in the same block as +; WWM computations. +; +;CHECK-LABEL: {{^}}test_strict_wwm3: +;CHECK: %if +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: v_add_f32_e32 +;CHECK: %endif +define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM +; write could clobber disabled channels in the non-WWM one. +; We enforce this by checking that v_mov gets emitted in the same block as +; WWM computations. +; +;CHECK-LABEL: {{^}}test_strict_wwm4: +;CHECK: %if +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK-NEXT: v_mov_b32_e32 +;CHECK: %endif +define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +; Make sure the transition from Exact to WWM then WQM works properly. +; +;CHECK-LABEL: {{^}}test_strict_wwm5: +;CHECK: buffer_load_dword +;CHECK: buffer_store_dword +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: s_wqm_b64 exec, exec +define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %src1, %src1 + %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +; Check that WWM is turned on correctly across basic block boundaries. +; if..then..endif version +; +;CHECK-LABEL: {{^}}test_strict_wwm6_then: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %if +;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endif +define amdgpu_ps float @test_strict_wwm6_then() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +; Check that WWM is turned on correctly across basic block boundaries. +; loop version +; +;CHECK-LABEL: {{^}}test_strict_wwm6_loop: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %loop +;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endloop +define amdgpu_ps float @test_strict_wwm6_loop() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + br label %loop + +loop: + %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + %counter.1 = sub i32 %counter, 1 + %cc = icmp ne i32 %counter.1, 0 + br i1 %cc, label %loop, label %endloop + +endloop: + ret float %out.0 +} + +; Check that @llvm.amdgcn.set.inactive disables WWM. +; +;CHECK-LABEL: {{^}}test_strict_wwm_set_inactive1: +;CHECK: buffer_load_dword +;CHECK: s_not_b64 exec, exec +;CHECK: v_mov_b32_e32 +;CHECK: s_not_b64 exec, exec +;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +;CHECK: v_add_{{[iu]}}32_e32 +define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { +main_body: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src.0 = bitcast float %src to i32 + %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) + %out = add i32 %src.1, %src.1 + %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + ret void +} + +; Check a case of a block being entirely WQM except for a bit of WWM. +; There was a bug where it forgot to enter and leave WWM. +; +;CHECK-LABEL: {{^}}test_strict_wwm_within_wqm: +;CHECK: %IF +;CHECK: s_or_saveexec_b64 {{.*}}, -1 +;CHECK: ds_swizzle +; +define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ENDIF + +IF: + %dataf = extractelement <4 x float> %dtex, i32 0 + %data1 = fptosi float %dataf to i32 + %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) + %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) + %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3) + %data4f = sitofp i32 %data4 to float + br label %ENDIF + +ENDIF: + %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] + ret float %r +} + +; Check a case of a block being entirely WQM except for a bit of STRICT WQM. +; +;CHECK-LABEL: {{^}}test_strict_wqm_within_wqm: +;CHECK: %IF +;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: ds_swizzle +; +define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ENDIF + +IF: + %dataf = extractelement <4 x float> %dtex, i32 0 + %data1 = fptosi float %dataf to i32 + %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079) + %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2) + %data3f = sitofp i32 %data3 to float + br label %ENDIF + +ENDIF: + %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ] + ret float %r +} + +;CHECK-LABEL: {{^}}test_strict_wqm_strict_wwm_wqm: +;CHECK: buffer_store_dword + +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] + +;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] + +;CHECK: s_mov_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: v_add +;CHECK: s_mov_b64 exec, [[ORIG3]] + +;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again. +;CHECK: s_wqm_b64 exec, exec +;CHECK: image_sample + +define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) + %temp3 = fadd float %temp2, %temp2 + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0) + %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm) + %temp5 = fadd float %temp3, %temp4 + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + ret float %out +} + +;CHECK-LABEL: {{^}}test_strict_wwm_strict_wqm_wqm: +;CHECK: buffer_store_dword + +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] + +;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] + +;CHECK: s_or_saveexec_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: v_add +;CHECK: s_mov_b64 exec, [[ORIG3]] + +;CHECK: s_wqm_b64 exec, exec +;CHECK: image_sample +define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) + %temp3 = fadd float %temp2, %temp2 + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) + %temp5 = fadd float %temp3, %temp4 + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + ret float %out +} + +;CHECK-LABEL: {{^}}test_wqm_strict_wqm_wqm: +;CHECK: buffer_store_dword + +;CHECK: s_wqm_b64 exec, exec + +;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again. +;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] + +;CHECK: image_sample + +define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) + %temp2 = fadd float %tex, %tex + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) + %temp4 = fadd float %temp2, %temp3 + %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + ret float %out +} + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 @@ -838,11 +1428,16 @@ declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 +declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare void @llvm.amdgcn.kill(i1) #1 declare float @llvm.amdgcn.wqm.f32(float) #3 declare i32 @llvm.amdgcn.wqm.i32(i32) #3 +declare float @llvm.amdgcn.strict.wwm.f32(float) #3 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3 declare float @llvm.amdgcn.wwm.f32(float) #3 declare i32 @llvm.amdgcn.wwm.i32(i32) #3 +declare float @llvm.amdgcn.strict.wqm.f32(float) #3 +declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3