Mercurial > hg > CbC > CbC_llvm
diff llvm/test/CodeGen/AMDGPU/loop-prefetch.ll @ 221:79ff65ed7e25
LLVM12 Original
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 15 Jun 2021 19:15:29 +0900 |
parents | |
children | c4bab56944e8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll Tue Jun 15 19:15:29 2021 +0900 @@ -0,0 +1,388 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s + +; GFX8-NOT: s_inst_prefetch +; GFX8-NOT: .palign 6 + +; GCN-LABEL: test_loop_64 +; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 +; GFX10-DIS-NEXT: {{^$}} +; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]: +; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>: +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc0 [[L1]] +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_64(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb2, %bb + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_128 +; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 +; GFX10-ASM-NEXT: .p2align 6 +; GFX10-DIS-NEXT: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L1:BB[0-9_]+]]: +; GFX10-DIS: <[[L1:BB[0-9_]+]]>: +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc0 [[L1]] +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_128(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb2, %bb + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_192 +; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 +; GFX10-NEXT: s_inst_prefetch 0x1 +; GFX10-ASM-NEXT: .p2align 6 +; GFX10-DIS-NEXT: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L1:BB[0-9_]+]]: +; GFX10-DIS: <[[L1:BB[0-9_]+]]>: +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc0 [[L1]] +; GFX10-NEXT: s_inst_prefetch 0x2 +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_192(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb2, %bb + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_256 +; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 +; GFX10-DIS-NEXT: {{^$}} +; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]: +; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>: +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc0 [[L1]] +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_256(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb2, %bb + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_prefetch_inner_outer +; GFX10: s_inst_prefetch 0x1 +; GFX10-ASM-NEXT: .p2align 6 +; GFX10-DIS-NEXT: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L1:BB[0-9_]+]]: +; GFX10-DIS: <[[L1:BB[0-9_]+]]>: +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: .p2align 6 +; GFX10-DIS: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L2:BB[0-9_]+]]: +; GFX10-DIS: <[[L2:BB[0-9_]+]]>: +; GFX10-NOT: s_inst_prefetch +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc{{[01]}} [[L2]] +; GFX10-NOT: s_inst_prefetch +; GFX10: s_cbranch_scc{{[01]}} [[L1]] +; GFX10-NEXT: s_inst_prefetch 0x2 +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_prefetch_inner_outer(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: + ret void + +bb2: + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + br label %bb3 + +bb3: + %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ] + %tmp5 = add nuw nsw i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp6, label %bb4, label %bb3 + +bb4: + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_prefetch_inner_outer_noouter +; GFX10-NOT: .p2align 6 +; GFX10-NOT: s_nop +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L0:BB[0-9_]+]]: +; GFX10-DIS: <[[L0:BB[0-9_]+]]>: +; GFX10: s_inst_prefetch 0x1 +; GFX10-ASM-NEXT: .p2align 6 +; GFX10-DIS-NEXT: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L1:BB[0-9_]+]]: +; GFX10-DIS: <[[L1:BB[0-9_]+]]>: +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: .p2align 6 +; GFX10-DIS: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L2:BB[0-9_]+]]: +; GFX10-DIS: <[[L2:BB[0-9_]+]]>: +; GFX10-NOT: s_inst_prefetch +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc{{[01]}} [[L2]] +; GFX10-NOT: s_inst_prefetch +; GFX10: s_cbranch_scc{{[01]}} [[L1]] +; GFX10-NEXT: s_inst_prefetch 0x2 +; GFX10: s_cbranch_scc{{[01]}} [[L0]] +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_prefetch_inner_outer_noouter(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: + ret void + +bb2: + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb6 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + br label %bb3 + +bb3: + %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb5 ] + %tmp5 = add nuw nsw i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 1024 + br label %bb4 + +bb4: + %tmp7 = phi i32 [ 0, %bb3 ], [ %tmp8, %bb4 ] + %tmp8 = add nuw nsw i32 %tmp7, 1 + %tmp9 = icmp eq i32 %tmp8, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp9, label %bb5, label %bb4 + +bb5: + br i1 %tmp6, label %bb6, label %bb3 + +bb6: + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +declare void @llvm.amdgcn.s.sleep(i32)