annotate llvm/test/CodeGen/AMDGPU/lds-output-queue.ll @ 206:f17a3b42b08b

Added tag before-12 for changeset b7591485f4cd
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 07 Jun 2021 21:25:57 +0900
parents 1d019706d866
children 1f2b6ac9f198
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
150
anatofuz
parents:
diff changeset
1 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
anatofuz
parents:
diff changeset
2 ;
anatofuz
parents:
diff changeset
3 ; This test checks that the lds input queue will is empty at the end of
anatofuz
parents:
diff changeset
4 ; the ALU clause.
anatofuz
parents:
diff changeset
5
anatofuz
parents:
diff changeset
6 ; CHECK-LABEL: {{^}}lds_input_queue:
anatofuz
parents:
diff changeset
7 ; CHECK: LDS_READ_RET * OQAP
anatofuz
parents:
diff changeset
8 ; CHECK-NOT: ALU clause
anatofuz
parents:
diff changeset
9 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
anatofuz
parents:
diff changeset
10
anatofuz
parents:
diff changeset
11 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
anatofuz
parents:
diff changeset
12
anatofuz
parents:
diff changeset
13 define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
anatofuz
parents:
diff changeset
14 entry:
anatofuz
parents:
diff changeset
15 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
anatofuz
parents:
diff changeset
16 %1 = load i32, i32 addrspace(3)* %0
anatofuz
parents:
diff changeset
17 call void @llvm.r600.group.barrier()
anatofuz
parents:
diff changeset
18
anatofuz
parents:
diff changeset
19 ; This will start a new clause for the vertex fetch
anatofuz
parents:
diff changeset
20 %2 = load i32, i32 addrspace(1)* %in
anatofuz
parents:
diff changeset
21 %3 = add i32 %1, %2
anatofuz
parents:
diff changeset
22 store i32 %3, i32 addrspace(1)* %out
anatofuz
parents:
diff changeset
23 ret void
anatofuz
parents:
diff changeset
24 }
anatofuz
parents:
diff changeset
25
anatofuz
parents:
diff changeset
26 declare void @llvm.r600.group.barrier() nounwind convergent
anatofuz
parents:
diff changeset
27
anatofuz
parents:
diff changeset
28 ; The machine scheduler does not do proper alias analysis and assumes that
anatofuz
parents:
diff changeset
29 ; loads from global values (Note that a global value is different that a
anatofuz
parents:
diff changeset
30 ; value from global memory. A global value is a value that is declared
anatofuz
parents:
diff changeset
31 ; outside of a function, it can reside in any address space) alias with
anatofuz
parents:
diff changeset
32 ; all other loads.
anatofuz
parents:
diff changeset
33 ;
anatofuz
parents:
diff changeset
34 ; This is a problem for scheduling the reads from the local data share (lds).
anatofuz
parents:
diff changeset
35 ; These reads are implemented using two instructions. The first copies the
anatofuz
parents:
diff changeset
36 ; data from lds into the lds output queue, and the second moves the data from
anatofuz
parents:
diff changeset
37 ; the input queue into main memory. These two instructions don't have to be
anatofuz
parents:
diff changeset
38 ; scheduled one after the other, but they do need to be scheduled in the same
anatofuz
parents:
diff changeset
39 ; clause. The aliasing problem mentioned above causes problems when there is a
anatofuz
parents:
diff changeset
40 ; load from global memory which immediately follows a load from a global value that
anatofuz
parents:
diff changeset
41 ; has been declared in the local memory space:
anatofuz
parents:
diff changeset
42 ;
anatofuz
parents:
diff changeset
43 ; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
anatofuz
parents:
diff changeset
44 ; %1 = load i32, i32 addrspace(3)* %0
anatofuz
parents:
diff changeset
45 ; %2 = load i32, i32 addrspace(1)* %in
anatofuz
parents:
diff changeset
46 ;
anatofuz
parents:
diff changeset
47 ; The instruction selection phase will generate ISA that looks like this:
anatofuz
parents:
diff changeset
48 ; %oqap = LDS_READ_RET
anatofuz
parents:
diff changeset
49 ; %0 = MOV %oqap
anatofuz
parents:
diff changeset
50 ; %1 = VTX_READ_32
anatofuz
parents:
diff changeset
51 ; %2 = ADD_INT %1, %0
anatofuz
parents:
diff changeset
52 ;
anatofuz
parents:
diff changeset
53 ; The bottom scheduler will schedule the two ALU instructions first:
anatofuz
parents:
diff changeset
54 ;
anatofuz
parents:
diff changeset
55 ; UNSCHEDULED:
anatofuz
parents:
diff changeset
56 ; %oqap = LDS_READ_RET
anatofuz
parents:
diff changeset
57 ; %1 = VTX_READ_32
anatofuz
parents:
diff changeset
58 ;
anatofuz
parents:
diff changeset
59 ; SCHEDULED:
anatofuz
parents:
diff changeset
60 ;
anatofuz
parents:
diff changeset
61 ; %0 = MOV %oqap
anatofuz
parents:
diff changeset
62 ; %2 = ADD_INT %1, %2
anatofuz
parents:
diff changeset
63 ;
anatofuz
parents:
diff changeset
64 ; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
anatofuz
parents:
diff changeset
65 ; to consider the global memory read (VTX_READ_32) has a chain dependency, so
anatofuz
parents:
diff changeset
66 ; the global memory read will always be scheduled first. This will give us a
anatofuz
parents:
diff changeset
67 ; final program which looks like this:
anatofuz
parents:
diff changeset
68 ;
anatofuz
parents:
diff changeset
69 ; Alu clause:
anatofuz
parents:
diff changeset
70 ; %oqap = LDS_READ_RET
anatofuz
parents:
diff changeset
71 ; VTX clause:
anatofuz
parents:
diff changeset
72 ; %1 = VTX_READ_32
anatofuz
parents:
diff changeset
73 ; Alu clause:
anatofuz
parents:
diff changeset
74 ; %0 = MOV %oqap
anatofuz
parents:
diff changeset
75 ; %2 = ADD_INT %1, %2
anatofuz
parents:
diff changeset
76 ;
anatofuz
parents:
diff changeset
77 ; This is an illegal program because the oqap def and use know occur in
anatofuz
parents:
diff changeset
78 ; different ALU clauses.
anatofuz
parents:
diff changeset
79 ;
anatofuz
parents:
diff changeset
80 ; This test checks this scenario and makes sure it doesn't result in an
anatofuz
parents:
diff changeset
81 ; illegal program. For now, we have fixed this issue by merging the
anatofuz
parents:
diff changeset
82 ; LDS_READ_RET and MOV together during instruction selection and then
anatofuz
parents:
diff changeset
83 ; expanding them after scheduling. Once the scheduler has better alias
anatofuz
parents:
diff changeset
84 ; analysis, we should be able to keep these instructions sparate before
anatofuz
parents:
diff changeset
85 ; scheduling.
anatofuz
parents:
diff changeset
86 ;
anatofuz
parents:
diff changeset
87 ; CHECK-LABEL: {{^}}local_global_alias:
anatofuz
parents:
diff changeset
88 ; CHECK: LDS_READ_RET
anatofuz
parents:
diff changeset
89 ; CHECK-NOT: ALU clause
anatofuz
parents:
diff changeset
90 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
anatofuz
parents:
diff changeset
91 define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
anatofuz
parents:
diff changeset
92 entry:
anatofuz
parents:
diff changeset
93 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
anatofuz
parents:
diff changeset
94 %1 = load i32, i32 addrspace(3)* %0
anatofuz
parents:
diff changeset
95 %2 = load i32, i32 addrspace(1)* %in
anatofuz
parents:
diff changeset
96 %3 = add i32 %2, %1
anatofuz
parents:
diff changeset
97 store i32 %3, i32 addrspace(1)* %out
anatofuz
parents:
diff changeset
98 ret void
anatofuz
parents:
diff changeset
99 }