Mercurial > hg > Members > tobaru > cbc > CbC_llvm
comparison test/CodeGen/AMDGPU/lds-output-queue.ll @ 95:afa8332a0e37
LLVM 3.8
author | Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 13 Oct 2015 17:48:58 +0900 |
parents | |
children | 1172e4bd9c6f |
comparison
equal
deleted
inserted
replaced
84:f3e34b893a5f | 95:afa8332a0e37 |
---|---|
1 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s | |
2 ; | |
3 ; This test checks that the lds input queue will is empty at the end of | |
4 ; the ALU clause. | |
5 | |
6 ; CHECK-LABEL: {{^}}lds_input_queue: | |
7 ; CHECK: LDS_READ_RET * OQAP | |
8 ; CHECK-NOT: ALU clause | |
9 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP | |
10 | |
11 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 | |
12 | |
13 define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { | |
14 entry: | |
15 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index | |
16 %1 = load i32, i32 addrspace(3)* %0 | |
17 call void @llvm.AMDGPU.barrier.local() | |
18 | |
19 ; This will start a new clause for the vertex fetch | |
20 %2 = load i32, i32 addrspace(1)* %in | |
21 %3 = add i32 %1, %2 | |
22 store i32 %3, i32 addrspace(1)* %out | |
23 ret void | |
24 } | |
25 | |
26 declare void @llvm.AMDGPU.barrier.local() | |
27 | |
28 ; The machine scheduler does not do proper alias analysis and assumes that | |
29 ; loads from global values (Note that a global value is different that a | |
30 ; value from global memory. A global value is a value that is declared | |
31 ; outside of a function, it can reside in any address space) alias with | |
32 ; all other loads. | |
33 ; | |
34 ; This is a problem for scheduling the reads from the local data share (lds). | |
35 ; These reads are implemented using two instructions. The first copies the | |
36 ; data from lds into the lds output queue, and the second moves the data from | |
37 ; the input queue into main memory. These two instructions don't have to be | |
38 ; scheduled one after the other, but they do need to be scheduled in the same | |
39 ; clause. The aliasing problem mentioned above causes problems when there is a | |
40 ; load from global memory which immediately follows a load from a global value that | |
41 ; has been declared in the local memory space: | |
42 ; | |
43 ; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index | |
44 ; %1 = load i32, i32 addrspace(3)* %0 | |
45 ; %2 = load i32, i32 addrspace(1)* %in | |
46 ; | |
47 ; The instruction selection phase will generate ISA that looks like this: | |
48 ; %OQAP = LDS_READ_RET | |
49 ; %vreg0 = MOV %OQAP | |
50 ; %vreg1 = VTX_READ_32 | |
51 ; %vreg2 = ADD_INT %vreg1, %vreg0 | |
52 ; | |
53 ; The bottom scheduler will schedule the two ALU instructions first: | |
54 ; | |
55 ; UNSCHEDULED: | |
56 ; %OQAP = LDS_READ_RET | |
57 ; %vreg1 = VTX_READ_32 | |
58 ; | |
59 ; SCHEDULED: | |
60 ; | |
61 ; vreg0 = MOV %OQAP | |
62 ; vreg2 = ADD_INT %vreg1, %vreg2 | |
63 ; | |
64 ; The lack of proper aliasing results in the local memory read (LDS_READ_RET) | |
65 ; to consider the global memory read (VTX_READ_32) has a chain dependency, so | |
66 ; the global memory read will always be scheduled first. This will give us a | |
67 ; final program which looks like this: | |
68 ; | |
69 ; Alu clause: | |
70 ; %OQAP = LDS_READ_RET | |
71 ; VTX clause: | |
72 ; %vreg1 = VTX_READ_32 | |
73 ; Alu clause: | |
74 ; vreg0 = MOV %OQAP | |
75 ; vreg2 = ADD_INT %vreg1, %vreg2 | |
76 ; | |
77 ; This is an illegal program because the OQAP def and use know occur in | |
78 ; different ALU clauses. | |
79 ; | |
80 ; This test checks this scenario and makes sure it doesn't result in an | |
81 ; illegal program. For now, we have fixed this issue by merging the | |
82 ; LDS_READ_RET and MOV together during instruction selection and then | |
83 ; expanding them after scheduling. Once the scheduler has better alias | |
84 ; analysis, we should be able to keep these instructions sparate before | |
85 ; scheduling. | |
86 ; | |
87 ; CHECK-LABEL: {{^}}local_global_alias: | |
88 ; CHECK: LDS_READ_RET | |
89 ; CHECK-NOT: ALU clause | |
90 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP | |
91 define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { | |
92 entry: | |
93 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 | |
94 %1 = load i32, i32 addrspace(3)* %0 | |
95 %2 = load i32, i32 addrspace(1)* %in | |
96 %3 = add i32 %2, %1 | |
97 store i32 %3, i32 addrspace(1)* %out | |
98 ret void | |
99 } |