Mercurial > hg > CbC > CbC_llvm
comparison llvm/test/CodeGen/AMDGPU/lds-alignment.ll @ 221:79ff65ed7e25
LLVM12 Original
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 15 Jun 2021 19:15:29 +0900 |
parents | 1d019706d866 |
children | 1f2b6ac9f198 |
comparison
equal
deleted
inserted
replaced
220:42394fc6a535 | 221:79ff65ed7e25 |
---|---|
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s | 1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 < %s | FileCheck -check-prefix=HSA %s |
2 | 2 |
3 @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 | 3 @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 |
4 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 | 4 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 |
5 | 5 |
6 @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8 | 6 @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8 |
44 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false) | 44 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false) |
45 | 45 |
46 ret void | 46 ret void |
47 } | 47 } |
48 | 48 |
49 ; 38 + (10 pad) + 38 | 49 ; 38 + (10 pad) + 38 (= 86) |
50 ; HSA-LABEL: {{^}}test_round_size_2_align_8: | 50 ; HSA-LABEL: {{^}}test_round_size_2_align_8: |
51 ; HSA: workgroup_group_segment_byte_size = 86 | 51 ; HSA: workgroup_group_segment_byte_size = 86 |
52 ; HSA: group_segment_alignment = 4 | 52 ; HSA: group_segment_alignment = 4 |
53 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { | 53 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
54 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* | 54 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
92 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false) | 92 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false) |
93 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false) | 93 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false) |
94 ret void | 94 ret void |
95 } | 95 } |
96 | 96 |
97 ; (7 * 8) + (39 * 4) = 212 | 97 ; (39 * 4) + (4 pad) + (7 * 8) = 216 |
98 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: | 98 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: |
99 ; HSA: workgroup_group_segment_byte_size = 212 | 99 ; HSA: workgroup_group_segment_byte_size = 216 |
100 ; HSA: group_segment_alignment = 4 | 100 ; HSA: group_segment_alignment = 4 |
101 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { | 101 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
102 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* | 102 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* |
103 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) | 103 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) |
104 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) | 104 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) |
123 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) | 123 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) |
124 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) | 124 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) |
125 | 125 |
126 ret void | 126 ret void |
127 } | 127 } |
128 ; Test how the size needed for padding changes based on when the | 128 |
129 ; global is encountered during lowering. There should be a consistent | 129 ; align 32, 16, 16 |
130 ; order to minimize padding waste. | 130 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134) |
131 ; | |
132 ; The way global addresses are lowered now, this is in inverse of | |
133 ; first use order which isn't great. | |
134 ; | |
135 ; This should be the optimal order for these globals. If sorted to | |
136 ; minimize padding, the minimum possible size is: align 32, align 8, | |
137 ; align 16 | |
138 | |
139 | |
140 ; align 32, 16, 8 | |
141 ; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 | |
142 ; HSA-LABEL: {{^}}test_round_size_3_order0: | 131 ; HSA-LABEL: {{^}}test_round_size_3_order0: |
143 ; HSA: workgroup_group_segment_byte_size = 134 | 132 ; HSA: workgroup_group_segment_byte_size = 134 |
144 ; HSA: group_segment_alignment = 4 | 133 ; HSA: group_segment_alignment = 4 |
145 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { | 134 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
146 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* | 135 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
156 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) | 145 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
157 | 146 |
158 ret void | 147 ret void |
159 } | 148 } |
160 | 149 |
161 ; align 32, 8, 16 | 150 ; align 32, 16, 16 |
162 ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 | 151 ; 38 (+ 10 pad) + 38 + (10 pad) + 38 ( = 134) |
163 ; HSA-LABEL: {{^}}test_round_size_3_order1: | 152 ; HSA-LABEL: {{^}}test_round_size_3_order1: |
164 ; HSA: workgroup_group_segment_byte_size = 134 | 153 ; HSA: workgroup_group_segment_byte_size = 134 |
165 ; HSA: group_segment_alignment = 4 | 154 ; HSA: group_segment_alignment = 4 |
166 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { | 155 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
167 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* | 156 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
177 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) | 166 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
178 | 167 |
179 ret void | 168 ret void |
180 } | 169 } |
181 | 170 |
182 ; align 16, 32, 8 | 171 ; align 32, 16, 16 |
183 ; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 | 172 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 126) |
184 ; HSA-LABEL: {{^}}test_round_size_3_order2: | 173 ; HSA-LABEL: {{^}}test_round_size_3_order2: |
185 ; HSA: workgroup_group_segment_byte_size = 150 | 174 ; HSA: workgroup_group_segment_byte_size = 134 |
186 ; HSA: group_segment_alignment = 4 | 175 ; HSA: group_segment_alignment = 4 |
187 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { | 176 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
188 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* | 177 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
189 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) | 178 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
190 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) | 179 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
198 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) | 187 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
199 | 188 |
200 ret void | 189 ret void |
201 } | 190 } |
202 | 191 |
203 ; align 16, 8, 32 | 192 ; align 32, 16, 16 |
204 ; 38 + (2 pad) + 38 + (2 pad) + 38 | 193 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134) |
205 ; HSA-LABEL: {{^}}test_round_size_3_order3: | 194 ; HSA-LABEL: {{^}}test_round_size_3_order3: |
206 ; HSA: workgroup_group_segment_byte_size = 118 | 195 ; HSA: workgroup_group_segment_byte_size = 134 |
207 ; HSA: group_segment_alignment = 4 | 196 ; HSA: group_segment_alignment = 4 |
208 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { | 197 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
209 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* | 198 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
210 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) | 199 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
211 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) | 200 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
219 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) | 208 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) |
220 | 209 |
221 ret void | 210 ret void |
222 } | 211 } |
223 | 212 |
224 ; align 8, 32, 16 | 213 ; align 32, 16, 16 |
225 ; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 | 214 ; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134) |
226 ; HSA-LABEL: {{^}}test_round_size_3_order4: | 215 ; HSA-LABEL: {{^}}test_round_size_3_order4: |
227 ; HSA: workgroup_group_segment_byte_size = 142 | 216 ; HSA: workgroup_group_segment_byte_size = 134 |
228 ; HSA: group_segment_alignment = 4 | 217 ; HSA: group_segment_alignment = 4 |
229 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { | 218 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
230 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* | 219 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
231 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) | 220 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
232 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) | 221 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
240 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) | 229 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
241 | 230 |
242 ret void | 231 ret void |
243 } | 232 } |
244 | 233 |
245 ; align 8, 16, 32 | 234 ; align 32, 16, 16 |
246 ; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 | 235 ; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134) |
247 ; HSA-LABEL: {{^}}test_round_size_3_order5: | 236 ; HSA-LABEL: {{^}}test_round_size_3_order5: |
248 ; HSA: workgroup_group_segment_byte_size = 126 | 237 ; HSA: workgroup_group_segment_byte_size = 134 |
249 ; HSA: group_segment_alignment = 4 | 238 ; HSA: group_segment_alignment = 4 |
250 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { | 239 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
251 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* | 240 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
252 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) | 241 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
253 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) | 242 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |