comparison llvm/test/CodeGen/AMDGPU/lds-alignment.ll @ 221:79ff65ed7e25

LLVM12 Original
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 15 Jun 2021 19:15:29 +0900
parents 1d019706d866
children 1f2b6ac9f198
comparison
equal deleted inserted replaced
220:42394fc6a535 221:79ff65ed7e25
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s 1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 < %s | FileCheck -check-prefix=HSA %s
2 2
3 @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 3 @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
4 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 4 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
5 5
6 @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8 6 @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8
44 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false) 44 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false)
45 45
46 ret void 46 ret void
47 } 47 }
48 48
49 ; 38 + (10 pad) + 38 49 ; 38 + (10 pad) + 38 (= 86)
50 ; HSA-LABEL: {{^}}test_round_size_2_align_8: 50 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
51 ; HSA: workgroup_group_segment_byte_size = 86 51 ; HSA: workgroup_group_segment_byte_size = 86
52 ; HSA: group_segment_alignment = 4 52 ; HSA: group_segment_alignment = 4
53 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 53 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
54 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 54 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
92 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false) 92 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false)
93 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false) 93 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false)
94 ret void 94 ret void
95 } 95 }
96 96
97 ; (7 * 8) + (39 * 4) = 212 97 ; (39 * 4) + (4 pad) + (7 * 8) = 216
98 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: 98 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
99 ; HSA: workgroup_group_segment_byte_size = 212 99 ; HSA: workgroup_group_segment_byte_size = 216
100 ; HSA: group_segment_alignment = 4 100 ; HSA: group_segment_alignment = 4
101 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 101 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
102 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* 102 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
103 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) 103 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false)
104 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) 104 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false)
123 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) 123 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false)
124 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) 124 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false)
125 125
126 ret void 126 ret void
127 } 127 }
128 ; Test how the size needed for padding changes based on when the 128
129 ; global is encountered during lowering. There should be a consistent 129 ; align 32, 16, 16
130 ; order to minimize padding waste. 130 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134)
131 ;
132 ; The way global addresses are lowered now, this is in inverse of
133 ; first use order which isn't great.
134 ;
135 ; This should be the optimal order for these globals. If sorted to
136 ; minimize padding, the minimum possible size is: align 32, align 8,
137 ; align 16
138
139
140 ; align 32, 16, 8
141 ; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
142 ; HSA-LABEL: {{^}}test_round_size_3_order0: 131 ; HSA-LABEL: {{^}}test_round_size_3_order0:
143 ; HSA: workgroup_group_segment_byte_size = 134 132 ; HSA: workgroup_group_segment_byte_size = 134
144 ; HSA: group_segment_alignment = 4 133 ; HSA: group_segment_alignment = 4
145 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 134 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
146 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 135 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
156 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 145 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
157 146
158 ret void 147 ret void
159 } 148 }
160 149
161 ; align 32, 8, 16 150 ; align 32, 16, 16
162 ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 151 ; 38 (+ 10 pad) + 38 + (10 pad) + 38 ( = 134)
163 ; HSA-LABEL: {{^}}test_round_size_3_order1: 152 ; HSA-LABEL: {{^}}test_round_size_3_order1:
164 ; HSA: workgroup_group_segment_byte_size = 134 153 ; HSA: workgroup_group_segment_byte_size = 134
165 ; HSA: group_segment_alignment = 4 154 ; HSA: group_segment_alignment = 4
166 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 155 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
167 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 156 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
177 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 166 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
178 167
179 ret void 168 ret void
180 } 169 }
181 170
182 ; align 16, 32, 8 171 ; align 32, 16, 16
183 ; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 172 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 126)
184 ; HSA-LABEL: {{^}}test_round_size_3_order2: 173 ; HSA-LABEL: {{^}}test_round_size_3_order2:
185 ; HSA: workgroup_group_segment_byte_size = 150 174 ; HSA: workgroup_group_segment_byte_size = 134
186 ; HSA: group_segment_alignment = 4 175 ; HSA: group_segment_alignment = 4
187 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 176 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
188 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 177 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
189 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 178 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
190 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 179 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
198 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 187 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
199 188
200 ret void 189 ret void
201 } 190 }
202 191
203 ; align 16, 8, 32 192 ; align 32, 16, 16
204 ; 38 + (2 pad) + 38 + (2 pad) + 38 193 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134)
205 ; HSA-LABEL: {{^}}test_round_size_3_order3: 194 ; HSA-LABEL: {{^}}test_round_size_3_order3:
206 ; HSA: workgroup_group_segment_byte_size = 118 195 ; HSA: workgroup_group_segment_byte_size = 134
207 ; HSA: group_segment_alignment = 4 196 ; HSA: group_segment_alignment = 4
208 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 197 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
209 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 198 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
210 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 199 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
211 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 200 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
219 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) 208 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
220 209
221 ret void 210 ret void
222 } 211 }
223 212
224 ; align 8, 32, 16 213 ; align 32, 16, 16
225 ; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 214 ; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134)
226 ; HSA-LABEL: {{^}}test_round_size_3_order4: 215 ; HSA-LABEL: {{^}}test_round_size_3_order4:
227 ; HSA: workgroup_group_segment_byte_size = 142 216 ; HSA: workgroup_group_segment_byte_size = 134
228 ; HSA: group_segment_alignment = 4 217 ; HSA: group_segment_alignment = 4
229 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 218 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
230 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 219 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
231 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 220 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
232 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 221 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
240 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 229 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
241 230
242 ret void 231 ret void
243 } 232 }
244 233
245 ; align 8, 16, 32 234 ; align 32, 16, 16
246 ; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 235 ; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134)
247 ; HSA-LABEL: {{^}}test_round_size_3_order5: 236 ; HSA-LABEL: {{^}}test_round_size_3_order5:
248 ; HSA: workgroup_group_segment_byte_size = 126 237 ; HSA: workgroup_group_segment_byte_size = 134
249 ; HSA: group_segment_alignment = 4 238 ; HSA: group_segment_alignment = 4
250 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 239 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
251 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 240 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
252 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 241 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
253 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 242 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)