221
|
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 < %s | FileCheck -check-prefix=HSA %s
|
150
|
2
|
|
3 @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
|
|
4 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
|
|
5
|
|
6 @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8
|
|
7 @lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32
|
|
8
|
|
9 @lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef
|
|
10 @lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef
|
|
11
|
|
12 declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #0
|
|
13 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #0
|
|
14
|
|
15
|
|
16 ; HSA-LABEL: {{^}}test_no_round_size_1:
|
|
17 ; HSA: workgroup_group_segment_byte_size = 38
|
|
18 define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
19 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
20 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
|
|
21 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
|
|
22 ret void
|
|
23 }
|
|
24
|
|
25 ; There are two objects, so one requires padding to be correctly
|
|
26 ; aligned after the other.
|
|
27
|
|
28 ; (38 -> 48) + 38 = 92
|
|
29
|
|
30 ; I don't think it is necessary to add padding after since if there
|
|
31 ; were to be a dynamically sized LDS kernel arg, the runtime should
|
|
32 ; add the alignment padding if necessary alignment padding if needed.
|
|
33
|
|
34 ; HSA-LABEL: {{^}}test_round_size_2:
|
|
35 ; HSA: workgroup_group_segment_byte_size = 86
|
|
36 ; HSA: group_segment_alignment = 4
|
|
37 define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
38 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
39 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
|
|
40 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
|
|
41
|
|
42 %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)*
|
|
43 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.1.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
|
|
44 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false)
|
|
45
|
|
46 ret void
|
|
47 }
|
|
48
|
221
|
49 ; 38 + (10 pad) + 38 (= 86)
|
150
|
50 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
|
|
51 ; HSA: workgroup_group_segment_byte_size = 86
|
|
52 ; HSA: group_segment_alignment = 4
|
|
53 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
54 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
55 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
56 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
|
|
57
|
|
58 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
|
|
59 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
60 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
|
|
61
|
|
62 ret void
|
|
63 }
|
|
64
|
|
65 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
|
|
66 ; HSA: workgroup_group_segment_byte_size = 38
|
|
67 ; HSA: group_segment_alignment = 4
|
|
68 define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
|
|
69 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
70 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
|
|
71
|
|
72 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
|
|
73 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
|
|
74 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false)
|
|
75 ret void
|
|
76 }
|
|
77
|
|
78 ; HSA-LABEL: {{^}}test_round_lds_arg:
|
|
79 ; HSA: workgroup_group_segment_byte_size = 0
|
|
80 ; HSA: group_segment_alignment = 4
|
|
81 define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
|
|
82 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
|
|
83 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false)
|
|
84 ret void
|
|
85 }
|
|
86
|
|
87 ; FIXME: Parameter alignment not considered
|
|
88 ; HSA-LABEL: {{^}}test_high_align_lds_arg:
|
|
89 ; HSA: workgroup_group_segment_byte_size = 0
|
|
90 ; HSA: group_segment_alignment = 4
|
|
91 define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
|
|
92 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false)
|
|
93 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false)
|
|
94 ret void
|
|
95 }
|
|
96
|
221
|
97 ; (39 * 4) + (4 pad) + (7 * 8) = 216
|
150
|
98 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
|
221
|
99 ; HSA: workgroup_group_segment_byte_size = 216
|
150
|
100 ; HSA: group_segment_alignment = 4
|
|
101 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
102 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
|
|
103 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false)
|
|
104 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false)
|
|
105
|
|
106 %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
|
|
107 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false)
|
|
108 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false)
|
|
109
|
|
110 ret void
|
|
111 }
|
|
112
|
|
113 ; (39 * 4) + (4 pad) + (7 * 8) = 216
|
|
114 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
|
|
115 ; HSA: workgroup_group_segment_byte_size = 216
|
|
116 ; HSA: group_segment_alignment = 4
|
|
117 define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
118 %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
|
|
119 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false)
|
|
120 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false)
|
|
121
|
|
122 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
|
|
123 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false)
|
|
124 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false)
|
|
125
|
|
126 ret void
|
|
127 }
|
|
128
|
221
|
129 ; align 32, 16, 16
|
|
130 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134)
|
150
|
131 ; HSA-LABEL: {{^}}test_round_size_3_order0:
|
|
132 ; HSA: workgroup_group_segment_byte_size = 134
|
|
133 ; HSA: group_segment_alignment = 4
|
|
134 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
135 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
|
|
136 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
137 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
|
|
138
|
|
139 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
140 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
141 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
|
|
142
|
|
143 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
|
|
144 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
145 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
|
|
146
|
|
147 ret void
|
|
148 }
|
|
149
|
221
|
150 ; align 32, 16, 16
|
|
151 ; 38 (+ 10 pad) + 38 + (10 pad) + 38 ( = 134)
|
150
|
152 ; HSA-LABEL: {{^}}test_round_size_3_order1:
|
|
153 ; HSA: workgroup_group_segment_byte_size = 134
|
|
154 ; HSA: group_segment_alignment = 4
|
|
155 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
156 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
|
|
157 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
158 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
|
|
159
|
|
160 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
|
|
161 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
162 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
|
|
163
|
|
164 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
165 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
166 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
|
|
167
|
|
168 ret void
|
|
169 }
|
|
170
|
221
|
171 ; align 32, 16, 16
|
|
172 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 126)
|
150
|
173 ; HSA-LABEL: {{^}}test_round_size_3_order2:
|
221
|
174 ; HSA: workgroup_group_segment_byte_size = 134
|
150
|
175 ; HSA: group_segment_alignment = 4
|
|
176 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
177 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
178 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
179 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
|
|
180
|
|
181 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
|
|
182 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
183 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
|
|
184
|
|
185 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
|
|
186 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
187 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
|
|
188
|
|
189 ret void
|
|
190 }
|
|
191
|
221
|
192 ; align 32, 16, 16
|
|
193 ; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134)
|
150
|
194 ; HSA-LABEL: {{^}}test_round_size_3_order3:
|
221
|
195 ; HSA: workgroup_group_segment_byte_size = 134
|
150
|
196 ; HSA: group_segment_alignment = 4
|
|
197 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
198 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
199 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
200 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
|
|
201
|
|
202 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
|
|
203 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
204 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
|
|
205
|
|
206 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
|
|
207 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
208 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
|
|
209
|
|
210 ret void
|
|
211 }
|
|
212
|
221
|
213 ; align 32, 16, 16
|
|
214 ; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134)
|
150
|
215 ; HSA-LABEL: {{^}}test_round_size_3_order4:
|
221
|
216 ; HSA: workgroup_group_segment_byte_size = 134
|
150
|
217 ; HSA: group_segment_alignment = 4
|
|
218 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
219 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
|
|
220 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
221 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
|
|
222
|
|
223 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
|
|
224 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
225 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
|
|
226
|
|
227 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
228 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
229 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
|
|
230
|
|
231 ret void
|
|
232 }
|
|
233
|
221
|
234 ; align 32, 16, 16
|
|
235 ; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134)
|
150
|
236 ; HSA-LABEL: {{^}}test_round_size_3_order5:
|
221
|
237 ; HSA: workgroup_group_segment_byte_size = 134
|
150
|
238 ; HSA: group_segment_alignment = 4
|
|
239 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
|
|
240 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
|
|
241 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
242 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
|
|
243
|
|
244 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
|
|
245 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
246 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
|
|
247
|
|
248 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
|
|
249 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
|
|
250 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
|
|
251
|
|
252 ret void
|
|
253 }
|
|
254
|
|
255 attributes #0 = { argmemonly nounwind }
|
|
256 attributes #1 = { nounwind }
|
|
257 attributes #2 = { convergent nounwind }
|