150
|
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
|
|
2 ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
|
|
3
|
|
4 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
|
|
5 ; HSA: enable_sgpr_private_segment_buffer = 1
|
|
6 ; HSA: enable_sgpr_dispatch_ptr = 0
|
|
7 ; CI: enable_sgpr_queue_ptr = 1
|
|
8 ; GFX9: enable_sgpr_queue_ptr = 0
|
|
9
|
|
10 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
|
|
11 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
|
|
12 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
|
|
13 ; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
|
|
14 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
|
|
15 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
|
16 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
|
|
17
|
|
18 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
|
|
19 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
|
|
20 ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
|
21 ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
|
|
22 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
|
|
23
|
|
24 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
|
|
25 ; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
|
|
26 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
|
|
27 ; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
|
28 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
|
|
29
|
|
30 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
|
|
31
|
|
32 ; At most 2 digits. Make sure src_shared_base is not counted as a high
|
|
33 ; number SGPR.
|
|
34
|
|
35 ; CI: NumSgprs: {{[0-9][0-9]+}}
|
|
36 ; GFX9: NumSgprs: {{[0-9]+}}
|
|
37 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
|
|
38 %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
|
|
39 store volatile i32 7, i32* %stof
|
|
40 ret void
|
|
41 }
|
|
42
|
173
|
43 ; Test handling inside a non-kernel
|
|
44 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
|
|
45 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
|
|
46 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
|
|
47 ; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
|
|
48 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
|
|
49 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
|
|
50
|
|
51 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
|
|
52 ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
|
53 ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
|
|
54 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
|
|
55
|
|
56 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
|
|
57 ; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
|
|
58 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
|
|
59 ; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
|
|
60
|
|
61 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
|
|
62 define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
|
|
63 %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
|
|
64 store volatile i32 7, i32* %stof
|
|
65 ret void
|
|
66 }
|
|
67
|
150
|
68 ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
|
|
69 ; HSA: enable_sgpr_private_segment_buffer = 1
|
|
70 ; HSA: enable_sgpr_dispatch_ptr = 0
|
|
71 ; CI: enable_sgpr_queue_ptr = 1
|
|
72 ; GFX9: enable_sgpr_queue_ptr = 0
|
|
73
|
|
74 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
|
|
75 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
|
|
76 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
|
|
77
|
|
78 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
|
|
79 ; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
|
|
80 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
|
|
81 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
|
82 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
|
|
83
|
|
84 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
|
|
85 ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
|
86 ; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
|
|
87 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]]
|
|
88
|
|
89 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
|
|
90
|
|
91 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
|
|
92 ; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
|
|
93 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
|
|
94 ; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
|
95 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
|
|
96
|
|
97 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
|
|
98
|
|
99 ; CI: NumSgprs: {{[0-9][0-9]+}}
|
|
100 ; GFX9: NumSgprs: {{[0-9]+}}
|
|
101 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 {
|
|
102 %stof = addrspacecast i32 addrspace(5)* %ptr to i32*
|
|
103 store volatile i32 7, i32* %stof
|
|
104 ret void
|
|
105 }
|
|
106
|
|
107 ; no-op
|
|
108 ; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
|
|
109 ; HSA: enable_sgpr_queue_ptr = 0
|
|
110
|
|
111 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
|
|
112 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
|
|
113 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
|
|
114 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
|
|
115 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
|
|
116 define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
|
|
117 %stof = addrspacecast i32 addrspace(1)* %ptr to i32*
|
|
118 store volatile i32 7, i32* %stof
|
|
119 ret void
|
|
120 }
|
|
121
|
|
122 ; no-op
|
|
123 ; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
|
|
124 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
|
|
125 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
|
|
126 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
|
|
127 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
|
|
128 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 {
|
|
129 %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
|
|
130 %ld = load volatile i32, i32* %stof
|
|
131 ret void
|
|
132 }
|
|
133
|
173
|
134 ; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
|
|
135 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
|
|
136 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
|
|
137 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
|
|
138 ; HSA: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
|
|
139 define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
|
|
140 %stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
|
|
141 %ld = load volatile i32, i32 addrspace(1)* %stof
|
|
142 ret void
|
|
143 }
|
|
144
|
150
|
145 ; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
|
|
146 ; HSA: enable_sgpr_private_segment_buffer = 1
|
|
147 ; HSA: enable_sgpr_dispatch_ptr = 0
|
|
148 ; HSA: enable_sgpr_queue_ptr = 0
|
|
149
|
|
150 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
|
|
151 ; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
|
|
152 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
|
|
153 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
|
|
154 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
|
|
155 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
|
|
156 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
|
|
157 %ftos = addrspacecast i32* %ptr to i32 addrspace(3)*
|
|
158 store volatile i32 0, i32 addrspace(3)* %ftos
|
|
159 ret void
|
|
160 }
|
|
161
|
|
162 ; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
|
|
163 ; HSA: enable_sgpr_private_segment_buffer = 1
|
|
164 ; HSA: enable_sgpr_dispatch_ptr = 0
|
|
165 ; HSA: enable_sgpr_queue_ptr = 0
|
|
166
|
|
167 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
|
|
168 ; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
|
|
169 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
|
|
170 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
|
|
171 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
|
173
|
172 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
|
150
|
173 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
|
|
174 %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*
|
|
175 store volatile i32 0, i32 addrspace(5)* %ftos
|
|
176 ret void
|
|
177 }
|
|
178
|
|
179 ; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
|
|
180 ; HSA: enable_sgpr_queue_ptr = 0
|
|
181
|
|
182 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
|
|
183 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
|
|
184 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
|
|
185 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
|
|
186 ; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
|
|
187 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
|
|
188 %ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
|
|
189 store volatile i32 0, i32 addrspace(1)* %ftos
|
|
190 ret void
|
|
191 }
|
|
192
|
|
193 ; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
|
|
194 ; HSA: enable_sgpr_queue_ptr = 0
|
|
195
|
|
196 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
|
|
197 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
|
|
198 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
|
|
199 %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
|
|
200 load volatile i32, i32 addrspace(4)* %ftos
|
|
201 ret void
|
|
202 }
|
|
203
|
|
204 ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
|
|
205 ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
|
|
206 ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
|
|
207 ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
|
208 ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
|
|
209 ; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
|
|
210
|
|
211 ; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
|
|
212
|
|
213 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
|
|
214 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
|
|
215 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
|
|
216 define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
|
|
217 %cast = addrspacecast i32 addrspace(3)* null to i32*
|
|
218 store volatile i32 7, i32* %cast
|
|
219 ret void
|
|
220 }
|
|
221
|
|
222 ; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
|
|
223 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
|
|
224 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
|
|
225 ; HSA: ds_write_b32 [[PTR]], [[K]]
|
|
226 define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
|
|
227 %cast = addrspacecast i32* null to i32 addrspace(3)*
|
|
228 store volatile i32 7, i32 addrspace(3)* %cast
|
|
229 ret void
|
|
230 }
|
|
231
|
|
232 ; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
|
|
233 ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
|
|
234 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
|
|
235 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
|
|
236 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
|
|
237 define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
|
|
238 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32*
|
|
239 store volatile i32 7, i32* %cast
|
|
240 ret void
|
|
241 }
|
|
242
|
|
243 ; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
|
|
244 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
|
|
245 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
|
|
246 ; HSA: ds_write_b32 [[PTR]], [[K]]
|
|
247 define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
|
|
248 %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)*
|
|
249 store volatile i32 7, i32 addrspace(3)* %cast
|
|
250 ret void
|
|
251 }
|
|
252
|
|
253 ; FIXME: Shouldn't need to enable queue ptr
|
|
254 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
|
|
255 ; CI: enable_sgpr_queue_ptr = 1
|
|
256 ; GFX9: enable_sgpr_queue_ptr = 0
|
|
257
|
|
258 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
|
|
259 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
|
|
260 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
|
|
261 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
|
|
262 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
|
|
263 %cast = addrspacecast i32 addrspace(5)* null to i32*
|
|
264 store volatile i32 7, i32* %cast
|
|
265 ret void
|
|
266 }
|
|
267
|
|
268 ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
|
|
269 ; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
|
173
|
270 ; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0
|
150
|
271 define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
|
|
272 %cast = addrspacecast i32* null to i32 addrspace(5)*
|
|
273 store volatile i32 7, i32 addrspace(5)* %cast
|
|
274 ret void
|
|
275 }
|
|
276
|
|
277 ; Disable optimizations in case there are optimizations added that
|
|
278 ; specialize away generic pointer accesses.
|
|
279
|
|
280 ; HSA-LABEL: {{^}}branch_use_flat_i32:
|
|
281 ; HSA: {{flat|global}}_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
|
|
282 ; HSA: s_endpgm
|
|
283 define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
|
|
284 entry:
|
|
285 %cmp = icmp ne i32 %c, 0
|
|
286 br i1 %cmp, label %local, label %global
|
|
287
|
|
288 local:
|
|
289 %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32*
|
|
290 br label %end
|
|
291
|
|
292 global:
|
|
293 %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32*
|
|
294 br label %end
|
|
295
|
|
296 end:
|
|
297 %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ]
|
|
298 store volatile i32 %x, i32* %fptr, align 4
|
|
299 ; %val = load i32, i32* %fptr, align 4
|
|
300 ; store i32 %val, i32 addrspace(1)* %out, align 4
|
|
301 ret void
|
|
302 }
|
|
303
|
|
304 ; Check for prologue initializing special SGPRs pointing to scratch.
|
|
305 ; HSA-LABEL: {{^}}store_flat_scratch:
|
|
306 ; CI-DAG: s_mov_b32 flat_scratch_lo, s9
|
|
307 ; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
|
|
308 ; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
|
|
309
|
|
310 ; GFX9: s_add_u32 flat_scratch_lo, s6, s9
|
|
311 ; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
|
|
312
|
|
313 ; HSA: {{flat|global}}_store_dword
|
|
314 ; HSA: s_barrier
|
|
315 ; HSA: {{flat|global}}_load_dword
|
|
316 define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
|
|
317 %alloca = alloca i32, i32 9, align 4, addrspace(5)
|
|
318 %x = call i32 @llvm.amdgcn.workitem.id.x() #2
|
|
319 %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x
|
|
320 %fptr = addrspacecast i32 addrspace(5)* %pptr to i32*
|
|
321 store volatile i32 %x, i32* %fptr
|
|
322 ; Dummy call
|
|
323 call void @llvm.amdgcn.s.barrier() #1
|
|
324 %reload = load volatile i32, i32* %fptr, align 4
|
|
325 store volatile i32 %reload, i32 addrspace(1)* %out, align 4
|
|
326 ret void
|
|
327 }
|
|
328
|
173
|
329 ; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
|
|
330 ; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
|
|
331 ; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
|
|
332 ; GFX9: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}, [[PTRPTR]], 0x0{{$}}
|
|
333 ; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
|
|
334 ; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
|
|
335 ; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
|
|
336 define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(i8 addrspace(4)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
|
|
337 %ptr = load volatile i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %ptr.ptr
|
|
338 %addrspacecast = addrspacecast i8 addrspace(4)* %ptr to i8 addrspace(6)*
|
|
339 %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset
|
|
340 %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)*
|
|
341 %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4
|
|
342 ret void
|
|
343 }
|
|
344
|
|
345 ; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
|
|
346 ; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
|
|
347 ; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
|
|
348 ; GFX9: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}, [[PTRPTR]], 0x0{{$}}
|
|
349 ; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
|
|
350 ; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
|
|
351 ; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
|
|
352 define amdgpu_kernel void @use_global_to_constant32_addrspacecast(i8 addrspace(1)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
|
|
353 %ptr = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* %ptr.ptr
|
|
354 %addrspacecast = addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(6)*
|
|
355 %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset
|
|
356 %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)*
|
|
357 %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4
|
|
358 ret void
|
|
359 }
|
|
360
|
150
|
361 declare void @llvm.amdgcn.s.barrier() #1
|
|
362 declare i32 @llvm.amdgcn.workitem.id.x() #2
|
|
363
|
|
364 attributes #0 = { nounwind }
|
|
365 attributes #1 = { nounwind convergent }
|
|
366 attributes #2 = { nounwind readnone }
|