150
|
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
|
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
|
3
|
|
4 ; GCN-LABEL: ds_read32_combine_stride_400:
|
|
5 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
6 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
7
|
|
8 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
9 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
10 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
11
|
|
12 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
|
|
13 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
|
|
14 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
|
|
15
|
|
16 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
|
|
17 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
|
|
18 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
|
|
19 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
|
|
20 define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
|
|
21 bb:
|
|
22 %tmp = load float, float addrspace(3)* %arg, align 4
|
|
23 %tmp2 = fadd float %tmp, 0.000000e+00
|
|
24 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
|
|
25 %tmp4 = load float, float addrspace(3)* %tmp3, align 4
|
|
26 %tmp5 = fadd float %tmp2, %tmp4
|
|
27 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
|
|
28 %tmp7 = load float, float addrspace(3)* %tmp6, align 4
|
|
29 %tmp8 = fadd float %tmp5, %tmp7
|
|
30 %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
|
|
31 %tmp10 = load float, float addrspace(3)* %tmp9, align 4
|
|
32 %tmp11 = fadd float %tmp8, %tmp10
|
|
33 %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
|
|
34 %tmp13 = load float, float addrspace(3)* %tmp12, align 4
|
|
35 %tmp14 = fadd float %tmp11, %tmp13
|
|
36 %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
|
|
37 %tmp16 = load float, float addrspace(3)* %tmp15, align 4
|
|
38 %tmp17 = fadd float %tmp14, %tmp16
|
|
39 %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
|
|
40 %tmp19 = load float, float addrspace(3)* %tmp18, align 4
|
|
41 %tmp20 = fadd float %tmp17, %tmp19
|
|
42 %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
|
|
43 %tmp22 = load float, float addrspace(3)* %tmp21, align 4
|
|
44 %tmp23 = fadd float %tmp20, %tmp22
|
|
45 store float %tmp23, float *%arg1, align 4
|
|
46 ret void
|
|
47 }
|
|
48
|
|
49 ; GCN-LABEL: ds_read32_combine_stride_400_back:
|
|
50 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
51 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
52
|
|
53 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
54 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
55 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
56
|
|
57 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
|
|
58 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
|
|
59 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
|
|
60
|
|
61 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
|
|
62 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
|
|
63 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
|
|
64 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
|
|
65 define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
|
|
66 bb:
|
|
67 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
|
|
68 %tmp2 = load float, float addrspace(3)* %tmp, align 4
|
|
69 %tmp3 = fadd float %tmp2, 0.000000e+00
|
|
70 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
|
|
71 %tmp5 = load float, float addrspace(3)* %tmp4, align 4
|
|
72 %tmp6 = fadd float %tmp3, %tmp5
|
|
73 %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
|
|
74 %tmp8 = load float, float addrspace(3)* %tmp7, align 4
|
|
75 %tmp9 = fadd float %tmp6, %tmp8
|
|
76 %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
|
|
77 %tmp11 = load float, float addrspace(3)* %tmp10, align 4
|
|
78 %tmp12 = fadd float %tmp9, %tmp11
|
|
79 %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
|
|
80 %tmp14 = load float, float addrspace(3)* %tmp13, align 4
|
|
81 %tmp15 = fadd float %tmp12, %tmp14
|
|
82 %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
|
|
83 %tmp17 = load float, float addrspace(3)* %tmp16, align 4
|
|
84 %tmp18 = fadd float %tmp15, %tmp17
|
|
85 %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
|
|
86 %tmp20 = load float, float addrspace(3)* %tmp19, align 4
|
|
87 %tmp21 = fadd float %tmp18, %tmp20
|
|
88 %tmp22 = load float, float addrspace(3)* %arg, align 4
|
|
89 %tmp23 = fadd float %tmp21, %tmp22
|
|
90 store float %tmp23, float *%arg1, align 4
|
|
91 ret void
|
|
92 }
|
|
93
|
|
94 ; GCN-LABEL: ds_read32_combine_stride_8192:
|
|
95 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
96 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
97 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32
|
|
98 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96
|
|
99 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160
|
|
100 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224
|
|
101 define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
|
|
102 bb:
|
|
103 %tmp = load float, float addrspace(3)* %arg, align 4
|
|
104 %tmp2 = fadd float %tmp, 0.000000e+00
|
|
105 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
|
|
106 %tmp4 = load float, float addrspace(3)* %tmp3, align 4
|
|
107 %tmp5 = fadd float %tmp2, %tmp4
|
|
108 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
|
|
109 %tmp7 = load float, float addrspace(3)* %tmp6, align 4
|
|
110 %tmp8 = fadd float %tmp5, %tmp7
|
|
111 %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
|
|
112 %tmp10 = load float, float addrspace(3)* %tmp9, align 4
|
|
113 %tmp11 = fadd float %tmp8, %tmp10
|
|
114 %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
|
|
115 %tmp13 = load float, float addrspace(3)* %tmp12, align 4
|
|
116 %tmp14 = fadd float %tmp11, %tmp13
|
|
117 %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
|
|
118 %tmp16 = load float, float addrspace(3)* %tmp15, align 4
|
|
119 %tmp17 = fadd float %tmp14, %tmp16
|
|
120 %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
|
|
121 %tmp19 = load float, float addrspace(3)* %tmp18, align 4
|
|
122 %tmp20 = fadd float %tmp17, %tmp19
|
|
123 %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
|
|
124 %tmp22 = load float, float addrspace(3)* %tmp21, align 4
|
|
125 %tmp23 = fadd float %tmp20, %tmp22
|
|
126 store float %tmp23, float *%arg1, align 4
|
|
127 ret void
|
|
128 }
|
|
129
|
|
130 ; GCN-LABEL: ds_read32_combine_stride_8192_shifted:
|
|
131 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
132 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
133
|
|
134 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
|
|
135 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
136 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
137
|
|
138 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
|
|
139 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
|
|
140 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
|
|
141
|
|
142 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32
|
|
143 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32
|
|
144 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32
|
|
145 define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
|
|
146 bb:
|
|
147 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
|
|
148 %tmp2 = load float, float addrspace(3)* %tmp, align 4
|
|
149 %tmp3 = fadd float %tmp2, 0.000000e+00
|
|
150 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2050
|
|
151 %tmp5 = load float, float addrspace(3)* %tmp4, align 4
|
|
152 %tmp6 = fadd float %tmp3, %tmp5
|
|
153 %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4098
|
|
154 %tmp8 = load float, float addrspace(3)* %tmp7, align 4
|
|
155 %tmp9 = fadd float %tmp6, %tmp8
|
|
156 %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6146
|
|
157 %tmp11 = load float, float addrspace(3)* %tmp10, align 4
|
|
158 %tmp12 = fadd float %tmp9, %tmp11
|
|
159 %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8194
|
|
160 %tmp14 = load float, float addrspace(3)* %tmp13, align 4
|
|
161 %tmp15 = fadd float %tmp12, %tmp14
|
|
162 %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10242
|
|
163 %tmp17 = load float, float addrspace(3)* %tmp16, align 4
|
|
164 %tmp18 = fadd float %tmp15, %tmp17
|
|
165 store float %tmp18, float *%arg1, align 4
|
|
166 ret void
|
|
167 }
|
|
168
|
|
169 ; GCN-LABEL: ds_read64_combine_stride_400:
|
|
170 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
171 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
172
|
|
173 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
174 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
|
|
175
|
|
176 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50
|
|
177 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150
|
|
178 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250
|
|
179 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50
|
|
180 define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
|
|
181 bb:
|
|
182 %tmp = load double, double addrspace(3)* %arg, align 8
|
|
183 %tmp2 = fadd double %tmp, 0.000000e+00
|
|
184 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
|
|
185 %tmp4 = load double, double addrspace(3)* %tmp3, align 8
|
|
186 %tmp5 = fadd double %tmp2, %tmp4
|
|
187 %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
|
|
188 %tmp7 = load double, double addrspace(3)* %tmp6, align 8
|
|
189 %tmp8 = fadd double %tmp5, %tmp7
|
|
190 %tmp9 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
|
|
191 %tmp10 = load double, double addrspace(3)* %tmp9, align 8
|
|
192 %tmp11 = fadd double %tmp8, %tmp10
|
|
193 %tmp12 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
|
|
194 %tmp13 = load double, double addrspace(3)* %tmp12, align 8
|
|
195 %tmp14 = fadd double %tmp11, %tmp13
|
|
196 %tmp15 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
|
|
197 %tmp16 = load double, double addrspace(3)* %tmp15, align 8
|
|
198 %tmp17 = fadd double %tmp14, %tmp16
|
|
199 %tmp18 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
|
|
200 %tmp19 = load double, double addrspace(3)* %tmp18, align 8
|
|
201 %tmp20 = fadd double %tmp17, %tmp19
|
|
202 %tmp21 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
|
|
203 %tmp22 = load double, double addrspace(3)* %tmp21, align 8
|
|
204 %tmp23 = fadd double %tmp20, %tmp22
|
|
205 store double %tmp23, double *%arg1, align 8
|
|
206 ret void
|
|
207 }
|
|
208
|
|
209 ; GCN-LABEL: ds_read64_combine_stride_8192_shifted:
|
|
210 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
211 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
212
|
|
213 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
|
|
214 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
215 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
216
|
|
217 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
|
|
218 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
|
|
219 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
|
|
220
|
|
221 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16
|
|
222 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16
|
|
223 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16
|
|
224 define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
|
|
225 bb:
|
|
226 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
|
|
227 %tmp2 = load double, double addrspace(3)* %tmp, align 8
|
|
228 %tmp3 = fadd double %tmp2, 0.000000e+00
|
|
229 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
|
|
230 %tmp5 = load double, double addrspace(3)* %tmp4, align 8
|
|
231 %tmp6 = fadd double %tmp3, %tmp5
|
|
232 %tmp7 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
|
|
233 %tmp8 = load double, double addrspace(3)* %tmp7, align 8
|
|
234 %tmp9 = fadd double %tmp6, %tmp8
|
|
235 %tmp10 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
|
|
236 %tmp11 = load double, double addrspace(3)* %tmp10, align 8
|
|
237 %tmp12 = fadd double %tmp9, %tmp11
|
|
238 %tmp13 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
|
|
239 %tmp14 = load double, double addrspace(3)* %tmp13, align 8
|
|
240 %tmp15 = fadd double %tmp12, %tmp14
|
|
241 %tmp16 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
|
|
242 %tmp17 = load double, double addrspace(3)* %tmp16, align 8
|
|
243 %tmp18 = fadd double %tmp15, %tmp17
|
|
244 store double %tmp18, double *%arg1, align 8
|
|
245 ret void
|
|
246 }
|
|
247
|
|
248 ; GCN-LABEL: ds_write32_combine_stride_400:
|
|
249 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
250 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
251
|
|
252 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
253 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
254 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
255
|
|
256 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
|
|
257 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
|
|
258 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
|
|
259
|
|
260 ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
|
|
261 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
|
|
262 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
|
|
263 ; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
|
|
264 define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) {
|
|
265 bb:
|
|
266 store float 1.000000e+00, float addrspace(3)* %arg, align 4
|
|
267 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
|
|
268 store float 1.000000e+00, float addrspace(3)* %tmp, align 4
|
|
269 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
|
|
270 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
|
|
271 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
|
|
272 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
|
|
273 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
|
|
274 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
|
|
275 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
|
|
276 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
|
|
277 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
|
|
278 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
|
|
279 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
|
|
280 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
|
|
281 ret void
|
|
282 }
|
|
283
|
|
284 ; GCN-LABEL: ds_write32_combine_stride_400_back:
|
|
285 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
286 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
287
|
|
288 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
289 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
290 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
291
|
|
292 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
|
|
293 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
|
|
294 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
|
|
295
|
|
296 ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
|
|
297 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
|
|
298 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
|
|
299 ; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
|
|
300 define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) {
|
|
301 bb:
|
|
302 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
|
|
303 store float 1.000000e+00, float addrspace(3)* %tmp, align 4
|
|
304 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
|
|
305 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
|
|
306 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
|
|
307 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
|
|
308 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
|
|
309 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
|
|
310 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
|
|
311 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
|
|
312 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
|
|
313 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
|
|
314 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
|
|
315 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
|
|
316 store float 1.000000e+00, float addrspace(3)* %arg, align 4
|
|
317 ret void
|
|
318 }
|
|
319
|
|
320 ; GCN-LABEL: ds_write32_combine_stride_8192:
|
|
321 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
322 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
323 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
|
|
324 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96
|
|
325 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160
|
|
326 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224
|
|
327 define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) {
|
|
328 bb:
|
|
329 store float 1.000000e+00, float addrspace(3)* %arg, align 4
|
|
330 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
|
|
331 store float 1.000000e+00, float addrspace(3)* %tmp, align 4
|
|
332 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
|
|
333 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
|
|
334 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
|
|
335 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
|
|
336 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
|
|
337 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
|
|
338 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
|
|
339 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
|
|
340 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
|
|
341 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
|
|
342 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
|
|
343 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
|
|
344 ret void
|
|
345 }
|
|
346
|
|
347 ; GCN-LABEL: ds_write32_combine_stride_8192_shifted:
|
|
348 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
349 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
350
|
|
351 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
|
|
352 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
353 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
354
|
|
355 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]]
|
|
356 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]]
|
|
357 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]]
|
|
358
|
|
359 ; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
|
|
360 ; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
|
|
361 ; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
|
|
362 define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) {
|
|
363 bb:
|
|
364 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
|
|
365 store float 1.000000e+00, float addrspace(3)* %tmp, align 4
|
|
366 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2049
|
|
367 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
|
|
368 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4097
|
|
369 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
|
|
370 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6145
|
|
371 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
|
|
372 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8193
|
|
373 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
|
|
374 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10241
|
|
375 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
|
|
376 ret void
|
|
377 }
|
|
378
|
|
379 ; GCN-LABEL: ds_write64_combine_stride_400:
|
|
380 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
381 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
382
|
|
383 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
384 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
|
|
385
|
|
386 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
|
|
387 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150
|
|
388 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250
|
|
389 ; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
|
|
390 define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) {
|
|
391 bb:
|
|
392 store double 1.000000e+00, double addrspace(3)* %arg, align 8
|
|
393 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
|
|
394 store double 1.000000e+00, double addrspace(3)* %tmp, align 8
|
|
395 %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
|
|
396 store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
|
|
397 %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
|
|
398 store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
|
|
399 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
|
|
400 store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
|
|
401 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
|
|
402 store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
|
|
403 %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
|
|
404 store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
|
|
405 %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
|
|
406 store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
|
|
407 ret void
|
|
408 }
|
|
409
|
|
410 ; GCN-LABEL: ds_write64_combine_stride_8192_shifted:
|
|
411 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
|
|
412 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
|
|
413
|
|
414 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
|
|
415 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
416 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
|
|
417
|
|
418 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
|
|
419 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
|
|
420 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
|
|
421
|
|
422 ; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
|
|
423 ; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
|
|
424 ; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
|
|
425 define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) {
|
|
426 bb:
|
|
427 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
|
|
428 store double 1.000000e+00, double addrspace(3)* %tmp, align 8
|
|
429 %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
|
|
430 store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
|
|
431 %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
|
|
432 store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
|
|
433 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
|
|
434 store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
|
|
435 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
|
|
436 store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
|
|
437 %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
|
|
438 store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
|
|
439 ret void
|
|
440 }
|