134
|
1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s
|
|
2 ; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s
|
|
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
|
|
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
|
|
5 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
|
6 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
121
|
7
|
120
|
8
|
121
|
9 ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
|
|
10 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
|
|
11 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
|
|
12 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
|
|
13 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
14 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
15 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
16 %a = load float, float addrspace(1)* %gep0
|
|
17 %a.add = fadd nnan float %a, 1.0
|
|
18 %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
|
|
19 %med = call float @llvm.minnum.f32(float %max, float 4.0)
|
|
20
|
|
21 store float %med, float addrspace(1)* %outgep
|
|
22 ret void
|
|
23 }
|
120
|
24
|
|
25 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32:
|
|
26 ; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
|
|
27
|
|
28 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
|
29 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
121
|
30 define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
|
120
|
31 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
32 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
33 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
34 %a = load float, float addrspace(1)* %gep0
|
|
35
|
|
36 %max = call float @llvm.maxnum.f32(float %a, float 2.0)
|
|
37 %med = call float @llvm.minnum.f32(float %max, float 4.0)
|
|
38
|
|
39 store float %med, float addrspace(1)* %outgep
|
|
40 ret void
|
|
41 }
|
|
42
|
|
43 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute0_f32:
|
|
44 ; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
|
|
45
|
|
46 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
|
47 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
121
|
48 define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
|
120
|
49 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
50 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
51 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
52 %a = load float, float addrspace(1)* %gep0
|
|
53
|
|
54 %max = call float @llvm.maxnum.f32(float 2.0, float %a)
|
|
55 %med = call float @llvm.minnum.f32(float 4.0, float %max)
|
|
56
|
|
57 store float %med, float addrspace(1)* %outgep
|
|
58 ret void
|
|
59 }
|
|
60
|
|
61 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute1_f32:
|
|
62 ; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
|
|
63
|
|
64 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
|
65 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
121
|
66 define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
|
120
|
67 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
68 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
69 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
70 %a = load float, float addrspace(1)* %gep0
|
|
71
|
|
72 %max = call float @llvm.maxnum.f32(float %a, float 2.0)
|
|
73 %med = call float @llvm.minnum.f32(float 4.0, float %max)
|
|
74
|
|
75 store float %med, float addrspace(1)* %outgep
|
|
76 ret void
|
|
77 }
|
|
78
|
|
79 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
|
|
80 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
|
81 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
121
|
82 define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
|
120
|
83 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
84 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
85 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
86 %a = load float, float addrspace(1)* %gep0
|
|
87
|
|
88 %max = call float @llvm.maxnum.f32(float %a, float 4.0)
|
|
89 %med = call float @llvm.minnum.f32(float %max, float 2.0)
|
|
90
|
|
91 store float %med, float addrspace(1)* %outgep
|
|
92 ret void
|
|
93 }
|
|
94
|
|
95
|
|
96 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
|
|
97 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
|
98 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
121
|
99 define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
|
120
|
100 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
101 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
102 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
103 %a = load float, float addrspace(1)* %gep0
|
|
104
|
|
105 %max = call float @llvm.maxnum.f32(float %a, float 2.0)
|
|
106 %med = call float @llvm.minnum.f32(float %max, float 4.0)
|
|
107
|
|
108 store volatile float %med, float addrspace(1)* %outgep
|
|
109 store volatile float %max, float addrspace(1)* %outgep
|
|
110 ret void
|
|
111 }
|
|
112
|
|
113 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
|
|
114 ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
|
|
115 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
|
121
|
116 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
|
120
|
117 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
118 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
|
|
119 %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
|
|
120 %a = load double, double addrspace(1)* %gep0
|
|
121
|
|
122 %max = call double @llvm.maxnum.f64(double %a, double 2.0)
|
|
123 %med = call double @llvm.minnum.f64(double %max, double 4.0)
|
|
124
|
|
125 store double %med, double addrspace(1)* %outgep
|
|
126 ret void
|
|
127 }
|
|
128
|
|
129 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
|
|
130 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
|
121
|
131 define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
|
120
|
132 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
133 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
134 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
135 %a = load float, float addrspace(1)* %gep0
|
|
136
|
|
137 %max = call float @llvm.maxnum.f32(float %a, float 2.0)
|
|
138 %med = call float @llvm.minnum.f32(float %max, float 4.0)
|
|
139
|
|
140 store float %med, float addrspace(1)* %outgep
|
|
141 ret void
|
|
142 }
|
|
143
|
|
144 ; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32:
|
|
145 ; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
|
|
146
|
|
147 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
|
148 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
121
|
149 define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
|
120
|
150 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
151 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
152 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
153 %a = load float, float addrspace(1)* %gep0
|
|
154
|
|
155 ; fmax_legacy
|
|
156 %cmp0 = fcmp ule float %a, 2.0
|
|
157 %max = select i1 %cmp0, float 2.0, float %a
|
|
158
|
|
159 ; fmin_legacy
|
|
160 %cmp1 = fcmp uge float %max, 4.0
|
|
161 %med = select i1 %cmp1, float 4.0, float %max
|
|
162
|
|
163 store float %med, float addrspace(1)* %outgep
|
|
164 ret void
|
|
165 }
|
|
166
|
121
|
167 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0:
|
|
168 ; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]]
|
|
169 ; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]]
|
|
170 ; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]]
|
|
171 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
|
|
172 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
173 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
174 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
175 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
176 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
177 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
178 %a = load volatile float, float addrspace(1)* %gep0
|
|
179 %b = load volatile float, float addrspace(1)* %gep1
|
|
180 %c = load volatile float, float addrspace(1)* %gep2
|
|
181 %a.fneg = fsub float -0.0, %a
|
|
182 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
|
|
183 %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
|
|
184 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
185 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
186 store float %med3, float addrspace(1)* %outgep
|
|
187 ret void
|
|
188 }
|
|
189
|
|
190 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod1:
|
|
191 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
192 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
193 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
194 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
|
|
195 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
196 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
197 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
198 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
199 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
200 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
201 %a = load volatile float, float addrspace(1)* %gep0
|
|
202 %b = load volatile float, float addrspace(1)* %gep1
|
|
203 %c = load volatile float, float addrspace(1)* %gep2
|
|
204 %b.fneg = fsub float -0.0, %b
|
|
205 %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
|
|
206 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
|
|
207 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
208 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
209 store float %med3, float addrspace(1)* %outgep
|
|
210 ret void
|
|
211 }
|
|
212
|
|
213 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod2:
|
|
214 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
215 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
216 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
217 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
|
|
218 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
219 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
220 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
221 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
222 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
223 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
224 %a = load volatile float, float addrspace(1)* %gep0
|
|
225 %b = load volatile float, float addrspace(1)* %gep1
|
|
226 %c = load volatile float, float addrspace(1)* %gep2
|
|
227 %c.fneg = fsub float -0.0, %c
|
|
228 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
229 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
230 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
|
|
231 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
232 store float %med3, float addrspace(1)* %outgep
|
|
233 ret void
|
|
234 }
|
|
235
|
|
236 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod012:
|
|
237 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
238 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
239 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
240 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
|
|
241 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
242 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
243 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
244 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
245 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
246 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
247 %a = load volatile float, float addrspace(1)* %gep0
|
|
248 %b = load volatile float, float addrspace(1)* %gep1
|
|
249 %c = load volatile float, float addrspace(1)* %gep2
|
|
250
|
|
251 %a.fneg = fsub float -0.0, %a
|
|
252 %b.fabs = call float @llvm.fabs.f32(float %b)
|
|
253 %c.fabs = call float @llvm.fabs.f32(float %c)
|
|
254 %c.fabs.fneg = fsub float -0.0, %c.fabs
|
|
255
|
|
256 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
|
|
257 %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
|
|
258 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
|
|
259 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
260
|
|
261 store float %med3, float addrspace(1)* %outgep
|
|
262 ret void
|
|
263 }
|
|
264
|
|
265 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_negabs012:
|
|
266 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
267 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
268 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
269 ; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
|
|
270 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
271 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
272 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
273 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
274 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
275 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
276 %a = load volatile float, float addrspace(1)* %gep0
|
|
277 %b = load volatile float, float addrspace(1)* %gep1
|
|
278 %c = load volatile float, float addrspace(1)* %gep2
|
|
279
|
|
280 %a.fabs = call float @llvm.fabs.f32(float %a)
|
|
281 %a.fabs.fneg = fsub float -0.0, %a.fabs
|
|
282 %b.fabs = call float @llvm.fabs.f32(float %b)
|
|
283 %b.fabs.fneg = fsub float -0.0, %b.fabs
|
|
284 %c.fabs = call float @llvm.fabs.f32(float %c)
|
|
285 %c.fabs.fneg = fsub float -0.0, %c.fabs
|
|
286
|
|
287 %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
|
|
288 %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
|
|
289 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
|
|
290 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
291
|
|
292 store float %med3, float addrspace(1)* %outgep
|
|
293 ret void
|
|
294 }
|
|
295
|
|
296 ; GCN-LABEL: {{^}}v_nnan_inputs_med3_f32_pat0:
|
|
297 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
298 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
299 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
300 ; GCN-DAG: v_add_f32_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
|
|
301 ; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
|
|
302 ; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
|
|
303 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
|
|
304 define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
|
305 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
306 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
307 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
308 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
309 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
310 %a = load volatile float, float addrspace(1)* %gep0
|
|
311 %b = load volatile float, float addrspace(1)* %gep1
|
|
312 %c = load volatile float, float addrspace(1)* %gep2
|
|
313
|
|
314 %a.nnan = fadd nnan float %a, 1.0
|
|
315 %b.nnan = fadd nnan float %b, 2.0
|
|
316 %c.nnan = fadd nnan float %c, 4.0
|
|
317
|
|
318 %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
|
|
319 %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
|
|
320 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
|
|
321 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
322 store float %med3, float addrspace(1)* %outgep
|
|
323 ret void
|
|
324 }
|
|
325
|
|
326 ; 16 combinations
|
|
327
|
|
328 ; 0: max(min(x, y), min(max(x, y), z))
|
|
329 ; 1: max(min(x, y), min(max(y, x), z))
|
|
330 ; 2: max(min(x, y), min(z, max(x, y)))
|
|
331 ; 3: max(min(x, y), min(z, max(y, x)))
|
|
332 ; 4: max(min(y, x), min(max(x, y), z))
|
|
333 ; 5: max(min(y, x), min(max(y, x), z))
|
|
334 ; 6: max(min(y, x), min(z, max(x, y)))
|
|
335 ; 7: max(min(y, x), min(z, max(y, x)))
|
|
336 ;
|
|
337 ; + commute outermost max
|
|
338
|
|
339 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0:
|
|
340 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
341 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
342 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
343 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
|
|
344 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
345 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
346 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
347 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
348 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
349 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
350 %a = load volatile float, float addrspace(1)* %gep0
|
|
351 %b = load volatile float, float addrspace(1)* %gep1
|
|
352 %c = load volatile float, float addrspace(1)* %gep2
|
|
353 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
354 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
355 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
356 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
357 store float %med3, float addrspace(1)* %outgep
|
|
358 ret void
|
|
359 }
|
|
360
|
|
361 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat1:
|
|
362 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
363 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
364 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
365 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
|
|
366 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
367 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
368 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
369 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
370 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
371 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
372 %a = load volatile float, float addrspace(1)* %gep0
|
|
373 %b = load volatile float, float addrspace(1)* %gep1
|
|
374 %c = load volatile float, float addrspace(1)* %gep2
|
|
375 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
376 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
377 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
378 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
379 store float %med3, float addrspace(1)* %outgep
|
|
380 ret void
|
|
381 }
|
|
382
|
|
383 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat2:
|
|
384 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
385 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
386 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
387 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
|
|
388 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
389 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
390 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
391 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
392 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
393 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
394 %a = load volatile float, float addrspace(1)* %gep0
|
|
395 %b = load volatile float, float addrspace(1)* %gep1
|
|
396 %c = load volatile float, float addrspace(1)* %gep2
|
|
397 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
398 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
399 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
400 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
401 store float %med3, float addrspace(1)* %outgep
|
|
402 ret void
|
|
403 }
|
|
404
|
|
405 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat3:
|
|
406 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
407 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
408 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
409 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
|
|
410 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
411 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
412 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
413 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
414 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
415 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
416 %a = load volatile float, float addrspace(1)* %gep0
|
|
417 %b = load volatile float, float addrspace(1)* %gep1
|
|
418 %c = load volatile float, float addrspace(1)* %gep2
|
|
419 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
420 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
421 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
422 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
423 store float %med3, float addrspace(1)* %outgep
|
|
424 ret void
|
|
425 }
|
|
426
|
|
427 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat4:
|
|
428 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
429 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
430 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
431 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
432 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
433 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
434 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
435 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
436 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
437 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
438 %a = load volatile float, float addrspace(1)* %gep0
|
|
439 %b = load volatile float, float addrspace(1)* %gep1
|
|
440 %c = load volatile float, float addrspace(1)* %gep2
|
|
441 %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
|
|
442 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
443 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
444 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
445 store float %med3, float addrspace(1)* %outgep
|
|
446 ret void
|
|
447 }
|
|
448
|
|
449 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat5:
|
|
450 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
451 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
452 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
453 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
454 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
455 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
456 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
457 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
458 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
459 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
460 %a = load volatile float, float addrspace(1)* %gep0
|
|
461 %b = load volatile float, float addrspace(1)* %gep1
|
|
462 %c = load volatile float, float addrspace(1)* %gep2
|
|
463 %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
|
|
464 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
465 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
466 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
467 store float %med3, float addrspace(1)* %outgep
|
|
468 ret void
|
|
469 }
|
|
470
|
|
471 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat6:
|
|
472 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
473 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
474 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
475 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
476 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
477 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
478 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
479 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
480 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
481 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
482 %a = load volatile float, float addrspace(1)* %gep0
|
|
483 %b = load volatile float, float addrspace(1)* %gep1
|
|
484 %c = load volatile float, float addrspace(1)* %gep2
|
|
485 %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
|
|
486 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
487 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
488 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
489 store float %med3, float addrspace(1)* %outgep
|
|
490 ret void
|
|
491 }
|
|
492
|
|
493 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat7:
|
|
494 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
495 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
496 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
497 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
498 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
499 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
500 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
501 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
502 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
503 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
504 %a = load volatile float, float addrspace(1)* %gep0
|
|
505 %b = load volatile float, float addrspace(1)* %gep1
|
|
506 %c = load volatile float, float addrspace(1)* %gep2
|
|
507 %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
|
|
508 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
509 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
510 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
511 store float %med3, float addrspace(1)* %outgep
|
|
512 ret void
|
|
513 }
|
|
514
|
|
515 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat8:
|
|
516 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
517 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
518 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
519 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
|
|
520 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
521 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
522 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
523 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
524 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
525 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
526 %a = load volatile float, float addrspace(1)* %gep0
|
|
527 %b = load volatile float, float addrspace(1)* %gep1
|
|
528 %c = load volatile float, float addrspace(1)* %gep2
|
|
529 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
530 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
531 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
532 %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
|
|
533 store float %med3, float addrspace(1)* %outgep
|
|
534 ret void
|
|
535 }
|
|
536
|
|
537 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat9:
|
|
538 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
539 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
540 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
541 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
542 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
543 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
544 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
545 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
546 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
547 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
548 %a = load volatile float, float addrspace(1)* %gep0
|
|
549 %b = load volatile float, float addrspace(1)* %gep1
|
|
550 %c = load volatile float, float addrspace(1)* %gep2
|
|
551 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
552 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
553 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
554 %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
|
|
555 store float %med3, float addrspace(1)* %outgep
|
|
556 ret void
|
|
557 }
|
|
558
|
|
559 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat10:
|
|
560 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
561 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
562 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
563 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
|
|
564 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
565 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
566 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
567 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
568 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
569 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
570 %a = load volatile float, float addrspace(1)* %gep0
|
|
571 %b = load volatile float, float addrspace(1)* %gep1
|
|
572 %c = load volatile float, float addrspace(1)* %gep2
|
|
573 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
574 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
575 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
576 %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
|
|
577 store float %med3, float addrspace(1)* %outgep
|
|
578 ret void
|
|
579 }
|
|
580
|
|
581 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat11:
|
|
582 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
583 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
584 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
585 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
586 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
587 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
588 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
589 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
590 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
591 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
592 %a = load volatile float, float addrspace(1)* %gep0
|
|
593 %b = load volatile float, float addrspace(1)* %gep1
|
|
594 %c = load volatile float, float addrspace(1)* %gep2
|
|
595 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
596 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
597 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
598 %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
|
|
599 store float %med3, float addrspace(1)* %outgep
|
|
600 ret void
|
|
601 }
|
|
602
|
|
603 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat12:
|
|
604 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
605 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
606 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
607 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
608 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
609 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
610 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
611 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
612 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
613 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
614 %a = load volatile float, float addrspace(1)* %gep0
|
|
615 %b = load volatile float, float addrspace(1)* %gep1
|
|
616 %c = load volatile float, float addrspace(1)* %gep2
|
|
617 %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
|
|
618 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
619 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
620 %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
|
|
621 store float %med3, float addrspace(1)* %outgep
|
|
622 ret void
|
|
623 }
|
|
624
|
|
625 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat13:
|
|
626 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
627 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
628 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
629 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
630 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
631 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
632 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
633 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
634 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
635 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
636 %a = load volatile float, float addrspace(1)* %gep0
|
|
637 %b = load volatile float, float addrspace(1)* %gep1
|
|
638 %c = load volatile float, float addrspace(1)* %gep2
|
|
639 %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
|
|
640 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
641 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
642 %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
|
|
643 store float %med3, float addrspace(1)* %outgep
|
|
644 ret void
|
|
645 }
|
|
646
|
|
647 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat14:
|
|
648 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
649 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
650 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
651 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
|
|
652 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
653 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
654 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
655 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
656 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
657 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
658 %a = load volatile float, float addrspace(1)* %gep0
|
|
659 %b = load volatile float, float addrspace(1)* %gep1
|
|
660 %c = load volatile float, float addrspace(1)* %gep2
|
|
661 %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
|
|
662 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
663 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
664 %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
|
|
665 store float %med3, float addrspace(1)* %outgep
|
|
666 ret void
|
|
667 }
|
|
668
|
|
669 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat15:
|
|
670 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
671 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
672 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
673 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
|
|
674 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
675 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
676 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
677 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
678 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
679 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
680 %a = load volatile float, float addrspace(1)* %gep0
|
|
681 %b = load volatile float, float addrspace(1)* %gep1
|
|
682 %c = load volatile float, float addrspace(1)* %gep2
|
|
683 %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
|
|
684 %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
|
|
685 %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
|
|
686 %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
|
|
687 store float %med3, float addrspace(1)* %outgep
|
|
688 ret void
|
|
689 }
|
|
690
|
|
691 ; ---------------------------------------------------------------------
|
|
692 ; Negative patterns
|
|
693 ; ---------------------------------------------------------------------
|
|
694
|
|
695 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0:
|
|
696 ; GCN-DAG: v_min_f32
|
|
697 ; GCN-DAG: v_max_f32
|
|
698 ; GCN: v_min_f32
|
|
699 ; GCN: v_max_f32
|
|
700 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
|
701 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
702 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
703 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
704 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
705 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
706 %a = load volatile float, float addrspace(1)* %gep0
|
|
707 %b = load volatile float, float addrspace(1)* %gep1
|
|
708 %c = load volatile float, float addrspace(1)* %gep2
|
|
709 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
710 store volatile float %tmp0, float addrspace(1)* undef
|
|
711 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
712 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
713 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
714 store float %med3, float addrspace(1)* %outgep
|
|
715 ret void
|
|
716 }
|
|
717
|
|
718 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
|
|
719 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
|
720 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
721 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
722 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
723 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
724 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
725 %a = load volatile float, float addrspace(1)* %gep0
|
|
726 %b = load volatile float, float addrspace(1)* %gep1
|
|
727 %c = load volatile float, float addrspace(1)* %gep2
|
|
728 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
729 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
730 store volatile float %tmp1, float addrspace(1)* undef
|
|
731 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
732 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
733 store float %med3, float addrspace(1)* %outgep
|
|
734 ret void
|
|
735 }
|
|
736
|
|
737 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
|
|
738 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
|
739 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
740 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
741 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
742 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
743 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
744 %a = load volatile float, float addrspace(1)* %gep0
|
|
745 %b = load volatile float, float addrspace(1)* %gep1
|
|
746 %c = load volatile float, float addrspace(1)* %gep2
|
|
747 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
748 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
749 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
750 store volatile float %tmp2, float addrspace(1)* undef
|
|
751 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
752 store float %med3, float addrspace(1)* %outgep
|
|
753 ret void
|
|
754 }
|
|
755
|
|
756
|
|
757 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
|
|
758 define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
|
759 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
760 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
761 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
762 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
763 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
764 %a = load volatile float, float addrspace(1)* %gep0
|
|
765 %b = load volatile float, float addrspace(1)* %gep1
|
|
766 %c = load volatile float, float addrspace(1)* %gep2
|
|
767 %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
|
|
768 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
769 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
770 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
771 store float %med3, float addrspace(1)* %outgep
|
|
772 ret void
|
|
773 }
|
|
774
|
|
775 ; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
|
|
776 define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
|
777 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
778 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
779 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
780 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
781 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
782 %a = load volatile float, float addrspace(1)* %gep0
|
|
783 %b = load volatile float, float addrspace(1)* %gep1
|
|
784 %c = load volatile float, float addrspace(1)* %gep2
|
|
785
|
|
786 %a.nnan = fadd float %a, 1.0
|
|
787 %b.nnan = fadd nnan float %b, 2.0
|
|
788 %c.nnan = fadd nnan float %c, 4.0
|
|
789
|
|
790 %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
|
|
791 %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
|
|
792 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
|
|
793 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
794 store float %med3, float addrspace(1)* %outgep
|
|
795 ret void
|
|
796 }
|
|
797
|
|
798 ; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
|
|
799 define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
|
800 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
801 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
802 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
803 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
804 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
805 %a = load volatile float, float addrspace(1)* %gep0
|
|
806 %b = load volatile float, float addrspace(1)* %gep1
|
|
807 %c = load volatile float, float addrspace(1)* %gep2
|
|
808
|
|
809 %a.nnan = fadd nnan float %a, 1.0
|
|
810 %b.nnan = fadd float %b, 2.0
|
|
811 %c.nnan = fadd nnan float %c, 4.0
|
|
812
|
|
813 %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
|
|
814 %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
|
|
815 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
|
|
816 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
817 store float %med3, float addrspace(1)* %outgep
|
|
818 ret void
|
|
819 }
|
|
820
|
|
821 ; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
|
|
822 define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
|
823 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
824 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
825 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
826 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
827 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
828 %a = load volatile float, float addrspace(1)* %gep0
|
|
829 %b = load volatile float, float addrspace(1)* %gep1
|
|
830 %c = load volatile float, float addrspace(1)* %gep2
|
|
831
|
|
832 %a.nnan = fadd nnan float %a, 1.0
|
|
833 %b.nnan = fadd nnan float %b, 2.0
|
|
834 %c.nnan = fadd float %c, 4.0
|
|
835
|
|
836 %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
|
|
837 %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
|
|
838 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
|
|
839 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
840 store float %med3, float addrspace(1)* %outgep
|
|
841 ret void
|
|
842 }
|
|
843
|
|
844 ; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
|
|
845 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
846 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
847 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
848 ; GCN-DAG: v_min_f32
|
|
849 ; GCN-DAG: v_max_f32
|
|
850 ; GCN-DAG: v_min_f32
|
|
851 ; GCN-DAG: v_max_f32
|
|
852 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
853 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
854 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
855 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
856 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
857 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
858 %a = load volatile float, float addrspace(1)* %gep0
|
|
859 %b = load volatile float, float addrspace(1)* %gep1
|
|
860 %c = load volatile float, float addrspace(1)* %gep2
|
|
861 %a.fneg = fsub float -0.0, %a
|
|
862 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
|
|
863 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
|
|
864 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
|
|
865 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
|
|
866 store float %med3, float addrspace(1)* %outgep
|
|
867 ret void
|
|
868 }
|
|
869
|
|
870 ; A simple min and max is not sufficient
|
|
871 ; GCN-LABEL: {{^}}v_test_global_nnans_min_max_f32:
|
|
872 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
|
873 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
|
874 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
|
|
875 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]]
|
|
876 ; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]]
|
|
877 define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
|
878 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
879 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
|
880 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
|
|
881 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
|
|
882 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
|
883 %a = load volatile float, float addrspace(1)* %gep0
|
|
884 %b = load volatile float, float addrspace(1)* %gep1
|
|
885 %c = load volatile float, float addrspace(1)* %gep2
|
|
886 %max = call float @llvm.maxnum.f32(float %a, float %b)
|
|
887 %minmax = call float @llvm.minnum.f32(float %max, float %c)
|
|
888 store float %minmax, float addrspace(1)* %outgep
|
|
889 ret void
|
|
890 }
|
|
891
|
|
892 ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16:
|
|
893 ; SI: v_cvt_f32_f16
|
|
894 ; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
|
|
895 ; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
|
|
896 ; SI: v_cvt_f16_f32
|
|
897
|
|
898 ; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0
|
|
899 ; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0
|
|
900 ; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0
|
|
901
|
134
|
902 ; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0
|
121
|
903 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
|
|
904 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
|
|
905 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
906 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
|
|
907 %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
|
|
908 %a = load half, half addrspace(1)* %gep0
|
|
909 %a.add = fadd nnan half %a, 1.0
|
|
910 %max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
|
|
911 %med = call half @llvm.minnum.f16(half %max, half 4.0)
|
|
912
|
|
913 store half %med, half addrspace(1)* %outgep
|
|
914 ret void
|
|
915 }
|
|
916
|
|
917 ; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0:
|
|
918 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
|
|
919 ; GCN: {{buffer|flat|global}}_load_ushort [[B:v[0-9]+]]
|
|
920 ; GCN: {{buffer|flat|global}}_load_ushort [[C:v[0-9]+]]
|
|
921
|
|
922 ; SI: v_cvt_f32_f16
|
|
923 ; SI: v_cvt_f32_f16
|
|
924 ; SI: v_add_f32_e32
|
|
925 ; SI: v_add_f32_e32
|
|
926 ; SI: v_add_f32_e32
|
|
927 ; SI: v_med3_f32
|
|
928 ; SI: v_cvt_f16_f32_e32
|
|
929
|
|
930
|
|
931 ; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
|
|
932 ; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
|
|
933 ; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
|
|
934
|
|
935 ; VI-DAG: v_min_f16
|
|
936 ; VI-DAG: v_max_f16
|
|
937 ; VI: v_min_f16
|
|
938 ; VI: v_max_f16
|
|
939
|
|
940 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
|
|
941 define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
|
|
942 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
943 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
|
|
944 %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
|
|
945 %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
|
|
946 %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
|
|
947 %a = load volatile half, half addrspace(1)* %gep0
|
|
948 %b = load volatile half, half addrspace(1)* %gep1
|
|
949 %c = load volatile half, half addrspace(1)* %gep2
|
|
950
|
|
951 %a.nnan = fadd nnan half %a, 1.0
|
|
952 %b.nnan = fadd nnan half %b, 2.0
|
|
953 %c.nnan = fadd nnan half %c, 4.0
|
|
954
|
|
955 %tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan)
|
|
956 %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
|
|
957 %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
|
|
958 %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
|
|
959 store half %med3, half addrspace(1)* %outgep
|
|
960 ret void
|
|
961 }
|
|
962
|
|
963 declare i32 @llvm.amdgcn.workitem.id.x() #0
|
|
964 declare float @llvm.fabs.f32(float) #0
|
|
965 declare float @llvm.minnum.f32(float, float) #0
|
|
966 declare float @llvm.maxnum.f32(float, float) #0
|
|
967 declare double @llvm.minnum.f64(double, double) #0
|
|
968 declare double @llvm.maxnum.f64(double, double) #0
|
|
969 declare half @llvm.fabs.f16(half) #0
|
|
970 declare half @llvm.minnum.f16(half, half) #0
|
|
971 declare half @llvm.maxnum.f16(half, half) #0
|
|
972
|
120
|
973 attributes #0 = { nounwind readnone }
|
|
974 attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
|
|
975 attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
|