150
|
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
|
|
2
|
|
3 declare i1 @llvm.amdgcn.class.f32(float, i32) #1
|
|
4 declare i1 @llvm.amdgcn.class.f64(double, i32) #1
|
|
5 declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
6 declare float @llvm.fabs.f32(float) #1
|
|
7 declare double @llvm.fabs.f64(double) #1
|
|
8
|
|
9 ; SI-LABEL: {{^}}test_class_f32:
|
|
10 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
11 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
|
12 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
|
13 ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
|
|
14 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
|
15 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
16 ; SI: s_endpgm
|
|
17 define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
|
18 %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
|
|
19 %sext = sext i1 %result to i32
|
|
20 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
21 ret void
|
|
22 }
|
|
23
|
|
24 ; SI-LABEL: {{^}}test_class_fabs_f32:
|
|
25 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
26 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
|
27 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
|
28 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
|
|
29 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
|
30 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
31 ; SI: s_endpgm
|
|
32 define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
|
33 %a.fabs = call float @llvm.fabs.f32(float %a) #1
|
|
34 %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
|
|
35 %sext = sext i1 %result to i32
|
|
36 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
37 ret void
|
|
38 }
|
|
39
|
|
40 ; SI-LABEL: {{^}}test_class_fneg_f32:
|
|
41 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
42 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
|
43 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
|
44 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
|
|
45 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
|
46 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
47 ; SI: s_endpgm
|
|
48 define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
|
49 %a.fneg = fsub float -0.0, %a
|
|
50 %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
|
|
51 %sext = sext i1 %result to i32
|
|
52 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
53 ret void
|
|
54 }
|
|
55
|
|
56 ; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
|
|
57 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
58 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
|
59 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
|
60 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
|
|
61 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
|
62 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
63 ; SI: s_endpgm
|
|
64 define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
|
65 %a.fabs = call float @llvm.fabs.f32(float %a) #1
|
|
66 %a.fneg.fabs = fsub float -0.0, %a.fabs
|
|
67 %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
|
|
68 %sext = sext i1 %result to i32
|
|
69 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
70 ret void
|
|
71 }
|
|
72
|
|
73 ; SI-LABEL: {{^}}test_class_1_f32:
|
|
74 ; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
|
75 ; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
|
|
76 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
|
|
77 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
78 ; SI: s_endpgm
|
|
79 define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
|
|
80 %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
|
|
81 %sext = sext i1 %result to i32
|
|
82 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
83 ret void
|
|
84 }
|
|
85
|
|
86 ; SI-LABEL: {{^}}test_class_64_f32:
|
|
87 ; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
|
88 ; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
|
|
89 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
|
|
90 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
91 ; SI: s_endpgm
|
|
92 define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
|
|
93 %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
|
|
94 %sext = sext i1 %result to i32
|
|
95 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
96 ret void
|
|
97 }
|
|
98
|
|
99 ; Set all 10 bits of mask
|
|
100 ; SI-LABEL: {{^}}test_class_full_mask_f32:
|
|
101 ; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
|
102 ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
|
|
103 ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
|
|
104 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
|
105 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
106 ; SI: s_endpgm
|
|
107 define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
|
|
108 %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
|
|
109 %sext = sext i1 %result to i32
|
|
110 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
111 ret void
|
|
112 }
|
|
113
|
|
114 ; SI-LABEL: {{^}}test_class_9bit_mask_f32:
|
|
115 ; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
|
116 ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
|
|
117 ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
|
|
118 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
|
119 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
120 ; SI: s_endpgm
|
|
121 define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
|
|
122 %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
|
|
123 %sext = sext i1 %result to i32
|
|
124 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
125 ret void
|
|
126 }
|
|
127
|
|
128 ; SI-LABEL: {{^}}v_test_class_full_mask_f32:
|
|
129 ; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
|
|
130 ; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
|
|
131 ; SI: v_cmp_class_f32_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
|
|
132 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
|
|
133 ; SI: buffer_store_dword [[RESULT]]
|
|
134 ; SI: s_endpgm
|
|
135 define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
|
136 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
137 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
|
|
138 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
139 %a = load float, float addrspace(1)* %gep.in
|
|
140
|
|
141 %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
|
|
142 %sext = sext i1 %result to i32
|
|
143 store i32 %sext, i32 addrspace(1)* %gep.out, align 4
|
|
144 ret void
|
|
145 }
|
|
146
|
|
147 ; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
|
|
148 ; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
|
|
149 ; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
|
|
150 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
|
151 ; SI: buffer_store_dword [[RESULT]]
|
|
152 ; SI: s_endpgm
|
|
153 define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
|
|
154 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
155 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
|
|
156 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
157 %b = load i32, i32 addrspace(1)* %gep.in
|
|
158
|
|
159 %result = call i1 @llvm.amdgcn.class.f32(float 1.0, i32 %b) #1
|
|
160 %sext = sext i1 %result to i32
|
|
161 store i32 %sext, i32 addrspace(1)* %gep.out, align 4
|
|
162 ret void
|
|
163 }
|
|
164
|
|
165 ; FIXME: Why isn't this using a literal constant operand?
|
|
166 ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
|
|
167 ; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
|
|
168 ; SI-DAG: s_mov_b32 [[VK:s[0-9]+]], 0x44800000
|
|
169 ; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
|
|
170 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
|
171 ; SI: buffer_store_dword [[RESULT]]
|
|
172 ; SI: s_endpgm
|
|
173 define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
|
|
174 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
175 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
|
|
176 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
177 %b = load i32, i32 addrspace(1)* %gep.in
|
|
178
|
|
179 %result = call i1 @llvm.amdgcn.class.f32(float 1024.0, i32 %b) #1
|
|
180 %sext = sext i1 %result to i32
|
|
181 store i32 %sext, i32 addrspace(1)* %gep.out, align 4
|
|
182 ret void
|
|
183 }
|
|
184
|
|
185 ; SI-LABEL: {{^}}test_class_f64:
|
|
186 ; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
187 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
|
188 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
|
189 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
|
|
190 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
|
191 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
192 ; SI: s_endpgm
|
|
193 define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
|
194 %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
|
|
195 %sext = sext i1 %result to i32
|
|
196 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
197 ret void
|
|
198 }
|
|
199
|
|
200 ; SI-LABEL: {{^}}test_class_fabs_f64:
|
|
201 ; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
202 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
|
203 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
|
204 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
|
|
205 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
|
206 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
207 ; SI: s_endpgm
|
|
208 define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
|
209 %a.fabs = call double @llvm.fabs.f64(double %a) #1
|
|
210 %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
|
|
211 %sext = sext i1 %result to i32
|
|
212 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
213 ret void
|
|
214 }
|
|
215
|
|
216 ; SI-LABEL: {{^}}test_class_fneg_f64:
|
|
217 ; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
218 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
|
219 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
|
220 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
|
|
221 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
|
222 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
223 ; SI: s_endpgm
|
|
224 define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
|
225 %a.fneg = fsub double -0.0, %a
|
|
226 %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
|
|
227 %sext = sext i1 %result to i32
|
|
228 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
229 ret void
|
|
230 }
|
|
231
|
|
232 ; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
|
|
233 ; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
234 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
|
235 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
|
236 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
|
|
237 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
|
238 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
239 ; SI: s_endpgm
|
|
240 define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
|
241 %a.fabs = call double @llvm.fabs.f64(double %a) #1
|
|
242 %a.fneg.fabs = fsub double -0.0, %a.fabs
|
|
243 %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
|
|
244 %sext = sext i1 %result to i32
|
|
245 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
246 ret void
|
|
247 }
|
|
248
|
|
249 ; SI-LABEL: {{^}}test_class_1_f64:
|
|
250 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
|
|
251 ; SI: s_endpgm
|
|
252 define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
|
|
253 %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
|
|
254 %sext = sext i1 %result to i32
|
|
255 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
256 ret void
|
|
257 }
|
|
258
|
|
259 ; SI-LABEL: {{^}}test_class_64_f64:
|
|
260 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
|
|
261 ; SI: s_endpgm
|
|
262 define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
|
|
263 %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
|
|
264 %sext = sext i1 %result to i32
|
|
265 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
266 ret void
|
|
267 }
|
|
268
|
|
269 ; Set all 9 bits of mask
|
|
270 ; SI-LABEL: {{^}}test_class_full_mask_f64:
|
|
271 ; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
|
272 ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
|
|
273 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
|
|
274 ; SI-NOT: vcc
|
|
275 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
|
276 ; SI-NEXT: buffer_store_dword [[RESULT]]
|
|
277 ; SI: s_endpgm
|
|
278 define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x i32], double %a) #0 {
|
|
279 %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
|
|
280 %sext = sext i1 %result to i32
|
|
281 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
282 ret void
|
|
283 }
|
|
284
|
|
285 ; SI-LABEL: {{^}}v_test_class_full_mask_f64:
|
|
286 ; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
|
|
287 ; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
|
|
288 ; SI: v_cmp_class_f64_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
|
|
289 ; SI-NOT: vcc
|
|
290 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
|
|
291 ; SI: buffer_store_dword [[RESULT]]
|
|
292 ; SI: s_endpgm
|
|
293 define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
|
|
294 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
295 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
|
|
296 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
297 %a = load double, double addrspace(1)* %in
|
|
298
|
|
299 %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
|
|
300 %sext = sext i1 %result to i32
|
|
301 store i32 %sext, i32 addrspace(1)* %gep.out, align 4
|
|
302 ret void
|
|
303 }
|
|
304
|
|
305 ; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
|
|
306 ; XSI: v_cmp_class_f64_e32 vcc, 1.0,
|
|
307 ; SI: v_cmp_class_f64_e32 vcc,
|
|
308 ; SI: s_endpgm
|
|
309 define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
|
|
310 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
311 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
|
|
312 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
313 %b = load i32, i32 addrspace(1)* %gep.in
|
|
314
|
|
315 %result = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 %b) #1
|
|
316 %sext = sext i1 %result to i32
|
|
317 store i32 %sext, i32 addrspace(1)* %gep.out, align 4
|
|
318 ret void
|
|
319 }
|
|
320
|
|
321 ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
|
|
322 ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
|
|
323 ; SI: s_endpgm
|
|
324 define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
|
|
325 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
326 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
|
|
327 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
328 %b = load i32, i32 addrspace(1)* %gep.in
|
|
329
|
|
330 %result = call i1 @llvm.amdgcn.class.f64(double 1024.0, i32 %b) #1
|
|
331 %sext = sext i1 %result to i32
|
|
332 store i32 %sext, i32 addrspace(1)* %gep.out, align 4
|
|
333 ret void
|
|
334 }
|
|
335
|
|
336 ; SI-LABEL: {{^}}test_fold_or_class_f32_0:
|
|
337 ; SI-NOT: v_cmp_class
|
|
338 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
|
|
339 ; SI-NOT: v_cmp_class
|
|
340 ; SI: s_endpgm
|
|
341 define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
|
342 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
343 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
|
|
344 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
345 %a = load float, float addrspace(1)* %gep.in
|
|
346
|
|
347 %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
|
|
348 %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 3) #1
|
|
349 %or = or i1 %class0, %class1
|
|
350
|
|
351 %sext = sext i1 %or to i32
|
|
352 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
353 ret void
|
|
354 }
|
|
355
|
|
356 ; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
|
|
357 ; SI-NOT: v_cmp_class
|
|
358 ; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
|
|
359 ; SI-NOT: v_cmp_class
|
|
360 ; SI: s_endpgm
|
|
361 define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
|
362 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
363 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
|
|
364 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
365 %a = load float, float addrspace(1)* %gep.in
|
|
366
|
|
367 %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
|
|
368 %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
|
|
369 %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
|
|
370 %or.0 = or i1 %class0, %class1
|
|
371 %or.1 = or i1 %or.0, %class2
|
|
372
|
|
373 %sext = sext i1 %or.1 to i32
|
|
374 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
375 ret void
|
|
376 }
|
|
377
|
|
378 ; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
|
|
379 ; SI-NOT: v_cmp_class
|
|
380 ; SI: s_movk_i32 [[MASK:s[0-9]+]], 0x3ff{{$}}
|
|
381 ; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}}
|
|
382 ; SI-NOT: v_cmp_class
|
|
383 ; SI: s_endpgm
|
|
384 define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
|
385 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
386 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
|
|
387 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
388 %a = load float, float addrspace(1)* %gep.in
|
|
389
|
|
390 %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
|
|
391 %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
|
|
392 %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
|
|
393 %class3 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
|
|
394 %class4 = call i1 @llvm.amdgcn.class.f32(float %a, i32 16) #1
|
|
395 %class5 = call i1 @llvm.amdgcn.class.f32(float %a, i32 32) #1
|
|
396 %class6 = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
|
|
397 %class7 = call i1 @llvm.amdgcn.class.f32(float %a, i32 128) #1
|
|
398 %class8 = call i1 @llvm.amdgcn.class.f32(float %a, i32 256) #1
|
|
399 %class9 = call i1 @llvm.amdgcn.class.f32(float %a, i32 512) #1
|
|
400 %or.0 = or i1 %class0, %class1
|
|
401 %or.1 = or i1 %or.0, %class2
|
|
402 %or.2 = or i1 %or.1, %class3
|
|
403 %or.3 = or i1 %or.2, %class4
|
|
404 %or.4 = or i1 %or.3, %class5
|
|
405 %or.5 = or i1 %or.4, %class6
|
|
406 %or.6 = or i1 %or.5, %class7
|
|
407 %or.7 = or i1 %or.6, %class8
|
|
408 %or.8 = or i1 %or.7, %class9
|
|
409 %sext = sext i1 %or.8 to i32
|
|
410 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
411 ret void
|
|
412 }
|
|
413
|
|
414 ; SI-LABEL: {{^}}test_fold_or_class_f32_1:
|
|
415 ; SI-NOT: v_cmp_class
|
|
416 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
|
|
417 ; SI-NOT: v_cmp_class
|
|
418 ; SI: s_endpgm
|
|
419 define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
|
420 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
421 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
|
|
422 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
423 %a = load float, float addrspace(1)* %gep.in
|
|
424
|
|
425 %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
|
|
426 %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
|
|
427 %or = or i1 %class0, %class1
|
|
428
|
|
429 %sext = sext i1 %or to i32
|
|
430 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
431 ret void
|
|
432 }
|
|
433
|
|
434 ; SI-LABEL: {{^}}test_fold_or_class_f32_2:
|
|
435 ; SI-NOT: v_cmp_class
|
|
436 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
|
|
437 ; SI-NOT: v_cmp_class
|
|
438 ; SI: s_endpgm
|
|
439 define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
|
440 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
441 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
|
|
442 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
443 %a = load float, float addrspace(1)* %gep.in
|
|
444
|
|
445 %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
|
|
446 %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
|
|
447 %or = or i1 %class0, %class1
|
|
448
|
|
449 %sext = sext i1 %or to i32
|
|
450 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
451 ret void
|
|
452 }
|
|
453
|
|
454 ; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
|
|
455 ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
|
|
456 ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
|
|
457 ; SI: s_or_b64
|
|
458 ; SI: s_endpgm
|
|
459 define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
|
|
460 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
461 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
|
|
462 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
|
463 %a = load float, float addrspace(1)* %gep.in
|
|
464
|
|
465 %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
|
|
466 %class1 = call i1 @llvm.amdgcn.class.f32(float %b, i32 8) #1
|
|
467 %or = or i1 %class0, %class1
|
|
468
|
|
469 %sext = sext i1 %or to i32
|
|
470 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
471 ret void
|
|
472 }
|
|
473
|
|
474 ; SI-LABEL: {{^}}test_class_0_f32:
|
|
475 ; SI-NOT: v_cmp_class
|
|
476 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
|
|
477 ; SI: buffer_store_dword [[RESULT]]
|
|
478 ; SI: s_endpgm
|
|
479 define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
|
|
480 %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
|
|
481 %sext = sext i1 %result to i32
|
|
482 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
483 ret void
|
|
484 }
|
|
485
|
|
486 ; SI-LABEL: {{^}}test_class_0_f64:
|
|
487 ; SI-NOT: v_cmp_class
|
|
488 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
|
|
489 ; SI: buffer_store_dword [[RESULT]]
|
|
490 ; SI: s_endpgm
|
|
491 define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
|
|
492 %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
|
|
493 %sext = sext i1 %result to i32
|
|
494 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
495 ret void
|
|
496 }
|
|
497
|
|
498 ; FIXME: Why is the extension still here?
|
|
499 ; SI-LABEL: {{^}}test_class_undef_f32:
|
|
500 ; SI-NOT: v_cmp_class
|
|
501 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1,
|
|
502 ; SI: buffer_store_dword
|
|
503 define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
|
|
504 %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
|
|
505 %sext = sext i1 %result to i32
|
|
506 store i32 %sext, i32 addrspace(1)* %out, align 4
|
|
507 ret void
|
|
508 }
|
|
509
|
|
510 ; SI-LABEL: {{^}}test_fold_and_ord:
|
|
511 ; SI: s_waitcnt
|
|
512 ; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
|
|
513 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
|
|
514 ; SI-NEXT: s_setpc_b64
|
|
515 define i1 @test_fold_and_ord(float %a) {
|
|
516 %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
|
|
517 %ord = fcmp ord float %a, %a
|
|
518 %and = and i1 %ord, %class
|
|
519 ret i1 %and
|
|
520 }
|
|
521
|
|
522 ; SI-LABEL: {{^}}test_fold_and_unord:
|
|
523 ; SI: s_waitcnt
|
|
524 ; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
|
|
525 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
|
|
526 ; SI-NEXT: s_setpc_b64
|
|
527 define i1 @test_fold_and_unord(float %a) {
|
|
528 %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
|
|
529 %ord = fcmp uno float %a, %a
|
|
530 %and = and i1 %ord, %class
|
|
531 ret i1 %and
|
|
532 }
|
|
533
|
|
534 ; SI-LABEL: {{^}}test_fold_and_ord_multi_use:
|
|
535 ; SI: v_cmp_class
|
|
536 ; SI-NOT: v_cmp_class
|
|
537 ; SI: v_cmp_o
|
|
538 ; SI: s_and_b64
|
|
539 define i1 @test_fold_and_ord_multi_use(float %a) {
|
|
540 %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
|
|
541 store volatile i1 %class, i1 addrspace(1)* undef
|
|
542 %ord = fcmp ord float %a, %a
|
|
543 %and = and i1 %ord, %class
|
|
544 ret i1 %and
|
|
545 }
|
|
546
|
|
547 attributes #0 = { nounwind }
|
|
548 attributes #1 = { nounwind readnone }
|