150
|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
|
|
3 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI
|
|
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9
|
|
5
|
|
6 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
|
|
7 ; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
|
|
8 ; SI: ; %bb.0:
|
|
9 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
10 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
11 ; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
12 ; SI-NEXT: s_mov_b32 s6, -1
|
|
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
14 ; SI-NEXT: v_mov_b32_e32 v0, s3
|
|
15 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0
|
|
16 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
17 ; SI-NEXT: s_endpgm
|
|
18 ;
|
|
19 ; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
|
|
20 ; VI: ; %bb.0:
|
|
21 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
22 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
|
23 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
173
|
24 ; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
25 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0
|
150
|
26 ; VI-NEXT: v_mov_b32_e32 v0, s2
|
|
27 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
28 ; VI-NEXT: flat_store_dword v[0:1], v2
|
|
29 ; VI-NEXT: s_endpgm
|
|
30 ;
|
|
31 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
|
|
32 ; GFX9: ; %bb.0:
|
|
33 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
34 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
|
35 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
173
|
36 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
37 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0
|
150
|
38 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
|
39 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
|
40 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
|
41 ; GFX9-NEXT: s_endpgm
|
|
42 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
|
|
43 store <2 x half> %result, <2 x half> addrspace(1)* %out
|
|
44 ret void
|
|
45 }
|
|
46
|
|
47 define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
|
|
48 ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
|
|
49 ; SI: ; %bb.0:
|
|
50 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
|
51 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
|
52 ; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
53 ; SI-NEXT: s_mov_b32 s6, -1
|
|
54 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
55 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s2, s2
|
|
56 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
57 ; SI-NEXT: s_endpgm
|
|
58 ;
|
|
59 ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
|
|
60 ; VI: ; %bb.0:
|
|
61 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
62 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
|
|
63 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
64 ; VI-NEXT: v_mov_b32_e32 v0, s2
|
173
|
65 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
|
150
|
66 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
67 ; VI-NEXT: flat_store_dword v[0:1], v2
|
|
68 ; VI-NEXT: s_endpgm
|
|
69 ;
|
|
70 ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
|
|
71 ; GFX9: ; %bb.0:
|
|
72 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
73 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
|
|
74 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
75 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
173
|
76 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
|
150
|
77 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
|
78 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
|
79 ; GFX9-NEXT: s_endpgm
|
|
80 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
|
|
81 store <2 x half> %result, <2 x half> addrspace(1)* %out
|
|
82 ret void
|
|
83 }
|
|
84
|
|
85 define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
|
|
86 ; GCN-LABEL: s_cvt_pkrtz_undef_undef:
|
|
87 ; GCN: ; %bb.0:
|
|
88 ; GCN-NEXT: s_endpgm
|
|
89 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
|
|
90 store <2 x half> %result, <2 x half> addrspace(1)* %out
|
|
91 ret void
|
|
92 }
|
|
93
|
|
94 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
|
95 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32:
|
|
96 ; SI: ; %bb.0:
|
|
97 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
|
98 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
99 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
100 ; SI-NEXT: s_mov_b32 s2, 0
|
|
101 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
102 ; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
103 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
104 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
105 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
|
106 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
107 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
|
108 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
109 ; SI-NEXT: s_waitcnt vmcnt(0)
|
|
110 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
|
|
111 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
|
112 ; SI-NEXT: s_endpgm
|
|
113 ;
|
|
114 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
|
|
115 ; VI: ; %bb.0:
|
|
116 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
117 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
118 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
119 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
120 ; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
121 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
122 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
123 ; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
124 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
125 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
126 ; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
127 ; VI-NEXT: flat_load_dword v1, v[2:3]
|
|
128 ; VI-NEXT: v_mov_b32_e32 v5, s5
|
|
129 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
|
130 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
131 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
173
|
132 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
|
|
133 ; VI-NEXT: flat_store_dword v[4:5], v0
|
150
|
134 ; VI-NEXT: s_endpgm
|
|
135 ;
|
|
136 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
|
|
137 ; GFX9: ; %bb.0:
|
|
138 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
139 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
140 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
141 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
142 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
143 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
|
|
144 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
145 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
|
146 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
|
|
147 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
148 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
|
149 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
|
150 ; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
151 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
|
|
152 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
153 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
173
|
154 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
|
|
155 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
|
150
|
156 ; GFX9-NEXT: s_endpgm
|
|
157 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
158 %tid.ext = sext i32 %tid to i64
|
|
159 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
|
160 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
|
161 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
|
|
162 %a = load volatile float, float addrspace(1)* %a.gep
|
|
163 %b = load volatile float, float addrspace(1)* %b.gep
|
|
164 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
|
|
165 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
|
|
166 ret void
|
|
167 }
|
|
168
|
|
169 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
|
|
170 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
|
|
171 ; SI: ; %bb.0:
|
|
172 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
173 ; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
174 ; SI-NEXT: s_mov_b32 s6, 0
|
|
175 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
176 ; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
177 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
178 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
|
179 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
|
180 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
|
181 ; SI-NEXT: s_waitcnt vmcnt(0)
|
|
182 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
|
|
183 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
|
184 ; SI-NEXT: s_endpgm
|
|
185 ;
|
|
186 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
|
|
187 ; VI: ; %bb.0:
|
|
188 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
189 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
190 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
191 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
192 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
|
193 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
173
|
194 ; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
195 ; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
196 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
|
197 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
150
|
198 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
173
|
199 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
|
|
200 ; VI-NEXT: flat_store_dword v[2:3], v0
|
150
|
201 ; VI-NEXT: s_endpgm
|
|
202 ;
|
|
203 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
|
|
204 ; GFX9: ; %bb.0:
|
|
205 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
206 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
208 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
|
209 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
|
|
210 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
173
|
211 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
|
212 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
|
213 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
|
|
214 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
150
|
215 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
173
|
216 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
|
|
217 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
|
150
|
218 ; GFX9-NEXT: s_endpgm
|
|
219 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
220 %tid.ext = sext i32 %tid to i64
|
|
221 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
|
222 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
|
|
223 %a = load volatile float, float addrspace(1)* %a.gep
|
|
224 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
|
|
225 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
|
|
226 ret void
|
|
227 }
|
|
228
|
|
229 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
|
|
230 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
|
|
231 ; SI: ; %bb.0:
|
|
232 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
233 ; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
234 ; SI-NEXT: s_mov_b32 s6, 0
|
|
235 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
236 ; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
237 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
238 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
|
239 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
|
240 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
|
241 ; SI-NEXT: s_waitcnt vmcnt(0)
|
|
242 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
|
|
243 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
|
244 ; SI-NEXT: s_endpgm
|
|
245 ;
|
|
246 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
|
|
247 ; VI: ; %bb.0:
|
|
248 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
249 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
250 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
251 ; VI-NEXT: v_mov_b32_e32 v1, s3
|
|
252 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
|
253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
173
|
254 ; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
255 ; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
256 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
|
257 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
150
|
258 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
173
|
259 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
|
|
260 ; VI-NEXT: flat_store_dword v[2:3], v0
|
150
|
261 ; VI-NEXT: s_endpgm
|
|
262 ;
|
|
263 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
|
|
264 ; GFX9: ; %bb.0:
|
|
265 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
266 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
|
267 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
268 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
|
269 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
|
|
270 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
173
|
271 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
|
272 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
|
273 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
|
|
274 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
150
|
275 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
173
|
276 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
|
|
277 ; GFX9-NEXT: global_store_dword v[2:3], v0, off
|
150
|
278 ; GFX9-NEXT: s_endpgm
|
|
279 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
280 %tid.ext = sext i32 %tid to i64
|
|
281 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
|
282 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
|
|
283 %a = load volatile float, float addrspace(1)* %a.gep
|
|
284 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
|
|
285 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
|
|
286 ret void
|
|
287 }
|
|
288
|
|
289 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
|
290 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
|
|
291 ; SI: ; %bb.0:
|
|
292 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
|
293 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
294 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
295 ; SI-NEXT: s_mov_b32 s2, 0
|
|
296 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
297 ; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
298 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
299 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
300 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
|
301 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
302 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
|
303 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
304 ; SI-NEXT: s_waitcnt vmcnt(0)
|
|
305 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
|
|
306 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
|
307 ; SI-NEXT: s_endpgm
|
|
308 ;
|
|
309 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
|
|
310 ; VI: ; %bb.0:
|
|
311 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
312 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
313 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
314 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
315 ; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
316 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
317 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
318 ; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
319 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
320 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
321 ; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
322 ; VI-NEXT: flat_load_dword v1, v[2:3]
|
|
323 ; VI-NEXT: v_mov_b32_e32 v5, s5
|
|
324 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
|
325 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
173
|
327 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
|
|
328 ; VI-NEXT: flat_store_dword v[4:5], v0
|
150
|
329 ; VI-NEXT: s_endpgm
|
|
330 ;
|
|
331 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
|
|
332 ; GFX9: ; %bb.0:
|
|
333 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
334 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
335 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
336 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
337 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
338 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
|
|
339 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
340 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
|
341 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
|
|
342 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
343 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
|
344 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
|
345 ; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
346 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
|
|
347 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
348 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
173
|
349 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
|
|
350 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
|
150
|
351 ; GFX9-NEXT: s_endpgm
|
|
352 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
353 %tid.ext = sext i32 %tid to i64
|
|
354 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
|
355 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
|
356 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
|
|
357 %a = load volatile float, float addrspace(1)* %a.gep
|
|
358 %b = load volatile float, float addrspace(1)* %b.gep
|
|
359 %neg.a = fsub float -0.0, %a
|
|
360 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
|
|
361 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
|
|
362 ret void
|
|
363 }
|
|
364
|
|
365 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
|
366 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
|
|
367 ; SI: ; %bb.0:
|
|
368 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
|
369 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
370 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
371 ; SI-NEXT: s_mov_b32 s2, 0
|
|
372 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
373 ; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
374 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
375 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
376 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
|
377 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
378 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
|
379 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
380 ; SI-NEXT: s_waitcnt vmcnt(0)
|
|
381 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
|
|
382 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
|
383 ; SI-NEXT: s_endpgm
|
|
384 ;
|
|
385 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
|
|
386 ; VI: ; %bb.0:
|
|
387 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
388 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
389 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
390 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
391 ; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
392 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
393 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
394 ; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
395 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
396 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
397 ; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
398 ; VI-NEXT: flat_load_dword v1, v[2:3]
|
|
399 ; VI-NEXT: v_mov_b32_e32 v5, s5
|
|
400 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
|
401 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
402 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
173
|
403 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
|
|
404 ; VI-NEXT: flat_store_dword v[4:5], v0
|
150
|
405 ; VI-NEXT: s_endpgm
|
|
406 ;
|
|
407 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
|
|
408 ; GFX9: ; %bb.0:
|
|
409 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
410 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
411 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
412 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
413 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
414 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
|
|
415 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
416 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
|
417 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
|
|
418 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
419 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
|
420 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
|
421 ; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
422 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
|
|
423 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
424 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
173
|
425 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
|
|
426 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
|
150
|
427 ; GFX9-NEXT: s_endpgm
|
|
428 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
429 %tid.ext = sext i32 %tid to i64
|
|
430 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
|
431 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
|
432 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
|
|
433 %a = load volatile float, float addrspace(1)* %a.gep
|
|
434 %b = load volatile float, float addrspace(1)* %b.gep
|
|
435 %neg.b = fsub float -0.0, %b
|
|
436 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
|
|
437 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
|
|
438 ret void
|
|
439 }
|
|
440
|
|
441 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
|
442 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
|
|
443 ; SI: ; %bb.0:
|
|
444 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
|
445 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
446 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
447 ; SI-NEXT: s_mov_b32 s2, 0
|
|
448 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
449 ; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
450 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
451 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
452 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
|
453 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
454 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
|
455 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
456 ; SI-NEXT: s_waitcnt vmcnt(0)
|
|
457 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
|
|
458 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
|
459 ; SI-NEXT: s_endpgm
|
|
460 ;
|
|
461 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
|
|
462 ; VI: ; %bb.0:
|
|
463 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
464 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
465 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
466 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
467 ; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
468 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
469 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
470 ; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
471 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
472 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
473 ; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
474 ; VI-NEXT: flat_load_dword v1, v[2:3]
|
|
475 ; VI-NEXT: v_mov_b32_e32 v5, s5
|
|
476 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
|
477 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
478 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
173
|
479 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
|
|
480 ; VI-NEXT: flat_store_dword v[4:5], v0
|
150
|
481 ; VI-NEXT: s_endpgm
|
|
482 ;
|
|
483 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
|
|
484 ; GFX9: ; %bb.0:
|
|
485 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
486 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
487 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
488 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
489 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
490 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
|
|
491 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
492 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
|
493 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
|
|
494 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
495 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
|
496 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
|
497 ; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
498 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
|
|
499 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
500 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
173
|
501 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
|
|
502 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
|
150
|
503 ; GFX9-NEXT: s_endpgm
|
|
504 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
505 %tid.ext = sext i32 %tid to i64
|
|
506 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
|
507 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
|
508 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
|
|
509 %a = load volatile float, float addrspace(1)* %a.gep
|
|
510 %b = load volatile float, float addrspace(1)* %b.gep
|
|
511 %neg.a = fsub float -0.0, %a
|
|
512 %neg.b = fsub float -0.0, %b
|
|
513 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
|
|
514 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
|
|
515 ret void
|
|
516 }
|
|
517
|
|
518 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
|
519 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
|
|
520 ; SI: ; %bb.0:
|
|
521 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
|
522 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
|
523 ; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
524 ; SI-NEXT: s_mov_b32 s2, 0
|
|
525 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
526 ; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
527 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
|
528 ; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
529 ; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
|
530 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
|
531 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
|
532 ; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
533 ; SI-NEXT: s_waitcnt vmcnt(0)
|
|
534 ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
|
|
535 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
|
536 ; SI-NEXT: s_endpgm
|
|
537 ;
|
|
538 ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
|
|
539 ; VI: ; %bb.0:
|
|
540 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
541 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
542 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
543 ; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
544 ; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
545 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
|
|
546 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
547 ; VI-NEXT: v_mov_b32_e32 v3, s1
|
|
548 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
|
549 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
550 ; VI-NEXT: flat_load_dword v0, v[0:1]
|
|
551 ; VI-NEXT: flat_load_dword v1, v[2:3]
|
|
552 ; VI-NEXT: v_mov_b32_e32 v5, s5
|
|
553 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
|
554 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
555 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
173
|
556 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
|
|
557 ; VI-NEXT: flat_store_dword v[4:5], v0
|
150
|
558 ; VI-NEXT: s_endpgm
|
|
559 ;
|
|
560 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
|
|
561 ; GFX9: ; %bb.0:
|
|
562 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
563 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
|
564 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
|
565 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
566 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
567 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
|
|
568 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
569 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
|
570 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
|
|
571 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
173
|
572 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
|
573 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
|
574 ; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
575 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
|
|
576 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
|
150
|
577 ; GFX9-NEXT: s_waitcnt vmcnt(0)
|
173
|
578 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
|
|
579 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
|
150
|
580 ; GFX9-NEXT: s_endpgm
|
|
581 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
582 %tid.ext = sext i32 %tid to i64
|
|
583 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
|
584 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
|
585 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
|
|
586 %a = load volatile float, float addrspace(1)* %a.gep
|
|
587 %b = load volatile float, float addrspace(1)* %b.gep
|
|
588 %fabs.a = call float @llvm.fabs.f32(float %a)
|
|
589 %neg.fabs.a = fsub float -0.0, %fabs.a
|
|
590 %neg.b = fsub float -0.0, %b
|
|
591 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
|
|
592 store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
|
|
593 ret void
|
|
594 }
|
|
595
|
|
596 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
|
|
597 declare float @llvm.fabs.f32(float) #1
|
|
598 declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
599
|
|
600
|
|
601 attributes #0 = { nounwind }
|
|
602 attributes #1 = { nounwind readnone }
|