221
|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
|
|
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
|
|
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
|
236
|
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
|
|
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
|
221
|
7
|
|
8 define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
|
|
9 ; GFX6-LABEL: v_saddsat_i8:
|
|
10 ; GFX6: ; %bb.0:
|
|
11 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
12 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
|
|
13 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
14 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
|
15 ; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0
|
|
16 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0
|
|
17 ; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
18 ;
|
|
19 ; GFX8-LABEL: v_saddsat_i8:
|
|
20 ; GFX8: ; %bb.0:
|
|
21 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
22 ; GFX8-NEXT: v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
|
|
23 ; GFX8-NEXT: v_min_i16_e32 v0, 0x7f, v0
|
|
24 ; GFX8-NEXT: v_max_i16_e32 v0, 0xff80, v0
|
|
25 ; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
26 ;
|
|
27 ; GFX9-LABEL: v_saddsat_i8:
|
|
28 ; GFX9: ; %bb.0:
|
|
29 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
30 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
|
|
31 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
|
|
32 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
|
|
33 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
|
|
34 ; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
35 ;
|
236
|
36 ; GFX10PLUS-LABEL: v_saddsat_i8:
|
|
37 ; GFX10PLUS: ; %bb.0:
|
|
38 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
39 ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
|
40 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
|
|
41 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
|
|
42 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp
|
|
43 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
|
|
44 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
221
|
45 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
|
|
46 ret i8 %result
|
|
47 }
|
|
48
|
|
49 define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
|
|
50 ; GFX6-LABEL: v_saddsat_i16:
|
|
51 ; GFX6: ; %bb.0:
|
|
52 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
53 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
|
|
54 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
55 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
|
56 ; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
|
|
57 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
|
|
58 ; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
59 ;
|
|
60 ; GFX8-LABEL: v_saddsat_i16:
|
|
61 ; GFX8: ; %bb.0:
|
|
62 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
63 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1
|
|
64 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1
|
|
65 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0
|
236
|
66 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1
|
|
67 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
|
221
|
68 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
69 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
70 ; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
71 ;
|
|
72 ; GFX9-LABEL: v_saddsat_i16:
|
|
73 ; GFX9: ; %bb.0:
|
|
74 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
75 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
|
|
76 ; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
77 ;
|
236
|
78 ; GFX10PLUS-LABEL: v_saddsat_i16:
|
|
79 ; GFX10PLUS: ; %bb.0:
|
|
80 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
81 ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
|
82 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp
|
|
83 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
221
|
84 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
|
|
85 ret i16 %result
|
|
86 }
|
|
87
|
|
88 define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
|
|
89 ; GFX6-LABEL: v_saddsat_i32:
|
|
90 ; GFX6: ; %bb.0:
|
|
91 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
92 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
|
|
93 ; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1
|
|
94 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
|
236
|
95 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1
|
|
96 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
|
221
|
97 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
98 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
99 ; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
100 ;
|
|
101 ; GFX8-LABEL: v_saddsat_i32:
|
|
102 ; GFX8: ; %bb.0:
|
|
103 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
104 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
|
|
105 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1
|
|
106 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
|
236
|
107 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1
|
|
108 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
|
221
|
109 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
110 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
111 ; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
112 ;
|
|
113 ; GFX9-LABEL: v_saddsat_i32:
|
|
114 ; GFX9: ; %bb.0:
|
|
115 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
116 ; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp
|
|
117 ; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
118 ;
|
236
|
119 ; GFX10PLUS-LABEL: v_saddsat_i32:
|
|
120 ; GFX10PLUS: ; %bb.0:
|
|
121 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
122 ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
|
123 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v1 clamp
|
|
124 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
221
|
125 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
|
|
126 ret i32 %result
|
|
127 }
|
|
128
|
|
129 define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
|
|
130 ; GFX6-LABEL: v_saddsat_v2i16:
|
|
131 ; GFX6: ; %bb.0:
|
|
132 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
133 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
134 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
135 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
|
|
136 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
|
|
137 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
|
|
138 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
236
|
139 ; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
|
|
140 ; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
|
|
141 ; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
|
|
142 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
|
|
143 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1
|
221
|
144 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
236
|
145 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
|
|
146 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
221
|
147 ; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
148 ;
|
|
149 ; GFX8-LABEL: v_saddsat_v2i16:
|
|
150 ; GFX8: ; %bb.0:
|
|
151 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
152 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
153 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
154 ; GFX8-NEXT: v_add_u16_e32 v4, v3, v2
|
|
155 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3
|
|
156 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2
|
236
|
157 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4
|
|
158 ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
|
221
|
159 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
236
|
160 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
|
221
|
161 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1
|
|
162 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1
|
|
163 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0
|
236
|
164 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1
|
|
165 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
|
221
|
166 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
167 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
|
168 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
169 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
170 ; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
171 ;
|
|
172 ; GFX9-LABEL: v_saddsat_v2i16:
|
|
173 ; GFX9: ; %bb.0:
|
|
174 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
175 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
|
|
176 ; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
177 ;
|
236
|
178 ; GFX10PLUS-LABEL: v_saddsat_v2i16:
|
|
179 ; GFX10PLUS: ; %bb.0:
|
|
180 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
181 ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
|
182 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v1 clamp
|
|
183 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
221
|
184 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
|
|
185 ret <2 x i16> %result
|
|
186 }
|
|
187
|
|
188 define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
|
|
189 ; GFX6-LABEL: v_saddsat_v3i16:
|
|
190 ; GFX6: ; %bb.0:
|
|
191 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
192 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
|
|
193 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
194 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
|
|
195 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
|
|
196 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
|
|
197 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
198 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
|
|
199 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
|
236
|
200 ; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
|
|
201 ; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
|
221
|
202 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
|
236
|
203 ; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
|
|
204 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
|
|
205 ; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2
|
221
|
206 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
207 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
236
|
208 ; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2
|
221
|
209 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
|
|
210 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
|
|
211 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
|
|
212 ; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
213 ;
|
|
214 ; GFX8-LABEL: v_saddsat_v3i16:
|
|
215 ; GFX8: ; %bb.0:
|
|
216 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
217 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
|
|
218 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
219 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4
|
|
220 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5
|
|
221 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4
|
236
|
222 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6
|
|
223 ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4
|
221
|
224 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
236
|
225 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
|
221
|
226 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3
|
|
227 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3
|
|
228 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1
|
236
|
229 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3
|
|
230 ; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
|
221
|
231 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
232 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
233 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2
|
|
234 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2
|
|
235 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0
|
236
|
236 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2
|
|
237 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
|
221
|
238 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
239 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
240 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
|
|
241 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
242 ; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
243 ;
|
|
244 ; GFX9-LABEL: v_saddsat_v3i16:
|
|
245 ; GFX9: ; %bb.0:
|
|
246 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
247 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp
|
|
248 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp
|
|
249 ; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
250 ;
|
236
|
251 ; GFX10PLUS-LABEL: v_saddsat_v3i16:
|
|
252 ; GFX10PLUS: ; %bb.0:
|
|
253 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
254 ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
|
255 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp
|
|
256 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp
|
|
257 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
221
|
258 %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
|
|
259 ret <3 x i16> %result
|
|
260 }
|
|
261
|
|
262 define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
|
|
263 ; GFX6-LABEL: v_saddsat_v4i16:
|
|
264 ; GFX6: ; %bb.0:
|
|
265 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
266 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
|
|
267 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
268 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
|
|
269 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
|
|
270 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5
|
|
271 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
236
|
272 ; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
|
|
273 ; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
|
|
274 ; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
|
|
275 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
|
221
|
276 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
|
|
277 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
278 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
|
|
279 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
|
|
280 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
236
|
281 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
221
|
282 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
|
|
283 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7
|
236
|
284 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
|
|
285 ; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1
|
|
286 ; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2
|
|
287 ; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
|
|
288 ; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2
|
221
|
289 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
236
|
290 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
221
|
291 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
|
|
292 ; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
293 ;
|
|
294 ; GFX8-LABEL: v_saddsat_v4i16:
|
|
295 ; GFX8: ; %bb.0:
|
|
296 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
297 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
|
|
298 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
299 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4
|
|
300 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5
|
|
301 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4
|
236
|
302 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6
|
|
303 ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4
|
221
|
304 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
236
|
305 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
|
221
|
306 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2
|
|
307 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2
|
|
308 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0
|
236
|
309 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2
|
|
310 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
|
221
|
311 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
236
|
312 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
221
|
313 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
314 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
315 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
316 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
317 ; GFX8-NEXT: v_add_u16_e32 v5, v4, v2
|
|
318 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4
|
|
319 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2
|
236
|
320 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5
|
|
321 ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
|
221
|
322 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
236
|
323 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
221
|
324 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3
|
|
325 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3
|
|
326 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1
|
236
|
327 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3
|
|
328 ; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
|
221
|
329 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
330 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
|
331 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
332 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
333 ; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
334 ;
|
|
335 ; GFX9-LABEL: v_saddsat_v4i16:
|
|
336 ; GFX9: ; %bb.0:
|
|
337 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
338 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp
|
|
339 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp
|
|
340 ; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
341 ;
|
236
|
342 ; GFX10PLUS-LABEL: v_saddsat_v4i16:
|
|
343 ; GFX10PLUS: ; %bb.0:
|
|
344 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
345 ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
|
346 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp
|
|
347 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp
|
|
348 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
221
|
349 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
|
|
350 %cast = bitcast <4 x i16> %result to <2 x float>
|
|
351 ret <2 x float> %cast
|
|
352 }
|
|
353
|
|
354 define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
|
|
355 ; GFX6-LABEL: v_saddsat_v2i32:
|
|
356 ; GFX6: ; %bb.0:
|
|
357 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
358 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
|
|
359 ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2
|
|
360 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
|
236
|
361 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2
|
|
362 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
|
221
|
363 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
364 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
365 ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3
|
|
366 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
|
367 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
|
236
|
368 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2
|
|
369 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
221
|
370 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
371 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
|
372 ; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
373 ;
|
|
374 ; GFX8-LABEL: v_saddsat_v2i32:
|
|
375 ; GFX8: ; %bb.0:
|
|
376 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
377 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
|
|
378 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2
|
|
379 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
|
236
|
380 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2
|
|
381 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
|
221
|
382 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
383 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
384 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3
|
|
385 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
|
386 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
|
236
|
387 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2
|
|
388 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
221
|
389 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
|
|
390 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
|
391 ; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
392 ;
|
|
393 ; GFX9-LABEL: v_saddsat_v2i32:
|
|
394 ; GFX9: ; %bb.0:
|
|
395 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
396 ; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp
|
|
397 ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp
|
|
398 ; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
399 ;
|
236
|
400 ; GFX10PLUS-LABEL: v_saddsat_v2i32:
|
|
401 ; GFX10PLUS: ; %bb.0:
|
|
402 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
403 ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
|
404 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v2 clamp
|
|
405 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v3 clamp
|
|
406 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
221
|
407 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
|
|
408 ret <2 x i32> %result
|
|
409 }
|
|
410
|
|
411 define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
|
|
412 ; GFX6-LABEL: v_saddsat_i64:
|
|
413 ; GFX6: ; %bb.0:
|
|
414 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
415 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
|
|
416 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
|
|
417 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
|
|
418 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
236
|
419 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
221
|
420 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
236
|
421 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
|
|
422 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
221
|
423 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
424 ; GFX6-NEXT: s_setpc_b64 s[30:31]
|
|
425 ;
|
|
426 ; GFX8-LABEL: v_saddsat_i64:
|
|
427 ; GFX8: ; %bb.0:
|
|
428 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
429 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
|
|
430 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
|
|
431 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
|
|
432 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
236
|
433 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
221
|
434 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
236
|
435 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
|
|
436 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
221
|
437 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
438 ; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
439 ;
|
|
440 ; GFX9-LABEL: v_saddsat_i64:
|
|
441 ; GFX9: ; %bb.0:
|
|
442 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
443 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
|
|
444 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
|
|
445 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
|
|
446 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
236
|
447 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
221
|
448 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
236
|
449 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
|
|
450 ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
|
221
|
451 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
452 ; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
453 ;
|
|
454 ; GFX10-LABEL: v_saddsat_i64:
|
|
455 ; GFX10: ; %bb.0:
|
|
456 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
457 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
|
458 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
|
|
459 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
|
|
460 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
|
236
|
461 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
221
|
462 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
|
236
|
463 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
|
221
|
464 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
|
236
|
465 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
|
221
|
466 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
|
|
467 ; GFX10-NEXT: s_setpc_b64 s[30:31]
|
236
|
468 ;
|
|
469 ; GFX11-LABEL: v_saddsat_i64:
|
|
470 ; GFX11: ; %bb.0:
|
|
471 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
472 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
|
473 ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
|
|
474 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
|
|
475 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
|
|
476 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
477 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
|
|
478 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
|
|
479 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
|
|
480 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
|
|
481 ; GFX11-NEXT: s_setpc_b64 s[30:31]
|
221
|
482 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
|
|
483 ret i64 %result
|
|
484 }
|
|
485
|
|
486 declare i8 @llvm.sadd.sat.i8(i8, i8) #0
|
|
487 declare i16 @llvm.sadd.sat.i16(i16, i16) #0
|
|
488 declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
|
|
489 declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
|
|
490 declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
|
|
491 declare i32 @llvm.sadd.sat.i32(i32, i32) #0
|
|
492 declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
|
|
493 declare i64 @llvm.sadd.sat.i64(i64, i64) #0
|