Mercurial > hg > Members > tobaru > cbc > CbC_llvm
comparison test/CodeGen/ARM/vuzp.ll @ 121:803732b1fca8
LLVM 5.0
author | kono |
---|---|
date | Fri, 27 Oct 2017 17:07:41 +0900 |
parents | afa8332a0e37 |
children |
comparison
equal
deleted
inserted
replaced
120:1172e4bd9c6f | 121:803732b1fca8 |
---|---|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |
1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s | 2 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s |
2 | 3 |
3 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { | 4 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { |
4 ; CHECK-LABEL: vuzpi8: | 5 ; CHECK-LABEL: vuzpi8: |
5 ; CHECK: @ BB#0: | 6 ; CHECK: @ BB#0: |
6 ; CHECK-NEXT: vldr d16, [r1] | 7 ; CHECK-NEXT: vldr d16, [r1] |
7 ; CHECK-NEXT: vldr d17, [r0] | 8 ; CHECK-NEXT: vldr d17, [r0] |
8 ; CHECK-NEXT: vuzp.8 d17, d16 | 9 ; CHECK-NEXT: vuzp.8 d17, d16 |
9 ; CHECK-NEXT: vadd.i8 d16, d17, d16 | 10 ; CHECK-NEXT: vmul.i8 d16, d17, d16 |
10 ; CHECK-NEXT: vmov r0, r1, d16 | 11 ; CHECK-NEXT: vmov r0, r1, d16 |
11 ; CHECK-NEXT: mov pc, lr | 12 ; CHECK-NEXT: mov pc, lr |
12 %tmp1 = load <8 x i8>, <8 x i8>* %A | 13 %tmp1 = load <8 x i8>, <8 x i8>* %A |
13 %tmp2 = load <8 x i8>, <8 x i8>* %B | 14 %tmp2 = load <8 x i8>, <8 x i8>* %B |
14 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> | 15 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> |
15 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> | 16 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> |
16 %tmp5 = add <8 x i8> %tmp3, %tmp4 | 17 %tmp5 = mul <8 x i8> %tmp3, %tmp4 |
17 ret <8 x i8> %tmp5 | 18 ret <8 x i8> %tmp5 |
18 } | 19 } |
19 | 20 |
20 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { | 21 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { |
21 ; CHECK-LABEL: vuzpi8_Qres: | 22 ; CHECK-LABEL: vuzpi8_Qres: |
22 ; CHECK: @ BB#0: | 23 ; CHECK: @ BB#0: |
23 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] | 24 ; CHECK-NEXT: vldr d17, [r1] |
24 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] | 25 ; CHECK-NEXT: vldr d16, [r0] |
25 ; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] | 26 ; CHECK-NEXT: vuzp.8 d16, d17 |
26 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] | 27 ; CHECK-NEXT: vmov r0, r1, d16 |
27 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] | 28 ; CHECK-NEXT: vmov r2, r3, d17 |
28 ; CHECK-NEXT: mov pc, lr | 29 ; CHECK-NEXT: mov pc, lr |
29 %tmp1 = load <8 x i8>, <8 x i8>* %A | 30 %tmp1 = load <8 x i8>, <8 x i8>* %A |
30 %tmp2 = load <8 x i8>, <8 x i8>* %B | 31 %tmp2 = load <8 x i8>, <8 x i8>* %B |
31 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> | 32 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> |
32 ret <16 x i8> %tmp3 | 33 ret <16 x i8> %tmp3 |
36 ; CHECK-LABEL: vuzpi16: | 37 ; CHECK-LABEL: vuzpi16: |
37 ; CHECK: @ BB#0: | 38 ; CHECK: @ BB#0: |
38 ; CHECK-NEXT: vldr d16, [r1] | 39 ; CHECK-NEXT: vldr d16, [r1] |
39 ; CHECK-NEXT: vldr d17, [r0] | 40 ; CHECK-NEXT: vldr d17, [r0] |
40 ; CHECK-NEXT: vuzp.16 d17, d16 | 41 ; CHECK-NEXT: vuzp.16 d17, d16 |
41 ; CHECK-NEXT: vadd.i16 d16, d17, d16 | 42 ; CHECK-NEXT: vmul.i16 d16, d17, d16 |
42 ; CHECK-NEXT: vmov r0, r1, d16 | 43 ; CHECK-NEXT: vmov r0, r1, d16 |
43 ; CHECK-NEXT: mov pc, lr | 44 ; CHECK-NEXT: mov pc, lr |
44 %tmp1 = load <4 x i16>, <4 x i16>* %A | 45 %tmp1 = load <4 x i16>, <4 x i16>* %A |
45 %tmp2 = load <4 x i16>, <4 x i16>* %B | 46 %tmp2 = load <4 x i16>, <4 x i16>* %B |
46 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> | 47 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> |
47 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> | 48 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> |
48 %tmp5 = add <4 x i16> %tmp3, %tmp4 | 49 %tmp5 = mul <4 x i16> %tmp3, %tmp4 |
49 ret <4 x i16> %tmp5 | 50 ret <4 x i16> %tmp5 |
50 } | 51 } |
51 | 52 |
52 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { | 53 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { |
53 ; CHECK-LABEL: vuzpi16_Qres: | 54 ; CHECK-LABEL: vuzpi16_Qres: |
54 ; CHECK: @ BB#0: | 55 ; CHECK: @ BB#0: |
55 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] | 56 ; CHECK-NEXT: vldr d17, [r1] |
56 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] | 57 ; CHECK-NEXT: vldr d16, [r0] |
57 ; CHECK-NEXT: vuzp.16 [[LDR0]], [[LDR1]] | 58 ; CHECK-NEXT: vuzp.16 d16, d17 |
58 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] | 59 ; CHECK-NEXT: vmov r0, r1, d16 |
59 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] | 60 ; CHECK-NEXT: vmov r2, r3, d17 |
60 ; CHECK-NEXT: mov pc, lr | 61 ; CHECK-NEXT: mov pc, lr |
61 %tmp1 = load <4 x i16>, <4 x i16>* %A | 62 %tmp1 = load <4 x i16>, <4 x i16>* %A |
62 %tmp2 = load <4 x i16>, <4 x i16>* %B | 63 %tmp2 = load <4 x i16>, <4 x i16>* %B |
63 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> | 64 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> |
64 ret <8 x i16> %tmp3 | 65 ret <8 x i16> %tmp3 |
204 ; CHECK-LABEL: vuzpi8_undef: | 205 ; CHECK-LABEL: vuzpi8_undef: |
205 ; CHECK: @ BB#0: | 206 ; CHECK: @ BB#0: |
206 ; CHECK-NEXT: vldr d16, [r1] | 207 ; CHECK-NEXT: vldr d16, [r1] |
207 ; CHECK-NEXT: vldr d17, [r0] | 208 ; CHECK-NEXT: vldr d17, [r0] |
208 ; CHECK-NEXT: vuzp.8 d17, d16 | 209 ; CHECK-NEXT: vuzp.8 d17, d16 |
209 ; CHECK-NEXT: vadd.i8 d16, d17, d16 | 210 ; CHECK-NEXT: vmul.i8 d16, d17, d16 |
210 ; CHECK-NEXT: vmov r0, r1, d16 | 211 ; CHECK-NEXT: vmov r0, r1, d16 |
211 ; CHECK-NEXT: mov pc, lr | 212 ; CHECK-NEXT: mov pc, lr |
212 %tmp1 = load <8 x i8>, <8 x i8>* %A | 213 %tmp1 = load <8 x i8>, <8 x i8>* %A |
213 %tmp2 = load <8 x i8>, <8 x i8>* %B | 214 %tmp2 = load <8 x i8>, <8 x i8>* %B |
214 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14> | 215 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14> |
215 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> | 216 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> |
216 %tmp5 = add <8 x i8> %tmp3, %tmp4 | 217 %tmp5 = mul <8 x i8> %tmp3, %tmp4 |
217 ret <8 x i8> %tmp5 | 218 ret <8 x i8> %tmp5 |
218 } | 219 } |
219 | 220 |
220 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { | 221 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { |
221 ; CHECK-LABEL: vuzpi8_undef_Qres: | 222 ; CHECK-LABEL: vuzpi8_undef_Qres: |
222 ; CHECK: @ BB#0: | 223 ; CHECK: @ BB#0: |
223 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] | 224 ; CHECK-NEXT: vldr d17, [r1] |
224 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] | 225 ; CHECK-NEXT: vldr d16, [r0] |
225 ; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] | 226 ; CHECK-NEXT: vuzp.8 d16, d17 |
226 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] | 227 ; CHECK-NEXT: vmov r0, r1, d16 |
227 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] | 228 ; CHECK-NEXT: vmov r2, r3, d17 |
228 ; CHECK-NEXT: mov pc, lr | 229 ; CHECK-NEXT: mov pc, lr |
229 %tmp1 = load <8 x i8>, <8 x i8>* %A | 230 %tmp1 = load <8 x i8>, <8 x i8>* %A |
230 %tmp2 = load <8 x i8>, <8 x i8>* %B | 231 %tmp2 = load <8 x i8>, <8 x i8>* %B |
231 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> | 232 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> |
232 ret <16 x i8> %tmp3 | 233 ret <16 x i8> %tmp3 |
264 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> | 265 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> |
265 ret <16 x i16> %tmp3 | 266 ret <16 x i16> %tmp3 |
266 } | 267 } |
267 | 268 |
268 define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { | 269 define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { |
270 ; CHECK-LABEL: vuzp_lower_shufflemask_undef: | |
271 ; CHECK: @ BB#0: @ %entry | |
272 ; CHECK-NEXT: vldr d17, [r1] | |
273 ; CHECK-NEXT: vldr d16, [r0] | |
274 ; CHECK-NEXT: vorr q9, q8, q8 | |
275 ; CHECK-NEXT: vuzp.16 q8, q9 | |
276 ; CHECK-NEXT: vmov r0, r1, d18 | |
277 ; CHECK-NEXT: vmov r2, r3, d19 | |
278 ; CHECK-NEXT: mov pc, lr | |
269 entry: | 279 entry: |
270 ; CHECK-LABEL: vuzp_lower_shufflemask_undef | |
271 ; CHECK: vuzp | |
272 %tmp1 = load <4 x i16>, <4 x i16>* %A | 280 %tmp1 = load <4 x i16>, <4 x i16>* %A |
273 %tmp2 = load <4 x i16>, <4 x i16>* %B | 281 %tmp2 = load <4 x i16>, <4 x i16>* %B |
274 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> | 282 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> |
275 ret <8 x i16> %0 | 283 ret <8 x i16> %0 |
276 } | 284 } |
277 | 285 |
278 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { | 286 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { |
287 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed: | |
288 ; CHECK: @ BB#0: @ %entry | |
289 ; CHECK-NEXT: vldr d17, [r1] | |
290 ; CHECK-NEXT: vldr d16, [r0] | |
291 ; CHECK-NEXT: vdup.32 q9, d16[0] | |
292 ; CHECK-NEXT: vuzp.32 q8, q9 | |
293 ; CHECK-NEXT: vext.32 q8, q9, q9, #2 | |
294 ; CHECK-NEXT: vmov r0, r1, d16 | |
295 ; CHECK-NEXT: vmov r2, r3, d17 | |
296 ; CHECK-NEXT: mov pc, lr | |
279 entry: | 297 entry: |
280 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed | |
281 ; CHECK-NOT: vtrn | |
282 ; CHECK: vuzp | |
283 %tmp1 = load <2 x i32>, <2 x i32>* %A | 298 %tmp1 = load <2 x i32>, <2 x i32>* %A |
284 %tmp2 = load <2 x i32>, <2 x i32>* %B | 299 %tmp2 = load <2 x i32>, <2 x i32>* %B |
285 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3> | 300 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3> |
286 ret <4 x i32> %0 | 301 ret <4 x i32> %0 |
287 } | 302 } |
288 | 303 |
289 define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { | 304 define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { |
305 ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn: | |
306 ; CHECK: @ BB#0: @ %entry | |
307 ; CHECK-NEXT: vldr d17, [r1] | |
308 ; CHECK-NEXT: vldr d16, [r0] | |
309 ; CHECK-NEXT: vrev64.32 q9, q8 | |
310 ; CHECK-NEXT: vuzp.32 q8, q9 | |
311 ; CHECK-NEXT: vst1.64 {d18, d19}, [r2] | |
312 ; CHECK-NEXT: mov pc, lr | |
290 entry: | 313 entry: |
291 ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn | |
292 ; CHECK-NOT: vtrn | |
293 ; CHECK: vuzp | |
294 %tmp1 = load <2 x i32>, <2 x i32>* %A | 314 %tmp1 = load <2 x i32>, <2 x i32>* %A |
295 %tmp2 = load <2 x i32>, <2 x i32>* %B | 315 %tmp2 = load <2 x i32>, <2 x i32>* %B |
296 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2> | 316 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2> |
297 store <4 x i32> %0, <4 x i32>* %C | 317 store <4 x i32> %0, <4 x i32>* %C |
298 ret void | 318 ret void |
299 } | 319 } |
300 | 320 |
301 define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { | 321 define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { |
302 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. | 322 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. |
303 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to | 323 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to |
304 ; truncate from i32 to i16 and one vuzp to perform the final truncation for i8. | 324 ; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8. |
305 ; CHECK-LABEL: vuzp_trunc | 325 ; CHECK-LABEL: cmpsel_trunc: |
306 ; CHECK: vmovn.i32 | 326 ; CHECK: @ BB#0: |
307 ; CHECK: vmovn.i32 | 327 ; CHECK-NEXT: add r12, sp, #16 |
308 ; CHECK: vuzp | 328 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] |
309 ; CHECK: vbsl | 329 ; CHECK-NEXT: mov r12, sp |
330 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] | |
331 ; CHECK-NEXT: add r12, sp, #48 | |
332 ; CHECK-NEXT: vld1.64 {d20, d21}, [r12] | |
333 ; CHECK-NEXT: add r12, sp, #32 | |
334 ; CHECK-NEXT: vcgt.u32 q8, q10, q8 | |
335 ; CHECK-NEXT: vld1.64 {d20, d21}, [r12] | |
336 ; CHECK-NEXT: vcgt.u32 q9, q10, q9 | |
337 ; CHECK-NEXT: vmov d20, r2, r3 | |
338 ; CHECK-NEXT: vmovn.i32 d17, q8 | |
339 ; CHECK-NEXT: vmovn.i32 d16, q9 | |
340 ; CHECK-NEXT: vmov d18, r0, r1 | |
341 ; CHECK-NEXT: vmovn.i16 d16, q8 | |
342 ; CHECK-NEXT: vbsl d16, d18, d20 | |
343 ; CHECK-NEXT: vmov r0, r1, d16 | |
344 ; CHECK-NEXT: mov pc, lr | |
310 %c = icmp ult <8 x i32> %cmp0, %cmp1 | 345 %c = icmp ult <8 x i32> %cmp0, %cmp1 |
311 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 | 346 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 |
312 ret <8 x i8> %res | 347 ret <8 x i8> %res |
313 } | 348 } |
314 | 349 |
315 ; Shuffle the result from the compare with a <4 x i8>. | 350 ; Shuffle the result from the compare with a <4 x i8>. |
316 ; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able | 351 ; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able |
317 ; to perform the vuzp and get the vbsl mask. | 352 ; to perform the vuzp and get the vbsl mask. |
318 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, | 353 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, |
354 ; CHECK-LABEL: vuzp_trunc_and_shuffle: | |
355 ; CHECK: @ BB#0: | |
356 ; CHECK-NEXT: .save {r11, lr} | |
357 ; CHECK-NEXT: push {r11, lr} | |
358 ; CHECK-NEXT: add r12, sp, #8 | |
359 ; CHECK-NEXT: add lr, sp, #24 | |
360 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] | |
361 ; CHECK-NEXT: ldr r12, [sp, #40] | |
362 ; CHECK-NEXT: vld1.64 {d18, d19}, [lr] | |
363 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 | |
364 ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] | |
365 ; CHECK-NEXT: vmov.i8 d19, #0x7 | |
366 ; CHECK-NEXT: vmovl.u8 q10, d18 | |
367 ; CHECK-NEXT: vmovn.i32 d16, q8 | |
368 ; CHECK-NEXT: vneg.s8 d17, d19 | |
369 ; CHECK-NEXT: vmov d18, r2, r3 | |
370 ; CHECK-NEXT: vuzp.8 d16, d20 | |
371 ; CHECK-NEXT: vshl.i8 d16, d16, #7 | |
372 ; CHECK-NEXT: vshl.s8 d16, d16, d17 | |
373 ; CHECK-NEXT: vmov d17, r0, r1 | |
374 ; CHECK-NEXT: vbsl d16, d17, d18 | |
375 ; CHECK-NEXT: vmov r0, r1, d16 | |
376 ; CHECK-NEXT: pop {r11, lr} | |
377 ; CHECK-NEXT: mov pc, lr | |
319 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { | 378 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { |
320 ; CHECK-LABEL: vuzp_trunc_and_shuffle | |
321 ; CHECK: vmovl | |
322 ; CHECK: vuzp | |
323 ; CHECK: vbsl | |
324 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 | 379 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 |
325 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> | 380 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> |
326 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 | 381 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 |
327 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | 382 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
328 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 | 383 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 |
330 } | 385 } |
331 | 386 |
332 ; Use an undef value for the <4 x i8> that is being shuffled with the compare result. | 387 ; Use an undef value for the <4 x i8> that is being shuffled with the compare result. |
333 ; This produces a build_vector with some of the operands undefs. | 388 ; This produces a build_vector with some of the operands undefs. |
334 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, | 389 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, |
390 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right: | |
391 ; CHECK: @ BB#0: | |
392 ; CHECK-NEXT: mov r12, sp | |
393 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] | |
394 ; CHECK-NEXT: add r12, sp, #16 | |
395 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] | |
396 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 | |
397 ; CHECK-NEXT: vmov.i8 d18, #0x7 | |
398 ; CHECK-NEXT: vmovn.i32 d16, q8 | |
399 ; CHECK-NEXT: vuzp.8 d16, d17 | |
400 ; CHECK-NEXT: vneg.s8 d17, d18 | |
401 ; CHECK-NEXT: vshl.i8 d16, d16, #7 | |
402 ; CHECK-NEXT: vmov d18, r2, r3 | |
403 ; CHECK-NEXT: vshl.s8 d16, d16, d17 | |
404 ; CHECK-NEXT: vmov d17, r0, r1 | |
405 ; CHECK-NEXT: vbsl d16, d17, d18 | |
406 ; CHECK-NEXT: vmov r0, r1, d16 | |
407 ; CHECK-NEXT: mov pc, lr | |
335 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { | 408 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { |
336 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right | |
337 ; CHECK: vuzp | |
338 ; CHECK: vbsl | |
339 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 | 409 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 |
340 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> | 410 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> |
341 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 | 411 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 |
342 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | 412 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
343 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 | 413 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 |
344 ret <8 x i8> %rv | 414 ret <8 x i8> %rv |
345 } | 415 } |
346 | 416 |
347 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, | 417 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, |
418 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left: | |
419 ; CHECK: @ BB#0: | |
420 ; CHECK-NEXT: mov r12, sp | |
421 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12] | |
422 ; CHECK-NEXT: add r12, sp, #16 | |
423 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] | |
424 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 | |
425 ; CHECK-NEXT: vldr d18, .LCPI22_0 | |
426 ; CHECK-NEXT: vmov.i8 d19, #0x7 | |
427 ; CHECK-NEXT: vmovn.i32 d16, q8 | |
428 ; CHECK-NEXT: vtbl.8 d16, {d16}, d18 | |
429 ; CHECK-NEXT: vneg.s8 d17, d19 | |
430 ; CHECK-NEXT: vmov d18, r2, r3 | |
431 ; CHECK-NEXT: vshl.i8 d16, d16, #7 | |
432 ; CHECK-NEXT: vshl.s8 d16, d16, d17 | |
433 ; CHECK-NEXT: vmov d17, r0, r1 | |
434 ; CHECK-NEXT: vbsl d16, d17, d18 | |
435 ; CHECK-NEXT: vmov r0, r1, d16 | |
436 ; CHECK-NEXT: mov pc, lr | |
437 ; CHECK-NEXT: .p2align 3 | |
438 ; CHECK-NEXT: @ BB#1: | |
439 ; CHECK-NEXT: .LCPI22_0: | |
440 ; CHECK-NEXT: .byte 255 @ 0xff | |
441 ; CHECK-NEXT: .byte 255 @ 0xff | |
442 ; CHECK-NEXT: .byte 255 @ 0xff | |
443 ; CHECK-NEXT: .byte 255 @ 0xff | |
444 ; CHECK-NEXT: .byte 0 @ 0x0 | |
445 ; CHECK-NEXT: .byte 2 @ 0x2 | |
446 ; CHECK-NEXT: .byte 4 @ 0x4 | |
447 ; CHECK-NEXT: .byte 6 @ 0x6 | |
348 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { | 448 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { |
349 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left | |
350 ; CHECK: vuzp | |
351 ; CHECK: vbsl | |
352 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 | 449 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 |
353 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> | 450 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> |
354 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 | 451 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 |
355 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | 452 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
356 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 | 453 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 |
358 } | 455 } |
359 | 456 |
360 ; We're using large data types here, and we have to fill with undef values until we | 457 ; We're using large data types here, and we have to fill with undef values until we |
361 ; get some vector size that we can represent. | 458 ; get some vector size that we can represent. |
362 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, | 459 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, |
460 ; CHECK-LABEL: vuzp_wide_type: | |
461 ; CHECK: @ BB#0: | |
462 ; CHECK-NEXT: .save {r4, r10, r11, lr} | |
463 ; CHECK-NEXT: push {r4, r10, r11, lr} | |
464 ; CHECK-NEXT: .setfp r11, sp, #8 | |
465 ; CHECK-NEXT: add r11, sp, #8 | |
466 ; CHECK-NEXT: bic sp, sp, #15 | |
467 ; CHECK-NEXT: add r12, r11, #32 | |
468 ; CHECK-NEXT: add lr, r11, #60 | |
469 ; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32] | |
470 ; CHECK-NEXT: add r12, r11, #24 | |
471 ; CHECK-NEXT: vld1.32 {d22[0]}, [lr:32] | |
472 ; CHECK-NEXT: add lr, r11, #36 | |
473 ; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32] | |
474 ; CHECK-NEXT: add r12, r11, #52 | |
475 ; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32] | |
476 ; CHECK-NEXT: add r12, r11, #44 | |
477 ; CHECK-NEXT: vld1.32 {d17[1]}, [lr:32] | |
478 ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] | |
479 ; CHECK-NEXT: add r12, r11, #40 | |
480 ; CHECK-NEXT: vld1.32 {d20[0]}, [r12:32] | |
481 ; CHECK-NEXT: ldr r12, [r11, #64] | |
482 ; CHECK-NEXT: vcgt.u32 q10, q11, q10 | |
483 ; CHECK-NEXT: ldr r4, [r12] | |
484 ; CHECK-NEXT: vmov.32 d25[0], r4 | |
485 ; CHECK-NEXT: add r4, r11, #28 | |
486 ; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32] | |
487 ; CHECK-NEXT: add r4, r11, #56 | |
488 ; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32] | |
489 ; CHECK-NEXT: add r4, r11, #48 | |
490 ; CHECK-NEXT: vmov.u8 lr, d25[3] | |
491 ; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32] | |
492 ; CHECK-NEXT: add r4, r12, #4 | |
493 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 | |
494 ; CHECK-NEXT: vmovn.i32 d19, q10 | |
495 ; CHECK-NEXT: vldr d20, .LCPI23_0 | |
496 ; CHECK-NEXT: vmov.i8 d18, #0x7 | |
497 ; CHECK-NEXT: vmovn.i32 d16, q8 | |
498 ; CHECK-NEXT: vneg.s8 d17, d18 | |
499 ; CHECK-NEXT: vuzp.8 d16, d19 | |
500 ; CHECK-NEXT: vmov.i8 q9, #0x7 | |
501 ; CHECK-NEXT: vshl.i8 d16, d16, #7 | |
502 ; CHECK-NEXT: vneg.s8 q9, q9 | |
503 ; CHECK-NEXT: vshl.s8 d24, d16, d17 | |
504 ; CHECK-NEXT: vmov.8 d17[0], lr | |
505 ; CHECK-NEXT: vtbl.8 d16, {d24, d25}, d20 | |
506 ; CHECK-NEXT: vld1.8 {d17[1]}, [r4] | |
507 ; CHECK-NEXT: add r4, r11, #8 | |
508 ; CHECK-NEXT: vshl.i8 q8, q8, #7 | |
509 ; CHECK-NEXT: vld1.64 {d20, d21}, [r4] | |
510 ; CHECK-NEXT: vshl.s8 q8, q8, q9 | |
511 ; CHECK-NEXT: vmov d19, r2, r3 | |
512 ; CHECK-NEXT: vmov d18, r0, r1 | |
513 ; CHECK-NEXT: vbsl q8, q9, q10 | |
514 ; CHECK-NEXT: vmov r0, r1, d16 | |
515 ; CHECK-NEXT: vmov r2, r3, d17 | |
516 ; CHECK-NEXT: sub sp, r11, #8 | |
517 ; CHECK-NEXT: pop {r4, r10, r11, lr} | |
518 ; CHECK-NEXT: mov pc, lr | |
519 ; CHECK-NEXT: .p2align 3 | |
520 ; CHECK-NEXT: @ BB#1: | |
521 ; CHECK-NEXT: .LCPI23_0: | |
522 ; CHECK-NEXT: .byte 0 @ 0x0 | |
523 ; CHECK-NEXT: .byte 1 @ 0x1 | |
524 ; CHECK-NEXT: .byte 2 @ 0x2 | |
525 ; CHECK-NEXT: .byte 3 @ 0x3 | |
526 ; CHECK-NEXT: .byte 4 @ 0x4 | |
527 ; CHECK-NEXT: .byte 8 @ 0x8 | |
528 ; CHECK-NEXT: .byte 9 @ 0x9 | |
529 ; CHECK-NEXT: .byte 10 @ 0xa | |
363 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) { | 530 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) { |
364 ; CHECK-LABEL: vuzp_wide_type | |
365 ; CHECK: vbsl | |
366 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4 | 531 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4 |
367 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1> | 532 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1> |
368 %c0 = icmp ult <5 x i32> %cmp0, %cmp1 | 533 %c0 = icmp ult <5 x i32> %cmp0, %cmp1 |
369 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> | 534 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> |
370 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 | 535 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 |
371 ret <10 x i8> %rv | 536 ret <10 x i8> %rv |
372 } | 537 } |
538 | |
539 %struct.uint8x8x2_t = type { [2 x <8 x i8>] } | |
540 define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 { | |
541 ; CHECK-LABEL: vuzp_extract_subvector: | |
542 ; CHECK: @ BB#0: | |
543 ; CHECK-NEXT: vmov d17, r2, r3 | |
544 ; CHECK-NEXT: vmov d16, r0, r1 | |
545 ; CHECK-NEXT: vorr d18, d17, d17 | |
546 ; CHECK-NEXT: vuzp.8 d16, d18 | |
547 ; CHECK-NEXT: vmov r0, r1, d16 | |
548 ; CHECK-NEXT: vmov r2, r3, d18 | |
549 ; CHECK-NEXT: mov pc, lr | |
550 | |
551 %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> | |
552 %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> | |
553 %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 | |
554 %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 | |
555 ret %struct.uint8x8x2_t %.fca.0.1.insert | |
556 } |