comparison test/CodeGen/ARM/vuzp.ll @ 121:803732b1fca8

LLVM 5.0
author kono
date Fri, 27 Oct 2017 17:07:41 +0900
parents afa8332a0e37
children
comparison
equal deleted inserted replaced
120:1172e4bd9c6f 121:803732b1fca8
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 2 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
2 3
3 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 4 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4 ; CHECK-LABEL: vuzpi8: 5 ; CHECK-LABEL: vuzpi8:
5 ; CHECK: @ BB#0: 6 ; CHECK: @ BB#0:
6 ; CHECK-NEXT: vldr d16, [r1] 7 ; CHECK-NEXT: vldr d16, [r1]
7 ; CHECK-NEXT: vldr d17, [r0] 8 ; CHECK-NEXT: vldr d17, [r0]
8 ; CHECK-NEXT: vuzp.8 d17, d16 9 ; CHECK-NEXT: vuzp.8 d17, d16
9 ; CHECK-NEXT: vadd.i8 d16, d17, d16 10 ; CHECK-NEXT: vmul.i8 d16, d17, d16
10 ; CHECK-NEXT: vmov r0, r1, d16 11 ; CHECK-NEXT: vmov r0, r1, d16
11 ; CHECK-NEXT: mov pc, lr 12 ; CHECK-NEXT: mov pc, lr
12 %tmp1 = load <8 x i8>, <8 x i8>* %A 13 %tmp1 = load <8 x i8>, <8 x i8>* %A
13 %tmp2 = load <8 x i8>, <8 x i8>* %B 14 %tmp2 = load <8 x i8>, <8 x i8>* %B
14 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 15 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
15 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 16 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
16 %tmp5 = add <8 x i8> %tmp3, %tmp4 17 %tmp5 = mul <8 x i8> %tmp3, %tmp4
17 ret <8 x i8> %tmp5 18 ret <8 x i8> %tmp5
18 } 19 }
19 20
20 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 21 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
21 ; CHECK-LABEL: vuzpi8_Qres: 22 ; CHECK-LABEL: vuzpi8_Qres:
22 ; CHECK: @ BB#0: 23 ; CHECK: @ BB#0:
23 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 24 ; CHECK-NEXT: vldr d17, [r1]
24 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 25 ; CHECK-NEXT: vldr d16, [r0]
25 ; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] 26 ; CHECK-NEXT: vuzp.8 d16, d17
26 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] 27 ; CHECK-NEXT: vmov r0, r1, d16
27 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] 28 ; CHECK-NEXT: vmov r2, r3, d17
28 ; CHECK-NEXT: mov pc, lr 29 ; CHECK-NEXT: mov pc, lr
29 %tmp1 = load <8 x i8>, <8 x i8>* %A 30 %tmp1 = load <8 x i8>, <8 x i8>* %A
30 %tmp2 = load <8 x i8>, <8 x i8>* %B 31 %tmp2 = load <8 x i8>, <8 x i8>* %B
31 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 32 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
32 ret <16 x i8> %tmp3 33 ret <16 x i8> %tmp3
36 ; CHECK-LABEL: vuzpi16: 37 ; CHECK-LABEL: vuzpi16:
37 ; CHECK: @ BB#0: 38 ; CHECK: @ BB#0:
38 ; CHECK-NEXT: vldr d16, [r1] 39 ; CHECK-NEXT: vldr d16, [r1]
39 ; CHECK-NEXT: vldr d17, [r0] 40 ; CHECK-NEXT: vldr d17, [r0]
40 ; CHECK-NEXT: vuzp.16 d17, d16 41 ; CHECK-NEXT: vuzp.16 d17, d16
41 ; CHECK-NEXT: vadd.i16 d16, d17, d16 42 ; CHECK-NEXT: vmul.i16 d16, d17, d16
42 ; CHECK-NEXT: vmov r0, r1, d16 43 ; CHECK-NEXT: vmov r0, r1, d16
43 ; CHECK-NEXT: mov pc, lr 44 ; CHECK-NEXT: mov pc, lr
44 %tmp1 = load <4 x i16>, <4 x i16>* %A 45 %tmp1 = load <4 x i16>, <4 x i16>* %A
45 %tmp2 = load <4 x i16>, <4 x i16>* %B 46 %tmp2 = load <4 x i16>, <4 x i16>* %B
46 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 47 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
47 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 48 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
48 %tmp5 = add <4 x i16> %tmp3, %tmp4 49 %tmp5 = mul <4 x i16> %tmp3, %tmp4
49 ret <4 x i16> %tmp5 50 ret <4 x i16> %tmp5
50 } 51 }
51 52
52 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { 53 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
53 ; CHECK-LABEL: vuzpi16_Qres: 54 ; CHECK-LABEL: vuzpi16_Qres:
54 ; CHECK: @ BB#0: 55 ; CHECK: @ BB#0:
55 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 56 ; CHECK-NEXT: vldr d17, [r1]
56 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 57 ; CHECK-NEXT: vldr d16, [r0]
57 ; CHECK-NEXT: vuzp.16 [[LDR0]], [[LDR1]] 58 ; CHECK-NEXT: vuzp.16 d16, d17
58 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] 59 ; CHECK-NEXT: vmov r0, r1, d16
59 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] 60 ; CHECK-NEXT: vmov r2, r3, d17
60 ; CHECK-NEXT: mov pc, lr 61 ; CHECK-NEXT: mov pc, lr
61 %tmp1 = load <4 x i16>, <4 x i16>* %A 62 %tmp1 = load <4 x i16>, <4 x i16>* %A
62 %tmp2 = load <4 x i16>, <4 x i16>* %B 63 %tmp2 = load <4 x i16>, <4 x i16>* %B
63 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 64 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
64 ret <8 x i16> %tmp3 65 ret <8 x i16> %tmp3
204 ; CHECK-LABEL: vuzpi8_undef: 205 ; CHECK-LABEL: vuzpi8_undef:
205 ; CHECK: @ BB#0: 206 ; CHECK: @ BB#0:
206 ; CHECK-NEXT: vldr d16, [r1] 207 ; CHECK-NEXT: vldr d16, [r1]
207 ; CHECK-NEXT: vldr d17, [r0] 208 ; CHECK-NEXT: vldr d17, [r0]
208 ; CHECK-NEXT: vuzp.8 d17, d16 209 ; CHECK-NEXT: vuzp.8 d17, d16
209 ; CHECK-NEXT: vadd.i8 d16, d17, d16 210 ; CHECK-NEXT: vmul.i8 d16, d17, d16
210 ; CHECK-NEXT: vmov r0, r1, d16 211 ; CHECK-NEXT: vmov r0, r1, d16
211 ; CHECK-NEXT: mov pc, lr 212 ; CHECK-NEXT: mov pc, lr
212 %tmp1 = load <8 x i8>, <8 x i8>* %A 213 %tmp1 = load <8 x i8>, <8 x i8>* %A
213 %tmp2 = load <8 x i8>, <8 x i8>* %B 214 %tmp2 = load <8 x i8>, <8 x i8>* %B
214 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14> 215 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
215 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 216 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
216 %tmp5 = add <8 x i8> %tmp3, %tmp4 217 %tmp5 = mul <8 x i8> %tmp3, %tmp4
217 ret <8 x i8> %tmp5 218 ret <8 x i8> %tmp5
218 } 219 }
219 220
220 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 221 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
221 ; CHECK-LABEL: vuzpi8_undef_Qres: 222 ; CHECK-LABEL: vuzpi8_undef_Qres:
222 ; CHECK: @ BB#0: 223 ; CHECK: @ BB#0:
223 ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] 224 ; CHECK-NEXT: vldr d17, [r1]
224 ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] 225 ; CHECK-NEXT: vldr d16, [r0]
225 ; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] 226 ; CHECK-NEXT: vuzp.8 d16, d17
226 ; CHECK-NEXT: vmov r0, r1, [[LDR0]] 227 ; CHECK-NEXT: vmov r0, r1, d16
227 ; CHECK-NEXT: vmov r2, r3, [[LDR1]] 228 ; CHECK-NEXT: vmov r2, r3, d17
228 ; CHECK-NEXT: mov pc, lr 229 ; CHECK-NEXT: mov pc, lr
229 %tmp1 = load <8 x i8>, <8 x i8>* %A 230 %tmp1 = load <8 x i8>, <8 x i8>* %A
230 %tmp2 = load <8 x i8>, <8 x i8>* %B 231 %tmp2 = load <8 x i8>, <8 x i8>* %B
231 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 232 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
232 ret <16 x i8> %tmp3 233 ret <16 x i8> %tmp3
264 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 265 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
265 ret <16 x i16> %tmp3 266 ret <16 x i16> %tmp3
266 } 267 }
267 268
268 define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { 269 define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
270 ; CHECK-LABEL: vuzp_lower_shufflemask_undef:
271 ; CHECK: @ BB#0: @ %entry
272 ; CHECK-NEXT: vldr d17, [r1]
273 ; CHECK-NEXT: vldr d16, [r0]
274 ; CHECK-NEXT: vorr q9, q8, q8
275 ; CHECK-NEXT: vuzp.16 q8, q9
276 ; CHECK-NEXT: vmov r0, r1, d18
277 ; CHECK-NEXT: vmov r2, r3, d19
278 ; CHECK-NEXT: mov pc, lr
269 entry: 279 entry:
270 ; CHECK-LABEL: vuzp_lower_shufflemask_undef
271 ; CHECK: vuzp
272 %tmp1 = load <4 x i16>, <4 x i16>* %A 280 %tmp1 = load <4 x i16>, <4 x i16>* %A
273 %tmp2 = load <4 x i16>, <4 x i16>* %B 281 %tmp2 = load <4 x i16>, <4 x i16>* %B
274 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 282 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
275 ret <8 x i16> %0 283 ret <8 x i16> %0
276 } 284 }
277 285
278 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { 286 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
287 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
288 ; CHECK: @ BB#0: @ %entry
289 ; CHECK-NEXT: vldr d17, [r1]
290 ; CHECK-NEXT: vldr d16, [r0]
291 ; CHECK-NEXT: vdup.32 q9, d16[0]
292 ; CHECK-NEXT: vuzp.32 q8, q9
293 ; CHECK-NEXT: vext.32 q8, q9, q9, #2
294 ; CHECK-NEXT: vmov r0, r1, d16
295 ; CHECK-NEXT: vmov r2, r3, d17
296 ; CHECK-NEXT: mov pc, lr
279 entry: 297 entry:
280 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed
281 ; CHECK-NOT: vtrn
282 ; CHECK: vuzp
283 %tmp1 = load <2 x i32>, <2 x i32>* %A 298 %tmp1 = load <2 x i32>, <2 x i32>* %A
284 %tmp2 = load <2 x i32>, <2 x i32>* %B 299 %tmp2 = load <2 x i32>, <2 x i32>* %B
285 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3> 300 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
286 ret <4 x i32> %0 301 ret <4 x i32> %0
287 } 302 }
288 303
289 define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { 304 define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
305 ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn:
306 ; CHECK: @ BB#0: @ %entry
307 ; CHECK-NEXT: vldr d17, [r1]
308 ; CHECK-NEXT: vldr d16, [r0]
309 ; CHECK-NEXT: vrev64.32 q9, q8
310 ; CHECK-NEXT: vuzp.32 q8, q9
311 ; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
312 ; CHECK-NEXT: mov pc, lr
290 entry: 313 entry:
291 ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn
292 ; CHECK-NOT: vtrn
293 ; CHECK: vuzp
294 %tmp1 = load <2 x i32>, <2 x i32>* %A 314 %tmp1 = load <2 x i32>, <2 x i32>* %A
295 %tmp2 = load <2 x i32>, <2 x i32>* %B 315 %tmp2 = load <2 x i32>, <2 x i32>* %B
296 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2> 316 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
297 store <4 x i32> %0, <4 x i32>* %C 317 store <4 x i32> %0, <4 x i32>* %C
298 ret void 318 ret void
299 } 319 }
300 320
301 define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { 321 define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
302 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. 322 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
303 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to 323 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
304 ; truncate from i32 to i16 and one vuzp to perform the final truncation for i8. 324 ; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8.
305 ; CHECK-LABEL: vuzp_trunc 325 ; CHECK-LABEL: cmpsel_trunc:
306 ; CHECK: vmovn.i32 326 ; CHECK: @ BB#0:
307 ; CHECK: vmovn.i32 327 ; CHECK-NEXT: add r12, sp, #16
308 ; CHECK: vuzp 328 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
309 ; CHECK: vbsl 329 ; CHECK-NEXT: mov r12, sp
330 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12]
331 ; CHECK-NEXT: add r12, sp, #48
332 ; CHECK-NEXT: vld1.64 {d20, d21}, [r12]
333 ; CHECK-NEXT: add r12, sp, #32
334 ; CHECK-NEXT: vcgt.u32 q8, q10, q8
335 ; CHECK-NEXT: vld1.64 {d20, d21}, [r12]
336 ; CHECK-NEXT: vcgt.u32 q9, q10, q9
337 ; CHECK-NEXT: vmov d20, r2, r3
338 ; CHECK-NEXT: vmovn.i32 d17, q8
339 ; CHECK-NEXT: vmovn.i32 d16, q9
340 ; CHECK-NEXT: vmov d18, r0, r1
341 ; CHECK-NEXT: vmovn.i16 d16, q8
342 ; CHECK-NEXT: vbsl d16, d18, d20
343 ; CHECK-NEXT: vmov r0, r1, d16
344 ; CHECK-NEXT: mov pc, lr
310 %c = icmp ult <8 x i32> %cmp0, %cmp1 345 %c = icmp ult <8 x i32> %cmp0, %cmp1
311 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 346 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
312 ret <8 x i8> %res 347 ret <8 x i8> %res
313 } 348 }
314 349
315 ; Shuffle the result from the compare with a <4 x i8>. 350 ; Shuffle the result from the compare with a <4 x i8>.
316 ; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able 351 ; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
317 ; to perform the vuzp and get the vbsl mask. 352 ; to perform the vuzp and get the vbsl mask.
318 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, 353 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
354 ; CHECK-LABEL: vuzp_trunc_and_shuffle:
355 ; CHECK: @ BB#0:
356 ; CHECK-NEXT: .save {r11, lr}
357 ; CHECK-NEXT: push {r11, lr}
358 ; CHECK-NEXT: add r12, sp, #8
359 ; CHECK-NEXT: add lr, sp, #24
360 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
361 ; CHECK-NEXT: ldr r12, [sp, #40]
362 ; CHECK-NEXT: vld1.64 {d18, d19}, [lr]
363 ; CHECK-NEXT: vcgt.u32 q8, q9, q8
364 ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32]
365 ; CHECK-NEXT: vmov.i8 d19, #0x7
366 ; CHECK-NEXT: vmovl.u8 q10, d18
367 ; CHECK-NEXT: vmovn.i32 d16, q8
368 ; CHECK-NEXT: vneg.s8 d17, d19
369 ; CHECK-NEXT: vmov d18, r2, r3
370 ; CHECK-NEXT: vuzp.8 d16, d20
371 ; CHECK-NEXT: vshl.i8 d16, d16, #7
372 ; CHECK-NEXT: vshl.s8 d16, d16, d17
373 ; CHECK-NEXT: vmov d17, r0, r1
374 ; CHECK-NEXT: vbsl d16, d17, d18
375 ; CHECK-NEXT: vmov r0, r1, d16
376 ; CHECK-NEXT: pop {r11, lr}
377 ; CHECK-NEXT: mov pc, lr
319 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 378 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
320 ; CHECK-LABEL: vuzp_trunc_and_shuffle
321 ; CHECK: vmovl
322 ; CHECK: vuzp
323 ; CHECK: vbsl
324 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 379 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
325 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 380 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
326 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 381 %c0 = icmp ult <4 x i32> %cmp0, %cmp1
327 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 382 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
328 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 383 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
330 } 385 }
331 386
332 ; Use an undef value for the <4 x i8> that is being shuffled with the compare result. 387 ; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
333 ; This produces a build_vector with some of the operands undefs. 388 ; This produces a build_vector with some of the operands undefs.
334 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, 389 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
390 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right:
391 ; CHECK: @ BB#0:
392 ; CHECK-NEXT: mov r12, sp
393 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
394 ; CHECK-NEXT: add r12, sp, #16
395 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12]
396 ; CHECK-NEXT: vcgt.u32 q8, q9, q8
397 ; CHECK-NEXT: vmov.i8 d18, #0x7
398 ; CHECK-NEXT: vmovn.i32 d16, q8
399 ; CHECK-NEXT: vuzp.8 d16, d17
400 ; CHECK-NEXT: vneg.s8 d17, d18
401 ; CHECK-NEXT: vshl.i8 d16, d16, #7
402 ; CHECK-NEXT: vmov d18, r2, r3
403 ; CHECK-NEXT: vshl.s8 d16, d16, d17
404 ; CHECK-NEXT: vmov d17, r0, r1
405 ; CHECK-NEXT: vbsl d16, d17, d18
406 ; CHECK-NEXT: vmov r0, r1, d16
407 ; CHECK-NEXT: mov pc, lr
335 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 408 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
336 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right
337 ; CHECK: vuzp
338 ; CHECK: vbsl
339 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 409 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
340 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 410 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
341 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 411 %c0 = icmp ult <4 x i32> %cmp0, %cmp1
342 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 412 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
343 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 413 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
344 ret <8 x i8> %rv 414 ret <8 x i8> %rv
345 } 415 }
346 416
347 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, 417 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
418 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left:
419 ; CHECK: @ BB#0:
420 ; CHECK-NEXT: mov r12, sp
421 ; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
422 ; CHECK-NEXT: add r12, sp, #16
423 ; CHECK-NEXT: vld1.64 {d18, d19}, [r12]
424 ; CHECK-NEXT: vcgt.u32 q8, q9, q8
425 ; CHECK-NEXT: vldr d18, .LCPI22_0
426 ; CHECK-NEXT: vmov.i8 d19, #0x7
427 ; CHECK-NEXT: vmovn.i32 d16, q8
428 ; CHECK-NEXT: vtbl.8 d16, {d16}, d18
429 ; CHECK-NEXT: vneg.s8 d17, d19
430 ; CHECK-NEXT: vmov d18, r2, r3
431 ; CHECK-NEXT: vshl.i8 d16, d16, #7
432 ; CHECK-NEXT: vshl.s8 d16, d16, d17
433 ; CHECK-NEXT: vmov d17, r0, r1
434 ; CHECK-NEXT: vbsl d16, d17, d18
435 ; CHECK-NEXT: vmov r0, r1, d16
436 ; CHECK-NEXT: mov pc, lr
437 ; CHECK-NEXT: .p2align 3
438 ; CHECK-NEXT: @ BB#1:
439 ; CHECK-NEXT: .LCPI22_0:
440 ; CHECK-NEXT: .byte 255 @ 0xff
441 ; CHECK-NEXT: .byte 255 @ 0xff
442 ; CHECK-NEXT: .byte 255 @ 0xff
443 ; CHECK-NEXT: .byte 255 @ 0xff
444 ; CHECK-NEXT: .byte 0 @ 0x0
445 ; CHECK-NEXT: .byte 2 @ 0x2
446 ; CHECK-NEXT: .byte 4 @ 0x4
447 ; CHECK-NEXT: .byte 6 @ 0x6
348 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 448 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
349 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left
350 ; CHECK: vuzp
351 ; CHECK: vbsl
352 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 449 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
353 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 450 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
354 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 451 %c0 = icmp ult <4 x i32> %cmp0, %cmp1
355 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 452 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
356 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 453 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
358 } 455 }
359 456
360 ; We're using large data types here, and we have to fill with undef values until we 457 ; We're using large data types here, and we have to fill with undef values until we
361 ; get some vector size that we can represent. 458 ; get some vector size that we can represent.
362 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, 459 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
460 ; CHECK-LABEL: vuzp_wide_type:
461 ; CHECK: @ BB#0:
462 ; CHECK-NEXT: .save {r4, r10, r11, lr}
463 ; CHECK-NEXT: push {r4, r10, r11, lr}
464 ; CHECK-NEXT: .setfp r11, sp, #8
465 ; CHECK-NEXT: add r11, sp, #8
466 ; CHECK-NEXT: bic sp, sp, #15
467 ; CHECK-NEXT: add r12, r11, #32
468 ; CHECK-NEXT: add lr, r11, #60
469 ; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32]
470 ; CHECK-NEXT: add r12, r11, #24
471 ; CHECK-NEXT: vld1.32 {d22[0]}, [lr:32]
472 ; CHECK-NEXT: add lr, r11, #36
473 ; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32]
474 ; CHECK-NEXT: add r12, r11, #52
475 ; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32]
476 ; CHECK-NEXT: add r12, r11, #44
477 ; CHECK-NEXT: vld1.32 {d17[1]}, [lr:32]
478 ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32]
479 ; CHECK-NEXT: add r12, r11, #40
480 ; CHECK-NEXT: vld1.32 {d20[0]}, [r12:32]
481 ; CHECK-NEXT: ldr r12, [r11, #64]
482 ; CHECK-NEXT: vcgt.u32 q10, q11, q10
483 ; CHECK-NEXT: ldr r4, [r12]
484 ; CHECK-NEXT: vmov.32 d25[0], r4
485 ; CHECK-NEXT: add r4, r11, #28
486 ; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32]
487 ; CHECK-NEXT: add r4, r11, #56
488 ; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32]
489 ; CHECK-NEXT: add r4, r11, #48
490 ; CHECK-NEXT: vmov.u8 lr, d25[3]
491 ; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32]
492 ; CHECK-NEXT: add r4, r12, #4
493 ; CHECK-NEXT: vcgt.u32 q8, q9, q8
494 ; CHECK-NEXT: vmovn.i32 d19, q10
495 ; CHECK-NEXT: vldr d20, .LCPI23_0
496 ; CHECK-NEXT: vmov.i8 d18, #0x7
497 ; CHECK-NEXT: vmovn.i32 d16, q8
498 ; CHECK-NEXT: vneg.s8 d17, d18
499 ; CHECK-NEXT: vuzp.8 d16, d19
500 ; CHECK-NEXT: vmov.i8 q9, #0x7
501 ; CHECK-NEXT: vshl.i8 d16, d16, #7
502 ; CHECK-NEXT: vneg.s8 q9, q9
503 ; CHECK-NEXT: vshl.s8 d24, d16, d17
504 ; CHECK-NEXT: vmov.8 d17[0], lr
505 ; CHECK-NEXT: vtbl.8 d16, {d24, d25}, d20
506 ; CHECK-NEXT: vld1.8 {d17[1]}, [r4]
507 ; CHECK-NEXT: add r4, r11, #8
508 ; CHECK-NEXT: vshl.i8 q8, q8, #7
509 ; CHECK-NEXT: vld1.64 {d20, d21}, [r4]
510 ; CHECK-NEXT: vshl.s8 q8, q8, q9
511 ; CHECK-NEXT: vmov d19, r2, r3
512 ; CHECK-NEXT: vmov d18, r0, r1
513 ; CHECK-NEXT: vbsl q8, q9, q10
514 ; CHECK-NEXT: vmov r0, r1, d16
515 ; CHECK-NEXT: vmov r2, r3, d17
516 ; CHECK-NEXT: sub sp, r11, #8
517 ; CHECK-NEXT: pop {r4, r10, r11, lr}
518 ; CHECK-NEXT: mov pc, lr
519 ; CHECK-NEXT: .p2align 3
520 ; CHECK-NEXT: @ BB#1:
521 ; CHECK-NEXT: .LCPI23_0:
522 ; CHECK-NEXT: .byte 0 @ 0x0
523 ; CHECK-NEXT: .byte 1 @ 0x1
524 ; CHECK-NEXT: .byte 2 @ 0x2
525 ; CHECK-NEXT: .byte 3 @ 0x3
526 ; CHECK-NEXT: .byte 4 @ 0x4
527 ; CHECK-NEXT: .byte 8 @ 0x8
528 ; CHECK-NEXT: .byte 9 @ 0x9
529 ; CHECK-NEXT: .byte 10 @ 0xa
363 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) { 530 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
364 ; CHECK-LABEL: vuzp_wide_type
365 ; CHECK: vbsl
366 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4 531 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
367 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1> 532 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
368 %c0 = icmp ult <5 x i32> %cmp0, %cmp1 533 %c0 = icmp ult <5 x i32> %cmp0, %cmp1
369 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> 534 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
370 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 535 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
371 ret <10 x i8> %rv 536 ret <10 x i8> %rv
372 } 537 }
538
539 %struct.uint8x8x2_t = type { [2 x <8 x i8>] }
540 define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
541 ; CHECK-LABEL: vuzp_extract_subvector:
542 ; CHECK: @ BB#0:
543 ; CHECK-NEXT: vmov d17, r2, r3
544 ; CHECK-NEXT: vmov d16, r0, r1
545 ; CHECK-NEXT: vorr d18, d17, d17
546 ; CHECK-NEXT: vuzp.8 d16, d18
547 ; CHECK-NEXT: vmov r0, r1, d16
548 ; CHECK-NEXT: vmov r2, r3, d18
549 ; CHECK-NEXT: mov pc, lr
550
551 %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
552 %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
553 %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
554 %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
555 ret %struct.uint8x8x2_t %.fca.0.1.insert
556 }