comparison test/CodeGen/X86/vector-shuffle-128-v16.ll @ 95:afa8332a0e37 LLVM3.8

LLVM 3.8
author Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
date Tue, 13 Oct 2015 17:48:58 +0900
parents 60c9769439b8
children 7d135dc70f03
comparison
equal deleted inserted replaced
84:f3e34b893a5f 95:afa8332a0e37
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 6
7 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 7 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
8 target triple = "x86_64-unknown-unknown" 8 target triple = "x86_64-unknown-unknown"
9 9
10 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { 10 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
245 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 245 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
246 ret <16 x i8> %shuffle 246 ret <16 x i8> %shuffle
247 } 247 }
248 248
249 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { 249 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
250 ; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: 250 ; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
251 ; SSE: # BB#0: 251 ; SSE2: # BB#0:
252 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 252 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
253 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 253 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
254 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 254 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
255 ; SSE-NEXT: movdqa %xmm1, %xmm0 255 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
256 ; SSE-NEXT: retq 256 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
257 ; SSE2-NEXT: pand %xmm2, %xmm1
258 ; SSE2-NEXT: pandn %xmm0, %xmm2
259 ; SSE2-NEXT: por %xmm1, %xmm2
260 ; SSE2-NEXT: movdqa %xmm2, %xmm0
261 ; SSE2-NEXT: retq
262 ;
263 ; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
264 ; SSSE3: # BB#0:
265 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
266 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
267 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
268 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
269 ; SSSE3-NEXT: retq
270 ;
271 ; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
272 ; SSE41: # BB#0:
273 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
274 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
275 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
276 ; SSE41-NEXT: movdqa %xmm1, %xmm0
277 ; SSE41-NEXT: retq
257 ; 278 ;
258 ; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: 279 ; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
259 ; AVX1: # BB#0: 280 ; AVX1: # BB#0:
260 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 281 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
261 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 282 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
316 ; SSE2-NEXT: packuswb %xmm1, %xmm0 337 ; SSE2-NEXT: packuswb %xmm1, %xmm0
317 ; SSE2-NEXT: retq 338 ; SSE2-NEXT: retq
318 ; 339 ;
319 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 340 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
320 ; SSSE3: # BB#0: 341 ; SSSE3: # BB#0:
321 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] 342 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
322 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero 343 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
323 ; SSSE3-NEXT: por %xmm1, %xmm0
324 ; SSSE3-NEXT: retq 344 ; SSSE3-NEXT: retq
325 ; 345 ;
326 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 346 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
327 ; SSE41: # BB#0: 347 ; SSE41: # BB#0:
328 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] 348 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
329 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero 349 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
330 ; SSE41-NEXT: por %xmm1, %xmm0
331 ; SSE41-NEXT: retq 350 ; SSE41-NEXT: retq
332 ; 351 ;
333 ; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 352 ; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
334 ; AVX: # BB#0: 353 ; AVX: # BB#0:
335 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] 354 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
336 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero 355 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
337 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
338 ; AVX-NEXT: retq 356 ; AVX-NEXT: retq
339 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20> 357 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
340 ret <16 x i8> %shuffle 358 ret <16 x i8> %shuffle
341 } 359 }
342 360
343 define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { 361 define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
344 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 362 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
345 ; SSE2: # BB#0: 363 ; SSE2: # BB#0:
346 ; SSE2-NEXT: pxor %xmm2, %xmm2 364 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
347 ; SSE2-NEXT: movdqa %xmm1, %xmm3 365 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
348 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 366 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
349 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 367 ; SSE2-NEXT: pxor %xmm1, %xmm1
350 ; SSE2-NEXT: movdqa %xmm0, %xmm4 368 ; SSE2-NEXT: movdqa %xmm0, %xmm2
351 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] 369 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
352 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 370 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
353 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 371 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
354 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 372 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
355 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 373 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
356 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 374 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
357 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 375 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
358 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 376 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
359 ; SSE2-NEXT: packuswb %xmm3, %xmm1 377 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
360 ; SSE2-NEXT: movdqa %xmm1, %xmm0 378 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
379 ; SSE2-NEXT: packuswb %xmm3, %xmm0
361 ; SSE2-NEXT: retq 380 ; SSE2-NEXT: retq
362 ; 381 ;
363 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 382 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
364 ; SSSE3: # BB#0: 383 ; SSSE3: # BB#0:
365 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] 384 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
366 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero 385 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
367 ; SSSE3-NEXT: por %xmm1, %xmm0 386 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
368 ; SSSE3-NEXT: retq 387 ; SSSE3-NEXT: retq
369 ; 388 ;
370 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 389 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
371 ; SSE41: # BB#0: 390 ; SSE41: # BB#0:
372 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] 391 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
373 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero 392 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
374 ; SSE41-NEXT: por %xmm1, %xmm0 393 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
375 ; SSE41-NEXT: retq 394 ; SSE41-NEXT: retq
376 ; 395 ;
377 ; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 396 ; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
378 ; AVX: # BB#0: 397 ; AVX: # BB#0:
379 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] 398 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
380 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero 399 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
381 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 400 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
382 ; AVX-NEXT: retq 401 ; AVX-NEXT: retq
383 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20> 402 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
384 ret <16 x i8> %shuffle 403 ret <16 x i8> %shuffle
385 } 404 }
386 405
387 define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) { 406 define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
388 ; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 407 ; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
389 ; SSE2: # BB#0: 408 ; SSE2: # BB#0:
390 ; SSE2-NEXT: pxor %xmm2, %xmm2 409 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
391 ; SSE2-NEXT: movdqa %xmm1, %xmm3 410 ; SSE2-NEXT: andps %xmm2, %xmm0
392 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 411 ; SSE2-NEXT: andnps %xmm1, %xmm2
393 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 412 ; SSE2-NEXT: orps %xmm2, %xmm0
394 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
395 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
396 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
397 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
398 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
399 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
400 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
401 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
402 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
403 ; SSE2-NEXT: packuswb %xmm0, %xmm1
404 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
405 ; SSE2-NEXT: packuswb %xmm0, %xmm0
406 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
407 ; SSE2-NEXT: retq 413 ; SSE2-NEXT: retq
408 ; 414 ;
409 ; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 415 ; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
410 ; SSSE3: # BB#0: 416 ; SSSE3: # BB#0:
411 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 417 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
413 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 419 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
414 ; SSSE3-NEXT: retq 420 ; SSSE3-NEXT: retq
415 ; 421 ;
416 ; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 422 ; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
417 ; SSE41: # BB#0: 423 ; SSE41: # BB#0:
418 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 424 ; SSE41-NEXT: movdqa %xmm0, %xmm2
419 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 425 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
420 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 426 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
427 ; SSE41-NEXT: movdqa %xmm1, %xmm0
421 ; SSE41-NEXT: retq 428 ; SSE41-NEXT: retq
422 ; 429 ;
423 ; AVX-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 430 ; AVX-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
424 ; AVX: # BB#0: 431 ; AVX: # BB#0:
425 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 432 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
426 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 433 ; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
427 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
428 ; AVX-NEXT: retq 434 ; AVX-NEXT: retq
429 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 435 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
430 ret <16 x i8> %shuffle 436 ret <16 x i8> %shuffle
431 } 437 }
432 438
433 define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) { 439 define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
434 ; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 440 ; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
435 ; SSE2: # BB#0: 441 ; SSE2: # BB#0:
436 ; SSE2-NEXT: pxor %xmm2, %xmm2 442 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
437 ; SSE2-NEXT: movdqa %xmm0, %xmm3 443 ; SSE2-NEXT: andps %xmm2, %xmm0
438 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 444 ; SSE2-NEXT: andnps %xmm1, %xmm2
439 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 445 ; SSE2-NEXT: orps %xmm2, %xmm0
440 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
441 ; SSE2-NEXT: movdqa %xmm0, %xmm4
442 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
443 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
444 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
445 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
446 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,0,3,1,4,5,6,7]
447 ; SSE2-NEXT: packuswb %xmm0, %xmm3
448 ; SSE2-NEXT: movdqa %xmm1, %xmm4
449 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
450 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
451 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7]
452 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
453 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
454 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7]
455 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
456 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
457 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,3,1,4,5,6,7]
458 ; SSE2-NEXT: packuswb %xmm0, %xmm1
459 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
460 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
461 ; SSE2-NEXT: packuswb %xmm0, %xmm0
462 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
463 ; SSE2-NEXT: retq 446 ; SSE2-NEXT: retq
464 ; 447 ;
465 ; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 448 ; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
466 ; SSSE3: # BB#0: 449 ; SSSE3: # BB#0:
467 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 450 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
468 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 451 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
469 ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 452 ; SSSE3-NEXT: por %xmm1, %xmm0
470 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
471 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
472 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
473 ; SSSE3-NEXT: retq 453 ; SSSE3-NEXT: retq
474 ; 454 ;
475 ; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 455 ; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
476 ; SSE41: # BB#0: 456 ; SSE41: # BB#0:
477 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
478 ; SSE41-NEXT: movdqa %xmm0, %xmm2 457 ; SSE41-NEXT: movdqa %xmm0, %xmm2
479 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 458 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
480 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 459 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
481 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 460 ; SSE41-NEXT: movdqa %xmm1, %xmm0
482 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
483 ; SSE41-NEXT: retq 461 ; SSE41-NEXT: retq
484 ; 462 ;
485 ; AVX-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 463 ; AVX-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
486 ; AVX: # BB#0: 464 ; AVX: # BB#0:
487 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 465 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
488 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 466 ; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
489 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
490 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
491 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
492 ; AVX-NEXT: retq 467 ; AVX-NEXT: retq
493 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31> 468 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
469 ret <16 x i8> %shuffle
470 }
471
472 define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
473 ; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
474 ; SSE: # BB#0:
475 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0
476 ; SSE-NEXT: retq
477 ;
478 ; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
479 ; AVX: # BB#0:
480 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
481 ; AVX-NEXT: retq
482 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
494 ret <16 x i8> %shuffle 483 ret <16 x i8> %shuffle
495 } 484 }
496 485
497 define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) { 486 define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
498 ; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: 487 ; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
499 ; SSE2: # BB#0: 488 ; SSE2: # BB#0:
500 ; SSE2-NEXT: pxor %xmm2, %xmm2 489 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
501 ; SSE2-NEXT: movdqa %xmm1, %xmm3 490 ; SSE2-NEXT: andps %xmm2, %xmm0
502 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 491 ; SSE2-NEXT: andnps %xmm1, %xmm2
503 ; SSE2-NEXT: movdqa %xmm0, %xmm4 492 ; SSE2-NEXT: orps %xmm2, %xmm0
504 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
505 ; SSE2-NEXT: movdqa %xmm4, %xmm5
506 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
507 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,1,2,3,4,5,6,7]
508 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
509 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
510 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,2,3,4,5,6,7]
511 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
512 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
513 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
514 ; SSE2-NEXT: movdqa %xmm0, %xmm2
515 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
516 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7]
517 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
518 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
519 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7]
520 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
521 ; SSE2-NEXT: packuswb %xmm4, %xmm0
522 ; SSE2-NEXT: retq 493 ; SSE2-NEXT: retq
523 ; 494 ;
524 ; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: 495 ; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
525 ; SSSE3: # BB#0: 496 ; SSSE3: # BB#0:
526 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15] 497 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
546 } 517 }
547 518
548 define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) { 519 define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
549 ; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 520 ; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
550 ; SSE2: # BB#0: 521 ; SSE2: # BB#0:
551 ; SSE2-NEXT: pxor %xmm2, %xmm2 522 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
552 ; SSE2-NEXT: movdqa %xmm1, %xmm3 523 ; SSE2-NEXT: andps %xmm2, %xmm1
553 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 524 ; SSE2-NEXT: andnps %xmm0, %xmm2
554 ; SSE2-NEXT: movdqa %xmm1, %xmm4 525 ; SSE2-NEXT: orps %xmm1, %xmm2
555 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 526 ; SSE2-NEXT: movaps %xmm2, %xmm0
556 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
557 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,1]
558 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
559 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
560 ; SSE2-NEXT: movdqa %xmm0, %xmm4
561 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
562 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
563 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
564 ; SSE2-NEXT: movdqa %xmm0, %xmm5
565 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
566 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7]
567 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
568 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,0,4,5,6,7]
569 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
570 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
571 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
572 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
573 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7]
574 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
575 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
576 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
577 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
578 ; SSE2-NEXT: packuswb %xmm0, %xmm3
579 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
580 ; SSE2-NEXT: pand %xmm2, %xmm1
581 ; SSE2-NEXT: pand %xmm2, %xmm0
582 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
583 ; SSE2-NEXT: movdqa %xmm1, %xmm2
584 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
585 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
586 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
587 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
588 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
589 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
590 ; SSE2-NEXT: packuswb %xmm0, %xmm2
591 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
592 ; SSE2-NEXT: movdqa %xmm2, %xmm0
593 ; SSE2-NEXT: retq 527 ; SSE2-NEXT: retq
594 ; 528 ;
595 ; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 529 ; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
596 ; SSSE3: # BB#0: 530 ; SSSE3: # BB#0:
597 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 531 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
598 ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,3],zero,zero,xmm2[9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u] 532 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
599 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
600 ; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[5,7],zero,xmm3[11,13,15,u,u,u,u,u,u,u,u]
601 ; SSSE3-NEXT: por %xmm2, %xmm3
602 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,6],zero,xmm0[10],zero,zero,xmm0[u,u,u,u,u,u,u,u]
603 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2],zero,zero,xmm1[8],zero,xmm1[12,14,u,u,u,u,u,u,u,u]
604 ; SSSE3-NEXT: por %xmm1, %xmm0 533 ; SSSE3-NEXT: por %xmm1, %xmm0
605 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
606 ; SSSE3-NEXT: retq 534 ; SSSE3-NEXT: retq
607 ; 535 ;
608 ; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 536 ; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
609 ; SSE41: # BB#0: 537 ; SSE41: # BB#0:
610 ; SSE41-NEXT: movdqa %xmm1, %xmm2 538 ; SSE41-NEXT: movdqa %xmm0, %xmm2
611 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,3],zero,zero,xmm2[9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u] 539 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
612 ; SSE41-NEXT: movdqa %xmm0, %xmm3 540 ; SSE41-NEXT: pblendvb %xmm1, %xmm2
613 ; SSE41-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[5,7],zero,xmm3[11,13,15,u,u,u,u,u,u,u,u] 541 ; SSE41-NEXT: movdqa %xmm2, %xmm0
614 ; SSE41-NEXT: por %xmm2, %xmm3
615 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,6],zero,xmm0[10],zero,zero,xmm0[u,u,u,u,u,u,u,u]
616 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2],zero,zero,xmm1[8],zero,xmm1[12,14,u,u,u,u,u,u,u,u]
617 ; SSE41-NEXT: por %xmm1, %xmm0
618 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
619 ; SSE41-NEXT: retq 542 ; SSE41-NEXT: retq
620 ; 543 ;
621 ; AVX-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 544 ; AVX-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
622 ; AVX: # BB#0: 545 ; AVX: # BB#0:
623 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[1,3],zero,zero,xmm1[9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 546 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
624 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[5,7],zero,xmm0[11,13,15,u,u,u,u,u,u,u,u] 547 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
625 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
626 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,6],zero,xmm0[10],zero,zero,xmm0[u,u,u,u,u,u,u,u]
627 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2],zero,zero,xmm1[8],zero,xmm1[12,14,u,u,u,u,u,u,u,u]
628 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
629 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
630 ; AVX-NEXT: retq 548 ; AVX-NEXT: retq
631 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15> 549 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
632 ret <16 x i8> %shuffle 550 ret <16 x i8> %shuffle
633 } 551 }
634 552
705 623
706 define <16 x i8> @PR20540(<8 x i8> %a) { 624 define <16 x i8> @PR20540(<8 x i8> %a) {
707 ; SSE2-LABEL: PR20540: 625 ; SSE2-LABEL: PR20540:
708 ; SSE2: # BB#0: 626 ; SSE2: # BB#0:
709 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 627 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
710 ; SSE2-NEXT: pxor %xmm1, %xmm1 628 ; SSE2-NEXT: packuswb %xmm0, %xmm0
711 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 629 ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
712 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
713 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
714 ; SSE2-NEXT: packuswb %xmm1, %xmm0
715 ; SSE2-NEXT: retq 630 ; SSE2-NEXT: retq
716 ; 631 ;
717 ; SSSE3-LABEL: PR20540: 632 ; SSSE3-LABEL: PR20540:
718 ; SSSE3: # BB#0: 633 ; SSSE3: # BB#0:
719 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 634 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
731 %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 646 %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
732 ret <16 x i8> %shuffle 647 ret <16 x i8> %shuffle
733 } 648 }
734 649
735 define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { 650 define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
736 ; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 651 ; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
737 ; SSE2: # BB#0: 652 ; SSE: # BB#0:
738 ; SSE2-NEXT: movzbl %dil, %eax 653 ; SSE-NEXT: movzbl %dil, %eax
739 ; SSE2-NEXT: movd %eax, %xmm0 654 ; SSE-NEXT: movd %eax, %xmm0
740 ; SSE2-NEXT: retq 655 ; SSE-NEXT: retq
741 ;
742 ; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
743 ; SSSE3: # BB#0:
744 ; SSSE3-NEXT: movd %edi, %xmm0
745 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
746 ; SSSE3-NEXT: retq
747 ;
748 ; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
749 ; SSE41: # BB#0:
750 ; SSE41-NEXT: movd %edi, %xmm0
751 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
752 ; SSE41-NEXT: retq
753 ; 656 ;
754 ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 657 ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
755 ; AVX: # BB#0: 658 ; AVX: # BB#0:
756 ; AVX-NEXT: vmovd %edi, %xmm0 659 ; AVX-NEXT: movzbl %dil, %eax
757 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 660 ; AVX-NEXT: vmovd %eax, %xmm0
758 ; AVX-NEXT: retq 661 ; AVX-NEXT: retq
759 %a = insertelement <16 x i8> undef, i8 %i, i32 0 662 %a = insertelement <16 x i8> undef, i8 %i, i32 0
760 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 663 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
761 ret <16 x i8> %shuffle 664 ret <16 x i8> %shuffle
762 } 665 }
763 666
764 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { 667 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
765 ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 668 ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
766 ; SSE2: # BB#0: 669 ; SSE2: # BB#0:
767 ; SSE2-NEXT: movzbl %dil, %eax 670 ; SSE2-NEXT: shll $8, %edi
768 ; SSE2-NEXT: movd %eax, %xmm0 671 ; SSE2-NEXT: pxor %xmm0, %xmm0
769 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10] 672 ; SSE2-NEXT: pinsrw $2, %edi, %xmm0
770 ; SSE2-NEXT: retq 673 ; SSE2-NEXT: retq
771 ; 674 ;
772 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 675 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
773 ; SSSE3: # BB#0: 676 ; SSSE3: # BB#0:
774 ; SSSE3-NEXT: movd %edi, %xmm0 677 ; SSSE3-NEXT: shll $8, %edi
775 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 678 ; SSSE3-NEXT: pxor %xmm0, %xmm0
679 ; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
776 ; SSSE3-NEXT: retq 680 ; SSSE3-NEXT: retq
777 ; 681 ;
778 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 682 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
779 ; SSE41: # BB#0: 683 ; SSE41: # BB#0:
780 ; SSE41-NEXT: movd %edi, %xmm0 684 ; SSE41-NEXT: pxor %xmm0, %xmm0
781 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 685 ; SSE41-NEXT: pinsrb $5, %edi, %xmm0
782 ; SSE41-NEXT: retq 686 ; SSE41-NEXT: retq
783 ; 687 ;
784 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 688 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
785 ; AVX: # BB#0: 689 ; AVX: # BB#0:
786 ; AVX-NEXT: vmovd %edi, %xmm0 690 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
787 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 691 ; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
788 ; AVX-NEXT: retq 692 ; AVX-NEXT: retq
789 %a = insertelement <16 x i8> undef, i8 %i, i32 0 693 %a = insertelement <16 x i8> undef, i8 %i, i32 0
790 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 694 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
791 ret <16 x i8> %shuffle 695 ret <16 x i8> %shuffle
792 } 696 }
793 697
794 define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { 698 define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
795 ; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: 699 ; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
796 ; SSE: # BB#0: 700 ; SSE2: # BB#0:
797 ; SSE-NEXT: movd %edi, %xmm0 701 ; SSE2-NEXT: shll $8, %edi
798 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 702 ; SSE2-NEXT: pxor %xmm0, %xmm0
799 ; SSE-NEXT: retq 703 ; SSE2-NEXT: pinsrw $7, %edi, %xmm0
704 ; SSE2-NEXT: retq
705 ;
706 ; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
707 ; SSSE3: # BB#0:
708 ; SSSE3-NEXT: shll $8, %edi
709 ; SSSE3-NEXT: pxor %xmm0, %xmm0
710 ; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
711 ; SSSE3-NEXT: retq
712 ;
713 ; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
714 ; SSE41: # BB#0:
715 ; SSE41-NEXT: pxor %xmm0, %xmm0
716 ; SSE41-NEXT: pinsrb $15, %edi, %xmm0
717 ; SSE41-NEXT: retq
800 ; 718 ;
801 ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: 719 ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
802 ; AVX: # BB#0: 720 ; AVX: # BB#0:
803 ; AVX-NEXT: vmovd %edi, %xmm0 721 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
804 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 722 ; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
805 ; AVX-NEXT: retq 723 ; AVX-NEXT: retq
806 %a = insertelement <16 x i8> undef, i8 %i, i32 0 724 %a = insertelement <16 x i8> undef, i8 %i, i32 0
807 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16> 725 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
808 ret <16 x i8> %shuffle 726 ret <16 x i8> %shuffle
809 } 727 }
810 728
811 define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { 729 define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
812 ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 730 ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
813 ; SSE2: # BB#0: 731 ; SSE2: # BB#0:
814 ; SSE2-NEXT: movzbl %dil, %eax 732 ; SSE2-NEXT: movzbl %dil, %eax
815 ; SSE2-NEXT: movd %eax, %xmm0 733 ; SSE2-NEXT: pxor %xmm0, %xmm0
816 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 734 ; SSE2-NEXT: pinsrw $1, %eax, %xmm0
817 ; SSE2-NEXT: retq 735 ; SSE2-NEXT: retq
818 ; 736 ;
819 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 737 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
820 ; SSSE3: # BB#0: 738 ; SSSE3: # BB#0:
821 ; SSSE3-NEXT: movd %edi, %xmm0 739 ; SSSE3-NEXT: movzbl %dil, %eax
822 ; SSSE3-NEXT: pslld $24, %xmm0 740 ; SSSE3-NEXT: pxor %xmm0, %xmm0
823 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 741 ; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
824 ; SSSE3-NEXT: retq 742 ; SSSE3-NEXT: retq
825 ; 743 ;
826 ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 744 ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
827 ; SSE41: # BB#0: 745 ; SSE41: # BB#0:
828 ; SSE41-NEXT: movd %edi, %xmm0 746 ; SSE41-NEXT: pxor %xmm0, %xmm0
829 ; SSE41-NEXT: pslld $24, %xmm0 747 ; SSE41-NEXT: pinsrb $2, %edi, %xmm0
830 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
831 ; SSE41-NEXT: retq 748 ; SSE41-NEXT: retq
832 ; 749 ;
833 ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 750 ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
834 ; AVX: # BB#0: 751 ; AVX: # BB#0:
835 ; AVX-NEXT: vmovd %edi, %xmm0 752 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
836 ; AVX-NEXT: vpslld $24, %xmm0, %xmm0 753 ; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
837 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
838 ; AVX-NEXT: retq 754 ; AVX-NEXT: retq
839 %a = insertelement <16 x i8> undef, i8 %i, i32 3 755 %a = insertelement <16 x i8> undef, i8 %i, i32 3
840 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 756 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
841 ret <16 x i8> %shuffle 757 ret <16 x i8> %shuffle
842 } 758 }
1192 ; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1108 ; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1193 ; SSE2: # BB#0: # %entry 1109 ; SSE2: # BB#0: # %entry
1194 ; SSE2-NEXT: pxor %xmm2, %xmm2 1110 ; SSE2-NEXT: pxor %xmm2, %xmm2
1195 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1111 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1196 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1112 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1197 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] 1113 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1]
1114 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
1115 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
1116 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
1198 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1117 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1199 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 1118 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
1200 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 1119 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
1201 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] 1120 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
1202 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1121 ; SSE2-NEXT: pand %xmm5, %xmm2
1203 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 1122 ; SSE2-NEXT: pandn %xmm4, %xmm5
1204 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1123 ; SSE2-NEXT: por %xmm2, %xmm5
1205 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7] 1124 ; SSE2-NEXT: psrlq $16, %xmm3
1206 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,2,3,4,5,6,7] 1125 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1207 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] 1126 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3]
1208 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 1127 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1209 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 1128 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
1210 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] 1129 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1211 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1130 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1212 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,1,4,5,6,7] 1131 ; SSE2-NEXT: packuswb %xmm5, %xmm2
1213 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 1132 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1214 ; SSE2-NEXT: packuswb %xmm0, %xmm4 1133 ; SSE2-NEXT: pand %xmm0, %xmm2
1215 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] 1134 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1216 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 1135 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
1217 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 1136 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,7]
1218 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 1137 ; SSE2-NEXT: pandn %xmm1, %xmm0
1219 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1138 ; SSE2-NEXT: por %xmm2, %xmm0
1220 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1221 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1222 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7]
1223 ; SSE2-NEXT: packuswb %xmm0, %xmm2
1224 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1225 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
1226 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1227 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1228 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1229 ; SSE2-NEXT: retq 1139 ; SSE2-NEXT: retq
1230 ; 1140 ;
1231 ; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1141 ; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1232 ; SSSE3: # BB#0: # %entry 1142 ; SSSE3: # BB#0: # %entry
1233 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1143 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1234 ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] 1144 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1235 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] 1145 ; SSSE3-NEXT: por %xmm1, %xmm0
1236 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1237 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
1238 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1239 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1240 ; SSSE3-NEXT: retq 1146 ; SSSE3-NEXT: retq
1241 ; 1147 ;
1242 ; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1148 ; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1243 ; SSE41: # BB#0: # %entry 1149 ; SSE41: # BB#0: # %entry
1244 ; SSE41-NEXT: movdqa %xmm0, %xmm2 1150 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1245 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] 1151 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1246 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] 1152 ; SSE41-NEXT: por %xmm1, %xmm0
1247 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1248 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
1249 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1250 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1251 ; SSE41-NEXT: retq 1153 ; SSE41-NEXT: retq
1252 ; 1154 ;
1253 ; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1155 ; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1254 ; AVX: # BB#0: # %entry 1156 ; AVX: # BB#0: # %entry
1255 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] 1157 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1256 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] 1158 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1257 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1159 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1258 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
1259 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1260 ; AVX-NEXT: retq 1160 ; AVX-NEXT: retq
1261 entry: 1161 entry:
1262 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0> 1162 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
1263 1163
1264 ret <16 x i8> %shuffle 1164 ret <16 x i8> %shuffle
1410 ; SSE2-NEXT: packuswb %xmm1, %xmm0 1310 ; SSE2-NEXT: packuswb %xmm1, %xmm0
1411 ; SSE2-NEXT: retq 1311 ; SSE2-NEXT: retq
1412 ; 1312 ;
1413 ; SSSE3-LABEL: PR12412: 1313 ; SSSE3-LABEL: PR12412:
1414 ; SSSE3: # BB#0: # %entry 1314 ; SSSE3: # BB#0: # %entry
1415 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,2,4,6,8,10,12,14] 1315 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1416 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 1316 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
1417 ; SSSE3-NEXT: por %xmm1, %xmm0 1317 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
1318 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1418 ; SSSE3-NEXT: retq 1319 ; SSSE3-NEXT: retq
1419 ; 1320 ;
1420 ; SSE41-LABEL: PR12412: 1321 ; SSE41-LABEL: PR12412:
1421 ; SSE41: # BB#0: # %entry 1322 ; SSE41: # BB#0: # %entry
1422 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,2,4,6,8,10,12,14] 1323 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1423 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 1324 ; SSE41-NEXT: pshufb %xmm2, %xmm1
1424 ; SSE41-NEXT: por %xmm1, %xmm0 1325 ; SSE41-NEXT: pshufb %xmm2, %xmm0
1326 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1425 ; SSE41-NEXT: retq 1327 ; SSE41-NEXT: retq
1426 ; 1328 ;
1427 ; AVX-LABEL: PR12412: 1329 ; AVX-LABEL: PR12412:
1428 ; AVX: # BB#0: # %entry 1330 ; AVX: # BB#0: # %entry
1429 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,2,4,6,8,10,12,14] 1331 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1430 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 1332 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1431 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1333 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1334 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1432 ; AVX-NEXT: retq 1335 ; AVX-NEXT: retq
1433 entry: 1336 entry:
1434 %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 1337 %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1435 ret <16 x i8> %0 1338 ret <16 x i8> %0
1436 } 1339 }
1340
1341 define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
1342 ; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
1343 ; SSE: # BB#0:
1344 ; SSE-NEXT: psrld $8, %xmm0
1345 ; SSE-NEXT: retq
1346 ;
1347 ; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
1348 ; AVX: # BB#0:
1349 ; AVX-NEXT: vpsrld $8, %xmm0, %xmm0
1350 ; AVX-NEXT: retq
1351 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
1352 ret <16 x i8> %shuffle
1353 }
1354
1355 define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
1356 ; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
1357 ; SSE: # BB#0:
1358 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1359 ; SSE-NEXT: retq
1360 ;
1361 ; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
1362 ; AVX: # BB#0:
1363 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1364 ; AVX-NEXT: retq
1365 %shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
1366 %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
1367 %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1368 %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16>
1369 %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1370 %bitcast8 = bitcast <8 x i16> %shuffle16 to <16 x i8>
1371 ret <16 x i8> %bitcast8
1372 }
1373
1374 define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
1375 ; SSE2-LABEL: insert_dup_mem_v16i8_i32:
1376 ; SSE2: # BB#0:
1377 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1378 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1379 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1380 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1381 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1382 ; SSE2-NEXT: retq
1383 ;
1384 ; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
1385 ; SSSE3: # BB#0:
1386 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1387 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1388 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1389 ; SSSE3-NEXT: retq
1390 ;
1391 ; SSE41-LABEL: insert_dup_mem_v16i8_i32:
1392 ; SSE41: # BB#0:
1393 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1394 ; SSE41-NEXT: pxor %xmm1, %xmm1
1395 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1396 ; SSE41-NEXT: retq
1397 ;
1398 ; AVX1-LABEL: insert_dup_mem_v16i8_i32:
1399 ; AVX1: # BB#0:
1400 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1401 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1402 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1403 ; AVX1-NEXT: retq
1404 ;
1405 ; AVX2-LABEL: insert_dup_mem_v16i8_i32:
1406 ; AVX2: # BB#0:
1407 ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
1408 ; AVX2-NEXT: retq
1409 %tmp = load i32, i32* %ptr, align 4
1410 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1411 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1412 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
1413 ret <16 x i8> %tmp3
1414 }
1415
1416 define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
1417 ; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
1418 ; SSE2: # BB#0:
1419 ; SSE2-NEXT: movsbl (%rdi), %eax
1420 ; SSE2-NEXT: movd %eax, %xmm0
1421 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1422 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1423 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1424 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1425 ; SSE2-NEXT: retq
1426 ;
1427 ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
1428 ; SSSE3: # BB#0:
1429 ; SSSE3-NEXT: movsbl (%rdi), %eax
1430 ; SSSE3-NEXT: movd %eax, %xmm0
1431 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1432 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1433 ; SSSE3-NEXT: retq
1434 ;
1435 ; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
1436 ; SSE41: # BB#0:
1437 ; SSE41-NEXT: movsbl (%rdi), %eax
1438 ; SSE41-NEXT: movd %eax, %xmm0
1439 ; SSE41-NEXT: pxor %xmm1, %xmm1
1440 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1441 ; SSE41-NEXT: retq
1442 ;
1443 ; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
1444 ; AVX1: # BB#0:
1445 ; AVX1-NEXT: movsbl (%rdi), %eax
1446 ; AVX1-NEXT: vmovd %eax, %xmm0
1447 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1448 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1449 ; AVX1-NEXT: retq
1450 ;
1451 ; AVX2-LABEL: insert_dup_mem_v16i8_sext_i8:
1452 ; AVX2: # BB#0:
1453 ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
1454 ; AVX2-NEXT: retq
1455 %tmp = load i8, i8* %ptr, align 1
1456 %tmp1 = sext i8 %tmp to i32
1457 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1458 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1459 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
1460 ret <16 x i8> %tmp4
1461 }