comparison test/CodeGen/X86/unaligned-32-byte-memops.ll @ 95:afa8332a0e37 LLVM3.8

LLVM 3.8
author Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
date Tue, 13 Oct 2015 17:48:58 +0900
parents 60c9769439b8
children 7d135dc70f03
comparison
equal deleted inserted replaced
84:f3e34b893a5f 95:afa8332a0e37
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK 1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXSLOW
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXFAST
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK 4
5 5 ; Don't generate an unaligned 32-byte load on this test if that is slower than two 16-byte loads.
6 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
7 ; because that is slower than two 16-byte loads.
8 ; Other AVX-capable chips don't have that problem.
9 6
10 define <8 x float> @load32bytes(<8 x float>* %Ap) { 7 define <8 x float> @load32bytes(<8 x float>* %Ap) {
11 ; CHECK-LABEL: load32bytes 8 ; AVXSLOW-LABEL: load32bytes:
12 9 ; AVXSLOW: # BB#0:
13 ; SANDYB: vmovaps 10 ; AVXSLOW-NEXT: vmovaps (%rdi), %xmm0
14 ; SANDYB: vinsertf128 11 ; AVXSLOW-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
15 ; SANDYB: retq 12 ; AVXSLOW-NEXT: retq
16 13 ;
17 ; BTVER2: vmovups 14 ; AVXFAST-LABEL: load32bytes:
18 ; BTVER2: retq 15 ; AVXFAST: # BB#0:
19 16 ; AVXFAST-NEXT: vmovups (%rdi), %ymm0
20 ; HASWELL: vmovups 17 ; AVXFAST-NEXT: retq
21 ; HASWELL: retq 18 ;
22 19 ; AVX2-LABEL: load32bytes:
23 %A = load <8 x float>* %Ap, align 16 20 ; AVX2: # BB#0:
21 ; AVX2-NEXT: vmovups (%rdi), %ymm0
22 ; AVX2-NEXT: retq
23 %A = load <8 x float>, <8 x float>* %Ap, align 16
24 ret <8 x float> %A 24 ret <8 x float> %A
25 } 25 }
26 26
27 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store 27 ; Don't generate an unaligned 32-byte store on this test if that is slower than two 16-byte loads.
28 ; because that is slowerthan two 16-byte stores.
29 ; Other AVX-capable chips don't have that problem.
30 28
31 define void @store32bytes(<8 x float> %A, <8 x float>* %P) { 29 define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
32 ; CHECK-LABEL: store32bytes 30 ; AVXSLOW-LABEL: store32bytes:
33 31 ; AVXSLOW: # BB#0:
34 ; SANDYB: vextractf128 32 ; AVXSLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi)
35 ; SANDYB: vmovaps 33 ; AVXSLOW-NEXT: vmovaps %xmm0, (%rdi)
36 ; SANDYB: retq 34 ; AVXSLOW-NEXT: vzeroupper
37 35 ; AVXSLOW-NEXT: retq
38 ; BTVER2: vmovups 36 ;
39 ; BTVER2: retq 37 ; AVXFAST-LABEL: store32bytes:
40 38 ; AVXFAST: # BB#0:
41 ; HASWELL: vmovups 39 ; AVXFAST-NEXT: vmovups %ymm0, (%rdi)
42 ; HASWELL: retq 40 ; AVXFAST-NEXT: vzeroupper
43 41 ; AVXFAST-NEXT: retq
42 ;
43 ; AVX2-LABEL: store32bytes:
44 ; AVX2: # BB#0:
45 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
46 ; AVX2-NEXT: vzeroupper
47 ; AVX2-NEXT: retq
44 store <8 x float> %A, <8 x float>* %P, align 16 48 store <8 x float> %A, <8 x float>* %P, align 16
45 ret void 49 ret void
46 } 50 }
47 51
48 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load 52 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load if it's faster.
49 ; if it's faster. 53
50
51 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
52
53 ; Use the vinsertf128 intrinsic to model source code
54 ; that explicitly uses AVX intrinsics.
55 define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
56 ; CHECK-LABEL: combine_16_byte_loads
57
58 ; SANDYB: vmovups
59 ; SANDYB-NEXT: vinsertf128
60 ; SANDYB-NEXT: retq
61
62 ; BTVER2: vmovups
63 ; BTVER2-NEXT: retq
64
65 ; HASWELL: vmovups
66 ; HASWELL-NEXT: retq
67
68 %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1
69 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2
70 %v1 = load <4 x float>* %ptr1, align 1
71 %v2 = load <4 x float>* %ptr2, align 1
72 %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
73 %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
74 ret <8 x float> %v3
75 }
76
77 ; Swap the operands of the shufflevector and vinsertf128 to ensure that the
78 ; pattern still matches.
79 define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
80 ; CHECK-LABEL: combine_16_byte_loads_swap
81
82 ; SANDYB: vmovups
83 ; SANDYB-NEXT: vinsertf128
84 ; SANDYB-NEXT: retq
85
86 ; BTVER2: vmovups
87 ; BTVER2-NEXT: retq
88
89 ; HASWELL: vmovups
90 ; HASWELL-NEXT: retq
91
92 %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2
93 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3
94 %v1 = load <4 x float>* %ptr1, align 1
95 %v2 = load <4 x float>* %ptr2, align 1
96 %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
97 %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
98 ret <8 x float> %v3
99 }
100
101 ; Replace the vinsertf128 intrinsic with a shufflevector as might be
102 ; expected from auto-vectorized code.
103 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { 54 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
104 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic 55 ; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic:
105 56 ; AVXSLOW: # BB#0:
106 ; SANDYB: vmovups 57 ; AVXSLOW-NEXT: vmovups 48(%rdi), %xmm0
107 ; SANDYB-NEXT: vinsertf128 58 ; AVXSLOW-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
108 ; SANDYB-NEXT: retq 59 ; AVXSLOW-NEXT: retq
109 60 ;
110 ; BTVER2: vmovups 61 ; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic:
111 ; BTVER2-NEXT: retq 62 ; AVXFAST: # BB#0:
112 63 ; AVXFAST-NEXT: vmovups 48(%rdi), %ymm0
113 ; HASWELL: vmovups 64 ; AVXFAST-NEXT: retq
114 ; HASWELL-NEXT: retq 65 ;
115 66 ; AVX2-LABEL: combine_16_byte_loads_no_intrinsic:
116 %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3 67 ; AVX2: # BB#0:
117 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4 68 ; AVX2-NEXT: vmovups 48(%rdi), %ymm0
118 %v1 = load <4 x float>* %ptr1, align 1 69 ; AVX2-NEXT: retq
119 %v2 = load <4 x float>* %ptr2, align 1 70 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
71 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
72 %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
73 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
120 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 74 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
121 ret <8 x float> %v3 75 ret <8 x float> %v3
122 } 76 }
123 77
124 ; Swap the order of the shufflevector operands to ensure that the 78 ; If the first load is 32-byte aligned, then the loads should be merged in all cases.
125 ; pattern still matches. 79
80 define <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) {
81 ; AVXSLOW-LABEL: combine_16_byte_loads_aligned:
82 ; AVXSLOW: # BB#0:
83 ; AVXSLOW-NEXT: vmovaps 48(%rdi), %ymm0
84 ; AVXSLOW-NEXT: retq
85 ;
86 ; AVXFAST-LABEL: combine_16_byte_loads_aligned:
87 ; AVXFAST: # BB#0:
88 ; AVXFAST-NEXT: vmovaps 48(%rdi), %ymm0
89 ; AVXFAST-NEXT: retq
90 ;
91 ; AVX2-LABEL: combine_16_byte_loads_aligned:
92 ; AVX2: # BB#0:
93 ; AVX2-NEXT: vmovaps 48(%rdi), %ymm0
94 ; AVX2-NEXT: retq
95 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
96 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
97 %v1 = load <4 x float>, <4 x float>* %ptr1, align 32
98 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
99 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
100 ret <8 x float> %v3
101 }
102
103 ; Swap the order of the shufflevector operands to ensure that the pattern still matches.
104
126 define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { 105 define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
127 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap 106 ; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic_swap:
128 107 ; AVXSLOW: # BB#0:
129 ; SANDYB: vmovups 108 ; AVXSLOW-NEXT: vmovups 64(%rdi), %xmm0
130 ; SANDYB-NEXT: vinsertf128 109 ; AVXSLOW-NEXT: vinsertf128 $1, 80(%rdi), %ymm0, %ymm0
131 ; SANDYB-NEXT: retq 110 ; AVXSLOW-NEXT: retq
132 111 ;
133 ; BTVER2: vmovups 112 ; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic_swap:
134 ; BTVER2-NEXT: retq 113 ; AVXFAST: # BB#0:
135 114 ; AVXFAST-NEXT: vmovups 64(%rdi), %ymm0
136 ; HASWELL: vmovups 115 ; AVXFAST-NEXT: retq
137 ; HASWELL-NEXT: retq 116 ;
138 117 ; AVX2-LABEL: combine_16_byte_loads_no_intrinsic_swap:
139 %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4 118 ; AVX2: # BB#0:
140 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5 119 ; AVX2-NEXT: vmovups 64(%rdi), %ymm0
141 %v1 = load <4 x float>* %ptr1, align 1 120 ; AVX2-NEXT: retq
142 %v2 = load <4 x float>* %ptr2, align 1 121 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
122 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
123 %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
124 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
143 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 125 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
144 ret <8 x float> %v3 126 ret <8 x float> %v3
145 } 127 }
146 128
147 ; Check each element type other than float to make sure it is handled correctly. 129 ; Check each element type other than float to make sure it is handled correctly.
148 ; Use the loaded values with an 'add' to make sure we're using the correct load type. 130 ; Use the loaded values with an 'add' to make sure we're using the correct load type.
149 ; Even though BtVer2 has fast 32-byte loads, we should not generate those for 131 ; Don't generate 32-byte loads for integer ops unless we have AVX2.
150 ; 256-bit integer vectors because BtVer2 doesn't have AVX2.
151 132
152 define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) { 133 define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
153 ; CHECK-LABEL: combine_16_byte_loads_i64 134 ; AVXSLOW-LABEL: combine_16_byte_loads_i64:
154 135 ; AVXSLOW: # BB#0:
155 ; SANDYB: vextractf128 136 ; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
156 ; SANDYB-NEXT: vpaddq 137 ; AVXSLOW-NEXT: vpaddq 96(%rdi), %xmm1, %xmm1
157 ; SANDYB-NEXT: vpaddq 138 ; AVXSLOW-NEXT: vpaddq 80(%rdi), %xmm0, %xmm0
158 ; SANDYB-NEXT: vinsertf128 139 ; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
159 ; SANDYB-NEXT: retq 140 ; AVXSLOW-NEXT: retq
160 141 ;
161 ; BTVER2: vextractf128 142 ; AVXFAST-LABEL: combine_16_byte_loads_i64:
162 ; BTVER2-NEXT: vpaddq 143 ; AVXFAST: # BB#0:
163 ; BTVER2-NEXT: vpaddq 144 ; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1
164 ; BTVER2-NEXT: vinsertf128 145 ; AVXFAST-NEXT: vpaddq 96(%rdi), %xmm1, %xmm1
165 ; BTVER2-NEXT: retq 146 ; AVXFAST-NEXT: vpaddq 80(%rdi), %xmm0, %xmm0
166 147 ; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
167 ; HASWELL-NOT: vextract 148 ; AVXFAST-NEXT: retq
168 ; HASWELL: vpaddq 149 ;
169 ; HASWELL-NEXT: retq 150 ; AVX2-LABEL: combine_16_byte_loads_i64:
170 151 ; AVX2: # BB#0:
171 %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5 152 ; AVX2-NEXT: vpaddq 80(%rdi), %ymm0, %ymm0
172 %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6 153 ; AVX2-NEXT: retq
173 %v1 = load <2 x i64>* %ptr1, align 1 154 %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5
174 %v2 = load <2 x i64>* %ptr2, align 1 155 %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6
156 %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1
157 %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1
175 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 158 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
176 %v4 = add <4 x i64> %v3, %x 159 %v4 = add <4 x i64> %v3, %x
177 ret <4 x i64> %v4 160 ret <4 x i64> %v4
178 } 161 }
179 162
180 define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) { 163 define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
181 ; CHECK-LABEL: combine_16_byte_loads_i32 164 ; AVXSLOW-LABEL: combine_16_byte_loads_i32:
182 165 ; AVXSLOW: # BB#0:
183 ; SANDYB: vextractf128 166 ; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
184 ; SANDYB-NEXT: vpaddd 167 ; AVXSLOW-NEXT: vpaddd 112(%rdi), %xmm1, %xmm1
185 ; SANDYB-NEXT: vpaddd 168 ; AVXSLOW-NEXT: vpaddd 96(%rdi), %xmm0, %xmm0
186 ; SANDYB-NEXT: vinsertf128 169 ; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
187 ; SANDYB-NEXT: retq 170 ; AVXSLOW-NEXT: retq
188 171 ;
189 ; BTVER2: vextractf128 172 ; AVXFAST-LABEL: combine_16_byte_loads_i32:
190 ; BTVER2-NEXT: vpaddd 173 ; AVXFAST: # BB#0:
191 ; BTVER2-NEXT: vpaddd 174 ; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1
192 ; BTVER2-NEXT: vinsertf128 175 ; AVXFAST-NEXT: vpaddd 112(%rdi), %xmm1, %xmm1
193 ; BTVER2-NEXT: retq 176 ; AVXFAST-NEXT: vpaddd 96(%rdi), %xmm0, %xmm0
194 177 ; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
195 ; HASWELL-NOT: vextract 178 ; AVXFAST-NEXT: retq
196 ; HASWELL: vpaddd 179 ;
197 ; HASWELL-NEXT: retq 180 ; AVX2-LABEL: combine_16_byte_loads_i32:
198 181 ; AVX2: # BB#0:
199 %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6 182 ; AVX2-NEXT: vpaddd 96(%rdi), %ymm0, %ymm0
200 %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7 183 ; AVX2-NEXT: retq
201 %v1 = load <4 x i32>* %ptr1, align 1 184 %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6
202 %v2 = load <4 x i32>* %ptr2, align 1 185 %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7
186 %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1
187 %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1
203 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 188 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
204 %v4 = add <8 x i32> %v3, %x 189 %v4 = add <8 x i32> %v3, %x
205 ret <8 x i32> %v4 190 ret <8 x i32> %v4
206 } 191 }
207 192
208 define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) { 193 define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
209 ; CHECK-LABEL: combine_16_byte_loads_i16 194 ; AVXSLOW-LABEL: combine_16_byte_loads_i16:
210 195 ; AVXSLOW: # BB#0:
211 ; SANDYB: vextractf128 196 ; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
212 ; SANDYB-NEXT: vpaddw 197 ; AVXSLOW-NEXT: vpaddw 128(%rdi), %xmm1, %xmm1
213 ; SANDYB-NEXT: vpaddw 198 ; AVXSLOW-NEXT: vpaddw 112(%rdi), %xmm0, %xmm0
214 ; SANDYB-NEXT: vinsertf128 199 ; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
215 ; SANDYB-NEXT: retq 200 ; AVXSLOW-NEXT: retq
216 201 ;
217 ; BTVER2: vextractf128 202 ; AVXFAST-LABEL: combine_16_byte_loads_i16:
218 ; BTVER2-NEXT: vpaddw 203 ; AVXFAST: # BB#0:
219 ; BTVER2-NEXT: vpaddw 204 ; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1
220 ; BTVER2-NEXT: vinsertf128 205 ; AVXFAST-NEXT: vpaddw 128(%rdi), %xmm1, %xmm1
221 ; BTVER2-NEXT: retq 206 ; AVXFAST-NEXT: vpaddw 112(%rdi), %xmm0, %xmm0
222 207 ; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
223 ; HASWELL-NOT: vextract 208 ; AVXFAST-NEXT: retq
224 ; HASWELL: vpaddw 209 ;
225 ; HASWELL-NEXT: retq 210 ; AVX2-LABEL: combine_16_byte_loads_i16:
226 211 ; AVX2: # BB#0:
227 %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7 212 ; AVX2-NEXT: vpaddw 112(%rdi), %ymm0, %ymm0
228 %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8 213 ; AVX2-NEXT: retq
229 %v1 = load <8 x i16>* %ptr1, align 1 214 %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7
230 %v2 = load <8 x i16>* %ptr2, align 1 215 %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8
216 %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1
217 %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1
231 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 218 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
232 %v4 = add <16 x i16> %v3, %x 219 %v4 = add <16 x i16> %v3, %x
233 ret <16 x i16> %v4 220 ret <16 x i16> %v4
234 } 221 }
235 222
236 define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) { 223 define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
237 ; CHECK-LABEL: combine_16_byte_loads_i8 224 ; AVXSLOW-LABEL: combine_16_byte_loads_i8:
238 225 ; AVXSLOW: # BB#0:
239 ; SANDYB: vextractf128 226 ; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
240 ; SANDYB-NEXT: vpaddb 227 ; AVXSLOW-NEXT: vpaddb 144(%rdi), %xmm1, %xmm1
241 ; SANDYB-NEXT: vpaddb 228 ; AVXSLOW-NEXT: vpaddb 128(%rdi), %xmm0, %xmm0
242 ; SANDYB-NEXT: vinsertf128 229 ; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
243 ; SANDYB-NEXT: retq 230 ; AVXSLOW-NEXT: retq
244 231 ;
245 ; BTVER2: vextractf128 232 ; AVXFAST-LABEL: combine_16_byte_loads_i8:
246 ; BTVER2-NEXT: vpaddb 233 ; AVXFAST: # BB#0:
247 ; BTVER2-NEXT: vpaddb 234 ; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1
248 ; BTVER2-NEXT: vinsertf128 235 ; AVXFAST-NEXT: vpaddb 144(%rdi), %xmm1, %xmm1
249 ; BTVER2-NEXT: retq 236 ; AVXFAST-NEXT: vpaddb 128(%rdi), %xmm0, %xmm0
250 237 ; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
251 ; HASWELL-NOT: vextract 238 ; AVXFAST-NEXT: retq
252 ; HASWELL: vpaddb 239 ;
253 ; HASWELL-NEXT: retq 240 ; AVX2-LABEL: combine_16_byte_loads_i8:
254 241 ; AVX2: # BB#0:
255 %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8 242 ; AVX2-NEXT: vpaddb 128(%rdi), %ymm0, %ymm0
256 %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9 243 ; AVX2-NEXT: retq
257 %v1 = load <16 x i8>* %ptr1, align 1 244 %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8
258 %v2 = load <16 x i8>* %ptr2, align 1 245 %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9
246 %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1
247 %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1
259 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 248 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
260 %v4 = add <32 x i8> %v3, %x 249 %v4 = add <32 x i8> %v3, %x
261 ret <32 x i8> %v4 250 ret <32 x i8> %v4
262 } 251 }
263 252
264 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) { 253 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
265 ; CHECK-LABEL: combine_16_byte_loads_double 254 ; AVXSLOW-LABEL: combine_16_byte_loads_double:
266 255 ; AVXSLOW: # BB#0:
267 ; SANDYB: vmovupd 256 ; AVXSLOW-NEXT: vmovupd 144(%rdi), %xmm1
268 ; SANDYB-NEXT: vinsertf128 257 ; AVXSLOW-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
269 ; SANDYB-NEXT: vaddpd 258 ; AVXSLOW-NEXT: vaddpd %ymm0, %ymm1, %ymm0
270 ; SANDYB-NEXT: retq 259 ; AVXSLOW-NEXT: retq
271 260 ;
272 ; BTVER2-NOT: vinsertf128 261 ; AVXFAST-LABEL: combine_16_byte_loads_double:
273 ; BTVER2: vaddpd 262 ; AVXFAST: # BB#0:
274 ; BTVER2-NEXT: retq 263 ; AVXFAST-NEXT: vaddpd 144(%rdi), %ymm0, %ymm0
275 264 ; AVXFAST-NEXT: retq
276 ; HASWELL-NOT: vinsertf128 265 ;
277 ; HASWELL: vaddpd 266 ; AVX2-LABEL: combine_16_byte_loads_double:
278 ; HASWELL-NEXT: retq 267 ; AVX2: # BB#0:
279 268 ; AVX2-NEXT: vaddpd 144(%rdi), %ymm0, %ymm0
280 %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9 269 ; AVX2-NEXT: retq
281 %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10 270 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9
282 %v1 = load <2 x double>* %ptr1, align 1 271 %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10
283 %v2 = load <2 x double>* %ptr2, align 1 272 %v1 = load <2 x double>, <2 x double>* %ptr1, align 1
273 %v2 = load <2 x double>, <2 x double>* %ptr2, align 1
284 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 274 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
285 %v4 = fadd <4 x double> %v3, %x 275 %v4 = fadd <4 x double> %v3, %x
286 ret <4 x double> %v4 276 ret <4 x double> %v4
287 } 277 }
288 278