83
|
1 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
|
|
2
|
|
3 define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
|
4 ; CHECK-LABEL: smull_v8i8_v8i16:
|
|
5 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
95
|
6 %tmp1 = load <8 x i8>, <8 x i8>* %A
|
|
7 %tmp2 = load <8 x i8>, <8 x i8>* %B
|
83
|
8 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
|
|
9 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
|
10 %tmp5 = mul <8 x i16> %tmp3, %tmp4
|
|
11 ret <8 x i16> %tmp5
|
|
12 }
|
|
13
|
|
14 define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
|
15 ; CHECK-LABEL: smull_v4i16_v4i32:
|
|
16 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
95
|
17 %tmp1 = load <4 x i16>, <4 x i16>* %A
|
|
18 %tmp2 = load <4 x i16>, <4 x i16>* %B
|
83
|
19 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
|
|
20 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
|
21 %tmp5 = mul <4 x i32> %tmp3, %tmp4
|
|
22 ret <4 x i32> %tmp5
|
|
23 }
|
|
24
|
|
25 define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
|
26 ; CHECK-LABEL: smull_v2i32_v2i64:
|
|
27 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
95
|
28 %tmp1 = load <2 x i32>, <2 x i32>* %A
|
|
29 %tmp2 = load <2 x i32>, <2 x i32>* %B
|
83
|
30 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
|
|
31 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
|
32 %tmp5 = mul <2 x i64> %tmp3, %tmp4
|
|
33 ret <2 x i64> %tmp5
|
|
34 }
|
|
35
|
|
36 define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
|
37 ; CHECK-LABEL: umull_v8i8_v8i16:
|
|
38 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
95
|
39 %tmp1 = load <8 x i8>, <8 x i8>* %A
|
|
40 %tmp2 = load <8 x i8>, <8 x i8>* %B
|
83
|
41 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
|
|
42 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
|
43 %tmp5 = mul <8 x i16> %tmp3, %tmp4
|
|
44 ret <8 x i16> %tmp5
|
|
45 }
|
|
46
|
|
47 define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
|
48 ; CHECK-LABEL: umull_v4i16_v4i32:
|
|
49 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
95
|
50 %tmp1 = load <4 x i16>, <4 x i16>* %A
|
|
51 %tmp2 = load <4 x i16>, <4 x i16>* %B
|
83
|
52 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
|
|
53 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
|
54 %tmp5 = mul <4 x i32> %tmp3, %tmp4
|
|
55 ret <4 x i32> %tmp5
|
|
56 }
|
|
57
|
|
58 define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
|
59 ; CHECK-LABEL: umull_v2i32_v2i64:
|
|
60 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
95
|
61 %tmp1 = load <2 x i32>, <2 x i32>* %A
|
|
62 %tmp2 = load <2 x i32>, <2 x i32>* %B
|
83
|
63 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
|
|
64 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
|
65 %tmp5 = mul <2 x i64> %tmp3, %tmp4
|
|
66 ret <2 x i64> %tmp5
|
|
67 }
|
|
68
|
|
69 define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|
70 ; CHECK-LABEL: smlal_v8i8_v8i16:
|
|
71 ; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
95
|
72 %tmp1 = load <8 x i16>, <8 x i16>* %A
|
|
73 %tmp2 = load <8 x i8>, <8 x i8>* %B
|
|
74 %tmp3 = load <8 x i8>, <8 x i8>* %C
|
83
|
75 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
|
76 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
|
|
77 %tmp6 = mul <8 x i16> %tmp4, %tmp5
|
|
78 %tmp7 = add <8 x i16> %tmp1, %tmp6
|
|
79 ret <8 x i16> %tmp7
|
|
80 }
|
|
81
|
|
82 define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
|
83 ; CHECK-LABEL: smlal_v4i16_v4i32:
|
|
84 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
95
|
85 %tmp1 = load <4 x i32>, <4 x i32>* %A
|
|
86 %tmp2 = load <4 x i16>, <4 x i16>* %B
|
|
87 %tmp3 = load <4 x i16>, <4 x i16>* %C
|
83
|
88 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
|
89 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
|
|
90 %tmp6 = mul <4 x i32> %tmp4, %tmp5
|
|
91 %tmp7 = add <4 x i32> %tmp1, %tmp6
|
|
92 ret <4 x i32> %tmp7
|
|
93 }
|
|
94
|
|
95 define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
|
96 ; CHECK-LABEL: smlal_v2i32_v2i64:
|
|
97 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
95
|
98 %tmp1 = load <2 x i64>, <2 x i64>* %A
|
|
99 %tmp2 = load <2 x i32>, <2 x i32>* %B
|
|
100 %tmp3 = load <2 x i32>, <2 x i32>* %C
|
83
|
101 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
|
102 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
|
|
103 %tmp6 = mul <2 x i64> %tmp4, %tmp5
|
|
104 %tmp7 = add <2 x i64> %tmp1, %tmp6
|
|
105 ret <2 x i64> %tmp7
|
|
106 }
|
|
107
|
|
108 define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|
109 ; CHECK-LABEL: umlal_v8i8_v8i16:
|
|
110 ; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
95
|
111 %tmp1 = load <8 x i16>, <8 x i16>* %A
|
|
112 %tmp2 = load <8 x i8>, <8 x i8>* %B
|
|
113 %tmp3 = load <8 x i8>, <8 x i8>* %C
|
83
|
114 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
|
115 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
|
|
116 %tmp6 = mul <8 x i16> %tmp4, %tmp5
|
|
117 %tmp7 = add <8 x i16> %tmp1, %tmp6
|
|
118 ret <8 x i16> %tmp7
|
|
119 }
|
|
120
|
|
121 define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
|
122 ; CHECK-LABEL: umlal_v4i16_v4i32:
|
|
123 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
95
|
124 %tmp1 = load <4 x i32>, <4 x i32>* %A
|
|
125 %tmp2 = load <4 x i16>, <4 x i16>* %B
|
|
126 %tmp3 = load <4 x i16>, <4 x i16>* %C
|
83
|
127 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
|
128 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
|
|
129 %tmp6 = mul <4 x i32> %tmp4, %tmp5
|
|
130 %tmp7 = add <4 x i32> %tmp1, %tmp6
|
|
131 ret <4 x i32> %tmp7
|
|
132 }
|
|
133
|
|
134 define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
|
135 ; CHECK-LABEL: umlal_v2i32_v2i64:
|
|
136 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
95
|
137 %tmp1 = load <2 x i64>, <2 x i64>* %A
|
|
138 %tmp2 = load <2 x i32>, <2 x i32>* %B
|
|
139 %tmp3 = load <2 x i32>, <2 x i32>* %C
|
83
|
140 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
|
141 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
|
|
142 %tmp6 = mul <2 x i64> %tmp4, %tmp5
|
|
143 %tmp7 = add <2 x i64> %tmp1, %tmp6
|
|
144 ret <2 x i64> %tmp7
|
|
145 }
|
|
146
|
|
147 define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|
148 ; CHECK-LABEL: smlsl_v8i8_v8i16:
|
|
149 ; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
95
|
150 %tmp1 = load <8 x i16>, <8 x i16>* %A
|
|
151 %tmp2 = load <8 x i8>, <8 x i8>* %B
|
|
152 %tmp3 = load <8 x i8>, <8 x i8>* %C
|
83
|
153 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
|
|
154 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
|
|
155 %tmp6 = mul <8 x i16> %tmp4, %tmp5
|
|
156 %tmp7 = sub <8 x i16> %tmp1, %tmp6
|
|
157 ret <8 x i16> %tmp7
|
|
158 }
|
|
159
|
|
160 define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
|
161 ; CHECK-LABEL: smlsl_v4i16_v4i32:
|
|
162 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
95
|
163 %tmp1 = load <4 x i32>, <4 x i32>* %A
|
|
164 %tmp2 = load <4 x i16>, <4 x i16>* %B
|
|
165 %tmp3 = load <4 x i16>, <4 x i16>* %C
|
83
|
166 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
|
|
167 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
|
|
168 %tmp6 = mul <4 x i32> %tmp4, %tmp5
|
|
169 %tmp7 = sub <4 x i32> %tmp1, %tmp6
|
|
170 ret <4 x i32> %tmp7
|
|
171 }
|
|
172
|
|
173 define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
|
174 ; CHECK-LABEL: smlsl_v2i32_v2i64:
|
|
175 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
95
|
176 %tmp1 = load <2 x i64>, <2 x i64>* %A
|
|
177 %tmp2 = load <2 x i32>, <2 x i32>* %B
|
|
178 %tmp3 = load <2 x i32>, <2 x i32>* %C
|
83
|
179 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
|
|
180 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
|
|
181 %tmp6 = mul <2 x i64> %tmp4, %tmp5
|
|
182 %tmp7 = sub <2 x i64> %tmp1, %tmp6
|
|
183 ret <2 x i64> %tmp7
|
|
184 }
|
|
185
|
|
186 define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
|
|
187 ; CHECK-LABEL: umlsl_v8i8_v8i16:
|
|
188 ; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
95
|
189 %tmp1 = load <8 x i16>, <8 x i16>* %A
|
|
190 %tmp2 = load <8 x i8>, <8 x i8>* %B
|
|
191 %tmp3 = load <8 x i8>, <8 x i8>* %C
|
83
|
192 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
|
|
193 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
|
|
194 %tmp6 = mul <8 x i16> %tmp4, %tmp5
|
|
195 %tmp7 = sub <8 x i16> %tmp1, %tmp6
|
|
196 ret <8 x i16> %tmp7
|
|
197 }
|
|
198
|
|
199 define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
|
|
200 ; CHECK-LABEL: umlsl_v4i16_v4i32:
|
|
201 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
95
|
202 %tmp1 = load <4 x i32>, <4 x i32>* %A
|
|
203 %tmp2 = load <4 x i16>, <4 x i16>* %B
|
|
204 %tmp3 = load <4 x i16>, <4 x i16>* %C
|
83
|
205 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
|
|
206 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
|
|
207 %tmp6 = mul <4 x i32> %tmp4, %tmp5
|
|
208 %tmp7 = sub <4 x i32> %tmp1, %tmp6
|
|
209 ret <4 x i32> %tmp7
|
|
210 }
|
|
211
|
|
212 define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
|
|
213 ; CHECK-LABEL: umlsl_v2i32_v2i64:
|
|
214 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
95
|
215 %tmp1 = load <2 x i64>, <2 x i64>* %A
|
|
216 %tmp2 = load <2 x i32>, <2 x i32>* %B
|
|
217 %tmp3 = load <2 x i32>, <2 x i32>* %C
|
83
|
218 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
|
|
219 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
|
|
220 %tmp6 = mul <2 x i64> %tmp4, %tmp5
|
|
221 %tmp7 = sub <2 x i64> %tmp1, %tmp6
|
|
222 ret <2 x i64> %tmp7
|
|
223 }
|
|
224
|
|
225 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
|
|
226 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
|
|
227 ; CHECK-LABEL: smull_extvec_v8i8_v8i16:
|
|
228 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
|
229 %tmp3 = sext <8 x i8> %arg to <8 x i16>
|
|
230 %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
|
|
231 ret <8 x i16> %tmp4
|
|
232 }
|
|
233
|
|
234 define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
|
|
235 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
|
|
236 ; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
|
|
237 ; CHECK: movz
|
|
238 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
|
|
239 %tmp3 = sext <8 x i8> %arg to <8 x i16>
|
|
240 %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
|
|
241 ret <8 x i16> %tmp4
|
|
242 }
|
|
243
|
|
244 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
|
|
245 ; CHECK-LABEL: smull_extvec_v4i16_v4i32:
|
|
246 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
|
247 %tmp3 = sext <4 x i16> %arg to <4 x i32>
|
|
248 %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
|
|
249 ret <4 x i32> %tmp4
|
|
250 }
|
|
251
|
|
252 define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
|
|
253 ; CHECK: smull_extvec_v2i32_v2i64
|
|
254 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
|
255 %tmp3 = sext <2 x i32> %arg to <2 x i64>
|
|
256 %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
|
|
257 ret <2 x i64> %tmp4
|
|
258 }
|
|
259
|
|
260 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
|
|
261 ; CHECK-LABEL: umull_extvec_v8i8_v8i16:
|
|
262 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
|
|
263 %tmp3 = zext <8 x i8> %arg to <8 x i16>
|
|
264 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
|
|
265 ret <8 x i16> %tmp4
|
|
266 }
|
|
267
|
|
268 define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
|
|
269 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
|
|
270 ; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
|
|
271 ; CHECK: movz
|
|
272 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
|
|
273 %tmp3 = zext <8 x i8> %arg to <8 x i16>
|
|
274 %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
|
|
275 ret <8 x i16> %tmp4
|
|
276 }
|
|
277
|
|
278 define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
|
|
279 ; CHECK-LABEL: umull_extvec_v4i16_v4i32:
|
|
280 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
|
|
281 %tmp3 = zext <4 x i16> %arg to <4 x i32>
|
|
282 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
|
|
283 ret <4 x i32> %tmp4
|
|
284 }
|
|
285
|
|
286 define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
|
|
287 ; CHECK-LABEL: umull_extvec_v2i32_v2i64:
|
|
288 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
|
|
289 %tmp3 = zext <2 x i32> %arg to <2 x i64>
|
|
290 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
|
|
291 ret <2 x i64> %tmp4
|
|
292 }
|
|
293
|
|
294 define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
|
|
295 ; If one operand has a zero-extend and the other a sign-extend, smull
|
|
296 ; cannot be used.
|
|
297 ; CHECK-LABEL: smullWithInconsistentExtensions:
|
|
298 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
|
|
299 %1 = sext <8 x i8> %vec to <8 x i16>
|
|
300 %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
301 %3 = extractelement <8 x i16> %2, i32 0
|
|
302 ret i16 %3
|
|
303 }
|
|
304
|
|
305 define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
|
|
306 entry:
|
|
307 ; CHECK-LABEL: distribute:
|
|
308 ; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
|
|
309 ; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
|
|
310 %0 = trunc i32 %mul to i8
|
|
311 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
|
|
312 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
|
|
313 %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
|
|
314 %4 = bitcast <16 x i8> %3 to <2 x double>
|
|
315 %5 = extractelement <2 x double> %4, i32 1
|
|
316 %6 = bitcast double %5 to <8 x i8>
|
|
317 %7 = zext <8 x i8> %6 to <8 x i16>
|
|
318 %8 = zext <8 x i8> %2 to <8 x i16>
|
|
319 %9 = extractelement <2 x double> %4, i32 0
|
|
320 %10 = bitcast double %9 to <8 x i8>
|
|
321 %11 = zext <8 x i8> %10 to <8 x i16>
|
|
322 %12 = add <8 x i16> %7, %11
|
|
323 %13 = mul <8 x i16> %12, %8
|
|
324 %14 = bitcast i16* %dst to i8*
|
|
325 tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
|
|
326 ret void
|
|
327 }
|
|
328
|
|
329 declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
|
|
330
|
|
331 declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
|
|
332
|