150
|
1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
|
|
2 *
|
|
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
4 * See https://llvm.org/LICENSE.txt for license information.
|
|
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
6 *
|
|
7 *===-----------------------------------------------------------------------===
|
|
8 */
|
|
9
|
|
10 #ifndef __IMMINTRIN_H
|
|
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
|
|
12 #endif
|
|
13
|
|
14 #ifndef __AVX2INTRIN_H
|
|
15 #define __AVX2INTRIN_H
|
|
16
|
|
17 /* Define the default attributes for the functions in this file. */
|
|
18 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
|
|
19 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
|
|
20
|
|
21 /* SSE4 Multiple Packed Sums of Absolute Difference. */
|
|
22 #define _mm256_mpsadbw_epu8(X, Y, M) \
|
|
23 (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
|
|
24 (__v32qi)(__m256i)(Y), (int)(M))
|
|
25
|
|
26 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
27 _mm256_abs_epi8(__m256i __a)
|
|
28 {
|
|
29 return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
|
|
30 }
|
|
31
|
|
32 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
33 _mm256_abs_epi16(__m256i __a)
|
|
34 {
|
|
35 return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
|
|
36 }
|
|
37
|
|
38 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
39 _mm256_abs_epi32(__m256i __a)
|
|
40 {
|
|
41 return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
|
|
42 }
|
|
43
|
|
44 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
45 _mm256_packs_epi16(__m256i __a, __m256i __b)
|
|
46 {
|
|
47 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
|
|
48 }
|
|
49
|
|
50 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
51 _mm256_packs_epi32(__m256i __a, __m256i __b)
|
|
52 {
|
|
53 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
|
|
54 }
|
|
55
|
|
56 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
57 _mm256_packus_epi16(__m256i __a, __m256i __b)
|
|
58 {
|
|
59 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
|
|
60 }
|
|
61
|
|
62 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
63 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
|
|
64 {
|
|
65 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
|
|
66 }
|
|
67
|
|
68 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
69 _mm256_add_epi8(__m256i __a, __m256i __b)
|
|
70 {
|
|
71 return (__m256i)((__v32qu)__a + (__v32qu)__b);
|
|
72 }
|
|
73
|
|
74 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
75 _mm256_add_epi16(__m256i __a, __m256i __b)
|
|
76 {
|
|
77 return (__m256i)((__v16hu)__a + (__v16hu)__b);
|
|
78 }
|
|
79
|
|
80 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
81 _mm256_add_epi32(__m256i __a, __m256i __b)
|
|
82 {
|
|
83 return (__m256i)((__v8su)__a + (__v8su)__b);
|
|
84 }
|
|
85
|
|
86 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
87 _mm256_add_epi64(__m256i __a, __m256i __b)
|
|
88 {
|
|
89 return (__m256i)((__v4du)__a + (__v4du)__b);
|
|
90 }
|
|
91
|
|
92 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
93 _mm256_adds_epi8(__m256i __a, __m256i __b)
|
|
94 {
|
|
95 return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
|
|
96 }
|
|
97
|
|
98 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
99 _mm256_adds_epi16(__m256i __a, __m256i __b)
|
|
100 {
|
|
101 return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
|
|
102 }
|
|
103
|
|
104 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
105 _mm256_adds_epu8(__m256i __a, __m256i __b)
|
|
106 {
|
|
107 return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
|
|
108 }
|
|
109
|
|
110 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
111 _mm256_adds_epu16(__m256i __a, __m256i __b)
|
|
112 {
|
|
113 return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
|
|
114 }
|
|
115
|
|
116 #define _mm256_alignr_epi8(a, b, n) \
|
|
117 (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
|
|
118 (__v32qi)(__m256i)(b), (n))
|
|
119
|
|
120 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
121 _mm256_and_si256(__m256i __a, __m256i __b)
|
|
122 {
|
|
123 return (__m256i)((__v4du)__a & (__v4du)__b);
|
|
124 }
|
|
125
|
|
126 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
127 _mm256_andnot_si256(__m256i __a, __m256i __b)
|
|
128 {
|
|
129 return (__m256i)(~(__v4du)__a & (__v4du)__b);
|
|
130 }
|
|
131
|
|
132 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
133 _mm256_avg_epu8(__m256i __a, __m256i __b)
|
|
134 {
|
|
135 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
|
|
136 }
|
|
137
|
|
138 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
139 _mm256_avg_epu16(__m256i __a, __m256i __b)
|
|
140 {
|
|
141 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
|
|
142 }
|
|
143
|
|
144 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
145 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
|
|
146 {
|
|
147 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
|
|
148 (__v32qi)__M);
|
|
149 }
|
|
150
|
|
151 #define _mm256_blend_epi16(V1, V2, M) \
|
|
152 (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
|
|
153 (__v16hi)(__m256i)(V2), (int)(M))
|
|
154
|
|
155 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
156 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
|
|
157 {
|
|
158 return (__m256i)((__v32qi)__a == (__v32qi)__b);
|
|
159 }
|
|
160
|
|
161 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
162 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
|
|
163 {
|
|
164 return (__m256i)((__v16hi)__a == (__v16hi)__b);
|
|
165 }
|
|
166
|
|
167 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
168 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
|
|
169 {
|
|
170 return (__m256i)((__v8si)__a == (__v8si)__b);
|
|
171 }
|
|
172
|
|
173 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
174 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
|
|
175 {
|
|
176 return (__m256i)((__v4di)__a == (__v4di)__b);
|
|
177 }
|
|
178
|
|
179 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
180 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
|
|
181 {
|
|
182 /* This function always performs a signed comparison, but __v32qi is a char
|
|
183 which may be signed or unsigned, so use __v32qs. */
|
|
184 return (__m256i)((__v32qs)__a > (__v32qs)__b);
|
|
185 }
|
|
186
|
|
187 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
188 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
|
|
189 {
|
|
190 return (__m256i)((__v16hi)__a > (__v16hi)__b);
|
|
191 }
|
|
192
|
|
193 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
194 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
|
|
195 {
|
|
196 return (__m256i)((__v8si)__a > (__v8si)__b);
|
|
197 }
|
|
198
|
|
199 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
200 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
|
|
201 {
|
|
202 return (__m256i)((__v4di)__a > (__v4di)__b);
|
|
203 }
|
|
204
|
|
205 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
206 _mm256_hadd_epi16(__m256i __a, __m256i __b)
|
|
207 {
|
|
208 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
|
|
209 }
|
|
210
|
|
211 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
212 _mm256_hadd_epi32(__m256i __a, __m256i __b)
|
|
213 {
|
|
214 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
|
|
215 }
|
|
216
|
|
217 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
218 _mm256_hadds_epi16(__m256i __a, __m256i __b)
|
|
219 {
|
|
220 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
|
|
221 }
|
|
222
|
|
223 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
224 _mm256_hsub_epi16(__m256i __a, __m256i __b)
|
|
225 {
|
|
226 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
|
|
227 }
|
|
228
|
|
229 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
230 _mm256_hsub_epi32(__m256i __a, __m256i __b)
|
|
231 {
|
|
232 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
|
|
233 }
|
|
234
|
|
235 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
236 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
|
|
237 {
|
|
238 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
|
|
239 }
|
|
240
|
|
241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
242 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
|
|
243 {
|
|
244 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
|
|
245 }
|
|
246
|
|
247 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
248 _mm256_madd_epi16(__m256i __a, __m256i __b)
|
|
249 {
|
|
250 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
|
|
251 }
|
|
252
|
|
253 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
254 _mm256_max_epi8(__m256i __a, __m256i __b)
|
|
255 {
|
|
256 return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
|
|
257 }
|
|
258
|
|
259 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
260 _mm256_max_epi16(__m256i __a, __m256i __b)
|
|
261 {
|
|
262 return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
|
|
263 }
|
|
264
|
|
265 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
266 _mm256_max_epi32(__m256i __a, __m256i __b)
|
|
267 {
|
|
268 return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
|
|
269 }
|
|
270
|
|
271 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
272 _mm256_max_epu8(__m256i __a, __m256i __b)
|
|
273 {
|
|
274 return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
|
|
275 }
|
|
276
|
|
277 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
278 _mm256_max_epu16(__m256i __a, __m256i __b)
|
|
279 {
|
|
280 return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
|
|
281 }
|
|
282
|
|
283 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
284 _mm256_max_epu32(__m256i __a, __m256i __b)
|
|
285 {
|
|
286 return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
|
|
287 }
|
|
288
|
|
289 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
290 _mm256_min_epi8(__m256i __a, __m256i __b)
|
|
291 {
|
|
292 return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
|
|
293 }
|
|
294
|
|
295 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
296 _mm256_min_epi16(__m256i __a, __m256i __b)
|
|
297 {
|
|
298 return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
|
|
299 }
|
|
300
|
|
301 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
302 _mm256_min_epi32(__m256i __a, __m256i __b)
|
|
303 {
|
|
304 return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
|
|
305 }
|
|
306
|
|
307 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
308 _mm256_min_epu8(__m256i __a, __m256i __b)
|
|
309 {
|
|
310 return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
|
|
311 }
|
|
312
|
|
313 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
314 _mm256_min_epu16(__m256i __a, __m256i __b)
|
|
315 {
|
|
316 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
|
|
317 }
|
|
318
|
|
319 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
320 _mm256_min_epu32(__m256i __a, __m256i __b)
|
|
321 {
|
|
322 return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
|
|
323 }
|
|
324
|
|
325 static __inline__ int __DEFAULT_FN_ATTRS256
|
|
326 _mm256_movemask_epi8(__m256i __a)
|
|
327 {
|
|
328 return __builtin_ia32_pmovmskb256((__v32qi)__a);
|
|
329 }
|
|
330
|
|
331 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
332 _mm256_cvtepi8_epi16(__m128i __V)
|
|
333 {
|
|
334 /* This function always performs a signed extension, but __v16qi is a char
|
|
335 which may be signed or unsigned, so use __v16qs. */
|
|
336 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
|
|
337 }
|
|
338
|
|
339 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
340 _mm256_cvtepi8_epi32(__m128i __V)
|
|
341 {
|
|
342 /* This function always performs a signed extension, but __v16qi is a char
|
|
343 which may be signed or unsigned, so use __v16qs. */
|
|
344 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
|
|
345 }
|
|
346
|
|
347 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
348 _mm256_cvtepi8_epi64(__m128i __V)
|
|
349 {
|
|
350 /* This function always performs a signed extension, but __v16qi is a char
|
|
351 which may be signed or unsigned, so use __v16qs. */
|
|
352 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
|
|
353 }
|
|
354
|
|
355 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
356 _mm256_cvtepi16_epi32(__m128i __V)
|
|
357 {
|
|
358 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
|
|
359 }
|
|
360
|
|
361 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
362 _mm256_cvtepi16_epi64(__m128i __V)
|
|
363 {
|
|
364 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
|
|
365 }
|
|
366
|
|
367 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
368 _mm256_cvtepi32_epi64(__m128i __V)
|
|
369 {
|
|
370 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
|
|
371 }
|
|
372
|
|
373 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
374 _mm256_cvtepu8_epi16(__m128i __V)
|
|
375 {
|
|
376 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
|
|
377 }
|
|
378
|
|
379 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
380 _mm256_cvtepu8_epi32(__m128i __V)
|
|
381 {
|
|
382 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
|
|
383 }
|
|
384
|
|
385 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
386 _mm256_cvtepu8_epi64(__m128i __V)
|
|
387 {
|
|
388 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
|
|
389 }
|
|
390
|
|
391 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
392 _mm256_cvtepu16_epi32(__m128i __V)
|
|
393 {
|
|
394 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
|
|
395 }
|
|
396
|
|
397 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
398 _mm256_cvtepu16_epi64(__m128i __V)
|
|
399 {
|
|
400 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
|
|
401 }
|
|
402
|
|
403 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
404 _mm256_cvtepu32_epi64(__m128i __V)
|
|
405 {
|
|
406 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
|
|
407 }
|
|
408
|
|
409 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
410 _mm256_mul_epi32(__m256i __a, __m256i __b)
|
|
411 {
|
|
412 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
|
|
413 }
|
|
414
|
|
415 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
416 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
|
|
417 {
|
|
418 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
|
|
419 }
|
|
420
|
|
421 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
422 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
|
|
423 {
|
|
424 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
|
|
425 }
|
|
426
|
|
427 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
428 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
|
|
429 {
|
|
430 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
|
|
431 }
|
|
432
|
|
433 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
434 _mm256_mullo_epi16(__m256i __a, __m256i __b)
|
|
435 {
|
|
436 return (__m256i)((__v16hu)__a * (__v16hu)__b);
|
|
437 }
|
|
438
|
|
439 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
440 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
|
|
441 {
|
|
442 return (__m256i)((__v8su)__a * (__v8su)__b);
|
|
443 }
|
|
444
|
|
445 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
446 _mm256_mul_epu32(__m256i __a, __m256i __b)
|
|
447 {
|
|
448 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
|
|
449 }
|
|
450
|
|
451 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
452 _mm256_or_si256(__m256i __a, __m256i __b)
|
|
453 {
|
|
454 return (__m256i)((__v4du)__a | (__v4du)__b);
|
|
455 }
|
|
456
|
|
457 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
458 _mm256_sad_epu8(__m256i __a, __m256i __b)
|
|
459 {
|
|
460 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
|
|
461 }
|
|
462
|
|
463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
464 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
|
|
465 {
|
|
466 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
|
|
467 }
|
|
468
|
|
469 #define _mm256_shuffle_epi32(a, imm) \
|
|
470 (__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))
|
|
471
|
|
472 #define _mm256_shufflehi_epi16(a, imm) \
|
|
473 (__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))
|
|
474
|
|
475 #define _mm256_shufflelo_epi16(a, imm) \
|
|
476 (__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))
|
|
477
|
|
478 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
479 _mm256_sign_epi8(__m256i __a, __m256i __b)
|
|
480 {
|
|
481 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
|
|
482 }
|
|
483
|
|
484 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
485 _mm256_sign_epi16(__m256i __a, __m256i __b)
|
|
486 {
|
|
487 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
|
|
488 }
|
|
489
|
|
490 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
491 _mm256_sign_epi32(__m256i __a, __m256i __b)
|
|
492 {
|
|
493 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
|
|
494 }
|
|
495
|
|
496 #define _mm256_slli_si256(a, imm) \
|
|
497 (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
|
|
498
|
|
499 #define _mm256_bslli_epi128(a, imm) \
|
|
500 (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
|
|
501
|
|
502 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
503 _mm256_slli_epi16(__m256i __a, int __count)
|
|
504 {
|
|
505 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
|
|
506 }
|
|
507
|
|
508 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
509 _mm256_sll_epi16(__m256i __a, __m128i __count)
|
|
510 {
|
|
511 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
|
|
512 }
|
|
513
|
|
514 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
515 _mm256_slli_epi32(__m256i __a, int __count)
|
|
516 {
|
|
517 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
|
|
518 }
|
|
519
|
|
520 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
521 _mm256_sll_epi32(__m256i __a, __m128i __count)
|
|
522 {
|
|
523 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
|
|
524 }
|
|
525
|
|
526 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
527 _mm256_slli_epi64(__m256i __a, int __count)
|
|
528 {
|
|
529 return __builtin_ia32_psllqi256((__v4di)__a, __count);
|
|
530 }
|
|
531
|
|
532 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
533 _mm256_sll_epi64(__m256i __a, __m128i __count)
|
|
534 {
|
|
535 return __builtin_ia32_psllq256((__v4di)__a, __count);
|
|
536 }
|
|
537
|
|
538 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
539 _mm256_srai_epi16(__m256i __a, int __count)
|
|
540 {
|
|
541 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
|
|
542 }
|
|
543
|
|
544 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
545 _mm256_sra_epi16(__m256i __a, __m128i __count)
|
|
546 {
|
|
547 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
|
|
548 }
|
|
549
|
|
550 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
551 _mm256_srai_epi32(__m256i __a, int __count)
|
|
552 {
|
|
553 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
|
|
554 }
|
|
555
|
|
556 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
557 _mm256_sra_epi32(__m256i __a, __m128i __count)
|
|
558 {
|
|
559 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
|
|
560 }
|
|
561
|
|
562 #define _mm256_srli_si256(a, imm) \
|
|
563 (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
|
|
564
|
|
565 #define _mm256_bsrli_epi128(a, imm) \
|
|
566 (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
|
|
567
|
|
568 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
569 _mm256_srli_epi16(__m256i __a, int __count)
|
|
570 {
|
|
571 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
|
|
572 }
|
|
573
|
|
574 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
575 _mm256_srl_epi16(__m256i __a, __m128i __count)
|
|
576 {
|
|
577 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
|
|
578 }
|
|
579
|
|
580 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
581 _mm256_srli_epi32(__m256i __a, int __count)
|
|
582 {
|
|
583 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
|
|
584 }
|
|
585
|
|
586 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
587 _mm256_srl_epi32(__m256i __a, __m128i __count)
|
|
588 {
|
|
589 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
|
|
590 }
|
|
591
|
|
592 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
593 _mm256_srli_epi64(__m256i __a, int __count)
|
|
594 {
|
|
595 return __builtin_ia32_psrlqi256((__v4di)__a, __count);
|
|
596 }
|
|
597
|
|
598 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
599 _mm256_srl_epi64(__m256i __a, __m128i __count)
|
|
600 {
|
|
601 return __builtin_ia32_psrlq256((__v4di)__a, __count);
|
|
602 }
|
|
603
|
|
604 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
605 _mm256_sub_epi8(__m256i __a, __m256i __b)
|
|
606 {
|
|
607 return (__m256i)((__v32qu)__a - (__v32qu)__b);
|
|
608 }
|
|
609
|
|
610 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
611 _mm256_sub_epi16(__m256i __a, __m256i __b)
|
|
612 {
|
|
613 return (__m256i)((__v16hu)__a - (__v16hu)__b);
|
|
614 }
|
|
615
|
|
616 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
617 _mm256_sub_epi32(__m256i __a, __m256i __b)
|
|
618 {
|
|
619 return (__m256i)((__v8su)__a - (__v8su)__b);
|
|
620 }
|
|
621
|
|
622 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
623 _mm256_sub_epi64(__m256i __a, __m256i __b)
|
|
624 {
|
|
625 return (__m256i)((__v4du)__a - (__v4du)__b);
|
|
626 }
|
|
627
|
|
628 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
629 _mm256_subs_epi8(__m256i __a, __m256i __b)
|
|
630 {
|
|
631 return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
|
|
632 }
|
|
633
|
|
634 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
635 _mm256_subs_epi16(__m256i __a, __m256i __b)
|
|
636 {
|
|
637 return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
|
|
638 }
|
|
639
|
|
640 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
641 _mm256_subs_epu8(__m256i __a, __m256i __b)
|
|
642 {
|
|
643 return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
|
|
644 }
|
|
645
|
|
646 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
647 _mm256_subs_epu16(__m256i __a, __m256i __b)
|
|
648 {
|
|
649 return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
|
|
650 }
|
|
651
|
|
652 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
653 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
|
|
654 {
|
|
655 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
|
|
656 }
|
|
657
|
|
658 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
659 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
|
|
660 {
|
|
661 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
|
|
662 }
|
|
663
|
|
664 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
665 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
|
|
666 {
|
|
667 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
|
|
668 }
|
|
669
|
|
670 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
671 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
|
|
672 {
|
|
673 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
|
|
674 }
|
|
675
|
|
676 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
677 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
|
|
678 {
|
|
679 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
|
|
680 }
|
|
681
|
|
682 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
683 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
|
|
684 {
|
|
685 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
|
|
686 }
|
|
687
|
|
688 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
689 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
|
|
690 {
|
|
691 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
|
|
692 }
|
|
693
|
|
694 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
695 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
|
|
696 {
|
|
697 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
|
|
698 }
|
|
699
|
|
700 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
701 _mm256_xor_si256(__m256i __a, __m256i __b)
|
|
702 {
|
|
703 return (__m256i)((__v4du)__a ^ (__v4du)__b);
|
|
704 }
|
|
705
|
|
706 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
707 _mm256_stream_load_si256(__m256i const *__V)
|
|
708 {
|
|
709 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
|
|
710 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
|
|
711 }
|
|
712
|
|
713 static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
|
714 _mm_broadcastss_ps(__m128 __X)
|
|
715 {
|
|
716 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
|
|
717 }
|
|
718
|
|
719 static __inline__ __m128d __DEFAULT_FN_ATTRS128
|
|
720 _mm_broadcastsd_pd(__m128d __a)
|
|
721 {
|
|
722 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
|
|
723 }
|
|
724
|
|
725 static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
|
726 _mm256_broadcastss_ps(__m128 __X)
|
|
727 {
|
|
728 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
729 }
|
|
730
|
|
731 static __inline__ __m256d __DEFAULT_FN_ATTRS256
|
|
732 _mm256_broadcastsd_pd(__m128d __X)
|
|
733 {
|
|
734 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
|
|
735 }
|
|
736
|
|
737 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
738 _mm256_broadcastsi128_si256(__m128i __X)
|
|
739 {
|
|
740 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
|
|
741 }
|
|
742
|
173
|
743 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
|
|
744
|
150
|
745 #define _mm_blend_epi32(V1, V2, M) \
|
|
746 (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
|
|
747 (__v4si)(__m128i)(V2), (int)(M))
|
|
748
|
|
749 #define _mm256_blend_epi32(V1, V2, M) \
|
|
750 (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
|
|
751 (__v8si)(__m256i)(V2), (int)(M))
|
|
752
|
|
753 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
754 _mm256_broadcastb_epi8(__m128i __X)
|
|
755 {
|
|
756 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
757 }
|
|
758
|
|
759 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
760 _mm256_broadcastw_epi16(__m128i __X)
|
|
761 {
|
|
762 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
763 }
|
|
764
|
|
765 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
766 _mm256_broadcastd_epi32(__m128i __X)
|
|
767 {
|
|
768 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
769 }
|
|
770
|
|
771 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
772 _mm256_broadcastq_epi64(__m128i __X)
|
|
773 {
|
|
774 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
|
|
775 }
|
|
776
|
|
777 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
778 _mm_broadcastb_epi8(__m128i __X)
|
|
779 {
|
|
780 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
781 }
|
|
782
|
|
783 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
784 _mm_broadcastw_epi16(__m128i __X)
|
|
785 {
|
|
786 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
787 }
|
|
788
|
|
789
|
|
790 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
791 _mm_broadcastd_epi32(__m128i __X)
|
|
792 {
|
|
793 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
|
|
794 }
|
|
795
|
|
796 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
797 _mm_broadcastq_epi64(__m128i __X)
|
|
798 {
|
|
799 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
|
|
800 }
|
|
801
|
|
802 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
803 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
|
|
804 {
|
|
805 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
|
|
806 }
|
|
807
|
|
808 #define _mm256_permute4x64_pd(V, M) \
|
|
809 (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))
|
|
810
|
|
811 static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
|
812 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
|
|
813 {
|
|
814 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
|
|
815 }
|
|
816
|
|
817 #define _mm256_permute4x64_epi64(V, M) \
|
|
818 (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))
|
|
819
|
|
820 #define _mm256_permute2x128_si256(V1, V2, M) \
|
|
821 (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))
|
|
822
|
|
823 #define _mm256_extracti128_si256(V, M) \
|
|
824 (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
|
|
825
|
|
826 #define _mm256_inserti128_si256(V1, V2, M) \
|
|
827 (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
|
|
828 (__v2di)(__m128i)(V2), (int)(M))
|
|
829
|
|
830 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
831 _mm256_maskload_epi32(int const *__X, __m256i __M)
|
|
832 {
|
|
833 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
|
|
834 }
|
|
835
|
|
836 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
837 _mm256_maskload_epi64(long long const *__X, __m256i __M)
|
|
838 {
|
|
839 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
|
|
840 }
|
|
841
|
|
842 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
843 _mm_maskload_epi32(int const *__X, __m128i __M)
|
|
844 {
|
|
845 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
|
|
846 }
|
|
847
|
|
848 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
849 _mm_maskload_epi64(long long const *__X, __m128i __M)
|
|
850 {
|
|
851 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
|
|
852 }
|
|
853
|
|
854 static __inline__ void __DEFAULT_FN_ATTRS256
|
|
855 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
|
|
856 {
|
|
857 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
|
|
858 }
|
|
859
|
|
860 static __inline__ void __DEFAULT_FN_ATTRS256
|
|
861 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
|
|
862 {
|
|
863 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
|
|
864 }
|
|
865
|
|
866 static __inline__ void __DEFAULT_FN_ATTRS128
|
|
867 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
|
|
868 {
|
|
869 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
|
|
870 }
|
|
871
|
|
872 static __inline__ void __DEFAULT_FN_ATTRS128
|
|
873 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
|
|
874 {
|
|
875 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
|
|
876 }
|
|
877
|
|
878 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
879 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
|
|
880 {
|
|
881 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
|
|
882 }
|
|
883
|
|
884 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
885 _mm_sllv_epi32(__m128i __X, __m128i __Y)
|
|
886 {
|
|
887 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
|
|
888 }
|
|
889
|
|
890 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
891 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
|
|
892 {
|
|
893 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
|
|
894 }
|
|
895
|
|
896 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
897 _mm_sllv_epi64(__m128i __X, __m128i __Y)
|
|
898 {
|
|
899 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
|
|
900 }
|
|
901
|
|
902 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
903 _mm256_srav_epi32(__m256i __X, __m256i __Y)
|
|
904 {
|
|
905 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
|
|
906 }
|
|
907
|
|
908 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
909 _mm_srav_epi32(__m128i __X, __m128i __Y)
|
|
910 {
|
|
911 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
|
|
912 }
|
|
913
|
|
914 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
915 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
|
|
916 {
|
|
917 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
|
|
918 }
|
|
919
|
|
920 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
921 _mm_srlv_epi32(__m128i __X, __m128i __Y)
|
|
922 {
|
|
923 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
|
|
924 }
|
|
925
|
|
926 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
927 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
|
|
928 {
|
|
929 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
|
|
930 }
|
|
931
|
|
932 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
933 _mm_srlv_epi64(__m128i __X, __m128i __Y)
|
|
934 {
|
|
935 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
|
|
936 }
|
|
937
|
|
938 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
|
|
939 (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
|
|
940 (double const *)(m), \
|
|
941 (__v4si)(__m128i)(i), \
|
|
942 (__v2df)(__m128d)(mask), (s))
|
|
943
|
|
944 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
|
|
945 (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
|
|
946 (double const *)(m), \
|
|
947 (__v4si)(__m128i)(i), \
|
|
948 (__v4df)(__m256d)(mask), (s))
|
|
949
|
|
950 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
|
|
951 (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
|
|
952 (double const *)(m), \
|
|
953 (__v2di)(__m128i)(i), \
|
|
954 (__v2df)(__m128d)(mask), (s))
|
|
955
|
|
956 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
|
|
957 (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
|
|
958 (double const *)(m), \
|
|
959 (__v4di)(__m256i)(i), \
|
|
960 (__v4df)(__m256d)(mask), (s))
|
|
961
|
|
962 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
|
|
963 (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
|
|
964 (float const *)(m), \
|
|
965 (__v4si)(__m128i)(i), \
|
|
966 (__v4sf)(__m128)(mask), (s))
|
|
967
|
|
968 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
|
|
969 (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
|
|
970 (float const *)(m), \
|
|
971 (__v8si)(__m256i)(i), \
|
|
972 (__v8sf)(__m256)(mask), (s))
|
|
973
|
|
974 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
|
|
975 (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
|
|
976 (float const *)(m), \
|
|
977 (__v2di)(__m128i)(i), \
|
|
978 (__v4sf)(__m128)(mask), (s))
|
|
979
|
|
980 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
|
|
981 (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
|
|
982 (float const *)(m), \
|
|
983 (__v4di)(__m256i)(i), \
|
|
984 (__v4sf)(__m128)(mask), (s))
|
|
985
|
|
986 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
|
|
987 (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
|
|
988 (int const *)(m), \
|
|
989 (__v4si)(__m128i)(i), \
|
|
990 (__v4si)(__m128i)(mask), (s))
|
|
991
|
|
992 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
|
|
993 (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
|
|
994 (int const *)(m), \
|
|
995 (__v8si)(__m256i)(i), \
|
|
996 (__v8si)(__m256i)(mask), (s))
|
|
997
|
|
998 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
|
|
999 (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
|
|
1000 (int const *)(m), \
|
|
1001 (__v2di)(__m128i)(i), \
|
|
1002 (__v4si)(__m128i)(mask), (s))
|
|
1003
|
|
1004 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
|
|
1005 (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
|
|
1006 (int const *)(m), \
|
|
1007 (__v4di)(__m256i)(i), \
|
|
1008 (__v4si)(__m128i)(mask), (s))
|
|
1009
|
|
1010 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
|
|
1011 (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
|
|
1012 (long long const *)(m), \
|
|
1013 (__v4si)(__m128i)(i), \
|
|
1014 (__v2di)(__m128i)(mask), (s))
|
|
1015
|
|
1016 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
|
|
1017 (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
|
|
1018 (long long const *)(m), \
|
|
1019 (__v4si)(__m128i)(i), \
|
|
1020 (__v4di)(__m256i)(mask), (s))
|
|
1021
|
|
1022 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
|
|
1023 (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
|
|
1024 (long long const *)(m), \
|
|
1025 (__v2di)(__m128i)(i), \
|
|
1026 (__v2di)(__m128i)(mask), (s))
|
|
1027
|
|
1028 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
|
|
1029 (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
|
|
1030 (long long const *)(m), \
|
|
1031 (__v4di)(__m256i)(i), \
|
|
1032 (__v4di)(__m256i)(mask), (s))
|
|
1033
|
|
1034 #define _mm_i32gather_pd(m, i, s) \
|
|
1035 (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
|
|
1036 (double const *)(m), \
|
|
1037 (__v4si)(__m128i)(i), \
|
|
1038 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
|
|
1039 _mm_setzero_pd()), \
|
|
1040 (s))
|
|
1041
|
|
1042 #define _mm256_i32gather_pd(m, i, s) \
|
|
1043 (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
|
|
1044 (double const *)(m), \
|
|
1045 (__v4si)(__m128i)(i), \
|
|
1046 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
|
|
1047 _mm256_setzero_pd(), \
|
|
1048 _CMP_EQ_OQ), \
|
|
1049 (s))
|
|
1050
|
|
1051 #define _mm_i64gather_pd(m, i, s) \
|
|
1052 (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
|
|
1053 (double const *)(m), \
|
|
1054 (__v2di)(__m128i)(i), \
|
|
1055 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
|
|
1056 _mm_setzero_pd()), \
|
|
1057 (s))
|
|
1058
|
|
1059 #define _mm256_i64gather_pd(m, i, s) \
|
|
1060 (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
|
|
1061 (double const *)(m), \
|
|
1062 (__v4di)(__m256i)(i), \
|
|
1063 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
|
|
1064 _mm256_setzero_pd(), \
|
|
1065 _CMP_EQ_OQ), \
|
|
1066 (s))
|
|
1067
|
|
1068 #define _mm_i32gather_ps(m, i, s) \
|
|
1069 (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
|
|
1070 (float const *)(m), \
|
|
1071 (__v4si)(__m128i)(i), \
|
|
1072 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
|
1073 _mm_setzero_ps()), \
|
|
1074 (s))
|
|
1075
|
|
1076 #define _mm256_i32gather_ps(m, i, s) \
|
|
1077 (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
|
|
1078 (float const *)(m), \
|
|
1079 (__v8si)(__m256i)(i), \
|
|
1080 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
|
|
1081 _mm256_setzero_ps(), \
|
|
1082 _CMP_EQ_OQ), \
|
|
1083 (s))
|
|
1084
|
|
1085 #define _mm_i64gather_ps(m, i, s) \
|
|
1086 (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
|
|
1087 (float const *)(m), \
|
|
1088 (__v2di)(__m128i)(i), \
|
|
1089 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
|
1090 _mm_setzero_ps()), \
|
|
1091 (s))
|
|
1092
|
|
1093 #define _mm256_i64gather_ps(m, i, s) \
|
|
1094 (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
|
|
1095 (float const *)(m), \
|
|
1096 (__v4di)(__m256i)(i), \
|
|
1097 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
|
1098 _mm_setzero_ps()), \
|
|
1099 (s))
|
|
1100
|
|
1101 #define _mm_i32gather_epi32(m, i, s) \
|
|
1102 (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
|
|
1103 (int const *)(m), (__v4si)(__m128i)(i), \
|
|
1104 (__v4si)_mm_set1_epi32(-1), (s))
|
|
1105
|
|
1106 #define _mm256_i32gather_epi32(m, i, s) \
|
|
1107 (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
|
|
1108 (int const *)(m), (__v8si)(__m256i)(i), \
|
|
1109 (__v8si)_mm256_set1_epi32(-1), (s))
|
|
1110
|
|
1111 #define _mm_i64gather_epi32(m, i, s) \
|
|
1112 (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
|
|
1113 (int const *)(m), (__v2di)(__m128i)(i), \
|
|
1114 (__v4si)_mm_set1_epi32(-1), (s))
|
|
1115
|
|
1116 #define _mm256_i64gather_epi32(m, i, s) \
|
|
1117 (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
|
|
1118 (int const *)(m), (__v4di)(__m256i)(i), \
|
|
1119 (__v4si)_mm_set1_epi32(-1), (s))
|
|
1120
|
|
1121 #define _mm_i32gather_epi64(m, i, s) \
|
|
1122 (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
|
|
1123 (long long const *)(m), \
|
|
1124 (__v4si)(__m128i)(i), \
|
|
1125 (__v2di)_mm_set1_epi64x(-1), (s))
|
|
1126
|
|
1127 #define _mm256_i32gather_epi64(m, i, s) \
|
|
1128 (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
|
|
1129 (long long const *)(m), \
|
|
1130 (__v4si)(__m128i)(i), \
|
|
1131 (__v4di)_mm256_set1_epi64x(-1), (s))
|
|
1132
|
|
1133 #define _mm_i64gather_epi64(m, i, s) \
|
|
1134 (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
|
|
1135 (long long const *)(m), \
|
|
1136 (__v2di)(__m128i)(i), \
|
|
1137 (__v2di)_mm_set1_epi64x(-1), (s))
|
|
1138
|
|
1139 #define _mm256_i64gather_epi64(m, i, s) \
|
|
1140 (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
|
|
1141 (long long const *)(m), \
|
|
1142 (__v4di)(__m256i)(i), \
|
|
1143 (__v4di)_mm256_set1_epi64x(-1), (s))
|
|
1144
|
|
1145 #undef __DEFAULT_FN_ATTRS256
|
|
1146 #undef __DEFAULT_FN_ATTRS128
|
|
1147
|
|
1148 #endif /* __AVX2INTRIN_H */
|