150
|
1 /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
|
|
2 *
|
|
3 *
|
|
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
5 * See https://llvm.org/LICENSE.txt for license information.
|
|
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7 *
|
|
8 *===-----------------------------------------------------------------------===
|
|
9 */
|
|
10 #ifndef __IMMINTRIN_H
|
|
11 #error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
|
|
12 #endif
|
|
13
|
|
14 #ifndef __AVX512VLVNNIINTRIN_H
|
|
15 #define __AVX512VLVNNIINTRIN_H
|
|
16
|
|
17 /* Define the default attributes for the functions in this file. */
|
|
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
|
|
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
|
|
20
|
221
|
21 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
|
22 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
|
23 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
|
24 /// in \a S, and store the packed 32-bit results in DST.
|
|
25 ///
|
|
26 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
|
27 ///
|
236
|
28 /// \code{.operation}
|
221
|
29 /// FOR j := 0 to 7
|
|
30 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
|
31 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
|
32 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
|
|
33 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
|
|
34 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
|
35 /// ENDFOR
|
|
36 /// DST[MAX:256] := 0
|
236
|
37 /// \endcode
|
221
|
38 #define _mm256_dpbusd_epi32(S, A, B) \
|
236
|
39 ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
150
|
40
|
221
|
41 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
|
42 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
|
43 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
|
44 /// in \a S using signed saturation, and store the packed 32-bit results in DST.
|
|
45 ///
|
|
46 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
|
47 ///
|
236
|
48 /// \code{.operation}
|
221
|
49 /// FOR j := 0 to 7
|
|
50 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
|
51 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
|
52 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
|
|
53 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
|
|
54 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
|
55 /// ENDFOR
|
|
56 /// DST[MAX:256] := 0
|
236
|
57 /// \endcode
|
221
|
58 #define _mm256_dpbusds_epi32(S, A, B) \
|
236
|
59 ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
221
|
60
|
|
61 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
|
62 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
|
63 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
|
|
64 /// and store the packed 32-bit results in DST.
|
|
65 ///
|
|
66 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
|
67 ///
|
236
|
68 /// \code{.operation}
|
221
|
69 /// FOR j := 0 to 7
|
|
70 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
|
71 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
|
72 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2
|
|
73 /// ENDFOR
|
|
74 /// DST[MAX:256] := 0
|
236
|
75 /// \endcode
|
221
|
76 #define _mm256_dpwssd_epi32(S, A, B) \
|
236
|
77 ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
221
|
78
|
|
79 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
|
80 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
|
81 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
|
|
82 /// using signed saturation, and store the packed 32-bit results in DST.
|
|
83 ///
|
|
84 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
|
85 ///
|
236
|
86 /// \code{.operation}
|
221
|
87 /// FOR j := 0 to 7
|
|
88 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
|
89 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
|
90 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
|
|
91 /// ENDFOR
|
|
92 /// DST[MAX:256] := 0
|
236
|
93 /// \endcode
|
221
|
94 #define _mm256_dpwssds_epi32(S, A, B) \
|
236
|
95 ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
221
|
96
|
|
97 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
|
98 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
|
99 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
|
100 /// in \a S, and store the packed 32-bit results in DST.
|
|
101 ///
|
|
102 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
|
103 ///
|
236
|
104 /// \code{.operation}
|
221
|
105 /// FOR j := 0 to 3
|
|
106 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
|
107 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
|
108 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
|
|
109 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
|
|
110 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
|
111 /// ENDFOR
|
|
112 /// DST[MAX:128] := 0
|
236
|
113 /// \endcode
|
221
|
114 #define _mm_dpbusd_epi32(S, A, B) \
|
236
|
115 ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
221
|
116
|
|
117 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
|
118 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
|
119 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
|
|
120 /// in \a S using signed saturation, and store the packed 32-bit results in DST.
|
|
121 ///
|
|
122 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
|
123 ///
|
236
|
124 /// \code{.operation}
|
221
|
125 /// FOR j := 0 to 3
|
|
126 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
|
127 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
|
128 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
|
|
129 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
|
|
130 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
|
131 /// ENDFOR
|
|
132 /// DST[MAX:128] := 0
|
236
|
133 /// \endcode
|
221
|
134 #define _mm_dpbusds_epi32(S, A, B) \
|
236
|
135 ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
221
|
136
|
|
137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
|
138 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
|
139 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
|
|
140 /// and store the packed 32-bit results in DST.
|
|
141 ///
|
|
142 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
|
143 ///
|
236
|
144 /// \code{.operation}
|
221
|
145 /// FOR j := 0 to 3
|
|
146 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
|
147 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
|
148 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2
|
|
149 /// ENDFOR
|
|
150 /// DST[MAX:128] := 0
|
236
|
151 /// \endcode
|
221
|
152 #define _mm_dpwssd_epi32(S, A, B) \
|
236
|
153 ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
221
|
154
|
|
155 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
|
156 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
|
157 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
|
|
158 /// using signed saturation, and store the packed 32-bit results in DST.
|
|
159 ///
|
|
160 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
|
161 ///
|
236
|
162 /// \code{.operation}
|
221
|
163 /// FOR j := 0 to 3
|
|
164 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
|
165 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
|
166 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
|
|
167 /// ENDFOR
|
|
168 /// DST[MAX:128] := 0
|
236
|
169 /// \endcode
|
221
|
170 #define _mm_dpwssds_epi32(S, A, B) \
|
236
|
171 ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
150
|
172
|
|
173 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
174 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
|
175 {
|
|
176 return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
177 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
|
|
178 (__v8si)__S);
|
|
179 }
|
|
180
|
|
181 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
182 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
|
183 {
|
|
184 return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
185 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
|
|
186 (__v8si)_mm256_setzero_si256());
|
|
187 }
|
|
188
|
|
189 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
190 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
|
191 {
|
|
192 return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
193 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
|
|
194 (__v8si)__S);
|
|
195 }
|
|
196
|
|
197 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
198 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
|
199 {
|
|
200 return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
201 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
|
|
202 (__v8si)_mm256_setzero_si256());
|
|
203 }
|
|
204
|
|
205 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
206 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
|
207 {
|
|
208 return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
209 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
|
|
210 (__v8si)__S);
|
|
211 }
|
|
212
|
|
213 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
214 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
|
215 {
|
|
216 return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
217 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
|
|
218 (__v8si)_mm256_setzero_si256());
|
|
219 }
|
|
220
|
|
221 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
222 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
|
223 {
|
|
224 return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
225 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
|
|
226 (__v8si)__S);
|
|
227 }
|
|
228
|
|
229 static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
230 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
|
231 {
|
|
232 return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
233 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
|
|
234 (__v8si)_mm256_setzero_si256());
|
|
235 }
|
|
236
|
|
237 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
238 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
|
239 {
|
|
240 return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
241 (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
|
|
242 (__v4si)__S);
|
|
243 }
|
|
244
|
|
245 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
246 _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
|
247 {
|
|
248 return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
249 (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
|
|
250 (__v4si)_mm_setzero_si128());
|
|
251 }
|
|
252
|
|
253 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
254 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
|
255 {
|
|
256 return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
257 (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
|
|
258 (__v4si)__S);
|
|
259 }
|
|
260
|
|
261 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
262 _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
|
263 {
|
|
264 return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
265 (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
|
|
266 (__v4si)_mm_setzero_si128());
|
|
267 }
|
|
268
|
|
269 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
270 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
|
271 {
|
|
272 return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
273 (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
|
|
274 (__v4si)__S);
|
|
275 }
|
|
276
|
|
277 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
278 _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
|
279 {
|
|
280 return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
281 (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
|
|
282 (__v4si)_mm_setzero_si128());
|
|
283 }
|
|
284
|
|
285 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
286 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
|
287 {
|
|
288 return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
289 (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
|
|
290 (__v4si)__S);
|
|
291 }
|
|
292
|
|
293 static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
294 _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
|
295 {
|
|
296 return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
297 (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
|
|
298 (__v4si)_mm_setzero_si128());
|
|
299 }
|
|
300
|
|
301 #undef __DEFAULT_FN_ATTRS128
|
|
302 #undef __DEFAULT_FN_ATTRS256
|
|
303
|
|
304 #endif
|