annotate clang/lib/Headers/avx512vlvnniintrin.h @ 236:c4bab56944e8 llvm-original

LLVM 16
author kono
date Wed, 09 Nov 2022 17:45:10 +0900
parents 79ff65ed7e25
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
150
anatofuz
parents:
diff changeset
1 /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
anatofuz
parents:
diff changeset
2 *
anatofuz
parents:
diff changeset
3 *
anatofuz
parents:
diff changeset
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
anatofuz
parents:
diff changeset
5 * See https://llvm.org/LICENSE.txt for license information.
anatofuz
parents:
diff changeset
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
anatofuz
parents:
diff changeset
7 *
anatofuz
parents:
diff changeset
8 *===-----------------------------------------------------------------------===
anatofuz
parents:
diff changeset
9 */
anatofuz
parents:
diff changeset
10 #ifndef __IMMINTRIN_H
anatofuz
parents:
diff changeset
11 #error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
anatofuz
parents:
diff changeset
12 #endif
anatofuz
parents:
diff changeset
13
anatofuz
parents:
diff changeset
14 #ifndef __AVX512VLVNNIINTRIN_H
anatofuz
parents:
diff changeset
15 #define __AVX512VLVNNIINTRIN_H
anatofuz
parents:
diff changeset
16
anatofuz
parents:
diff changeset
17 /* Define the default attributes for the functions in this file. */
anatofuz
parents:
diff changeset
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
anatofuz
parents:
diff changeset
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
anatofuz
parents:
diff changeset
20
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
21 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
22 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
23 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
24 /// in \a S, and store the packed 32-bit results in DST.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
25 ///
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
26 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
27 ///
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
28 /// \code{.operation}
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
29 /// FOR j := 0 to 7
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
30 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
31 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
32 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
33 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
34 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
35 /// ENDFOR
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
36 /// DST[MAX:256] := 0
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
37 /// \endcode
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
38 #define _mm256_dpbusd_epi32(S, A, B) \
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
39 ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
150
anatofuz
parents:
diff changeset
40
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
41 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
42 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
43 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
44 /// in \a S using signed saturation, and store the packed 32-bit results in DST.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
45 ///
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
46 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
47 ///
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
48 /// \code{.operation}
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
49 /// FOR j := 0 to 7
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
50 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
51 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
52 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
53 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
54 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
55 /// ENDFOR
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
56 /// DST[MAX:256] := 0
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
57 /// \endcode
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
58 #define _mm256_dpbusds_epi32(S, A, B) \
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
59 ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
60
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
61 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
62 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
63 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
64 /// and store the packed 32-bit results in DST.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
65 ///
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
66 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
67 ///
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
68 /// \code{.operation}
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
69 /// FOR j := 0 to 7
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
70 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
71 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
72 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
73 /// ENDFOR
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
74 /// DST[MAX:256] := 0
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
75 /// \endcode
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
76 #define _mm256_dpwssd_epi32(S, A, B) \
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
77 ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
78
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
79 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
80 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
81 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
82 /// using signed saturation, and store the packed 32-bit results in DST.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
83 ///
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
84 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
85 ///
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
86 /// \code{.operation}
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
87 /// FOR j := 0 to 7
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
88 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
89 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
90 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
91 /// ENDFOR
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
92 /// DST[MAX:256] := 0
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
93 /// \endcode
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
94 #define _mm256_dpwssds_epi32(S, A, B) \
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
95 ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
96
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
97 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
98 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
99 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
100 /// in \a S, and store the packed 32-bit results in DST.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
101 ///
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
102 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
103 ///
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
104 /// \code{.operation}
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
105 /// FOR j := 0 to 3
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
106 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
107 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
108 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
109 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
110 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
111 /// ENDFOR
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
112 /// DST[MAX:128] := 0
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
113 /// \endcode
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
114 #define _mm_dpbusd_epi32(S, A, B) \
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
115 ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
116
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
117 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
118 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
119 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
120 /// in \a S using signed saturation, and store the packed 32-bit results in DST.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
121 ///
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
122 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
123 ///
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
124 /// \code{.operation}
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
125 /// FOR j := 0 to 3
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
126 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
127 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
128 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
129 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
130 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
131 /// ENDFOR
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
132 /// DST[MAX:128] := 0
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
133 /// \endcode
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
134 #define _mm_dpbusds_epi32(S, A, B) \
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
135 ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
136
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
138 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
139 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
140 /// and store the packed 32-bit results in DST.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
141 ///
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
142 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
143 ///
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
144 /// \code{.operation}
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
145 /// FOR j := 0 to 3
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
146 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
147 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
148 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
149 /// ENDFOR
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
150 /// DST[MAX:128] := 0
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
151 /// \endcode
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
152 #define _mm_dpwssd_epi32(S, A, B) \
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
153 ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
154
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
155 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
156 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
157 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
158 /// using signed saturation, and store the packed 32-bit results in DST.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
159 ///
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
160 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
161 ///
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
162 /// \code{.operation}
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
163 /// FOR j := 0 to 3
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
164 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
165 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
166 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
167 /// ENDFOR
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
168 /// DST[MAX:128] := 0
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
169 /// \endcode
221
79ff65ed7e25 LLVM12 Original
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 150
diff changeset
170 #define _mm_dpwssds_epi32(S, A, B) \
236
c4bab56944e8 LLVM 16
kono
parents: 221
diff changeset
171 ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
150
anatofuz
parents:
diff changeset
172
anatofuz
parents:
diff changeset
173 static __inline__ __m256i __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
174 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
anatofuz
parents:
diff changeset
175 {
anatofuz
parents:
diff changeset
176 return (__m256i)__builtin_ia32_selectd_256(__U,
anatofuz
parents:
diff changeset
177 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
178 (__v8si)__S);
anatofuz
parents:
diff changeset
179 }
anatofuz
parents:
diff changeset
180
anatofuz
parents:
diff changeset
181 static __inline__ __m256i __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
182 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
anatofuz
parents:
diff changeset
183 {
anatofuz
parents:
diff changeset
184 return (__m256i)__builtin_ia32_selectd_256(__U,
anatofuz
parents:
diff changeset
185 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
186 (__v8si)_mm256_setzero_si256());
anatofuz
parents:
diff changeset
187 }
anatofuz
parents:
diff changeset
188
anatofuz
parents:
diff changeset
189 static __inline__ __m256i __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
190 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
anatofuz
parents:
diff changeset
191 {
anatofuz
parents:
diff changeset
192 return (__m256i)__builtin_ia32_selectd_256(__U,
anatofuz
parents:
diff changeset
193 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
194 (__v8si)__S);
anatofuz
parents:
diff changeset
195 }
anatofuz
parents:
diff changeset
196
anatofuz
parents:
diff changeset
197 static __inline__ __m256i __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
198 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
anatofuz
parents:
diff changeset
199 {
anatofuz
parents:
diff changeset
200 return (__m256i)__builtin_ia32_selectd_256(__U,
anatofuz
parents:
diff changeset
201 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
202 (__v8si)_mm256_setzero_si256());
anatofuz
parents:
diff changeset
203 }
anatofuz
parents:
diff changeset
204
anatofuz
parents:
diff changeset
205 static __inline__ __m256i __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
206 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
anatofuz
parents:
diff changeset
207 {
anatofuz
parents:
diff changeset
208 return (__m256i)__builtin_ia32_selectd_256(__U,
anatofuz
parents:
diff changeset
209 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
210 (__v8si)__S);
anatofuz
parents:
diff changeset
211 }
anatofuz
parents:
diff changeset
212
anatofuz
parents:
diff changeset
213 static __inline__ __m256i __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
214 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
anatofuz
parents:
diff changeset
215 {
anatofuz
parents:
diff changeset
216 return (__m256i)__builtin_ia32_selectd_256(__U,
anatofuz
parents:
diff changeset
217 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
218 (__v8si)_mm256_setzero_si256());
anatofuz
parents:
diff changeset
219 }
anatofuz
parents:
diff changeset
220
anatofuz
parents:
diff changeset
221 static __inline__ __m256i __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
222 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
anatofuz
parents:
diff changeset
223 {
anatofuz
parents:
diff changeset
224 return (__m256i)__builtin_ia32_selectd_256(__U,
anatofuz
parents:
diff changeset
225 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
226 (__v8si)__S);
anatofuz
parents:
diff changeset
227 }
anatofuz
parents:
diff changeset
228
anatofuz
parents:
diff changeset
229 static __inline__ __m256i __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
230 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
anatofuz
parents:
diff changeset
231 {
anatofuz
parents:
diff changeset
232 return (__m256i)__builtin_ia32_selectd_256(__U,
anatofuz
parents:
diff changeset
233 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
234 (__v8si)_mm256_setzero_si256());
anatofuz
parents:
diff changeset
235 }
anatofuz
parents:
diff changeset
236
anatofuz
parents:
diff changeset
237 static __inline__ __m128i __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
238 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
anatofuz
parents:
diff changeset
239 {
anatofuz
parents:
diff changeset
240 return (__m128i)__builtin_ia32_selectd_128(__U,
anatofuz
parents:
diff changeset
241 (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
242 (__v4si)__S);
anatofuz
parents:
diff changeset
243 }
anatofuz
parents:
diff changeset
244
anatofuz
parents:
diff changeset
245 static __inline__ __m128i __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
246 _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
anatofuz
parents:
diff changeset
247 {
anatofuz
parents:
diff changeset
248 return (__m128i)__builtin_ia32_selectd_128(__U,
anatofuz
parents:
diff changeset
249 (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
250 (__v4si)_mm_setzero_si128());
anatofuz
parents:
diff changeset
251 }
anatofuz
parents:
diff changeset
252
anatofuz
parents:
diff changeset
253 static __inline__ __m128i __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
254 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
anatofuz
parents:
diff changeset
255 {
anatofuz
parents:
diff changeset
256 return (__m128i)__builtin_ia32_selectd_128(__U,
anatofuz
parents:
diff changeset
257 (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
258 (__v4si)__S);
anatofuz
parents:
diff changeset
259 }
anatofuz
parents:
diff changeset
260
anatofuz
parents:
diff changeset
261 static __inline__ __m128i __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
262 _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
anatofuz
parents:
diff changeset
263 {
anatofuz
parents:
diff changeset
264 return (__m128i)__builtin_ia32_selectd_128(__U,
anatofuz
parents:
diff changeset
265 (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
266 (__v4si)_mm_setzero_si128());
anatofuz
parents:
diff changeset
267 }
anatofuz
parents:
diff changeset
268
anatofuz
parents:
diff changeset
269 static __inline__ __m128i __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
270 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
anatofuz
parents:
diff changeset
271 {
anatofuz
parents:
diff changeset
272 return (__m128i)__builtin_ia32_selectd_128(__U,
anatofuz
parents:
diff changeset
273 (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
274 (__v4si)__S);
anatofuz
parents:
diff changeset
275 }
anatofuz
parents:
diff changeset
276
anatofuz
parents:
diff changeset
277 static __inline__ __m128i __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
278 _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
anatofuz
parents:
diff changeset
279 {
anatofuz
parents:
diff changeset
280 return (__m128i)__builtin_ia32_selectd_128(__U,
anatofuz
parents:
diff changeset
281 (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
282 (__v4si)_mm_setzero_si128());
anatofuz
parents:
diff changeset
283 }
anatofuz
parents:
diff changeset
284
anatofuz
parents:
diff changeset
285 static __inline__ __m128i __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
286 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
anatofuz
parents:
diff changeset
287 {
anatofuz
parents:
diff changeset
288 return (__m128i)__builtin_ia32_selectd_128(__U,
anatofuz
parents:
diff changeset
289 (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
290 (__v4si)__S);
anatofuz
parents:
diff changeset
291 }
anatofuz
parents:
diff changeset
292
anatofuz
parents:
diff changeset
293 static __inline__ __m128i __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
294 _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
anatofuz
parents:
diff changeset
295 {
anatofuz
parents:
diff changeset
296 return (__m128i)__builtin_ia32_selectd_128(__U,
anatofuz
parents:
diff changeset
297 (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
anatofuz
parents:
diff changeset
298 (__v4si)_mm_setzero_si128());
anatofuz
parents:
diff changeset
299 }
anatofuz
parents:
diff changeset
300
anatofuz
parents:
diff changeset
301 #undef __DEFAULT_FN_ATTRS128
anatofuz
parents:
diff changeset
302 #undef __DEFAULT_FN_ATTRS256
anatofuz
parents:
diff changeset
303
anatofuz
parents:
diff changeset
304 #endif