Mercurial > hg > CbC > CbC_llvm
comparison clang/lib/Headers/avx512vlvnniintrin.h @ 236:c4bab56944e8 llvm-original
LLVM 16
author | kono |
---|---|
date | Wed, 09 Nov 2022 17:45:10 +0900 |
parents | 79ff65ed7e25 |
children |
comparison
equal
deleted
inserted
replaced
232:70dce7da266c | 236:c4bab56944e8 |
---|---|
23 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer | 23 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer |
24 /// in \a S, and store the packed 32-bit results in DST. | 24 /// in \a S, and store the packed 32-bit results in DST. |
25 /// | 25 /// |
26 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. | 26 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. |
27 /// | 27 /// |
28 /// \operation | 28 /// \code{.operation} |
29 /// FOR j := 0 to 7 | 29 /// FOR j := 0 to 7 |
30 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) | 30 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) |
31 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) | 31 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) |
32 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) | 32 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) |
33 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) | 33 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) |
34 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 | 34 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
35 /// ENDFOR | 35 /// ENDFOR |
36 /// DST[MAX:256] := 0 | 36 /// DST[MAX:256] := 0 |
37 /// \endoperation | 37 /// \endcode |
38 #define _mm256_dpbusd_epi32(S, A, B) \ | 38 #define _mm256_dpbusd_epi32(S, A, B) \ |
39 (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)) | 39 ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) |
40 | 40 |
41 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with | 41 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with |
42 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed | 42 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed |
43 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer | 43 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer |
44 /// in \a S using signed saturation, and store the packed 32-bit results in DST. | 44 /// in \a S using signed saturation, and store the packed 32-bit results in DST. |
45 /// | 45 /// |
46 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. | 46 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. |
47 /// | 47 /// |
48 /// \operation | 48 /// \code{.operation} |
49 /// FOR j := 0 to 7 | 49 /// FOR j := 0 to 7 |
50 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) | 50 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) |
51 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) | 51 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) |
52 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) | 52 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) |
53 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) | 53 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) |
54 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) | 54 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
55 /// ENDFOR | 55 /// ENDFOR |
56 /// DST[MAX:256] := 0 | 56 /// DST[MAX:256] := 0 |
57 /// \endoperation | 57 /// \endcode |
58 #define _mm256_dpbusds_epi32(S, A, B) \ | 58 #define _mm256_dpbusds_epi32(S, A, B) \ |
59 (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)) | 59 ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) |
60 | 60 |
61 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with | 61 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with |
62 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit | 62 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit |
63 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, | 63 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, |
64 /// and store the packed 32-bit results in DST. | 64 /// and store the packed 32-bit results in DST. |
65 /// | 65 /// |
66 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. | 66 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. |
67 /// | 67 /// |
68 /// \operation | 68 /// \code{.operation} |
69 /// FOR j := 0 to 7 | 69 /// FOR j := 0 to 7 |
70 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) | 70 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) |
71 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) | 71 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) |
72 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 | 72 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 |
73 /// ENDFOR | 73 /// ENDFOR |
74 /// DST[MAX:256] := 0 | 74 /// DST[MAX:256] := 0 |
75 /// \endoperation | 75 /// \endcode |
76 #define _mm256_dpwssd_epi32(S, A, B) \ | 76 #define _mm256_dpwssd_epi32(S, A, B) \ |
77 (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)) | 77 ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) |
78 | 78 |
79 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with | 79 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with |
80 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit | 80 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit |
81 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S | 81 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S |
82 /// using signed saturation, and store the packed 32-bit results in DST. | 82 /// using signed saturation, and store the packed 32-bit results in DST. |
83 /// | 83 /// |
84 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. | 84 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. |
85 /// | 85 /// |
86 /// \operation | 86 /// \code{.operation} |
87 /// FOR j := 0 to 7 | 87 /// FOR j := 0 to 7 |
88 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) | 88 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) |
89 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) | 89 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) |
90 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) | 90 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) |
91 /// ENDFOR | 91 /// ENDFOR |
92 /// DST[MAX:256] := 0 | 92 /// DST[MAX:256] := 0 |
93 /// \endoperation | 93 /// \endcode |
94 #define _mm256_dpwssds_epi32(S, A, B) \ | 94 #define _mm256_dpwssds_epi32(S, A, B) \ |
95 (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)) | 95 ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) |
96 | 96 |
97 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with | 97 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with |
98 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed | 98 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed |
99 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer | 99 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer |
100 /// in \a S, and store the packed 32-bit results in DST. | 100 /// in \a S, and store the packed 32-bit results in DST. |
101 /// | 101 /// |
102 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. | 102 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. |
103 /// | 103 /// |
104 /// \operation | 104 /// \code{.operation} |
105 /// FOR j := 0 to 3 | 105 /// FOR j := 0 to 3 |
106 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) | 106 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) |
107 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) | 107 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) |
108 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) | 108 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) |
109 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) | 109 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) |
110 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 | 110 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
111 /// ENDFOR | 111 /// ENDFOR |
112 /// DST[MAX:128] := 0 | 112 /// DST[MAX:128] := 0 |
113 /// \endoperation | 113 /// \endcode |
114 #define _mm_dpbusd_epi32(S, A, B) \ | 114 #define _mm_dpbusd_epi32(S, A, B) \ |
115 (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)) | 115 ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) |
116 | 116 |
117 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with | 117 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with |
118 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed | 118 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed |
119 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer | 119 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer |
120 /// in \a S using signed saturation, and store the packed 32-bit results in DST. | 120 /// in \a S using signed saturation, and store the packed 32-bit results in DST. |
121 /// | 121 /// |
122 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. | 122 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. |
123 /// | 123 /// |
124 /// \operation | 124 /// \code{.operation} |
125 /// FOR j := 0 to 3 | 125 /// FOR j := 0 to 3 |
126 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) | 126 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) |
127 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) | 127 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) |
128 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) | 128 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) |
129 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) | 129 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) |
130 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) | 130 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
131 /// ENDFOR | 131 /// ENDFOR |
132 /// DST[MAX:128] := 0 | 132 /// DST[MAX:128] := 0 |
133 /// \endoperation | 133 /// \endcode |
134 #define _mm_dpbusds_epi32(S, A, B) \ | 134 #define _mm_dpbusds_epi32(S, A, B) \ |
135 (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)) | 135 ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) |
136 | 136 |
137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with | 137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with |
138 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit | 138 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit |
139 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, | 139 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, |
140 /// and store the packed 32-bit results in DST. | 140 /// and store the packed 32-bit results in DST. |
141 /// | 141 /// |
142 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. | 142 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. |
143 /// | 143 /// |
144 /// \operation | 144 /// \code{.operation} |
145 /// FOR j := 0 to 3 | 145 /// FOR j := 0 to 3 |
146 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) | 146 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) |
147 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) | 147 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) |
148 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 | 148 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 |
149 /// ENDFOR | 149 /// ENDFOR |
150 /// DST[MAX:128] := 0 | 150 /// DST[MAX:128] := 0 |
151 /// \endoperation | 151 /// \endcode |
152 #define _mm_dpwssd_epi32(S, A, B) \ | 152 #define _mm_dpwssd_epi32(S, A, B) \ |
153 (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)) | 153 ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) |
154 | 154 |
155 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with | 155 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with |
156 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit | 156 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit |
157 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S | 157 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S |
158 /// using signed saturation, and store the packed 32-bit results in DST. | 158 /// using signed saturation, and store the packed 32-bit results in DST. |
159 /// | 159 /// |
160 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. | 160 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. |
161 /// | 161 /// |
162 /// \operation | 162 /// \code{.operation} |
163 /// FOR j := 0 to 3 | 163 /// FOR j := 0 to 3 |
164 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) | 164 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) |
165 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) | 165 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) |
166 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) | 166 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) |
167 /// ENDFOR | 167 /// ENDFOR |
168 /// DST[MAX:128] := 0 | 168 /// DST[MAX:128] := 0 |
169 /// \endoperation | 169 /// \endcode |
170 #define _mm_dpwssds_epi32(S, A, B) \ | 170 #define _mm_dpwssds_epi32(S, A, B) \ |
171 (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)) | 171 ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) |
172 | 172 |
173 static __inline__ __m256i __DEFAULT_FN_ATTRS256 | 173 static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
174 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) | 174 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) |
175 { | 175 { |
176 return (__m256i)__builtin_ia32_selectd_256(__U, | 176 return (__m256i)__builtin_ia32_selectd_256(__U, |