libstdc++
simd_x86.h
1// Simd x86 specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2023 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_X86INTRIN
31#error \
32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available"
33#endif
34
35_GLIBCXX_SIMD_BEGIN_NAMESPACE
36
37// __to_masktype {{{
38// Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and
39// __vector_type_t.
40template <typename _Tp, size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); }
44
45template <typename _TV,
46 typename _TVT
47 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
48 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
50 __to_masktype(_TV __x)
51 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); }
52
53// }}}
54// __interleave128_lo {{{
55template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>,
56 typename _Trait = _VectorTraits<_Tp>>
57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
58 __interleave128_lo(const _Ap& __av, const _Bp& __bv)
59 {
60 const _Tp __a(__av);
61 const _Tp __b(__bv);
62 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
63 return _Tp{__a[0], __b[0]};
64 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
65 return _Tp{__a[0], __b[0], __a[1], __b[1]};
66 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
67 return _Tp{__a[0], __b[0], __a[1], __b[1],
68 __a[2], __b[2], __a[3], __b[3]};
69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
72 __a[6], __b[6], __a[7], __b[7]};
73 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
74 return _Tp{__a[0], __b[0], __a[2], __b[2]};
75 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
76 return _Tp{__a[0], __b[0], __a[1], __b[1],
77 __a[4], __b[4], __a[5], __b[5]};
78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
81 __a[10], __b[10], __a[11], __b[11]};
82 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
87 __a[22], __b[22], __a[23], __b[23]};
88 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
89 return _Tp{__a[0], __b[0], __a[2], __b[2],
90 __a[4], __b[4], __a[6], __b[6]};
91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
94 __a[12], __b[12], __a[13], __b[13]};
95 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
100 __a[26], __b[26], __a[27], __b[27]};
101 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
111 __b[55]};
112 else
113 __assert_unreachable<_Tp>();
114 }
115
116// }}}
117// __is_zero{{{
118template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
119 _GLIBCXX_SIMD_INTRINSIC constexpr bool
120 __is_zero(_Tp __a)
121 {
122 if (!__builtin_is_constant_evaluated())
123 {
124 if constexpr (__have_avx)
125 {
126 if constexpr (_TVT::template _S_is<float, 8>)
127 return _mm256_testz_ps(__a, __a);
128 else if constexpr (_TVT::template _S_is<double, 4>)
129 return _mm256_testz_pd(__a, __a);
130 else if constexpr (sizeof(_Tp) == 32)
131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
132 else if constexpr (_TVT::template _S_is<float>)
133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
134 else if constexpr (_TVT::template _S_is<double, 2>)
135 return _mm_testz_pd(__a, __a);
136 else
137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
138 }
139 else if constexpr (__have_sse4_1)
140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
141 __intrin_bitcast<__m128i>(__a));
142 }
143 else if constexpr (sizeof(_Tp) <= 8)
144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
145 else
146 {
147 const auto __b = __vector_bitcast<_LLong>(__a);
148 if constexpr (sizeof(__b) == 16)
149 return (__b[0] | __b[1]) == 0;
150 else if constexpr (sizeof(__b) == 32)
151 return __is_zero(__lo128(__b) | __hi128(__b));
152 else if constexpr (sizeof(__b) == 64)
153 return __is_zero(__lo256(__b) | __hi256(__b));
154 else
155 __assert_unreachable<_Tp>();
156 }
157 }
158
159// }}}
160// __movemask{{{
161template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int
163 __movemask(_Tp __a)
164 {
165 if constexpr (sizeof(_Tp) == 32)
166 {
167 if constexpr (_TVT::template _S_is<float>)
168 return _mm256_movemask_ps(__to_intrin(__a));
169 else if constexpr (_TVT::template _S_is<double>)
170 return _mm256_movemask_pd(__to_intrin(__a));
171 else
172 return _mm256_movemask_epi8(__to_intrin(__a));
173 }
174 else if constexpr (_TVT::template _S_is<float>)
175 return _mm_movemask_ps(__to_intrin(__a));
176 else if constexpr (_TVT::template _S_is<double>)
177 return _mm_movemask_pd(__to_intrin(__a));
178 else
179 return _mm_movemask_epi8(__to_intrin(__a));
180 }
181
182// }}}
183// __testz{{{
184template <typename _TI, typename _TVT = _VectorTraits<_TI>>
185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
186 __testz(_TI __a, _TI __b)
187 {
188 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
189 _TVT::_S_full_size>>);
190 if (!__builtin_is_constant_evaluated())
191 {
192 if constexpr (sizeof(_TI) == 32)
193 {
194 if constexpr (_TVT::template _S_is<float>)
195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
196 else if constexpr (_TVT::template _S_is<double>)
197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
198 else
199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
200 }
201 else if constexpr (_TVT::template _S_is<float> && __have_avx)
202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
203 else if constexpr (_TVT::template _S_is<double> && __have_avx)
204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
205 else if constexpr (__have_sse4_1)
206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
207 __intrin_bitcast<__m128i>(__to_intrin(__b)));
208 else
209 return __movemask(0 == __and(__a, __b)) != 0;
210 }
211 else
212 return __is_zero(__and(__a, __b));
213 }
214
215// }}}
216// __testc{{{
217// requires SSE4.1 or above
218template <typename _TI, typename _TVT = _VectorTraits<_TI>>
219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
220 __testc(_TI __a, _TI __b)
221 {
222 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
223 _TVT::_S_full_size>>);
224 if (__builtin_is_constant_evaluated())
225 return __is_zero(__andnot(__a, __b));
226
227 if constexpr (sizeof(_TI) == 32)
228 {
229 if constexpr (_TVT::template _S_is<float>)
230 return _mm256_testc_ps(__a, __b);
231 else if constexpr (_TVT::template _S_is<double>)
232 return _mm256_testc_pd(__a, __b);
233 else
234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
235 }
236 else if constexpr (_TVT::template _S_is<float> && __have_avx)
237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
238 else if constexpr (_TVT::template _S_is<double> && __have_avx)
239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
240 else
241 {
242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
244 __intrin_bitcast<__m128i>(__to_intrin(__b)));
245 }
246 }
247
248// }}}
249// __testnzc{{{
250template <typename _TI, typename _TVT = _VectorTraits<_TI>>
251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
252 __testnzc(_TI __a, _TI __b)
253 {
254 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
255 _TVT::_S_full_size>>);
256 if (!__builtin_is_constant_evaluated())
257 {
258 if constexpr (sizeof(_TI) == 32)
259 {
260 if constexpr (_TVT::template _S_is<float>)
261 return _mm256_testnzc_ps(__a, __b);
262 else if constexpr (_TVT::template _S_is<double>)
263 return _mm256_testnzc_pd(__a, __b);
264 else
265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
266 }
267 else if constexpr (_TVT::template _S_is<float> && __have_avx)
268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
269 else if constexpr (_TVT::template _S_is<double> && __have_avx)
270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
271 else if constexpr (__have_sse4_1)
272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
273 __intrin_bitcast<__m128i>(__to_intrin(__b)));
274 else
275 return __movemask(0 == __and(__a, __b)) == 0
276 && __movemask(0 == __andnot(__a, __b)) == 0;
277 }
278 else
279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
280 }
281
282// }}}
283// __xzyw{{{
284// shuffles the complete vector, swapping the inner two quarters. Often useful
285// for AVX for fixing up a shuffle result.
286template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
287 _GLIBCXX_SIMD_INTRINSIC _Tp
288 __xzyw(_Tp __a)
289 {
290 if constexpr (sizeof(_Tp) == 16)
291 {
292 const auto __x = __vector_bitcast<conditional_t<
293 is_floating_point_v<typename _TVT::value_type>, float, int>>(__a);
294 return reinterpret_cast<_Tp>(
295 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
296 }
297 else if constexpr (sizeof(_Tp) == 32)
298 {
299 const auto __x = __vector_bitcast<conditional_t<
300 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
301 return reinterpret_cast<_Tp>(
302 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
303 }
304 else if constexpr (sizeof(_Tp) == 64)
305 {
306 const auto __x = __vector_bitcast<conditional_t<
307 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
308 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4],
309 __x[5], __x[2], __x[3],
310 __x[6], __x[7]});
311 }
312 else
313 __assert_unreachable<_Tp>();
314 }
315
316// }}}
317// __maskload_epi32{{{
318template <typename _Tp>
319 _GLIBCXX_SIMD_INTRINSIC auto
320 __maskload_epi32(const int* __ptr, _Tp __k)
321 {
322 if constexpr (sizeof(__k) == 16)
323 return _mm_maskload_epi32(__ptr, __k);
324 else
325 return _mm256_maskload_epi32(__ptr, __k);
326 }
327
328// }}}
329// __maskload_epi64{{{
330template <typename _Tp>
331 _GLIBCXX_SIMD_INTRINSIC auto
332 __maskload_epi64(const _LLong* __ptr, _Tp __k)
333 {
334 if constexpr (sizeof(__k) == 16)
335 return _mm_maskload_epi64(__ptr, __k);
336 else
337 return _mm256_maskload_epi64(__ptr, __k);
338 }
339
340// }}}
341// __maskload_ps{{{
342template <typename _Tp>
343 _GLIBCXX_SIMD_INTRINSIC auto
344 __maskload_ps(const float* __ptr, _Tp __k)
345 {
346 if constexpr (sizeof(__k) == 16)
347 return _mm_maskload_ps(__ptr, __k);
348 else
349 return _mm256_maskload_ps(__ptr, __k);
350 }
351
352// }}}
353// __maskload_pd{{{
354template <typename _Tp>
355 _GLIBCXX_SIMD_INTRINSIC auto
356 __maskload_pd(const double* __ptr, _Tp __k)
357 {
358 if constexpr (sizeof(__k) == 16)
359 return _mm_maskload_pd(__ptr, __k);
360 else
361 return _mm256_maskload_pd(__ptr, __k);
362 }
363
364// }}}
365
366#ifdef __clang__
367template <size_t _Np, typename _Tp, typename _Kp>
368 _GLIBCXX_SIMD_INTRINSIC constexpr auto
369 __movm(_Kp __k) noexcept
370 {
371 static_assert(is_unsigned_v<_Kp>);
372 if constexpr (sizeof(_Tp) == 1 && __have_avx512bw)
373 {
374 if constexpr (_Np <= 16 && __have_avx512vl)
375 return __builtin_ia32_cvtmask2b128(__k);
376 else if constexpr (_Np <= 32 && __have_avx512vl)
377 return __builtin_ia32_cvtmask2b256(__k);
378 else
379 return __builtin_ia32_cvtmask2b512(__k);
380 }
381 else if constexpr (sizeof(_Tp) == 2 && __have_avx512bw)
382 {
383 if constexpr (_Np <= 8 && __have_avx512vl)
384 return __builtin_ia32_cvtmask2w128(__k);
385 else if constexpr (_Np <= 16 && __have_avx512vl)
386 return __builtin_ia32_cvtmask2w256(__k);
387 else
388 return __builtin_ia32_cvtmask2w512(__k);
389 }
390 else if constexpr (sizeof(_Tp) == 4 && __have_avx512dq)
391 {
392 if constexpr (_Np <= 4 && __have_avx512vl)
393 return __builtin_ia32_cvtmask2d128(__k);
394 else if constexpr (_Np <= 8 && __have_avx512vl)
395 return __builtin_ia32_cvtmask2d256(__k);
396 else
397 return __builtin_ia32_cvtmask2d512(__k);
398 }
399 else if constexpr (sizeof(_Tp) == 8 && __have_avx512dq)
400 {
401 if constexpr (_Np <= 2 && __have_avx512vl)
402 return __builtin_ia32_cvtmask2q128(__k);
403 else if constexpr (_Np <= 4 && __have_avx512vl)
404 return __builtin_ia32_cvtmask2q256(__k);
405 else
406 return __builtin_ia32_cvtmask2q512(__k);
407 }
408 else
409 __assert_unreachable<_Tp>();
410 }
411#endif // __clang__
412
413#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
414#include "simd_x86_conversions.h"
415#endif
416
417// ISA & type detection {{{
418template <typename _Tp, size_t _Np>
419 constexpr bool
420 __is_sse_ps()
421 {
422 return __have_sse
423 && is_same_v<_Tp,
424 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
425 }
426
427template <typename _Tp, size_t _Np>
428 constexpr bool
429 __is_sse_pd()
430 {
431 return __have_sse2
432 && is_same_v<_Tp,
433 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
434 }
435
436template <typename _Tp, size_t _Np>
437 constexpr bool
438 __is_avx_ps()
439 {
440 return __have_avx
441 && is_same_v<_Tp,
442 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
443 }
444
445template <typename _Tp, size_t _Np>
446 constexpr bool
447 __is_avx_pd()
448 {
449 return __have_avx
450 && is_same_v<_Tp,
451 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
452 }
453
454template <typename _Tp, size_t _Np>
455 constexpr bool
456 __is_avx512_ps()
457 {
458 return __have_avx512f
459 && is_same_v<_Tp,
460 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
461 }
462
463template <typename _Tp, size_t _Np>
464 constexpr bool
465 __is_avx512_pd()
466 {
467 return __have_avx512f
468 && is_same_v<_Tp,
469 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
470 }
471
472// }}}
473struct _MaskImplX86Mixin;
474
475// _CommonImplX86 {{{
476struct _CommonImplX86 : _CommonImplBuiltin
477{
478#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
479 // _S_converts_via_decomposition {{{
480 template <typename _From, typename _To, size_t _ToSize>
481 static constexpr bool
482 _S_converts_via_decomposition()
483 {
484 if constexpr (is_integral_v<
485 _From> && is_integral_v<_To> && sizeof(_From) == 8
486 && _ToSize == 16)
487 return (sizeof(_To) == 2 && !__have_ssse3)
488 || (sizeof(_To) == 1 && !__have_avx512f);
489 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
490 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
491 && !__have_avx512dq)
492 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
493 && _ToSize == 16);
494 else if constexpr (
495 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
496 && !__have_avx512dq)
497 return (sizeof(_To) == 4 && _ToSize == 16)
498 || (sizeof(_To) == 8 && _ToSize < 64);
499 else
500 return false;
501 }
502
503 template <typename _From, typename _To, size_t _ToSize>
504 static inline constexpr bool __converts_via_decomposition_v
505 = _S_converts_via_decomposition<_From, _To, _ToSize>();
506
507 // }}}
508#endif
509 // _S_store {{{
510 using _CommonImplBuiltin::_S_store;
511
512 template <typename _Tp, size_t _Np>
513 _GLIBCXX_SIMD_INTRINSIC static void
514 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
515 {
516 constexpr size_t _Bytes = _Np * sizeof(_Tp);
517
518 if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
519 {
520 const auto __v = __to_intrin(__x);
521
522 if constexpr (_Bytes & 1)
523 {
524 if constexpr (_Bytes < 16)
525 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
526 __intrin_bitcast<__m128i>(__v));
527 else if constexpr (_Bytes < 32)
528 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
529 __intrin_bitcast<__m256i>(__v));
530 else
531 _mm512_mask_storeu_epi8(__addr,
532 0xffffffffffffffffull >> (64 - _Bytes),
533 __intrin_bitcast<__m512i>(__v));
534 }
535 else if constexpr (_Bytes & 2)
536 {
537 if constexpr (_Bytes < 16)
538 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
539 __intrin_bitcast<__m128i>(__v));
540 else if constexpr (_Bytes < 32)
541 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
542 __intrin_bitcast<__m256i>(__v));
543 else
544 _mm512_mask_storeu_epi16(__addr,
545 0xffffffffull >> (32 - _Bytes / 2),
546 __intrin_bitcast<__m512i>(__v));
547 }
548 else if constexpr (_Bytes & 4)
549 {
550 if constexpr (_Bytes < 16)
551 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
552 __intrin_bitcast<__m128i>(__v));
553 else if constexpr (_Bytes < 32)
554 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
555 __intrin_bitcast<__m256i>(__v));
556 else
557 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
558 __intrin_bitcast<__m512i>(__v));
559 }
560 else
561 {
562 static_assert(
563 _Bytes > 16,
564 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes "
565 "- 1)) != 0 is impossible");
566 if constexpr (_Bytes < 32)
567 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
568 __intrin_bitcast<__m256i>(__v));
569 else
570 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
571 __intrin_bitcast<__m512i>(__v));
572 }
573 }
574 else
575 _CommonImplBuiltin::_S_store(__x, __addr);
576 }
577
578 // }}}
579 // _S_store_bool_array(_BitMask) {{{
580 template <size_t _Np, bool _Sanitized>
581 _GLIBCXX_SIMD_INTRINSIC static constexpr void
582 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
583 {
584 if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
585 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
586 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
587 if constexpr (_Np <= 16)
588 return _mm_movm_epi8(__x._M_to_bits());
589 else if constexpr (_Np <= 32)
590 return _mm256_movm_epi8(__x._M_to_bits());
591 else if constexpr (_Np <= 64)
592 return _mm512_movm_epi8(__x._M_to_bits());
593 else
594 __assert_unreachable<_SizeConstant<_Np>>();
595 }()),
596 __mem);
597 else if constexpr (__have_bmi2)
598 {
599 if constexpr (_Np <= 4)
600 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
601 else
602 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
603 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
604 constexpr size_t __offset = __i * sizeof(size_t);
605 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
606 if constexpr (__todo == 1)
607 __mem[__offset] = __x[__offset];
608 else
609 {
610 const auto __bools =
611#ifdef __x86_64__
612 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
613 0x0101010101010101ULL);
614#else // __x86_64__
615 _pdep_u32(
616 __x.template _M_extract<__offset>()._M_to_bits(),
617 0x01010101U);
618#endif // __x86_64__
619 _S_store<__todo>(__bools, __mem + __offset);
620 }
621 });
622 }
623 else if constexpr (__have_sse2 && _Np > 7)
624 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
625 constexpr int __offset = __i * 16;
626 constexpr int __todo = std::min(16, int(_Np) - __offset);
627 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
628 __vector_type16_t<_UChar> __bools;
629 if constexpr (__have_avx512f)
630 {
631 auto __as32bits
632 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
633 __vector_broadcast<16>(1)));
634 auto __as16bits
635 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
636 __todo > 8 ? __hi256(__as32bits)
637 : __m256i()));
638 __bools = __vector_bitcast<_UChar>(
639 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
640 }
641 else
642 {
643 using _V = __vector_type_t<_UChar, 16>;
644 auto __tmp = _mm_cvtsi32_si128(__bits);
645 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
646 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
647 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
648 _V __tmp2 = reinterpret_cast<_V>(__tmp);
649 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
650 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index
651 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01
652 }
653 _S_store<__todo>(__bools, __mem + __offset);
654 });
655 else
656 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
657 }
658
659 // }}}
660 // _S_blend_avx512 {{{
661 // Returns: __k ? __b : __a
662 // TODO: reverse __a and __b to match COND_EXPR
663 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask
664 // __k
665 template <typename _Kp, typename _TV>
666 _GLIBCXX_SIMD_INTRINSIC static _TV
667 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept
668 {
669 static_assert(__is_vector_type_v<_TV>);
670 using _Tp = typename _VectorTraits<_TV>::value_type;
671 static_assert(sizeof(_TV) >= 16);
672 static_assert(sizeof(_Tp) <= 8);
673#ifdef __clang__
674 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a;
675#else
676 using _IntT
677 = conditional_t<(sizeof(_Tp) > 2),
678 conditional_t<sizeof(_Tp) == 4, int, long long>,
679 conditional_t<sizeof(_Tp) == 1, char, short>>;
680 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a);
681 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b);
682 if constexpr (sizeof(_TV) == 64)
683 {
684 if constexpr (sizeof(_Tp) == 1)
685 return reinterpret_cast<_TV>(
686 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
687 else if constexpr (sizeof(_Tp) == 2)
688 return reinterpret_cast<_TV>(
689 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
690 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
691 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
692 else if constexpr (sizeof(_Tp) == 4)
693 return reinterpret_cast<_TV>(
694 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
695 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
696 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
697 else if constexpr (sizeof(_Tp) == 8)
698 return reinterpret_cast<_TV>(
699 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
700 }
701 else if constexpr (sizeof(_TV) == 32)
702 {
703 if constexpr (sizeof(_Tp) == 1)
704 return reinterpret_cast<_TV>(
705 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
706 else if constexpr (sizeof(_Tp) == 2)
707 return reinterpret_cast<_TV>(
708 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
709 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
710 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
711 else if constexpr (sizeof(_Tp) == 4)
712 return reinterpret_cast<_TV>(
713 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
714 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
715 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
716 else if constexpr (sizeof(_Tp) == 8)
717 return reinterpret_cast<_TV>(
718 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
719 }
720 else if constexpr (sizeof(_TV) == 16)
721 {
722 if constexpr (sizeof(_Tp) == 1)
723 return reinterpret_cast<_TV>(
724 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
725 else if constexpr (sizeof(_Tp) == 2)
726 return reinterpret_cast<_TV>(
727 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
728 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
729 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
730 else if constexpr (sizeof(_Tp) == 4)
731 return reinterpret_cast<_TV>(
732 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
733 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
734 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
735 else if constexpr (sizeof(_Tp) == 8)
736 return reinterpret_cast<_TV>(
737 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
738 }
739#endif
740 }
741
742 // }}}
743 // _S_blend_intrin {{{
744 // Returns: __k ? __b : __a
745 // TODO: reverse __a and __b to match COND_EXPR
746 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32
747 // Bytes wide
748 template <typename _Tp>
749 _GLIBCXX_SIMD_INTRINSIC static _Tp
750 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept
751 {
752 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
753 constexpr struct
754 {
755 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
756 __m128 __k) const noexcept
757 {
758 return __builtin_ia32_blendvps(__a, __b, __k);
759 }
760 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
761 __m128d __k) const noexcept
762 {
763 return __builtin_ia32_blendvpd(__a, __b, __k);
764 }
765 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
766 __m128i __k) const noexcept
767 {
768 return reinterpret_cast<__m128i>(
769 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
770 reinterpret_cast<__v16qi>(__b),
771 reinterpret_cast<__v16qi>(__k)));
772 }
773 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
774 __m256 __k) const noexcept
775 {
776 return __builtin_ia32_blendvps256(__a, __b, __k);
777 }
778 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
779 __m256d __k) const noexcept
780 {
781 return __builtin_ia32_blendvpd256(__a, __b, __k);
782 }
783 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
784 __m256i __k) const noexcept
785 {
786 if constexpr (__have_avx2)
787 return reinterpret_cast<__m256i>(
788 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
789 reinterpret_cast<__v32qi>(__b),
790 reinterpret_cast<__v32qi>(__k)));
791 else
792 return reinterpret_cast<__m256i>(
793 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
794 reinterpret_cast<__v8sf>(__b),
795 reinterpret_cast<__v8sf>(__k)));
796 }
797 } __eval;
798 return __eval(__a, __b, __k);
799 }
800
801 // }}}
802 // _S_blend {{{
803 // Returns: __k ? __at1 : __at0
804 // TODO: reverse __at0 and __at1 to match COND_EXPR
805 template <typename _Tp, size_t _Np>
806 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
807 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
808 _SimdWrapper<_Tp, _Np> __at1)
809 {
810 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
811 if (__k._M_is_constprop() && __at0._M_is_constprop()
812 && __at1._M_is_constprop())
813 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
814 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
815 return __k[__i] ? __at1[__i] : __at0[__i];
816 });
817 else if constexpr (sizeof(__at0) == 64
818 || (__have_avx512vl && sizeof(__at0) >= 16))
819 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
820 else
821 {
822 static_assert((__have_avx512vl && sizeof(__at0) < 16)
823 || !__have_avx512vl);
824 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp);
825 return __vector_bitcast<_Tp, _Np>(
826 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
827 __vector_bitcast<_Tp, __size>(__at1)));
828 }
829 }
830
831 template <typename _Tp, size_t _Np>
832 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
833 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
834 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
835 {
836 const auto __kk = __wrapper_bitcast<_Tp>(__k);
837 if (__builtin_is_constant_evaluated()
838 || (__kk._M_is_constprop() && __at0._M_is_constprop()
839 && __at1._M_is_constprop()))
840 {
841 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
842 if (__r._M_is_constprop())
843 return __r;
844 }
845 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl)
846 && (sizeof(_Tp) >= 4 || __have_avx512bw))
847 // convert to bitmask and call overload above
848 return _S_blend(
849 _SimdWrapper<bool, _Np>(
850 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
851 ._M_to_bits()),
852 __at0, __at1);
853 else
854 {
855 // Since GCC does not assume __k to be a mask, using the builtin
856 // conditional operator introduces an extra compare against 0 before
857 // blending. So we rather call the intrinsic here.
858 if constexpr (__have_sse4_1)
859 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
860 __to_intrin(__at1));
861 else
862 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
863 }
864 }
865
866 // }}}
867};
868
869// }}}
870// _SimdImplX86 {{{
871template <typename _Abi, typename>
872 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
873 {
874 using _Base = _SimdImplBuiltin<_Abi>;
875
876 template <typename _Tp>
877 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
878
879 template <typename _Tp>
880 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
881
882 template <typename _Tp>
883 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
884
885 template <typename _Tp>
886 static constexpr size_t _S_max_store_size
887 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
888 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
889 : 16;
890
891 using _MaskImpl = typename _Abi::_MaskImpl;
892
893 // _S_masked_load {{{
894 template <typename _Tp, size_t _Np, typename _Up>
895 static inline _SimdWrapper<_Tp, _Np>
896 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
897 const _Up* __mem) noexcept
898 {
899 static_assert(_Np == _S_size<_Tp>);
900 if constexpr (is_same_v<_Tp, _Up> || // no conversion
901 (sizeof(_Tp) == sizeof(_Up)
902 && is_integral_v<
903 _Tp> == is_integral_v<_Up>) // conversion via bit
904 // reinterpretation
905 )
906 {
907 [[maybe_unused]] const auto __intrin = __to_intrin(__merge);
908 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
909 && sizeof(_Tp) == 1)
910 {
911 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
912 if constexpr (sizeof(__intrin) == 16)
913 __merge = __vector_bitcast<_Tp, _Np>(
914 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
915 else if constexpr (sizeof(__merge) == 32)
916 __merge = __vector_bitcast<_Tp, _Np>(
917 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
918 else if constexpr (sizeof(__merge) == 64)
919 __merge = __vector_bitcast<_Tp, _Np>(
920 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
921 else
922 __assert_unreachable<_Tp>();
923 }
924 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
925 && sizeof(_Tp) == 2)
926 {
927 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
928 if constexpr (sizeof(__intrin) == 16)
929 __merge = __vector_bitcast<_Tp, _Np>(
930 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
931 else if constexpr (sizeof(__intrin) == 32)
932 __merge = __vector_bitcast<_Tp, _Np>(
933 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
934 else if constexpr (sizeof(__intrin) == 64)
935 __merge = __vector_bitcast<_Tp, _Np>(
936 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
937 else
938 __assert_unreachable<_Tp>();
939 }
940 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
941 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
942 {
943 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
944 if constexpr (sizeof(__intrin) == 16)
945 __merge = __vector_bitcast<_Tp, _Np>(
946 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
947 else if constexpr (sizeof(__intrin) == 32)
948 __merge = __vector_bitcast<_Tp, _Np>(
949 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
950 else if constexpr (sizeof(__intrin) == 64)
951 __merge = __vector_bitcast<_Tp, _Np>(
952 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
953 else
954 __assert_unreachable<_Tp>();
955 }
956 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
957 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
958 {
959 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
960 if constexpr (sizeof(__intrin) == 16)
961 __merge = __vector_bitcast<_Tp, _Np>(
962 _mm_mask_loadu_ps(__intrin, __kk, __mem));
963 else if constexpr (sizeof(__intrin) == 32)
964 __merge = __vector_bitcast<_Tp, _Np>(
965 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
966 else if constexpr (sizeof(__intrin) == 64)
967 __merge = __vector_bitcast<_Tp, _Np>(
968 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
969 else
970 __assert_unreachable<_Tp>();
971 }
972 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
973 && is_integral_v<_Up>)
974 {
975 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
976 __merge
977 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
978 __vector_bitcast<_Tp, _Np>(
979 __maskload_epi32(reinterpret_cast<const int*>(__mem),
980 __to_intrin(__k))));
981 }
982 else if constexpr (__have_avx && sizeof(_Tp) == 4)
983 {
984 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
985 __merge
986 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
987 __vector_bitcast<_Tp, _Np>(
988 __maskload_ps(reinterpret_cast<const float*>(__mem),
989 __to_intrin(__k))));
990 }
991 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
992 && sizeof(_Tp) == 8 && is_integral_v<_Up>)
993 {
994 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
995 if constexpr (sizeof(__intrin) == 16)
996 __merge = __vector_bitcast<_Tp, _Np>(
997 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
998 else if constexpr (sizeof(__intrin) == 32)
999 __merge = __vector_bitcast<_Tp, _Np>(
1000 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
1001 else if constexpr (sizeof(__intrin) == 64)
1002 __merge = __vector_bitcast<_Tp, _Np>(
1003 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
1004 else
1005 __assert_unreachable<_Tp>();
1006 }
1007 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1008 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
1009 {
1010 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1011 if constexpr (sizeof(__intrin) == 16)
1012 __merge = __vector_bitcast<_Tp, _Np>(
1013 _mm_mask_loadu_pd(__intrin, __kk, __mem));
1014 else if constexpr (sizeof(__intrin) == 32)
1015 __merge = __vector_bitcast<_Tp, _Np>(
1016 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
1017 else if constexpr (sizeof(__intrin) == 64)
1018 __merge = __vector_bitcast<_Tp, _Np>(
1019 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
1020 else
1021 __assert_unreachable<_Tp>();
1022 }
1023 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1024 && is_integral_v<_Up>)
1025 {
1026 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1027 __merge
1028 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1029 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
1030 reinterpret_cast<const _LLong*>(__mem),
1031 __to_intrin(__k))));
1032 }
1033 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1034 {
1035 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1036 __merge
1037 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1038 __vector_bitcast<_Tp, _Np>(
1039 __maskload_pd(reinterpret_cast<const double*>(__mem),
1040 __to_intrin(__k))));
1041 }
1042 else
1043 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1044 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1045 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1046 });
1047 }
1048 /* Very uncertain, that the following improves anything. Needs
1049 benchmarking
1050 * before it's activated.
1051 else if constexpr (sizeof(_Up) <= 8 && // no long double
1052 !__converts_via_decomposition_v<
1053 _Up, _Tp,
1054 sizeof(__merge)> // conversion via decomposition
1055 // is better handled via the
1056 // bit_iteration fallback below
1057 )
1058 {
1059 // TODO: copy pattern from _S_masked_store, which doesn't resort to
1060 // fixed_size
1061 using _Ap = simd_abi::deduce_t<_Up, _Np>;
1062 using _ATraits = _SimdTraits<_Up, _Ap>;
1063 using _AImpl = typename _ATraits::_SimdImpl;
1064 typename _ATraits::_SimdMember __uncvted{};
1065 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template
1066 _S_convert<_Up>(__k);
1067 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem);
1068 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter;
1069 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted));
1070 }
1071 */
1072 else
1073 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1074 return __merge;
1075 }
1076
1077 // }}}
1078 // _S_masked_store_nocvt {{{
1079 template <typename _Tp, size_t _Np>
1080 _GLIBCXX_SIMD_INTRINSIC static void
1081 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k)
1082 {
1083 [[maybe_unused]] const auto __vi = __to_intrin(__v);
1084 if constexpr (sizeof(__vi) == 64)
1085 {
1086 static_assert(sizeof(__v) == 64 && __have_avx512f);
1087 if constexpr (__have_avx512bw && sizeof(_Tp) == 1)
1088 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1089 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1090 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1091 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1092 {
1093 if constexpr (is_integral_v<_Tp>)
1094 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1095 else
1096 _mm512_mask_storeu_ps(__mem, __k, __vi);
1097 }
1098 else if constexpr (__have_avx512f && sizeof(_Tp) == 8)
1099 {
1100 if constexpr (is_integral_v<_Tp>)
1101 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1102 else
1103 _mm512_mask_storeu_pd(__mem, __k, __vi);
1104 }
1105#if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32
1106 // with Skylake-AVX512, __have_avx512bw is true
1107 else if constexpr (__have_sse2)
1108 {
1109 using _M = __vector_type_t<_Tp, _Np>;
1110 using _MVT = _VectorTraits<_M>;
1111 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1112 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1113 reinterpret_cast<char*>(__mem));
1114 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1115 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1116 __k._M_data >> 1 * _MVT::_S_full_size)),
1117 reinterpret_cast<char*>(__mem) + 1 * 16);
1118 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1119 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1120 __k._M_data >> 2 * _MVT::_S_full_size)),
1121 reinterpret_cast<char*>(__mem) + 2 * 16);
1122 if constexpr (_Np > 48 / sizeof(_Tp))
1123 _mm_maskmoveu_si128(
1124 __auto_bitcast(__extract<3, 4>(__v._M_data)),
1125 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1126 __k._M_data >> 3 * _MVT::_S_full_size)),
1127 reinterpret_cast<char*>(__mem) + 3 * 16);
1128 }
1129#endif
1130 else
1131 __assert_unreachable<_Tp>();
1132 }
1133 else if constexpr (sizeof(__vi) == 32)
1134 {
1135 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1136 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1137 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1138 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1139 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1140 {
1141 if constexpr (is_integral_v<_Tp>)
1142 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1143 else
1144 _mm256_mask_storeu_ps(__mem, __k, __vi);
1145 }
1146 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1147 {
1148 if constexpr (is_integral_v<_Tp>)
1149 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1150 else
1151 _mm256_mask_storeu_pd(__mem, __k, __vi);
1152 }
1153 else if constexpr (__have_avx512f
1154 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1155 {
1156 // use a 512-bit maskstore, using zero-extension of the bitmask
1157 _S_masked_store_nocvt(
1158 _SimdWrapper64<_Tp>(
1159 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1160 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1161 }
1162 else
1163 _S_masked_store_nocvt(__v, __mem,
1164 _MaskImpl::template _S_to_maskvector<
1165 __int_for_sizeof_t<_Tp>, _Np>(__k));
1166 }
1167 else if constexpr (sizeof(__vi) == 16)
1168 {
1169 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1170 _mm_mask_storeu_epi8(__mem, __k, __vi);
1171 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1172 _mm_mask_storeu_epi16(__mem, __k, __vi);
1173 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1174 {
1175 if constexpr (is_integral_v<_Tp>)
1176 _mm_mask_storeu_epi32(__mem, __k, __vi);
1177 else
1178 _mm_mask_storeu_ps(__mem, __k, __vi);
1179 }
1180 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1181 {
1182 if constexpr (is_integral_v<_Tp>)
1183 _mm_mask_storeu_epi64(__mem, __k, __vi);
1184 else
1185 _mm_mask_storeu_pd(__mem, __k, __vi);
1186 }
1187 else if constexpr (__have_avx512f
1188 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1189 {
1190 // use a 512-bit maskstore, using zero-extension of the bitmask
1191 _S_masked_store_nocvt(
1192 _SimdWrapper64<_Tp>(
1193 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1194 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1195 }
1196 else
1197 _S_masked_store_nocvt(__v, __mem,
1198 _MaskImpl::template _S_to_maskvector<
1199 __int_for_sizeof_t<_Tp>, _Np>(__k));
1200 }
1201 else
1202 __assert_unreachable<_Tp>();
1203 }
1204
1205 template <typename _Tp, size_t _Np>
1206 _GLIBCXX_SIMD_INTRINSIC static void
1207 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1208 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1209 {
1210 if constexpr (sizeof(__v) <= 16)
1211 {
1212 [[maybe_unused]] const auto __vi
1213 = __intrin_bitcast<__m128i>(__as_vector(__v));
1214 [[maybe_unused]] const auto __ki
1215 = __intrin_bitcast<__m128i>(__as_vector(__k));
1216 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1217 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1218 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1219 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1220 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1221 && is_integral_v<_Tp>)
1222 _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1223 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1224 _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1225 __vector_bitcast<float>(__vi));
1226 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1227 && is_integral_v<_Tp>)
1228 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1229 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1230 _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1231 __vector_bitcast<double>(__vi));
1232 else if constexpr (__have_sse2)
1233 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<char*>(__mem));
1234 }
1235 else if constexpr (sizeof(__v) == 32)
1236 {
1237 [[maybe_unused]] const auto __vi
1238 = __intrin_bitcast<__m256i>(__as_vector(__v));
1239 [[maybe_unused]] const auto __ki
1240 = __intrin_bitcast<__m256i>(__as_vector(__k));
1241 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1242 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1243 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1244 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1245 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1246 && is_integral_v<_Tp>)
1247 _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1248 else if constexpr (sizeof(_Tp) == 4)
1249 _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1250 __vector_bitcast<float>(__v));
1251 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1252 && is_integral_v<_Tp>)
1253 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1254 __vi);
1255 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1256 _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1257 __vector_bitcast<double>(__v));
1258 else if constexpr (__have_sse2)
1259 {
1260 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1261 reinterpret_cast<char*>(__mem));
1262 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1263 reinterpret_cast<char*>(__mem) + 16);
1264 }
1265 }
1266 else
1267 __assert_unreachable<_Tp>();
1268 }
1269
1270 // }}}
1271 // _S_masked_store {{{
1272 template <typename _Tp, size_t _Np, typename _Up>
1273 _GLIBCXX_SIMD_INTRINSIC static void
1274 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1275 const _MaskMember<_Tp> __k) noexcept
1276 {
1277 if constexpr (is_integral_v<
1278 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up)
1279 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw)
1280 && (sizeof(__v) == 64 || __have_avx512vl))
1281 { // truncating store
1282 const auto __vi = __to_intrin(__v);
1283 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1284 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1285 && sizeof(__vi) == 64)
1286 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1287 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1288 && sizeof(__vi) == 32)
1289 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1290 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1291 && sizeof(__vi) == 16)
1292 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1293 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1294 && sizeof(__vi) == 64)
1295 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1296 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1297 && sizeof(__vi) == 32)
1298 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1299 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1300 && sizeof(__vi) == 16)
1301 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1302 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1303 && sizeof(__vi) == 64)
1304 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1305 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1306 && sizeof(__vi) == 32)
1307 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1308 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1309 && sizeof(__vi) == 16)
1310 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1311 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1312 && sizeof(__vi) == 64)
1313 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1314 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1315 && sizeof(__vi) == 32)
1316 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1317 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1318 && sizeof(__vi) == 16)
1319 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1320 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1321 && sizeof(__vi) == 64)
1322 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1323 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1324 && sizeof(__vi) == 32)
1325 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1326 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1327 && sizeof(__vi) == 16)
1328 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1329 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1330 && sizeof(__vi) == 64)
1331 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1332 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1333 && sizeof(__vi) == 32)
1334 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1335 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1336 && sizeof(__vi) == 16)
1337 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1338 else
1339 __assert_unreachable<_Tp>();
1340 }
1341 else
1342 _Base::_S_masked_store(__v, __mem, __k);
1343 }
1344
1345 // }}}
1346 // _S_multiplies {{{
1347 template <typename _V, typename _VVT = _VectorTraits<_V>>
1348 _GLIBCXX_SIMD_INTRINSIC static constexpr _V
1349 _S_multiplies(_V __x, _V __y)
1350 {
1351 using _Tp = typename _VVT::value_type;
1352 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1353 || __y._M_is_constprop())
1354 return __as_vector(__x) * __as_vector(__y);
1355 else if constexpr (sizeof(_Tp) == 1)
1356 {
1357 if constexpr (sizeof(_V) == 2)
1358 {
1359 const auto __xs = reinterpret_cast<short>(__x._M_data);
1360 const auto __ys = reinterpret_cast<short>(__y._M_data);
1361 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short(
1362 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1363 }
1364 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1365 {
1366 const auto __xi = reinterpret_cast<int>(__x._M_data);
1367 const auto __yi = reinterpret_cast<int>(__y._M_data);
1368 return reinterpret_cast<__vector_type_t<_Tp, 3>>(
1369 ((__xi * __yi) & 0xff)
1370 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1371 | ((__xi >> 16) * (__yi & 0xff0000)));
1372 }
1373 else if constexpr (sizeof(_V) == 4)
1374 {
1375 const auto __xi = reinterpret_cast<int>(__x._M_data);
1376 const auto __yi = reinterpret_cast<int>(__y._M_data);
1377 return reinterpret_cast<__vector_type_t<_Tp, 4>>(
1378 ((__xi * __yi) & 0xff)
1379 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1380 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1381 | ((__xi >> 24) * (__yi & 0xff000000u)));
1382 }
1383 else if constexpr (sizeof(_V) == 8 && __have_avx2
1384 && is_signed_v<_Tp>)
1385 return __convert<typename _VVT::type>(
1386 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1387 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1388 else if constexpr (sizeof(_V) == 8 && __have_avx2
1389 && is_unsigned_v<_Tp>)
1390 return __convert<typename _VVT::type>(
1391 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1392 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1393 else
1394 {
1395 // codegen of `x*y` is suboptimal (as of GCC 9.0.1)
1396 constexpr size_t __full_size = _VVT::_S_full_size;
1397 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8;
1398 using _ShortW = _SimdWrapper<short, _Np>;
1399 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1400 * __vector_bitcast<short, _Np>(__y);
1401 _ShortW __high_byte = _ShortW()._M_data - 256;
1402 //[&]() { asm("" : "+x"(__high_byte._M_data)); }();
1403 const _ShortW __odd
1404 = (__vector_bitcast<short, _Np>(__x) >> 8)
1405 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1406 if constexpr (__have_avx512bw && sizeof(_V) > 2)
1407 return _CommonImplX86::_S_blend_avx512(
1408 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1409 __vector_bitcast<_Tp>(__odd));
1410 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1411 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1412 __high_byte),
1413 __to_intrin(__even),
1414 __to_intrin(__odd));
1415 else
1416 return __to_intrin(
1417 __or(__andnot(__high_byte, __even), __odd));
1418 }
1419 }
1420 else
1421 return _Base::_S_multiplies(__x, __y);
1422 }
1423
1424 // }}}
1425 // _S_divides {{{
1426#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993
1427 template <typename _Tp, size_t _Np>
1428 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1429 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1430 {
1431 if (!__builtin_is_constant_evaluated()
1432 && !__builtin_constant_p(__y._M_data))
1433 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4)
1434 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1)
1435 // Note that using floating-point division is likely to raise the
1436 // *Inexact* exception flag and thus appears like an invalid
1437 // "as-if" transformation. However, C++ doesn't specify how the
1438 // fpenv can be observed and points to C. C says that function
1439 // calls are assumed to potentially raise fp exceptions, unless
1440 // documented otherwise. Consequently, operator/, which is a
1441 // function call, may raise fp exceptions.
1442 /*const struct _CsrGuard
1443 {
1444 const unsigned _M_data = _mm_getcsr();
1445 _CsrGuard()
1446 {
1447 _mm_setcsr(0x9f80); // turn off FP exceptions and
1448 flush-to-zero
1449 }
1450 ~_CsrGuard() { _mm_setcsr(_M_data); }
1451 } __csr;*/
1452 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1453 constexpr size_t __n_intermediate
1454 = std::min(_Np, (__have_avx512f ? 64
1455 : __have_avx ? 32
1456 : 16)
1457 / sizeof(_Float));
1458 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1459 constexpr size_t __n_floatv
1460 = __div_roundup(_Np, __n_intermediate);
1461 using _R = __vector_type_t<_Tp, _Np>;
1462 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1463 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1464 _Abi::__make_padding_nonzero(__as_vector(__y)));
1465 return __call_with_n_evaluations<__n_floatv>(
1466 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1467 return __vector_convert<_R>(__quotients...);
1468 },
1469 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1470 -> _SimdWrapper<_Float, __n_intermediate>
1471 {
1472#if __RECIPROCAL_MATH__
1473 // If -freciprocal-math is active, using the `/` operator is
1474 // incorrect because it may be translated to an imprecise
1475 // multiplication with reciprocal. We need to use inline
1476 // assembly to force a real division.
1477 _FloatV __r;
1478 if constexpr (__have_avx) // -mno-sse2avx is irrelevant
1479 // because once -mavx is given, GCC
1480 // emits VEX encoded vdivp[sd]
1481 {
1482 if constexpr (sizeof(_Tp) == 4)
1483 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}"
1484 : "=x"(__r)
1485 : "x"(__xf[__i]), "x"(__yf[__i]));
1486 else
1487 asm("vdivps\t{%2, %1, %0|%0, %1, %2}"
1488 : "=x"(__r)
1489 : "x"(__xf[__i]), "x"(__yf[__i]));
1490 }
1491 else
1492 {
1493 __r = __xf[__i];
1494 if constexpr (sizeof(_Tp) == 4)
1495 asm("divpd\t{%1, %0|%0, %1}"
1496 : "=x"(__r)
1497 : "x"(__yf[__i]));
1498 else
1499 asm("divps\t{%1, %0|%0, %1}"
1500 : "=x"(__r)
1501 : "x"(__yf[__i]));
1502 }
1503 return __r;
1504#else
1505 return __xf[__i] / __yf[__i];
1506#endif
1507 });
1508 }
1509 /* 64-bit int division is potentially optimizable via double division if
1510 * the value in __x is small enough and the conversion between
1511 * int<->double is efficient enough:
1512 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
1513 sizeof(_Tp) == 8)
1514 {
1515 if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1516 {
1517 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull,
1518 0xffe0'0000'0000'0000ull}))
1519 {
1520 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data)
1521 }
1522 }
1523 }
1524 */
1525 return _Base::_S_divides(__x, __y);
1526 }
1527#endif // _GLIBCXX_SIMD_WORKAROUND_PR90993
1528
1529 // }}}
1530 // _S_modulus {{{
1531 template <typename _Tp, size_t _Np>
1532 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1533 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1534 {
1535 if (__builtin_is_constant_evaluated()
1536 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8)
1537 return _Base::_S_modulus(__x, __y);
1538 else
1539 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
1540 }
1541
1542 // }}}
1543 // _S_bit_shift_left {{{
1544 // Notes on UB. C++2a [expr.shift] says:
1545 // -1- [...] The operands shall be of integral or unscoped enumeration type
1546 // and integral promotions are performed. The type of the result is that
1547 // of the promoted left operand. The behavior is undefined if the right
1548 // operand is negative, or greater than or equal to the width of the
1549 // promoted left operand.
1550 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo
1551 // 2^N, where N is the width of the type of the result.
1552 //
1553 // C++17 [expr.shift] says:
1554 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated
1555 // bits are zero-filled. If E1 has an unsigned type, the value of the
1556 // result is E1 × 2^E2 , reduced modulo one more than the maximum value
1557 // representable in the result type. Otherwise, if E1 has a signed type
1558 // and non-negative value, and E1 × 2^E2 is representable in the
1559 // corresponding unsigned type of the result type, then that value,
1560 // converted to the result type, is the resulting value; otherwise, the
1561 // behavior is undefined.
1562 //
1563 // Consequences:
1564 // With C++2a signed and unsigned types have the same UB
1565 // characteristics:
1566 // - left shift is not UB for 0 <= RHS < max(32, #bits(T))
1567 //
1568 // With C++17 there's little room for optimizations because the standard
1569 // requires all shifts to happen on promoted integrals (i.e. int). Thus,
1570 // short and char shifts must assume shifts affect bits of neighboring
1571 // values.
1572 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1573 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1574 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1575 _S_bit_shift_left(_Tp __xx, int __y)
1576 {
1577 using _V = typename _TVT::type;
1578 using _Up = typename _TVT::value_type;
1579 _V __x = __xx;
1580 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1581 if (__builtin_is_constant_evaluated())
1582 return __x << __y;
1583#if __cplusplus > 201703
1584 // after C++17, signed shifts have no UB, and behave just like unsigned
1585 // shifts
1586 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>)
1587 return __vector_bitcast<_Up>(
1588 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1589 __y));
1590#endif
1591 else if constexpr (sizeof(_Up) == 1)
1592 {
1593 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894)
1594 if (__builtin_constant_p(__y))
1595 {
1596 if (__y == 0)
1597 return __x;
1598 else if (__y == 1)
1599 return __x + __x;
1600 else if (__y == 2)
1601 {
1602 __x = __x + __x;
1603 return __x + __x;
1604 }
1605 else if (__y > 2 && __y < 8)
1606 {
1607 if constexpr (sizeof(__x) > sizeof(unsigned))
1608 {
1609 const _UChar __mask = 0xff << __y; // precomputed vector
1610 return __vector_bitcast<_Up>(
1611 __vector_bitcast<_UChar>(
1612 __vector_bitcast<unsigned>(__x) << __y)
1613 & __mask);
1614 }
1615 else
1616 {
1617 const unsigned __mask
1618 = (0xff & (0xff << __y)) * 0x01010101u;
1619 return reinterpret_cast<_V>(
1620 static_cast<__int_for_sizeof_t<_V>>(
1621 unsigned(
1622 reinterpret_cast<__int_for_sizeof_t<_V>>(__x)
1623 << __y)
1624 & __mask));
1625 }
1626 }
1627 else if (__y >= 8 && __y < 32)
1628 return _V();
1629 else
1630 __builtin_unreachable();
1631 }
1632 // general strategy in the following: use an sllv instead of sll
1633 // instruction, because it's 2 to 4 times faster:
1634 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16)
1635 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8(
1636 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix),
1637 _mm256_set1_epi16(__y))));
1638 else if constexpr (__have_avx512bw && sizeof(__x) == 32)
1639 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1640 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix),
1641 _mm512_set1_epi16(__y))));
1642 else if constexpr (__have_avx512bw && sizeof(__x) == 64)
1643 {
1644 const auto __shift = _mm512_set1_epi16(__y);
1645 return __vector_bitcast<_Up>(
1646 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1647 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)),
1648 _mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1649 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift))));
1650 }
1651 else if constexpr (__have_avx2 && sizeof(__x) == 32)
1652 {
1653#if 1
1654 const auto __shift = _mm_cvtsi32_si128(__y);
1655 auto __k
1656 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift);
1657 __k |= _mm256_srli_epi16(__k, 8);
1658 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift)
1659 & __k);
1660#else
1661 const _Up __k = 0xff << __y;
1662 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y)
1663 & __k;
1664#endif
1665 }
1666 else
1667 {
1668 const auto __shift = _mm_cvtsi32_si128(__y);
1669 auto __k
1670 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift);
1671 __k |= _mm_srli_epi16(__k, 8);
1672 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k);
1673 }
1674 }
1675 return __x << __y;
1676 }
1677
1678 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1679 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1680 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y)
1681 {
1682 using _V = typename _TVT::type;
1683 using _Up = typename _TVT::value_type;
1684 _V __x = __xx;
1685 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1686 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1687 if (__builtin_is_constant_evaluated())
1688 return __x << __y;
1689#if __cplusplus > 201703
1690 // after C++17, signed shifts have no UB, and behave just like unsigned
1691 // shifts
1692 else if constexpr (is_signed_v<_Up>)
1693 return __vector_bitcast<_Up>(
1694 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1695 __vector_bitcast<make_unsigned_t<_Up>>(__y)));
1696#endif
1697 else if constexpr (sizeof(_Up) == 1)
1698 {
1699 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1700 return __vector_bitcast<_Up>(__concat(
1701 _mm512_cvtepi16_epi8(
1702 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)),
1703 _mm512_cvtepu8_epi16(__lo256(__iy)))),
1704 _mm512_cvtepi16_epi8(
1705 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)),
1706 _mm512_cvtepu8_epi16(__hi256(__iy))))));
1707 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1708 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1709 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix),
1710 _mm512_cvtepu8_epi16(__iy))));
1711 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl)
1712 return __intrin_bitcast<_V>(
1713 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix),
1714 _mm_cvtepu8_epi16(__iy))));
1715 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1716 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1717 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix),
1718 _mm256_cvtepu8_epi16(__iy))));
1719 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1720 return __intrin_bitcast<_V>(
1721 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1722 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)),
1723 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy))))));
1724 else if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1725 {
1726 auto __mask
1727 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5);
1728 auto __x4
1729 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1730 __x4 &= char(0xf0);
1731 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1732 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4)));
1733 __mask += __mask;
1734 auto __x2
1735 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1736 __x2 &= char(0xfc);
1737 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1738 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2)));
1739 __mask += __mask;
1740 auto __x1 = __x + __x;
1741 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1742 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1)));
1743 return __x
1744 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1745 }
1746 else if constexpr (sizeof(__x) == 16)
1747 {
1748 auto __mask
1749 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5);
1750 auto __x4
1751 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1752 __x4 &= char(0xf0);
1753 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x;
1754 __mask += __mask;
1755 auto __x2
1756 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1757 __x2 &= char(0xfc);
1758 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x;
1759 __mask += __mask;
1760 auto __x1 = __x + __x;
1761 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x;
1762 return __x
1763 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1764 }
1765 else
1766 return __x << __y;
1767 }
1768 else if constexpr (sizeof(_Up) == 2)
1769 {
1770 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1771 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy));
1772 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl)
1773 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy));
1774 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1775 return __vector_bitcast<_Up>(
1776 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix),
1777 _mm512_castsi256_si512(__iy))));
1778 else if constexpr (sizeof __ix == 32 && __have_avx2)
1779 {
1780 const auto __ux = __vector_bitcast<unsigned>(__x);
1781 const auto __uy = __vector_bitcast<unsigned>(__y);
1782 return __vector_bitcast<_Up>(_mm256_blend_epi16(
1783 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1784 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1785 }
1786 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1787 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy));
1788 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1789 return __intrin_bitcast<_V>(
1790 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix),
1791 _mm512_castsi128_si512(__iy))));
1792 else if constexpr (sizeof __ix == 16 && __have_avx2)
1793 {
1794 const auto __ux = __vector_bitcast<unsigned>(__ix);
1795 const auto __uy = __vector_bitcast<unsigned>(__iy);
1796 return __intrin_bitcast<_V>(_mm_blend_epi16(
1797 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1798 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1799 }
1800 else if constexpr (sizeof __ix == 16)
1801 {
1802 using _Float4 = __vector_type_t<float, 4>;
1803 using _Int4 = __vector_type_t<int, 4>;
1804 using _UInt4 = __vector_type_t<unsigned, 4>;
1805 const _UInt4 __yu
1806 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3)));
1807 return __x
1808 * __intrin_bitcast<_V>(
1809 __vector_convert<_Int4>(_SimdWrapper<float, 4>(
1810 reinterpret_cast<_Float4>(__yu << 23)))
1811 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>(
1812 reinterpret_cast<_Float4>((__yu >> 16) << 23)))
1813 << 16));
1814 }
1815 else
1816 __assert_unreachable<_Tp>();
1817 }
1818 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16
1819 && !__have_avx2)
1820 // latency is suboptimal, but throughput is at full speedup
1821 return __intrin_bitcast<_V>(
1822 __vector_bitcast<unsigned>(__ix)
1823 * __vector_convert<__vector_type16_t<int>>(
1824 _SimdWrapper<float, 4>(__vector_bitcast<float>(
1825 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000))));
1826 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16
1827 && !__have_avx2)
1828 {
1829 const auto __lo = _mm_sll_epi64(__ix, __iy);
1830 const auto __hi
1831 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy));
1832 if constexpr (__have_sse4_1)
1833 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0));
1834 else
1835 return __vector_bitcast<_Up>(
1836 _mm_move_sd(__vector_bitcast<double>(__hi),
1837 __vector_bitcast<double>(__lo)));
1838 }
1839 else
1840 return __x << __y;
1841 }
1842#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
1843
1844 // }}}
1845 // _S_bit_shift_right {{{
1846#ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1847 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1848 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1849 _S_bit_shift_right(_Tp __xx, int __y)
1850 {
1851 using _V = typename _TVT::type;
1852 using _Up = typename _TVT::value_type;
1853 _V __x = __xx;
1854 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1855 if (__builtin_is_constant_evaluated())
1856 return __x >> __y;
1857 else if (__builtin_constant_p(__y)
1858 && is_unsigned_v<
1859 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__))
1860 return _V();
1861 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{
1862 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y)
1863 & _Up(0xff >> __y);
1864 //}}}
1865 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{
1866 return __intrin_bitcast<_V>(
1867 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix)
1868 >> (__y + 8))
1869 << 8)
1870 | (__vector_bitcast<_UShort>(
1871 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8)
1872 >> __y)
1873 >> 8));
1874 //}}}
1875 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected
1876 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{
1877 {
1878 if (__y > 32)
1879 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32)
1880 & _Up(0xffff'ffff'0000'0000ull))
1881 | __vector_bitcast<_Up>(
1882 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix)
1883 >> 32)
1884 >> (__y - 32));
1885 else
1886 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix)
1887 >> __y)
1888 | __vector_bitcast<_Up>(
1889 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll)
1890 >> __y);
1891 }
1892 //}}}
1893 else
1894 return __x >> __y;
1895 }
1896
1897 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1898 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1899 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y)
1900 {
1901 using _V = typename _TVT::type;
1902 using _Up = typename _TVT::value_type;
1903 _V __x = __xx;
1904 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1905 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1906 if (__builtin_is_constant_evaluated()
1907 || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
1908 return __x >> __y;
1909 else if constexpr (sizeof(_Up) == 1) //{{{
1910 {
1911 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl)
1912 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8(
1913 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix),
1914 _mm_cvtepi8_epi16(__iy))
1915 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix),
1916 _mm_cvtepu8_epi16(__iy))));
1917 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl)
1918 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1919 is_signed_v<_Up>
1920 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix),
1921 _mm256_cvtepi8_epi16(__iy))
1922 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix),
1923 _mm256_cvtepu8_epi16(__iy))));
1924 else if constexpr (sizeof(__x) == 32 && __have_avx512bw)
1925 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1926 is_signed_v<_Up>
1927 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix),
1928 _mm512_cvtepi8_epi16(__iy))
1929 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix),
1930 _mm512_cvtepu8_epi16(__iy))));
1931 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>)
1932 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1933 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1934 0x5555'5555'5555'5555ull,
1935 _mm512_srav_epi16(
1936 _mm512_slli_epi16(__ix, 8),
1937 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy,
1938 _mm512_set1_epi16(8)))));
1939 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>)
1940 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1941 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1942 0x5555'5555'5555'5555ull,
1943 _mm512_srlv_epi16(
1944 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix),
1945 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy))));
1946 /* This has better throughput but higher latency than the impl below
1947 else if constexpr (__have_avx2 && sizeof(__x) == 16 &&
1948 is_unsigned_v<_Up>)
1949 {
1950 const auto __shorts = __to_intrin(_S_bit_shift_right(
1951 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)),
1952 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy))));
1953 return __vector_bitcast<_Up>(
1954 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts)));
1955 }
1956 */
1957 else if constexpr (__have_avx2 && sizeof(__x) > 8)
1958 // the following uses vpsr[al]vd, which requires AVX2
1959 if constexpr (is_signed_v<_Up>)
1960 {
1961 const auto r3 = __vector_bitcast<_UInt>(
1962 (__vector_bitcast<int>(__x)
1963 >> (__vector_bitcast<_UInt>(__y) >> 24)))
1964 & 0xff000000u;
1965 const auto r2
1966 = __vector_bitcast<_UInt>(
1967 ((__vector_bitcast<int>(__x) << 8)
1968 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)))
1969 & 0xff000000u;
1970 const auto r1
1971 = __vector_bitcast<_UInt>(
1972 ((__vector_bitcast<int>(__x) << 16)
1973 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)))
1974 & 0xff000000u;
1975 const auto r0 = __vector_bitcast<_UInt>(
1976 (__vector_bitcast<int>(__x) << 24)
1977 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24));
1978 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1979 | (r0 >> 24));
1980 }
1981 else
1982 {
1983 const auto r3 = (__vector_bitcast<_UInt>(__x)
1984 >> (__vector_bitcast<_UInt>(__y) >> 24))
1985 & 0xff000000u;
1986 const auto r2
1987 = ((__vector_bitcast<_UInt>(__x) << 8)
1988 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))
1989 & 0xff000000u;
1990 const auto r1
1991 = ((__vector_bitcast<_UInt>(__x) << 16)
1992 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))
1993 & 0xff000000u;
1994 const auto r0
1995 = (__vector_bitcast<_UInt>(__x) << 24)
1996 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24);
1997 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1998 | (r0 >> 24));
1999 }
2000 else if constexpr (__have_sse4_1
2001 && is_unsigned_v<_Up> && sizeof(__x) > 2)
2002 {
2003 auto __x128 = __vector_bitcast<_Up>(__ix);
2004 auto __mask
2005 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5);
2006 auto __x4 = __vector_bitcast<_Up>(
2007 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f));
2008 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2009 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4)));
2010 __mask += __mask;
2011 auto __x2 = __vector_bitcast<_Up>(
2012 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f));
2013 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2014 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2)));
2015 __mask += __mask;
2016 auto __x1 = __vector_bitcast<_Up>(
2017 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f));
2018 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2019 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1)));
2020 return __intrin_bitcast<_V>(
2021 __x128
2022 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2023 == 0)); // y > 7 nulls the result
2024 }
2025 else if constexpr (__have_sse4_1
2026 && is_signed_v<_Up> && sizeof(__x) > 2)
2027 {
2028 auto __mask = __vector_bitcast<_UChar>(
2029 __vector_bitcast<_UShort>(__iy) << 5);
2030 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2031 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
2032 };
2033 auto __xh = __vector_bitcast<short>(__ix);
2034 auto __xl = __vector_bitcast<short>(__ix) << 8;
2035 auto __xh4 = __xh >> 4;
2036 auto __xl4 = __xl >> 4;
2037 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2038 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4)));
2039 __xl = __vector_bitcast<short>(
2040 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2041 __to_intrin(__xl4)));
2042 __mask += __mask;
2043 auto __xh2 = __xh >> 2;
2044 auto __xl2 = __xl >> 2;
2045 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2046 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2)));
2047 __xl = __vector_bitcast<short>(
2048 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2049 __to_intrin(__xl2)));
2050 __mask += __mask;
2051 auto __xh1 = __xh >> 1;
2052 auto __xl1 = __xl >> 1;
2053 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2054 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1)));
2055 __xl = __vector_bitcast<short>(
2056 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2057 __to_intrin(__xl1)));
2058 return __intrin_bitcast<_V>(
2059 (__vector_bitcast<_Up>((__xh & short(0xff00)))
2060 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2061 >> 8))
2062 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2063 == 0)); // y > 7 nulls the result
2064 }
2065 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2
2066 {
2067 auto __mask
2068 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5);
2069 auto __x4 = __vector_bitcast<_Up>(
2070 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f));
2071 __x = __mask > 0x7f ? __x4 : __x;
2072 __mask += __mask;
2073 auto __x2 = __vector_bitcast<_Up>(
2074 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f));
2075 __x = __mask > 0x7f ? __x2 : __x;
2076 __mask += __mask;
2077 auto __x1 = __vector_bitcast<_Up>(
2078 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f));
2079 __x = __mask > 0x7f ? __x1 : __x;
2080 return __x
2081 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2082 }
2083 else if constexpr (sizeof(__x) > 2) // signed SSE2
2084 {
2085 static_assert(is_signed_v<_Up>);
2086 auto __maskh = __vector_bitcast<_UShort>(__y) << 5;
2087 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8);
2088 auto __xh = __vector_bitcast<short>(__x);
2089 auto __xl = __vector_bitcast<short>(__x) << 8;
2090 auto __xh4 = __xh >> 4;
2091 auto __xl4 = __xl >> 4;
2092 __xh = __maskh > 0x7fff ? __xh4 : __xh;
2093 __xl = __maskl > 0x7fff ? __xl4 : __xl;
2094 __maskh += __maskh;
2095 __maskl += __maskl;
2096 auto __xh2 = __xh >> 2;
2097 auto __xl2 = __xl >> 2;
2098 __xh = __maskh > 0x7fff ? __xh2 : __xh;
2099 __xl = __maskl > 0x7fff ? __xl2 : __xl;
2100 __maskh += __maskh;
2101 __maskl += __maskl;
2102 auto __xh1 = __xh >> 1;
2103 auto __xl1 = __xl >> 1;
2104 __xh = __maskh > 0x7fff ? __xh1 : __xh;
2105 __xl = __maskl > 0x7fff ? __xl1 : __xl;
2106 __x = __vector_bitcast<_Up>((__xh & short(0xff00)))
2107 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2108 >> 8);
2109 return __x
2110 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2111 }
2112 else
2113 return __x >> __y;
2114 } //}}}
2115 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
2116 {
2117 [[maybe_unused]] auto __blend_0xaa
2118 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2119 if constexpr (sizeof(__a) == 16)
2120 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2121 0xaa);
2122 else if constexpr (sizeof(__a) == 32)
2123 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2124 0xaa);
2125 else if constexpr (sizeof(__a) == 64)
2126 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
2127 __to_intrin(__b));
2128 else
2129 __assert_unreachable<decltype(__a)>();
2130 };
2131 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
2132 return __intrin_bitcast<_V>(is_signed_v<_Up>
2133 ? _mm_srav_epi16(__ix, __iy)
2134 : _mm_srlv_epi16(__ix, __iy));
2135 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32)
2136 return __vector_bitcast<_Up>(is_signed_v<_Up>
2137 ? _mm256_srav_epi16(__ix, __iy)
2138 : _mm256_srlv_epi16(__ix, __iy));
2139 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64)
2140 return __vector_bitcast<_Up>(is_signed_v<_Up>
2141 ? _mm512_srav_epi16(__ix, __iy)
2142 : _mm512_srlv_epi16(__ix, __iy));
2143 else if constexpr (__have_avx2 && is_signed_v<_Up>)
2144 return __intrin_bitcast<_V>(
2145 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16)
2146 >> (__vector_bitcast<int>(__iy) & 0xffffu))
2147 >> 16,
2148 __vector_bitcast<int>(__ix)
2149 >> (__vector_bitcast<int>(__iy) >> 16)));
2150 else if constexpr (__have_avx2 && is_unsigned_v<_Up>)
2151 return __intrin_bitcast<_V>(
2152 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu)
2153 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu),
2154 __vector_bitcast<_UInt>(__ix)
2155 >> (__vector_bitcast<_UInt>(__iy) >> 16)));
2156 else if constexpr (__have_sse4_1)
2157 {
2158 auto __mask = __vector_bitcast<_UShort>(__iy);
2159 auto __x128 = __vector_bitcast<_Up>(__ix);
2160 //__mask *= 0x0808;
2161 __mask = (__mask << 3) | (__mask << 11);
2162 // do __x128 = 0 where __y[4] is set
2163 __x128 = __vector_bitcast<_Up>(
2164 _mm_blendv_epi8(__to_intrin(__x128), __m128i(),
2165 __to_intrin(__mask)));
2166 // do __x128 =>> 8 where __y[3] is set
2167 __x128 = __vector_bitcast<_Up>(
2168 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8),
2169 __to_intrin(__mask += __mask)));
2170 // do __x128 =>> 4 where __y[2] is set
2171 __x128 = __vector_bitcast<_Up>(
2172 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4),
2173 __to_intrin(__mask += __mask)));
2174 // do __x128 =>> 2 where __y[1] is set
2175 __x128 = __vector_bitcast<_Up>(
2176 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2),
2177 __to_intrin(__mask += __mask)));
2178 // do __x128 =>> 1 where __y[0] is set
2179 return __intrin_bitcast<_V>(
2180 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1),
2181 __to_intrin(__mask + __mask)));
2182 }
2183 else
2184 {
2185 auto __k = __vector_bitcast<_UShort>(__iy) << 11;
2186 auto __x128 = __vector_bitcast<_Up>(__ix);
2187 auto __mask
2188 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2189 return __vector_bitcast<short>(__kk) < 0;
2190 };
2191 // do __x128 = 0 where __y[4] is set
2192 __x128 = __mask(__k) ? decltype(__x128)() : __x128;
2193 // do __x128 =>> 8 where __y[3] is set
2194 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128;
2195 // do __x128 =>> 4 where __y[2] is set
2196 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128;
2197 // do __x128 =>> 2 where __y[1] is set
2198 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128;
2199 // do __x128 =>> 1 where __y[0] is set
2200 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1
2201 : __x128);
2202 }
2203 } //}}}
2204 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{
2205 {
2206 if constexpr (is_unsigned_v<_Up>)
2207 {
2208 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31
2209 const __m128 __factor_f = reinterpret_cast<__m128>(
2210 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23));
2211 const __m128i __factor
2212 = __builtin_constant_p(__factor_f)
2213 ? __to_intrin(
2214 __make_vector<unsigned>(__factor_f[0], __factor_f[1],
2215 __factor_f[2], __factor_f[3]))
2216 : _mm_cvttps_epi32(__factor_f);
2217 const auto __r02
2218 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31);
2219 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4),
2220 _mm_srli_si128(__factor, 4));
2221 if constexpr (__have_sse4_1)
2222 return __intrin_bitcast<_V>(
2223 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33));
2224 else
2225 return __intrin_bitcast<_V>(
2226 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4));
2227 }
2228 else
2229 {
2230 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2231 if constexpr (is_signed_v<_Up>)
2232 return _mm_sra_epi32(__a, __b);
2233 else
2234 return _mm_srl_epi32(__a, __b);
2235 };
2236 const auto __r0
2237 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i()));
2238 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32));
2239 const auto __r2
2240 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i()));
2241 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12));
2242 if constexpr (__have_sse4_1)
2243 return __intrin_bitcast<_V>(
2244 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3),
2245 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0));
2246 else
2247 return __intrin_bitcast<_V>(_mm_unpacklo_epi64(
2248 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)),
2249 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4))));
2250 }
2251 } //}}}
2252 else
2253 return __x >> __y;
2254 }
2255#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
2256
2257 // }}}
2258 // compares {{{
2259 // _S_equal_to {{{
2260 template <typename _Tp, size_t _Np>
2261 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2262 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2263 {
2264 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2265 {
2266 if (__builtin_is_constant_evaluated()
2267 || (__x._M_is_constprop() && __y._M_is_constprop()))
2268 return _MaskImpl::_S_to_bits(
2269 __as_wrapper<_Np>(__x._M_data == __y._M_data));
2270
2271 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2272 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2273 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2274 if constexpr (is_floating_point_v<_Tp>)
2275 {
2276 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2277 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2278 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2279 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2280 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2281 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2282 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2283 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2284 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2285 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2286 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2287 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2288 else
2289 __assert_unreachable<_Tp>();
2290 }
2291 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2292 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2293 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2294 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2295 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2296 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2297 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2298 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2299 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2300 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2301 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2302 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2303 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2304 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2306 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2307 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2308 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2309 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2310 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2311 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2312 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2313 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2314 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2315 else
2316 __assert_unreachable<_Tp>();
2317 } // }}}
2318 else if (__builtin_is_constant_evaluated())
2319 return _Base::_S_equal_to(__x, __y);
2320 else if constexpr (sizeof(__x) == 8) // {{{
2321 {
2322 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2323 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2324 _MaskMember<_Tp> __r64;
2325 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2326 return __r64;
2327 } // }}}
2328 else
2329 return _Base::_S_equal_to(__x, __y);
2330 }
2331
2332 // }}}
2333 // _S_not_equal_to {{{
2334 template <typename _Tp, size_t _Np>
2335 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2336 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2337 {
2338 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2339 {
2340 if (__builtin_is_constant_evaluated()
2341 || (__x._M_is_constprop() && __y._M_is_constprop()))
2342 return _MaskImpl::_S_to_bits(
2343 __as_wrapper<_Np>(__x._M_data != __y._M_data));
2344
2345 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2346 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2347 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2348 if constexpr (is_floating_point_v<_Tp>)
2349 {
2350 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2351 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2352 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2353 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2354 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2355 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2356 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2357 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2358 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2359 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2360 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2361 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2362 else
2363 __assert_unreachable<_Tp>();
2364 }
2365 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2366 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2367 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2368 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2369 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2370 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2371 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2372 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2373 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2374 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2375 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2376 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2377 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2378 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2379 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2380 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2381 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2382 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2383 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2384 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2385 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2386 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2387 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2388 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2389 else
2390 __assert_unreachable<_Tp>();
2391 } // }}}
2392 else if (__builtin_is_constant_evaluated())
2393 return _Base::_S_not_equal_to(__x, __y);
2394 else if constexpr (sizeof(__x) == 8)
2395 {
2396 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2397 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2398 _MaskMember<_Tp> __r64;
2399 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2400 return __r64;
2401 }
2402 else
2403 return _Base::_S_not_equal_to(__x, __y);
2404 }
2405
2406 // }}}
2407 // _S_less {{{
2408 template <typename _Tp, size_t _Np>
2409 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2410 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2411 {
2412 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2413 {
2414 if (__builtin_is_constant_evaluated()
2415 || (__x._M_is_constprop() && __y._M_is_constprop()))
2416 return _MaskImpl::_S_to_bits(
2417 __as_wrapper<_Np>(__x._M_data < __y._M_data));
2418
2419 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2420 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2421 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2422 if constexpr (sizeof(__xi) == 64)
2423 {
2424 if constexpr (is_same_v<_Tp, float>)
2425 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2426 else if constexpr (is_same_v<_Tp, double>)
2427 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2428 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2429 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2430 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2431 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2432 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2433 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2435 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2436 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2437 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2438 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2439 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2440 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2441 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2443 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2444 else
2445 __assert_unreachable<_Tp>();
2446 }
2447 else if constexpr (sizeof(__xi) == 32)
2448 {
2449 if constexpr (is_same_v<_Tp, float>)
2450 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2451 else if constexpr (is_same_v<_Tp, double>)
2452 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2453 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2454 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2455 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2456 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2457 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2458 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2459 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2460 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2461 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2462 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2463 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2464 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2465 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2466 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2467 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2468 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2469 else
2470 __assert_unreachable<_Tp>();
2471 }
2472 else if constexpr (sizeof(__xi) == 16)
2473 {
2474 if constexpr (is_same_v<_Tp, float>)
2475 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2476 else if constexpr (is_same_v<_Tp, double>)
2477 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2478 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2479 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2480 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2481 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2482 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2483 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2484 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2485 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2486 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2487 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2488 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2489 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2490 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2491 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2492 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2493 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2494 else
2495 __assert_unreachable<_Tp>();
2496 }
2497 else
2498 __assert_unreachable<_Tp>();
2499 } // }}}
2500 else if (__builtin_is_constant_evaluated())
2501 return _Base::_S_less(__x, __y);
2502 else if constexpr (sizeof(__x) == 8)
2503 {
2504 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2505 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2506 _MaskMember<_Tp> __r64;
2507 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2508 return __r64;
2509 }
2510 else
2511 return _Base::_S_less(__x, __y);
2512 }
2513
2514 // }}}
2515 // _S_less_equal {{{
2516 template <typename _Tp, size_t _Np>
2517 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2518 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2519 {
2520 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2521 {
2522 if (__builtin_is_constant_evaluated()
2523 || (__x._M_is_constprop() && __y._M_is_constprop()))
2524 return _MaskImpl::_S_to_bits(
2525 __as_wrapper<_Np>(__x._M_data <= __y._M_data));
2526
2527 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2528 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2529 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2530 if constexpr (sizeof(__xi) == 64)
2531 {
2532 if constexpr (is_same_v<_Tp, float>)
2533 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2534 else if constexpr (is_same_v<_Tp, double>)
2535 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2536 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2537 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi);
2538 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2539 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi);
2540 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2541 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi);
2542 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2543 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi);
2544 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2545 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi);
2546 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2547 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi);
2548 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2549 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi);
2550 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2551 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi);
2552 else
2553 __assert_unreachable<_Tp>();
2554 }
2555 else if constexpr (sizeof(__xi) == 32)
2556 {
2557 if constexpr (is_same_v<_Tp, float>)
2558 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2559 else if constexpr (is_same_v<_Tp, double>)
2560 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2561 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2562 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi);
2563 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2564 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi);
2565 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2566 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi);
2567 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2568 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi);
2569 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2570 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi);
2571 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2572 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi);
2573 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2574 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi);
2575 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2576 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi);
2577 else
2578 __assert_unreachable<_Tp>();
2579 }
2580 else if constexpr (sizeof(__xi) == 16)
2581 {
2582 if constexpr (is_same_v<_Tp, float>)
2583 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2584 else if constexpr (is_same_v<_Tp, double>)
2585 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2586 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2587 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi);
2588 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2589 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi);
2590 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2591 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi);
2592 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2593 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi);
2594 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2595 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi);
2596 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2597 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi);
2598 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2599 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi);
2600 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2601 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi);
2602 else
2603 __assert_unreachable<_Tp>();
2604 }
2605 else
2606 __assert_unreachable<_Tp>();
2607 } // }}}
2608 else if (__builtin_is_constant_evaluated())
2609 return _Base::_S_less_equal(__x, __y);
2610 else if constexpr (sizeof(__x) == 8)
2611 {
2612 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2613 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2614 _MaskMember<_Tp> __r64;
2615 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2616 return __r64;
2617 }
2618 else
2619 return _Base::_S_less_equal(__x, __y);
2620 }
2621
2622 // }}} }}}
2623 // negation {{{
2624 template <typename _Tp, size_t _Np>
2625 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2626 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
2627 {
2628 if constexpr (__is_avx512_abi<_Abi>())
2629 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>());
2630 else
2631 return _Base::_S_negate(__x);
2632 }
2633
2634 // }}}
2635 // math {{{
2636 using _Base::_S_abs;
2637
2638 // _S_sqrt {{{
2639 template <typename _Tp, size_t _Np>
2640 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2641 _S_sqrt(_SimdWrapper<_Tp, _Np> __x)
2642 {
2643 if constexpr (__is_sse_ps<_Tp, _Np>())
2644 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x)));
2645 else if constexpr (__is_sse_pd<_Tp, _Np>())
2646 return _mm_sqrt_pd(__x);
2647 else if constexpr (__is_avx_ps<_Tp, _Np>())
2648 return _mm256_sqrt_ps(__x);
2649 else if constexpr (__is_avx_pd<_Tp, _Np>())
2650 return _mm256_sqrt_pd(__x);
2651 else if constexpr (__is_avx512_ps<_Tp, _Np>())
2652 return _mm512_sqrt_ps(__x);
2653 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2654 return _mm512_sqrt_pd(__x);
2655 else
2656 __assert_unreachable<_Tp>();
2657 }
2658
2659 // }}}
2660 // _S_ldexp {{{
2661 template <typename _Tp, size_t _Np>
2662 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2663 _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
2664 __fixed_size_storage_t<int, _Np> __exp)
2665 {
2666 if constexpr (sizeof(__x) == 64 || __have_avx512vl)
2667 {
2668 const auto __xi = __to_intrin(__x);
2669 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi>
2670 __cvt;
2671 const auto __expi = __to_intrin(__cvt(__exp));
2672 using _Up = __bool_storage_member_type_t<_Np>;
2673 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up();
2674 if constexpr (sizeof(__xi) == 16)
2675 {
2676 if constexpr (sizeof(_Tp) == 8)
2677 return _mm_maskz_scalef_pd(__k1, __xi, __expi);
2678 else
2679 return _mm_maskz_scalef_ps(__k1, __xi, __expi);
2680 }
2681 else if constexpr (sizeof(__xi) == 32)
2682 {
2683 if constexpr (sizeof(_Tp) == 8)
2684 return _mm256_maskz_scalef_pd(__k1, __xi, __expi);
2685 else
2686 return _mm256_maskz_scalef_ps(__k1, __xi, __expi);
2687 }
2688 else
2689 {
2690 static_assert(sizeof(__xi) == 64);
2691 if constexpr (sizeof(_Tp) == 8)
2692 return _mm512_maskz_scalef_pd(__k1, __xi, __expi);
2693 else
2694 return _mm512_maskz_scalef_ps(__k1, __xi, __expi);
2695 }
2696 }
2697 else
2698 return _Base::_S_ldexp(__x, __exp);
2699 }
2700
2701 // }}}
2702 // _S_trunc {{{
2703 template <typename _Tp, size_t _Np>
2704 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2705 _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2706 {
2707 if constexpr (__is_avx512_ps<_Tp, _Np>())
2708 return _mm512_roundscale_ps(__x, 0x0b);
2709 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2710 return _mm512_roundscale_pd(__x, 0x0b);
2711 else if constexpr (__is_avx_ps<_Tp, _Np>())
2712 return _mm256_round_ps(__x, 0xb);
2713 else if constexpr (__is_avx_pd<_Tp, _Np>())
2714 return _mm256_round_pd(__x, 0xb);
2715 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2716 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb));
2717 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2718 return _mm_round_pd(__x, 0xb);
2719 else if constexpr (__is_sse_ps<_Tp, _Np>())
2720 {
2721 auto __truncated
2722 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)));
2723 const auto __no_fractional_values
2724 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x))
2725 & 0x7f800000u)
2726 < 0x4b000000; // the exponent is so large that no mantissa bits
2727 // signify fractional values (0x3f8 + 23*8 =
2728 // 0x4b0)
2729 return __no_fractional_values ? __truncated : __to_intrin(__x);
2730 }
2731 else
2732 return _Base::_S_trunc(__x);
2733 }
2734
2735 // }}}
2736 // _S_round {{{
2737 template <typename _Tp, size_t _Np>
2738 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2739 _S_round(_SimdWrapper<_Tp, _Np> __x)
2740 {
2741 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away
2742 // from zero as required by std::round. Therefore this function is more
2743 // complicated.
2744 using _V = __vector_type_t<_Tp, _Np>;
2745 _V __truncated;
2746 if constexpr (__is_avx512_ps<_Tp, _Np>())
2747 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b);
2748 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2749 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b);
2750 else if constexpr (__is_avx_ps<_Tp, _Np>())
2751 __truncated = _mm256_round_ps(__x._M_data,
2752 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2753 else if constexpr (__is_avx_pd<_Tp, _Np>())
2754 __truncated = _mm256_round_pd(__x._M_data,
2755 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2756 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2757 __truncated = __auto_bitcast(
2758 _mm_round_ps(__to_intrin(__x),
2759 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
2760 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2761 __truncated
2762 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2763 else if constexpr (__is_sse_ps<_Tp, _Np>())
2764 __truncated = __auto_bitcast(
2765 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))));
2766 else
2767 return _Base::_S_round(__x);
2768
2769 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0
2770 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0
2771
2772 const _V __rounded
2773 = __truncated
2774 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5)
2775 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1)
2776 : _V());
2777 if constexpr (__have_sse4_1)
2778 return __rounded;
2779 else // adjust for missing range in cvttps_epi32
2780 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded
2781 : __x._M_data;
2782 }
2783
2784 // }}}
2785 // _S_nearbyint {{{
2786 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2787 _GLIBCXX_SIMD_INTRINSIC static _Tp
2788 _S_nearbyint(_Tp __x) noexcept
2789 {
2790 if constexpr (_TVT::template _S_is<float, 16>)
2791 return _mm512_roundscale_ps(__x, 0x0c);
2792 else if constexpr (_TVT::template _S_is<double, 8>)
2793 return _mm512_roundscale_pd(__x, 0x0c);
2794 else if constexpr (_TVT::template _S_is<float, 8>)
2795 return _mm256_round_ps(__x,
2796 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2797 else if constexpr (_TVT::template _S_is<double, 4>)
2798 return _mm256_round_pd(__x,
2799 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2800 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2801 return _mm_round_ps(__x,
2802 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2803 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2804 return _mm_round_pd(__x,
2805 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2806 else
2807 return _Base::_S_nearbyint(__x);
2808 }
2809
2810 // }}}
2811 // _S_rint {{{
2812 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2813 _GLIBCXX_SIMD_INTRINSIC static _Tp
2814 _S_rint(_Tp __x) noexcept
2815 {
2816 if constexpr (_TVT::template _S_is<float, 16>)
2817 return _mm512_roundscale_ps(__x, 0x04);
2818 else if constexpr (_TVT::template _S_is<double, 8>)
2819 return _mm512_roundscale_pd(__x, 0x04);
2820 else if constexpr (_TVT::template _S_is<float, 8>)
2821 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2822 else if constexpr (_TVT::template _S_is<double, 4>)
2823 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2824 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2825 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2826 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2827 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2828 else
2829 return _Base::_S_rint(__x);
2830 }
2831
2832 // }}}
2833 // _S_floor {{{
2834 template <typename _Tp, size_t _Np>
2835 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2836 _S_floor(_SimdWrapper<_Tp, _Np> __x)
2837 {
2838 if constexpr (__is_avx512_ps<_Tp, _Np>())
2839 return _mm512_roundscale_ps(__x, 0x09);
2840 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2841 return _mm512_roundscale_pd(__x, 0x09);
2842 else if constexpr (__is_avx_ps<_Tp, _Np>())
2843 return _mm256_round_ps(__x, 0x9);
2844 else if constexpr (__is_avx_pd<_Tp, _Np>())
2845 return _mm256_round_pd(__x, 0x9);
2846 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2847 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9));
2848 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2849 return _mm_round_pd(__x, 0x9);
2850 else
2851 return _Base::_S_floor(__x);
2852 }
2853
2854 // }}}
2855 // _S_ceil {{{
2856 template <typename _Tp, size_t _Np>
2857 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2858 _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2859 {
2860 if constexpr (__is_avx512_ps<_Tp, _Np>())
2861 return _mm512_roundscale_ps(__x, 0x0a);
2862 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2863 return _mm512_roundscale_pd(__x, 0x0a);
2864 else if constexpr (__is_avx_ps<_Tp, _Np>())
2865 return _mm256_round_ps(__x, 0xa);
2866 else if constexpr (__is_avx_pd<_Tp, _Np>())
2867 return _mm256_round_pd(__x, 0xa);
2868 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2869 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa));
2870 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2871 return _mm_round_pd(__x, 0xa);
2872 else
2873 return _Base::_S_ceil(__x);
2874 }
2875
2876 // }}}
2877 // _S_signbit {{{
2878 template <typename _Tp, size_t _Np>
2879 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2880 _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2881 {
2882 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2883 {
2884 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
2885 return _mm512_movepi32_mask(
2886 __intrin_bitcast<__m512i>(__x._M_data));
2887 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
2888 return _mm512_movepi64_mask(
2889 __intrin_bitcast<__m512i>(__x._M_data));
2890 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
2891 return _mm256_movepi32_mask(
2892 __intrin_bitcast<__m256i>(__x._M_data));
2893 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
2894 return _mm256_movepi64_mask(
2895 __intrin_bitcast<__m256i>(__x._M_data));
2896 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4)
2897 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data));
2898 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8)
2899 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data));
2900 }
2901 else if constexpr (__is_avx512_abi<_Abi>())
2902 {
2903 const auto __xi = __to_intrin(__x);
2904 [[maybe_unused]] constexpr auto __k1
2905 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2906 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2907 return _mm_movemask_ps(__xi);
2908 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2909 return _mm_movemask_pd(__xi);
2910 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2911 return _mm256_movemask_ps(__xi);
2912 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2913 return _mm256_movemask_pd(__xi);
2914 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2915 return _mm512_mask_cmplt_epi32_mask(
2916 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2917 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2918 return _mm512_mask_cmplt_epi64_mask(
2919 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2920 else
2921 __assert_unreachable<_Tp>();
2922 }
2923 else
2924 return _Base::_S_signbit(__x);
2925 /*{
2926 using _I = __int_for_sizeof_t<_Tp>;
2927 if constexpr (sizeof(__x) == 64)
2928 return _S_less(__vector_bitcast<_I>(__x), _I());
2929 else
2930 {
2931 const auto __xx = __vector_bitcast<_I>(__x._M_data);
2932 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>;
2933 if constexpr ((sizeof(_Tp) == 4 &&
2934 (__have_avx2 || sizeof(__x) == 16)) ||
2935 __have_avx512vl)
2936 {
2937 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>);
2938 }
2939 else if constexpr ((__have_avx2 ||
2940 (__have_ssse3 && sizeof(__x) == 16)))
2941 {
2942 return __vector_bitcast<_Tp>((__xx & __signmask) ==
2943 __signmask);
2944 }
2945 else
2946 { // SSE2/3 or AVX (w/o AVX2)
2947 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1);
2948 return __vector_bitcast<_Tp>(
2949 __vector_bitcast<_Tp>(
2950 (__xx & __signmask) |
2951 __vector_bitcast<_I>(__one)) // -1 or 1
2952 != __one);
2953 }
2954 }
2955 }*/
2956 }
2957
2958 // }}}
2959 // _S_isnonzerovalue_mask {{{
2960 // (isnormal | is subnormal == !isinf & !isnan & !is zero)
2961 template <typename _Tp>
2962 _GLIBCXX_SIMD_INTRINSIC static auto
2963 _S_isnonzerovalue_mask(_Tp __x)
2964 {
2965 using _Traits = _VectorTraits<_Tp>;
2966 if constexpr (__have_avx512dq_vl)
2967 {
2968 if constexpr (_Traits::template _S_is<
2969 float, 2> || _Traits::template _S_is<float, 4>)
2970 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f));
2971 else if constexpr (_Traits::template _S_is<float, 8>)
2972 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f));
2973 else if constexpr (_Traits::template _S_is<float, 16>)
2974 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f));
2975 else if constexpr (_Traits::template _S_is<double, 2>)
2976 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f));
2977 else if constexpr (_Traits::template _S_is<double, 4>)
2978 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f));
2979 else if constexpr (_Traits::template _S_is<double, 8>)
2980 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f));
2981 else
2982 __assert_unreachable<_Tp>();
2983 }
2984 else
2985 {
2986 using _Up = typename _Traits::value_type;
2987 constexpr size_t _Np = _Traits::_S_full_size;
2988 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0
2989 const auto __b = __x * _Up(); // NaN if __x == inf
2990 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>())
2991 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b),
2992 _CMP_ORD_Q);
2993 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>())
2994 return __mmask8(0xf
2995 & _mm512_cmp_ps_mask(__auto_bitcast(__a),
2996 __auto_bitcast(__b),
2997 _CMP_ORD_Q));
2998 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>())
2999 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3000 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>())
3001 return __mmask8(0x3
3002 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
3003 __auto_bitcast(__b),
3004 _CMP_ORD_Q));
3005 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>())
3006 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
3007 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>())
3008 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a),
3009 __auto_bitcast(__b),
3010 _CMP_ORD_Q));
3011 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>())
3012 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3013 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>())
3014 return __mmask8(0xf
3015 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
3016 __auto_bitcast(__b),
3017 _CMP_ORD_Q));
3018 else if constexpr (__is_avx512_ps<_Up, _Np>())
3019 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
3020 else if constexpr (__is_avx512_pd<_Up, _Np>())
3021 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3022 else
3023 __assert_unreachable<_Tp>();
3024 }
3025 }
3026
3027 // }}}
3028 // _S_isfinite {{{
3029 template <typename _Tp, size_t _Np>
3030 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3031 _S_isfinite(_SimdWrapper<_Tp, _Np> __x)
3032 {
3033 static_assert(is_floating_point_v<_Tp>);
3034#if !__FINITE_MATH_ONLY__
3035 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3036 {
3037 const auto __xi = __to_intrin(__x);
3038 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3039 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3040 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3041 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3042 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3043 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3044 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3045 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3046 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3047 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3048 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3049 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3050 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3051 }
3052 else if constexpr (__is_avx512_abi<_Abi>())
3053 {
3054 // if all exponent bits are set, __x is either inf or NaN
3055 using _I = __int_for_sizeof_t<_Tp>;
3056 const auto __inf = __vector_bitcast<_I>(
3057 __vector_broadcast<_Np>(__infinity_v<_Tp>));
3058 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf);
3059 }
3060 else
3061#endif
3062 return _Base::_S_isfinite(__x);
3063 }
3064
3065 // }}}
3066 // _S_isinf {{{
3067 template <typename _Tp, size_t _Np>
3068 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3069 _S_isinf(_SimdWrapper<_Tp, _Np> __x)
3070 {
3071#if !__FINITE_MATH_ONLY__
3072 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3073 {
3074 const auto __xi = __to_intrin(__x);
3075 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3076 return _mm512_fpclass_ps_mask(__xi, 0x18);
3077 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3078 return _mm512_fpclass_pd_mask(__xi, 0x18);
3079 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3080 return _mm256_fpclass_ps_mask(__xi, 0x18);
3081 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3082 return _mm256_fpclass_pd_mask(__xi, 0x18);
3083 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3084 return _mm_fpclass_ps_mask(__xi, 0x18);
3085 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3086 return _mm_fpclass_pd_mask(__xi, 0x18);
3087 else
3088 __assert_unreachable<_Tp>();
3089 }
3090 else if constexpr (__have_avx512dq_vl)
3091 {
3092 if constexpr (__is_sse_pd<_Tp, _Np>())
3093 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18));
3094 else if constexpr (__is_avx_pd<_Tp, _Np>())
3095 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18));
3096 else if constexpr (__is_sse_ps<_Tp, _Np>())
3097 return _mm_movm_epi32(
3098 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18));
3099 else if constexpr (__is_avx_ps<_Tp, _Np>())
3100 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18));
3101 else
3102 __assert_unreachable<_Tp>();
3103 }
3104 else
3105#endif
3106 return _Base::_S_isinf(__x);
3107 }
3108
3109 // }}}
3110 // _S_isnormal {{{
3111 template <typename _Tp, size_t _Np>
3112 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3113 _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
3114 {
3115#if __FINITE_MATH_ONLY__
3116 [[maybe_unused]] constexpr int __mode = 0x26;
3117#else
3118 [[maybe_unused]] constexpr int __mode = 0xbf;
3119#endif
3120 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3121 {
3122 const auto __xi = __to_intrin(__x);
3123 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3124 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3125 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode);
3126 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3127 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode);
3128 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3129 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode);
3130 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3131 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode);
3132 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3133 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode);
3134 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3135 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode);
3136 else
3137 __assert_unreachable<_Tp>();
3138 }
3139 else if constexpr (__have_avx512dq)
3140 {
3141 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
3142 return _mm_movm_epi32(
3143 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode)));
3144 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>())
3145 return _mm256_movm_epi32(
3146 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode)));
3147 else if constexpr (__is_avx512_ps<_Tp, _Np>())
3148 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode));
3149 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>())
3150 return _mm_movm_epi64(
3151 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode)));
3152 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>())
3153 return _mm256_movm_epi64(
3154 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode)));
3155 else if constexpr (__is_avx512_pd<_Tp, _Np>())
3156 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode));
3157 else
3158 __assert_unreachable<_Tp>();
3159 }
3160 else if constexpr (__is_avx512_abi<_Abi>())
3161 {
3162 using _I = __int_for_sizeof_t<_Tp>;
3163 const auto absn = __vector_bitcast<_I>(_S_abs(__x));
3164 const auto minn = __vector_bitcast<_I>(
3165 __vector_broadcast<_Np>(__norm_min_v<_Tp>));
3166#if __FINITE_MATH_ONLY__
3167 return _S_less_equal<_I, _Np>(minn, absn);
3168#else
3169 const auto infn
3170 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
3171 return __and(_S_less_equal<_I, _Np>(minn, absn),
3172 _S_less<_I, _Np>(absn, infn));
3173#endif
3174 }
3175 else
3176 return _Base::_S_isnormal(__x);
3177 }
3178
3179 // }}}
3180 // _S_isnan {{{
3181 template <typename _Tp, size_t _Np>
3182 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3183 _S_isnan(_SimdWrapper<_Tp, _Np> __x)
3184 { return _S_isunordered(__x, __x); }
3185
3186 // }}}
3187 // _S_isunordered {{{
3188 template <typename _Tp, size_t _Np>
3189 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3190 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x,
3191 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y)
3192 {
3193#if __FINITE_MATH_ONLY__
3194 return {}; // false
3195#else
3196 const auto __xi = __to_intrin(__x);
3197 const auto __yi = __to_intrin(__y);
3198 if constexpr (__is_avx512_abi<_Abi>())
3199 {
3200 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3201 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3202 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3203 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3204 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3205 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3206 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3207 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3208 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3209 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3210 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3211 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3212 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3213 }
3214 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3215 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q));
3216 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3217 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q));
3218 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3219 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi));
3220 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3221 return __to_masktype(_mm_cmpunord_pd(__xi, __yi));
3222 else
3223 __assert_unreachable<_Tp>();
3224#endif
3225 }
3226
3227 // }}}
3228 // _S_isgreater {{{
3229 template <typename _Tp, size_t _Np>
3230 static constexpr _MaskMember<_Tp>
3231 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3232 {
3233 const auto __xi = __to_intrin(__x);
3234 const auto __yi = __to_intrin(__y);
3235 if constexpr (__is_avx512_abi<_Abi>())
3236 {
3237 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3238 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3239 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3240 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3241 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3242 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3243 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3244 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3245 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3246 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3247 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3248 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3249 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3250 else
3251 __assert_unreachable<_Tp>();
3252 }
3253 else if constexpr (__have_avx)
3254 {
3255 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3256 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3257 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3258 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3259 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3260 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3261 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3262 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3263 else
3264 __assert_unreachable<_Tp>();
3265 }
3266 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3267 && sizeof(_Tp) == 4)
3268 {
3269 const auto __xn = __vector_bitcast<int>(__xi);
3270 const auto __yn = __vector_bitcast<int>(__yi);
3271 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3272 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3273 return __auto_bitcast(
3274 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp));
3275 }
3276 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3277 && sizeof(_Tp) == 8)
3278 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3279 -_mm_ucomigt_sd(__xi, __yi),
3280 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi),
3281 _mm_unpackhi_pd(__yi, __yi))};
3282 else
3283 return _Base::_S_isgreater(__x, __y);
3284 }
3285
3286 // }}}
3287 // _S_isgreaterequal {{{
3288 template <typename _Tp, size_t _Np>
3289 static constexpr _MaskMember<_Tp>
3290 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3291 {
3292 const auto __xi = __to_intrin(__x);
3293 const auto __yi = __to_intrin(__y);
3294 if constexpr (__is_avx512_abi<_Abi>())
3295 {
3296 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3297 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3298 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3299 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3300 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3301 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3302 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3303 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3304 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3305 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3306 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3307 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3308 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3309 else
3310 __assert_unreachable<_Tp>();
3311 }
3312 else if constexpr (__have_avx)
3313 {
3314 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3315 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3316 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3317 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3318 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3319 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3320 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3321 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3322 else
3323 __assert_unreachable<_Tp>();
3324 }
3325 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3326 && sizeof(_Tp) == 4)
3327 {
3328 const auto __xn = __vector_bitcast<int>(__xi);
3329 const auto __yn = __vector_bitcast<int>(__yi);
3330 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3331 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3332 return __auto_bitcast(
3333 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp));
3334 }
3335 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3336 && sizeof(_Tp) == 8)
3337 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3338 -_mm_ucomige_sd(__xi, __yi),
3339 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi),
3340 _mm_unpackhi_pd(__yi, __yi))};
3341 else
3342 return _Base::_S_isgreaterequal(__x, __y);
3343 }
3344
3345 // }}}
3346 // _S_isless {{{
3347 template <typename _Tp, size_t _Np>
3348 static constexpr _MaskMember<_Tp>
3349 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3350 {
3351 const auto __xi = __to_intrin(__x);
3352 const auto __yi = __to_intrin(__y);
3353 if constexpr (__is_avx512_abi<_Abi>())
3354 {
3355 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3356 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3357 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3358 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3359 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3360 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3361 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3362 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3363 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3364 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3365 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3366 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3367 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3368 else
3369 __assert_unreachable<_Tp>();
3370 }
3371 else if constexpr (__have_avx)
3372 {
3373 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3374 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3375 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3376 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3377 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3378 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3379 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3380 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3381 else
3382 __assert_unreachable<_Tp>();
3383 }
3384 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3385 && sizeof(_Tp) == 4)
3386 {
3387 const auto __xn = __vector_bitcast<int>(__xi);
3388 const auto __yn = __vector_bitcast<int>(__yi);
3389 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3390 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3391 return __auto_bitcast(
3392 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp));
3393 }
3394 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3395 && sizeof(_Tp) == 8)
3396 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3397 -_mm_ucomigt_sd(__yi, __xi),
3398 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi),
3399 _mm_unpackhi_pd(__xi, __xi))};
3400 else
3401 return _Base::_S_isless(__x, __y);
3402 }
3403
3404 // }}}
3405 // _S_islessequal {{{
3406 template <typename _Tp, size_t _Np>
3407 static constexpr _MaskMember<_Tp>
3408 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3409 {
3410 const auto __xi = __to_intrin(__x);
3411 const auto __yi = __to_intrin(__y);
3412 if constexpr (__is_avx512_abi<_Abi>())
3413 {
3414 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3415 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3416 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3417 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3418 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3419 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3420 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3421 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3422 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3423 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3424 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3425 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3426 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3427 else
3428 __assert_unreachable<_Tp>();
3429 }
3430 else if constexpr (__have_avx)
3431 {
3432 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3433 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3434 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3435 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3436 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3437 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3438 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3439 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3440 else
3441 __assert_unreachable<_Tp>();
3442 }
3443 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3444 && sizeof(_Tp) == 4)
3445 {
3446 const auto __xn = __vector_bitcast<int>(__xi);
3447 const auto __yn = __vector_bitcast<int>(__yi);
3448 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3449 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3450 return __auto_bitcast(
3451 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp));
3452 }
3453 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3454 && sizeof(_Tp) == 8)
3455 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3456 -_mm_ucomige_sd(__yi, __xi),
3457 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi),
3458 _mm_unpackhi_pd(__xi, __xi))};
3459 else
3460 return _Base::_S_islessequal(__x, __y);
3461 }
3462
3463 // }}}
3464 // _S_islessgreater {{{
3465 template <typename _Tp, size_t _Np>
3466 static constexpr _MaskMember<_Tp>
3467 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3468 {
3469 const auto __xi = __to_intrin(__x);
3470 const auto __yi = __to_intrin(__y);
3471 if constexpr (__is_avx512_abi<_Abi>())
3472 {
3473 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3474 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3475 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3476 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3477 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3478 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3479 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3480 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3481 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3482 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3483 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3484 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3485 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3486 else
3487 __assert_unreachable<_Tp>();
3488 }
3489 else if constexpr (__have_avx)
3490 {
3491 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3492 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3493 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3494 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3495 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3496 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3497 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3498 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3499 else
3500 __assert_unreachable<_Tp>();
3501 }
3502 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3503 return __auto_bitcast(
3504 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi)));
3505 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3506 return __to_masktype(
3507 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi)));
3508 else
3509 __assert_unreachable<_Tp>();
3510 }
3511
3512 //}}} }}}
3513 template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np>
3514 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
3515 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v)
3516 {
3517 if (__k._M_is_constprop_none_of())
3518 return __v;
3519 else if (__k._M_is_constprop_all_of())
3520 {
3521 auto __vv = _Base::_M_make_simd(__v);
3522 _Op<decltype(__vv)> __op;
3523 return __data(__op(__vv));
3524 }
3525 else if constexpr (__is_bitmask_v<decltype(__k)>
3526 && (is_same_v<_Op<void>, __increment<void>>
3527 || is_same_v<_Op<void>, __decrement<void>>))
3528 {
3529 // optimize masked unary increment and decrement as masked sub +/-1
3530 constexpr int __pm_one
3531 = is_same_v<_Op<void>, __increment<void>> ? -1 : 1;
3532#ifdef __clang__
3533 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data;
3534#else // __clang__
3535 if constexpr (is_integral_v<_Tp>)
3536 {
3537 constexpr bool __lp64 = sizeof(long) == sizeof(long long);
3538 using _Ip = std::make_signed_t<_Tp>;
3539 using _Up = std::conditional_t<
3540 std::is_same_v<_Ip, long>,
3543 std::is_same_v<_Ip, signed char>, char, _Ip>>;
3544 const auto __value = __vector_bitcast<_Up>(__v._M_data);
3545#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3546 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \
3547 return __vector_bitcast<_Tp>(__builtin_ia32_##_Instr##_mask(__value, \
3548 __vector_broadcast<_Np>(_Up(__pm_one)), __value, __k._M_data))
3549 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512);
3550 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256);
3551 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128);
3552 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512);
3553 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256);
3554 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128);
3555 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512);
3556 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256);
3557 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128);
3558 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512);
3559 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256);
3560 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128);
3561#undef _GLIBCXX_SIMD_MASK_SUB
3562 }
3563 else
3564 {
3565#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3566 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \
3567 return __builtin_ia32_##_Instr##_mask( \
3568 __v._M_data, __vector_broadcast<_Np>(_Tp(__pm_one)), __v._M_data, \
3569 __k._M_data, _MM_FROUND_CUR_DIRECTION)
3570 _GLIBCXX_SIMD_MASK_SUB(4, 64, subps512);
3571 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256);
3572 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128);
3573 _GLIBCXX_SIMD_MASK_SUB(8, 64, subpd512);
3574 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256);
3575 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128);
3576#undef _GLIBCXX_SIMD_MASK_SUB
3577 }
3578#endif // __clang__
3579 }
3580 else
3581 return _Base::template _S_masked_unary<_Op>(__k, __v);
3582 }
3583 };
3584
3585// }}}
3586// _MaskImplX86Mixin {{{
3587struct _MaskImplX86Mixin
3588{
3589 template <typename _Tp>
3590 using _TypeTag = _Tp*;
3591
3592 using _Base = _MaskImplBuiltinMixin;
3593
3594 // _S_to_maskvector(bool) {{{
3595 template <typename _Up, size_t _ToN = 1, typename _Tp>
3596 _GLIBCXX_SIMD_INTRINSIC static constexpr
3597 enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>>
3598 _S_to_maskvector(_Tp __x)
3599 {
3600 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3601 return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
3602 : __vector_type_t<_Up, _ToN>();
3603 }
3604
3605 // }}}
3606 // _S_to_maskvector(_SanitizedBitMask) {{{
3607 template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN>
3608 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3609 _S_to_maskvector(_SanitizedBitMask<_Np> __x)
3610 {
3611 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3612 using _UV = __vector_type_t<_Up, _ToN>;
3613 using _UI = __intrinsic_type_t<_Up, _ToN>;
3614 [[maybe_unused]] const auto __k = __x._M_to_bits();
3615 if constexpr (_Np == 1)
3616 return _S_to_maskvector<_Up, _ToN>(__k);
3617 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3618 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
3619 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; });
3620 else if constexpr (sizeof(_Up) == 1)
3621 {
3622 if constexpr (sizeof(_UI) == 16)
3623 {
3624 if constexpr (__have_avx512bw_vl)
3625 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k));
3626 else if constexpr (__have_avx512bw)
3627 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k)));
3628 else if constexpr (__have_avx512f)
3629 {
3630 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3631 auto __as16bits
3632 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3633 __hi256(__as32bits)));
3634 return __intrin_bitcast<_UV>(
3635 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
3636 }
3637 else if constexpr (__have_ssse3)
3638 {
3639 const auto __bitmask = __to_intrin(
3640 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4,
3641 8, 16, 32, 64, 128));
3642 return __intrin_bitcast<_UV>(
3643 __vector_bitcast<_Up>(
3644 _mm_shuffle_epi8(__to_intrin(
3645 __vector_type_t<_ULLong, 2>{__k}),
3646 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1,
3647 1, 1, 1, 1, 1, 1, 1))
3648 & __bitmask)
3649 != 0);
3650 }
3651 // else fall through
3652 }
3653 else if constexpr (sizeof(_UI) == 32)
3654 {
3655 if constexpr (__have_avx512bw_vl)
3656 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k));
3657 else if constexpr (__have_avx512bw)
3658 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k)));
3659 else if constexpr (__have_avx512f)
3660 {
3661 auto __as16bits = // 0 16 1 17 ... 15 31
3662 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()),
3663 16)
3664 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16,
3665 ~__m512i()),
3666 16);
3667 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16(
3668 __lo256(__as16bits),
3669 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ...
3670 );
3671 // deinterleave:
3672 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8(
3673 __0_16_1_17, // 0 16 1 17 2 ...
3674 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9,
3675 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1,
3676 3, 5, 7, 9, 11, 13,
3677 15)))); // 0-7 16-23 8-15 24-31 -> xzyw
3678 // 0-3 8-11 16-19 24-27
3679 // 4-7 12-15 20-23 28-31
3680 }
3681 else if constexpr (__have_avx2)
3682 {
3683 const auto __bitmask
3684 = _mm256_broadcastsi128_si256(__to_intrin(
3685 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
3686 4, 8, 16, 32, 64, 128)));
3687 return __vector_bitcast<_Up>(
3688 __vector_bitcast<_Up>(
3689 _mm256_shuffle_epi8(
3690 _mm256_broadcastsi128_si256(
3691 __to_intrin(__vector_type_t<_ULLong, 2>{__k})),
3692 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3693 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
3694 3, 3, 3, 3, 3, 3))
3695 & __bitmask)
3696 != 0);
3697 }
3698 // else fall through
3699 }
3700 else if constexpr (sizeof(_UI) == 64)
3701 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k));
3702 if constexpr (std::min(_ToN, _Np) <= 4)
3703 {
3704 if constexpr (_Np > 7) // avoid overflow
3705 __x &= _SanitizedBitMask<_Np>(0x0f);
3706 const _UInt __char_mask
3707 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL)
3708 * 0xff;
3709 _UV __r = {};
3710 __builtin_memcpy(&__r, &__char_mask,
3711 std::min(sizeof(__r), sizeof(__char_mask)));
3712 return __r;
3713 }
3714 else if constexpr (std::min(_ToN, _Np) <= 7)
3715 {
3716 if constexpr (_Np > 7) // avoid overflow
3717 __x &= _SanitizedBitMask<_Np>(0x7f);
3718 const _ULLong __char_mask
3719 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL)
3720 * 0xff;
3721 _UV __r = {};
3722 __builtin_memcpy(&__r, &__char_mask,
3723 std::min(sizeof(__r), sizeof(__char_mask)));
3724 return __r;
3725 }
3726 }
3727 else if constexpr (sizeof(_Up) == 2)
3728 {
3729 if constexpr (sizeof(_UI) == 16)
3730 {
3731 if constexpr (__have_avx512bw_vl)
3732 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k));
3733 else if constexpr (__have_avx512bw)
3734 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k)));
3735 else if constexpr (__have_avx512f)
3736 {
3737 __m256i __as32bits = {};
3738 if constexpr (__have_avx512vl)
3739 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i());
3740 else
3741 __as32bits
3742 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()));
3743 return __intrin_bitcast<_UV>(
3744 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits)));
3745 }
3746 // else fall through
3747 }
3748 else if constexpr (sizeof(_UI) == 32)
3749 {
3750 if constexpr (__have_avx512bw_vl)
3751 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k));
3752 else if constexpr (__have_avx512bw)
3753 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k)));
3754 else if constexpr (__have_avx512f)
3755 {
3756 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3757 return __vector_bitcast<_Up>(
3758 __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3759 __hi256(__as32bits))));
3760 }
3761 // else fall through
3762 }
3763 else if constexpr (sizeof(_UI) == 64)
3764 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k));
3765 }
3766 else if constexpr (sizeof(_Up) == 4)
3767 {
3768 if constexpr (sizeof(_UI) == 16)
3769 {
3770 if constexpr (__have_avx512dq_vl)
3771 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k));
3772 else if constexpr (__have_avx512dq)
3773 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k)));
3774 else if constexpr (__have_avx512vl)
3775 return __intrin_bitcast<_UV>(
3776 _mm_maskz_mov_epi32(__k, ~__m128i()));
3777 else if constexpr (__have_avx512f)
3778 return __intrin_bitcast<_UV>(
3779 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3780 // else fall through
3781 }
3782 else if constexpr (sizeof(_UI) == 32)
3783 {
3784 if constexpr (__have_avx512dq_vl)
3785 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k));
3786 else if constexpr (__have_avx512dq)
3787 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k)));
3788 else if constexpr (__have_avx512vl)
3789 return __vector_bitcast<_Up>(
3790 _mm256_maskz_mov_epi32(__k, ~__m256i()));
3791 else if constexpr (__have_avx512f)
3792 return __vector_bitcast<_Up>(
3793 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3794 // else fall through
3795 }
3796 else if constexpr (sizeof(_UI) == 64)
3797 return __vector_bitcast<_Up>(
3798 __have_avx512dq ? _mm512_movm_epi32(__k)
3799 : _mm512_maskz_mov_epi32(__k, ~__m512i()));
3800 }
3801 else if constexpr (sizeof(_Up) == 8)
3802 {
3803 if constexpr (sizeof(_UI) == 16)
3804 {
3805 if constexpr (__have_avx512dq_vl)
3806 return __vector_bitcast<_Up>(_mm_movm_epi64(__k));
3807 else if constexpr (__have_avx512dq)
3808 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k)));
3809 else if constexpr (__have_avx512vl)
3810 return __vector_bitcast<_Up>(
3811 _mm_maskz_mov_epi64(__k, ~__m128i()));
3812 else if constexpr (__have_avx512f)
3813 return __vector_bitcast<_Up>(
3814 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3815 // else fall through
3816 }
3817 else if constexpr (sizeof(_UI) == 32)
3818 {
3819 if constexpr (__have_avx512dq_vl)
3820 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k));
3821 else if constexpr (__have_avx512dq)
3822 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k)));
3823 else if constexpr (__have_avx512vl)
3824 return __vector_bitcast<_Up>(
3825 _mm256_maskz_mov_epi64(__k, ~__m256i()));
3826 else if constexpr (__have_avx512f)
3827 return __vector_bitcast<_Up>(
3828 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3829 // else fall through
3830 }
3831 else if constexpr (sizeof(_UI) == 64)
3832 return __vector_bitcast<_Up>(
3833 __have_avx512dq ? _mm512_movm_epi64(__k)
3834 : _mm512_maskz_mov_epi64(__k, ~__m512i()));
3835 }
3836
3837 using _UpUInt = make_unsigned_t<_Up>;
3838 using _V = __vector_type_t<_UpUInt, _ToN>;
3839 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__;
3840 if constexpr (_ToN == 2)
3841 {
3842 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])});
3843 }
3844 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32)
3845 {
3846 if constexpr (sizeof(_Up) == 4)
3847 return __vector_bitcast<_Up>(_mm256_cmp_ps(
3848 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)),
3849 _mm256_castsi256_ps(_mm256_setr_epi32(
3850 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))),
3851 _mm256_setzero_ps(), _CMP_NEQ_UQ));
3852 else if constexpr (sizeof(_Up) == 8)
3853 return __vector_bitcast<_Up>(_mm256_cmp_pd(
3854 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)),
3855 _mm256_castsi256_pd(
3856 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))),
3857 _mm256_setzero_pd(), _CMP_NEQ_UQ));
3858 else
3859 __assert_unreachable<_Up>();
3860 }
3861 else if constexpr (__bits_per_element >= _ToN)
3862 {
3863 constexpr auto __bitmask
3864 = __generate_vector<_V>([](auto __i)
3865 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt
3866 { return __i < _ToN ? 1ull << __i : 0; });
3867 const auto __bits
3868 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
3869 if constexpr (__bits_per_element > _ToN)
3870 return __vector_bitcast<_Up>(__bits) > 0;
3871 else
3872 return __vector_bitcast<_Up>(__bits != 0);
3873 }
3874 else
3875 {
3876 const _V __tmp
3877 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3878 return static_cast<_UpUInt>(
3879 __k >> (__bits_per_element * (__i / __bits_per_element)));
3880 })
3881 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3882 return static_cast<_UpUInt>(1ull
3883 << (__i % __bits_per_element));
3884 }); // mask bit index
3885 return __intrin_bitcast<_UV>(__tmp != _V());
3886 }
3887 }
3888
3889 // }}}
3890 // _S_to_maskvector(_SimdWrapper) {{{
3891 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
3892 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3893 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3894 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
3895 {
3896 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3897 using _TW = _SimdWrapper<_Tp, _Np>;
3898 using _UW = _SimdWrapper<_Up, _ToN>;
3899 using _UI = __intrinsic_type_t<_Up, _ToN>;
3900 if constexpr (is_same_v<_Tp, bool>) // bits -> vector
3901 return _S_to_maskvector<_Up, _ToN>(
3902 _BitMask<_Np>(__x._M_data)._M_sanitized());
3903 // vector -> vector bitcast
3904 else if constexpr (sizeof(_Up) == sizeof(_Tp)
3905 && sizeof(_TW) == sizeof(_UW))
3906 return __wrapper_bitcast<_Up, _ToN>(
3907 _ToN <= _Np
3908 ? __x
3909 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x));
3910 else // vector -> vector {{{
3911 {
3912 if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3913 {
3914 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
3915 return __generate_from_n_evaluations<std::min(_ToN, _Np),
3916 __vector_type_t<_Up, _ToN>>(
3917 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; });
3918 }
3919 using _To = __vector_type_t<_Up, _ToN>;
3920 [[maybe_unused]] constexpr size_t _FromN = _Np;
3921 constexpr int _FromBytes = sizeof(_Tp);
3922 constexpr int _ToBytes = sizeof(_Up);
3923 const auto __k = __x._M_data;
3924
3925 if constexpr (_FromBytes == _ToBytes)
3926 return __intrin_bitcast<_To>(__k);
3927 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16)
3928 { // SSE -> SSE {{{
3929 if constexpr (_FromBytes == 4 && _ToBytes == 8)
3930 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3931 else if constexpr (_FromBytes == 2 && _ToBytes == 8)
3932 {
3933 const auto __y
3934 = __vector_bitcast<int>(__interleave128_lo(__k, __k));
3935 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3936 }
3937 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3938 {
3939 auto __y
3940 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3941 auto __z
3942 = __vector_bitcast<int>(__interleave128_lo(__y, __y));
3943 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z));
3944 }
3945 else if constexpr (_FromBytes == 8 && _ToBytes == 4
3946 && __have_sse2)
3947 return __intrin_bitcast<_To>(
3948 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3949 else if constexpr (_FromBytes == 8 && _ToBytes == 4)
3950 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k),
3951 _UI());
3952 else if constexpr (_FromBytes == 2 && _ToBytes == 4)
3953 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3954 else if constexpr (_FromBytes == 1 && _ToBytes == 4)
3955 {
3956 const auto __y
3957 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3958 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3959 }
3960 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
3961 {
3962 if constexpr (__have_sse2 && !__have_ssse3)
3963 return __intrin_bitcast<_To>(_mm_packs_epi32(
3964 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()),
3965 __m128i()));
3966 else
3967 return __intrin_bitcast<_To>(
3968 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(
3969 __vector_bitcast<_Up>(__k)));
3970 }
3971 else if constexpr (_FromBytes == 4 && _ToBytes == 2)
3972 return __intrin_bitcast<_To>(
3973 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3974 else if constexpr (_FromBytes == 1 && _ToBytes == 2)
3975 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3976 else if constexpr (_FromBytes == 8 && _ToBytes == 1
3977 && __have_ssse3)
3978 return __intrin_bitcast<_To>(
3979 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3980 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1,
3981 -1, -1, -1, -1, -1, -1, -1,
3982 -1)));
3983 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
3984 {
3985 auto __y
3986 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3987 __y = _mm_packs_epi32(__y, __m128i());
3988 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3989 }
3990 else if constexpr (_FromBytes == 4 && _ToBytes == 1
3991 && __have_ssse3)
3992 return __intrin_bitcast<_To>(
3993 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3994 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
3995 -1, -1, -1, -1, -1, -1, -1,
3996 -1)));
3997 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
3998 {
3999 const auto __y
4000 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
4001 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
4002 }
4003 else if constexpr (_FromBytes == 2 && _ToBytes == 1)
4004 return __intrin_bitcast<_To>(
4005 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()));
4006 else
4007 __assert_unreachable<_Tp>();
4008 } // }}}
4009 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32)
4010 { // AVX -> AVX {{{
4011 if constexpr (_FromBytes == _ToBytes)
4012 __assert_unreachable<_Tp>();
4013 else if constexpr (_FromBytes == _ToBytes * 2)
4014 {
4015 const auto __y = __vector_bitcast<_LLong>(__k);
4016 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
4017 _mm_packs_epi16(__lo128(__y), __hi128(__y))));
4018 }
4019 else if constexpr (_FromBytes == _ToBytes * 4)
4020 {
4021 const auto __y = __vector_bitcast<_LLong>(__k);
4022 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
4023 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4024 __m128i())));
4025 }
4026 else if constexpr (_FromBytes == _ToBytes * 8)
4027 {
4028 const auto __y = __vector_bitcast<_LLong>(__k);
4029 return __intrin_bitcast<_To>(
4030 _mm256_castsi128_si256(_mm_shuffle_epi8(
4031 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4032 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
4033 -1, -1, -1, -1, -1))));
4034 }
4035 else if constexpr (_FromBytes * 2 == _ToBytes)
4036 {
4037 auto __y = __xzyw(__to_intrin(__k));
4038 if constexpr (is_floating_point_v<
4039 _Tp> || (!__have_avx2 && _FromBytes == 4))
4040 {
4041 const auto __yy = __vector_bitcast<float>(__y);
4042 return __intrin_bitcast<_To>(
4043 _mm256_unpacklo_ps(__yy, __yy));
4044 }
4045 else
4046 return __intrin_bitcast<_To>(
4047 _mm256_unpacklo_epi8(__y, __y));
4048 }
4049 else if constexpr (_FromBytes * 4 == _ToBytes)
4050 {
4051 auto __y
4052 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4053 __lo128(__vector_bitcast<_LLong>(
4054 __k))); // drops 3/4 of input
4055 return __intrin_bitcast<_To>(
4056 __concat(_mm_unpacklo_epi16(__y, __y),
4057 _mm_unpackhi_epi16(__y, __y)));
4058 }
4059 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
4060 {
4061 auto __y
4062 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4063 __lo128(__vector_bitcast<_LLong>(
4064 __k))); // drops 3/4 of input
4065 __y
4066 = _mm_unpacklo_epi16(__y,
4067 __y); // drops another 1/2 => 7/8 total
4068 return __intrin_bitcast<_To>(
4069 __concat(_mm_unpacklo_epi32(__y, __y),
4070 _mm_unpackhi_epi32(__y, __y)));
4071 }
4072 else
4073 __assert_unreachable<_Tp>();
4074 } // }}}
4075 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16)
4076 { // SSE -> AVX {{{
4077 if constexpr (_FromBytes == _ToBytes)
4078 return __intrin_bitcast<_To>(
4079 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>(
4080 __zero_extend(__to_intrin(__k))));
4081 else if constexpr (_FromBytes * 2 == _ToBytes)
4082 { // keep all
4083 return __intrin_bitcast<_To>(
4084 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k),
4085 __vector_bitcast<_LLong>(__k)),
4086 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k),
4087 __vector_bitcast<_LLong>(__k))));
4088 }
4089 else if constexpr (_FromBytes * 4 == _ToBytes)
4090 {
4091 if constexpr (__have_avx2)
4092 {
4093 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4094 __concat(__vector_bitcast<_LLong>(__k),
4095 __vector_bitcast<_LLong>(__k)),
4096 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3,
4097 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
4098 6, 6, 7, 7, 7, 7)));
4099 }
4100 else
4101 {
4102 return __intrin_bitcast<_To>(__concat(
4103 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4104 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1,
4105 2, 2, 2, 2, 3, 3, 3, 3)),
4106 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4107 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5,
4108 6, 6, 6, 6, 7, 7, 7,
4109 7))));
4110 }
4111 }
4112 else if constexpr (_FromBytes * 8 == _ToBytes)
4113 {
4114 if constexpr (__have_avx2)
4115 {
4116 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4117 __concat(__vector_bitcast<_LLong>(__k),
4118 __vector_bitcast<_LLong>(__k)),
4119 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
4120 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
4121 3, 3, 3, 3, 3, 3)));
4122 }
4123 else
4124 {
4125 return __intrin_bitcast<_To>(__concat(
4126 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4127 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
4128 1, 1, 1, 1, 1, 1, 1, 1)),
4129 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4130 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2,
4131 3, 3, 3, 3, 3, 3, 3,
4132 3))));
4133 }
4134 }
4135 else if constexpr (_FromBytes == _ToBytes * 2)
4136 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4137 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()))));
4138 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
4139 {
4140 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4141 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4142 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1,
4143 -1, -1, -1, -1, -1, -1, -1,
4144 -1)))));
4145 }
4146 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4147 {
4148 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4149 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4150 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
4151 -1, -1, -1, -1, -1, -1, -1,
4152 -1)))));
4153 }
4154 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4155 {
4156 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4157 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4158 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1,
4159 -1, -1, -1, -1, -1, -1, -1,
4160 -1, -1)))));
4161 }
4162 else
4163 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4164 } // }}}
4165 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32)
4166 { // AVX -> SSE {{{
4167 if constexpr (_FromBytes == _ToBytes)
4168 { // keep low 1/2
4169 return __intrin_bitcast<_To>(__lo128(__k));
4170 }
4171 else if constexpr (_FromBytes == _ToBytes * 2)
4172 { // keep all
4173 auto __y = __vector_bitcast<_LLong>(__k);
4174 return __intrin_bitcast<_To>(
4175 _mm_packs_epi16(__lo128(__y), __hi128(__y)));
4176 }
4177 else if constexpr (_FromBytes == _ToBytes * 4)
4178 { // add 1/2 undef
4179 auto __y = __vector_bitcast<_LLong>(__k);
4180 return __intrin_bitcast<_To>(
4181 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4182 __m128i()));
4183 }
4184 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4185 { // add 3/4 undef
4186 auto __y = __vector_bitcast<_LLong>(__k);
4187 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
4188 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4189 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1,
4190 -1, -1, -1, -1)));
4191 }
4192 else if constexpr (_FromBytes * 2 == _ToBytes)
4193 { // keep low 1/4
4194 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4195 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4196 }
4197 else if constexpr (_FromBytes * 4 == _ToBytes)
4198 { // keep low 1/8
4199 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4200 __y = _mm_unpacklo_epi8(__y, __y);
4201 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4202 }
4203 else if constexpr (_FromBytes * 8 == _ToBytes)
4204 { // keep low 1/16
4205 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4206 __y = _mm_unpacklo_epi8(__y, __y);
4207 __y = _mm_unpacklo_epi8(__y, __y);
4208 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4209 }
4210 else
4211 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4212 } // }}}
4213 else
4214 return _Base::template _S_to_maskvector<_Up, _ToN>(__x);
4215 /*
4216 if constexpr (_FromBytes > _ToBytes) {
4217 const _To __y = __vector_bitcast<_Up>(__k);
4218 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4219 constexpr int _Stride = _FromBytes / _ToBytes;
4220 return _To{__y[(_Is + 1) * _Stride - 1]...};
4221 }(make_index_sequence<std::min(_ToN, _FromN)>());
4222 } else {
4223 // {0, 0, 1, 1} (_Dups = 2, _Is<4>)
4224 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>)
4225 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>)
4226 // ...
4227 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4228 constexpr int __dup = _ToBytes / _FromBytes;
4229 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...});
4230 }(make_index_sequence<_FromN>());
4231 }
4232 */
4233 } // }}}
4234 }
4235
4236 // }}}
4237 // _S_to_bits {{{
4238 template <typename _Tp, size_t _Np>
4239 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
4240 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
4241 {
4242 if constexpr (is_same_v<_Tp, bool>)
4243 return _BitMask<_Np>(__x._M_data)._M_sanitized();
4244 else
4245 {
4246 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4247 if (__builtin_is_constant_evaluated()
4248 || __builtin_constant_p(__x._M_data))
4249 {
4250 const auto __bools = -__x._M_data;
4251 const _ULLong __k = __call_with_n_evaluations<_Np>(
4252 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4253 return (__bits | ...);
4254 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4255 return _ULLong(__bools[+__i]) << __i;
4256 });
4257 if (__builtin_is_constant_evaluated()
4258 || __builtin_constant_p(__k))
4259 return __k;
4260 }
4261 const auto __xi = __to_intrin(__x);
4262 if constexpr (sizeof(_Tp) == 1)
4263 if constexpr (sizeof(__xi) == 16)
4264 if constexpr (__have_avx512bw_vl)
4265 return _BitMask<_Np>(_mm_movepi8_mask(__xi));
4266 else // implies SSE2
4267 return _BitMask<_Np>(_mm_movemask_epi8(__xi));
4268 else if constexpr (sizeof(__xi) == 32)
4269 if constexpr (__have_avx512bw_vl)
4270 return _BitMask<_Np>(_mm256_movepi8_mask(__xi));
4271 else // implies AVX2
4272 return _BitMask<_Np>(_mm256_movemask_epi8(__xi));
4273 else // implies AVX512BW
4274 return _BitMask<_Np>(_mm512_movepi8_mask(__xi));
4275
4276 else if constexpr (sizeof(_Tp) == 2)
4277 if constexpr (sizeof(__xi) == 16)
4278 if constexpr (__have_avx512bw_vl)
4279 return _BitMask<_Np>(_mm_movepi16_mask(__xi));
4280 else if constexpr (__have_avx512bw)
4281 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4282 else // implies SSE2
4283 return _BitMask<_Np>(
4284 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i())));
4285 else if constexpr (sizeof(__xi) == 32)
4286 if constexpr (__have_avx512bw_vl)
4287 return _BitMask<_Np>(_mm256_movepi16_mask(__xi));
4288 else if constexpr (__have_avx512bw)
4289 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4290 else // implies SSE2
4291 return _BitMask<_Np>(_mm_movemask_epi8(
4292 _mm_packs_epi16(__lo128(__xi), __hi128(__xi))));
4293 else // implies AVX512BW
4294 return _BitMask<_Np>(_mm512_movepi16_mask(__xi));
4295
4296 else if constexpr (sizeof(_Tp) == 4)
4297 if constexpr (sizeof(__xi) == 16)
4298 if constexpr (__have_avx512dq_vl)
4299 return _BitMask<_Np>(_mm_movepi32_mask(__xi));
4300 else if constexpr (__have_avx512vl)
4301 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i()));
4302 else if constexpr (__have_avx512dq)
4303 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4304 else if constexpr (__have_avx512f)
4305 return _BitMask<_Np>(
4306 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4307 else // implies SSE
4308 return _BitMask<_Np>(
4309 _mm_movemask_ps(reinterpret_cast<__m128>(__xi)));
4310 else if constexpr (sizeof(__xi) == 32)
4311 if constexpr (__have_avx512dq_vl)
4312 return _BitMask<_Np>(_mm256_movepi32_mask(__xi));
4313 else if constexpr (__have_avx512dq)
4314 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4315 else if constexpr (__have_avx512vl)
4316 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i()));
4317 else if constexpr (__have_avx512f)
4318 return _BitMask<_Np>(
4319 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4320 else // implies AVX
4321 return _BitMask<_Np>(
4322 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi)));
4323 else // implies AVX512??
4324 if constexpr (__have_avx512dq)
4325 return _BitMask<_Np>(_mm512_movepi32_mask(__xi));
4326 else // implies AVX512F
4327 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i()));
4328
4329 else if constexpr (sizeof(_Tp) == 8)
4330 if constexpr (sizeof(__xi) == 16)
4331 if constexpr (__have_avx512dq_vl)
4332 return _BitMask<_Np>(_mm_movepi64_mask(__xi));
4333 else if constexpr (__have_avx512dq)
4334 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4335 else if constexpr (__have_avx512vl)
4336 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i()));
4337 else if constexpr (__have_avx512f)
4338 return _BitMask<_Np>(
4339 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4340 else // implies SSE2
4341 return _BitMask<_Np>(
4342 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi)));
4343 else if constexpr (sizeof(__xi) == 32)
4344 if constexpr (__have_avx512dq_vl)
4345 return _BitMask<_Np>(_mm256_movepi64_mask(__xi));
4346 else if constexpr (__have_avx512dq)
4347 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4348 else if constexpr (__have_avx512vl)
4349 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i()));
4350 else if constexpr (__have_avx512f)
4351 return _BitMask<_Np>(
4352 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4353 else // implies AVX
4354 return _BitMask<_Np>(
4355 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi)));
4356 else // implies AVX512??
4357 if constexpr (__have_avx512dq)
4358 return _BitMask<_Np>(_mm512_movepi64_mask(__xi));
4359 else // implies AVX512F
4360 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i()));
4361
4362 else
4363 __assert_unreachable<_Tp>();
4364 }
4365 }
4366 // }}}
4367};
4368
4369// }}}
4370// _MaskImplX86 {{{
4371template <typename _Abi, typename>
4372 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
4373 {
4374 using _MaskImplX86Mixin::_S_to_bits;
4375 using _MaskImplX86Mixin::_S_to_maskvector;
4376 using _MaskImplBuiltin<_Abi>::_S_convert;
4377
4378 // member types {{{
4379 template <typename _Tp>
4380 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
4381
4382 template <typename _Tp>
4383 using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
4384
4385 template <typename _Tp>
4386 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
4387
4388 using _Base = _MaskImplBuiltin<_Abi>;
4389
4390 // }}}
4391 // _S_broadcast {{{
4392 template <typename _Tp>
4393 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4394 _S_broadcast(bool __x)
4395 {
4396 if constexpr (__is_avx512_abi<_Abi>())
4397 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1))
4398 : _MaskMember<_Tp>();
4399 else
4400 return _Base::template _S_broadcast<_Tp>(__x);
4401 }
4402
4403 // }}}
4404 // _S_load {{{
4405 template <typename _Tp>
4406 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4407 _S_load(const bool* __mem)
4408 {
4409 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4410 if constexpr (__have_avx512bw)
4411 {
4412 const auto __to_vec_or_bits
4413 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) {
4414 if constexpr (__is_avx512_abi<_Abi>())
4415 return __bits;
4416 else
4417 return _S_to_maskvector<_Tp>(
4418 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
4419 };
4420
4421 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
4422 {
4423 __m128i __a = {};
4424 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4425 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a));
4426 }
4427 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl)
4428 {
4429 __m256i __a = {};
4430 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4431 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a));
4432 }
4433 else if constexpr (_S_size<_Tp> <= 64)
4434 {
4435 __m512i __a = {};
4436 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4437 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a));
4438 }
4439 }
4440 else if constexpr (__is_avx512_abi<_Abi>())
4441 {
4442 if constexpr (_S_size<_Tp> <= 8)
4443 {
4444 __m128i __a = {};
4445 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4446 const auto __b = _mm512_cvtepi8_epi64(__a);
4447 return _mm512_test_epi64_mask(__b, __b);
4448 }
4449 else if constexpr (_S_size<_Tp> <= 16)
4450 {
4451 __m128i __a = {};
4452 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4453 const auto __b = _mm512_cvtepi8_epi32(__a);
4454 return _mm512_test_epi32_mask(__b, __b);
4455 }
4456 else if constexpr (_S_size<_Tp> <= 32)
4457 {
4458 __m128i __a = {};
4459 __builtin_memcpy(&__a, __mem, 16);
4460 const auto __b = _mm512_cvtepi8_epi32(__a);
4461 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16);
4462 const auto __c = _mm512_cvtepi8_epi32(__a);
4463 return _mm512_test_epi32_mask(__b, __b)
4464 | (_mm512_test_epi32_mask(__c, __c) << 16);
4465 }
4466 else if constexpr (_S_size<_Tp> <= 64)
4467 {
4468 __m128i __a = {};
4469 __builtin_memcpy(&__a, __mem, 16);
4470 const auto __b = _mm512_cvtepi8_epi32(__a);
4471 __builtin_memcpy(&__a, __mem + 16, 16);
4472 const auto __c = _mm512_cvtepi8_epi32(__a);
4473 if constexpr (_S_size<_Tp> <= 48)
4474 {
4475 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32);
4476 const auto __d = _mm512_cvtepi8_epi32(__a);
4477 return _mm512_test_epi32_mask(__b, __b)
4478 | (_mm512_test_epi32_mask(__c, __c) << 16)
4479 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32);
4480 }
4481 else
4482 {
4483 __builtin_memcpy(&__a, __mem + 16, 16);
4484 const auto __d = _mm512_cvtepi8_epi32(__a);
4485 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48);
4486 const auto __e = _mm512_cvtepi8_epi32(__a);
4487 return _mm512_test_epi32_mask(__b, __b)
4488 | (_mm512_test_epi32_mask(__c, __c) << 16)
4489 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32)
4490 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48);
4491 }
4492 }
4493 else
4494 __assert_unreachable<_Tp>();
4495 }
4496 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2)
4497 return __vector_bitcast<_Tp>(
4498 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]),
4499 -int(__mem[1]), -int(__mem[1])});
4500 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx)
4501 {
4502 int __bool4 = 0;
4503 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>);
4504 const auto __k = __to_intrin(
4505 (__vector_broadcast<4>(__bool4)
4506 & __make_vector<int>(0x1, 0x100, 0x10000,
4507 _S_size<_Tp> == 4 ? 0x1000000 : 0))
4508 != 0);
4509 return __vector_bitcast<_Tp>(
4510 __concat(_mm_unpacklo_epi32(__k, __k),
4511 _mm_unpackhi_epi32(__k, __k)));
4512 }
4513 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4)
4514 {
4515 int __bools = 0;
4516 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>);
4517 if constexpr (__have_sse2)
4518 {
4519 __m128i __k = _mm_cvtsi32_si128(__bools);
4520 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4521 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4522 _mm_unpacklo_epi16(__k, __k));
4523 }
4524 else
4525 {
4526 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools));
4527 _mm_empty();
4528 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4529 _mm_cmpgt_ps(__k, __m128()));
4530 }
4531 }
4532 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8)
4533 {
4534 __m128i __k = {};
4535 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4536 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4537 return __vector_bitcast<_Tp>(
4538 __concat(_mm_unpacklo_epi16(__k, __k),
4539 _mm_unpackhi_epi16(__k, __k)));
4540 }
4541 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16)
4542 {
4543 __m128i __k = {};
4544 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4545 __k = _mm_cmpgt_epi8(__k, __m128i());
4546 if constexpr (_S_size<_Tp> <= 8)
4547 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4548 _mm_unpacklo_epi8(__k, __k));
4549 else
4550 return __concat(_mm_unpacklo_epi8(__k, __k),
4551 _mm_unpackhi_epi8(__k, __k));
4552 }
4553 else
4554 return _Base::template _S_load<_Tp>(__mem);
4555 }
4556
4557 // }}}
4558 // _S_from_bitmask{{{
4559 template <size_t _Np, typename _Tp>
4560 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
4561 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
4562 {
4563 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4564 if constexpr (__is_avx512_abi<_Abi>())
4565 return __bits._M_to_bits();
4566 else
4567 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
4568 }
4569
4570 // }}}
4571 // _S_masked_load {{{2
4572 template <typename _Tp, size_t _Np>
4573 static inline _SimdWrapper<_Tp, _Np>
4574 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
4575 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
4576 {
4577 if constexpr (__is_avx512_abi<_Abi>())
4578 {
4579 if constexpr (__have_avx512bw_vl)
4580 {
4581 if constexpr (_Np <= 16)
4582 {
4583 const auto __a
4584 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem);
4585 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a);
4586 }
4587 else if constexpr (_Np <= 32)
4588 {
4589 const auto __a
4590 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem);
4591 return (__merge & ~__mask)
4592 | _mm256_test_epi8_mask(__a, __a);
4593 }
4594 else if constexpr (_Np <= 64)
4595 {
4596 const auto __a
4597 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem);
4598 return (__merge & ~__mask)
4599 | _mm512_test_epi8_mask(__a, __a);
4600 }
4601 else
4602 __assert_unreachable<_Tp>();
4603 }
4604 else
4605 {
4606 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4607 __merge._M_set(__i, __mem[__i]);
4608 });
4609 return __merge;
4610 }
4611 }
4612 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1)
4613 {
4614 const auto __k = _S_to_bits(__mask)._M_to_bits();
4615 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(),
4616 _mm256_mask_loadu_epi8(__m256i(),
4617 __k, __mem));
4618 }
4619 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1)
4620 {
4621 const auto __k = _S_to_bits(__mask)._M_to_bits();
4622 __merge
4623 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k,
4624 __m128i(),
4625 _mm_mask_loadu_epi8(__m128i(), __k, __mem));
4626 }
4627 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2)
4628 {
4629 const auto __k = _S_to_bits(__mask)._M_to_bits();
4630 __merge = _mm256_mask_sub_epi16(
4631 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4632 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4633 }
4634 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2)
4635 {
4636 const auto __k = _S_to_bits(__mask)._M_to_bits();
4637 __merge = _mm_mask_sub_epi16(
4638 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4639 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4640 }
4641 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4)
4642 {
4643 const auto __k = _S_to_bits(__mask)._M_to_bits();
4644 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32(
4645 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4646 _mm256_cvtepi8_epi32(
4647 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4648 }
4649 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4)
4650 {
4651 const auto __k = _S_to_bits(__mask)._M_to_bits();
4652 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32(
4653 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4654 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4655 }
4656 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8)
4657 {
4658 const auto __k = _S_to_bits(__mask)._M_to_bits();
4659 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64(
4660 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4661 _mm256_cvtepi8_epi64(
4662 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4663 }
4664 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8)
4665 {
4666 const auto __k = _S_to_bits(__mask)._M_to_bits();
4667 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64(
4668 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4669 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4670 }
4671 else
4672 return _Base::_S_masked_load(__merge, __mask, __mem);
4673 return __merge;
4674 }
4675
4676 // _S_store {{{2
4677 template <typename _Tp, size_t _Np>
4678 _GLIBCXX_SIMD_INTRINSIC static void
4679 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
4680 {
4681 if constexpr (__is_avx512_abi<_Abi>())
4682 {
4683 if constexpr (__have_avx512bw_vl)
4684 _CommonImplX86::_S_store<_Np>(
4685 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4686 if constexpr (_Np <= 16)
4687 return _mm_maskz_set1_epi8(__data, 1);
4688 else if constexpr (_Np <= 32)
4689 return _mm256_maskz_set1_epi8(__data, 1);
4690 else
4691 return _mm512_maskz_set1_epi8(__data, 1);
4692 }(__v._M_data)),
4693 __mem);
4694 else if constexpr (_Np <= 8)
4695 _CommonImplX86::_S_store<_Np>(
4696 __vector_bitcast<char>(
4697#if defined __x86_64__
4698 __make_wrapper<_ULLong>(
4699 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull)
4700#else
4701 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U),
4702 _pdep_u32(__v._M_data >> 4,
4703 0x01010101U))
4704#endif
4705 ),
4706 __mem);
4707 else if constexpr (_Np <= 16)
4708 _mm512_mask_cvtepi32_storeu_epi8(
4709 __mem, 0xffffu >> (16 - _Np),
4710 _mm512_maskz_set1_epi32(__v._M_data, 1));
4711 else
4712 __assert_unreachable<_Tp>();
4713 }
4714 else if constexpr (__is_sse_abi<_Abi>()) //{{{
4715 {
4716 if constexpr (_Np == 2 && sizeof(_Tp) == 8)
4717 {
4718 const auto __k = __vector_bitcast<int>(__v);
4719 __mem[0] = -__k[1];
4720 __mem[1] = -__k[3];
4721 }
4722 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
4723 {
4724 if constexpr (__have_sse2)
4725 {
4726 const unsigned __bool4
4727 = __vector_bitcast<_UInt>(_mm_packs_epi16(
4728 _mm_packs_epi32(__intrin_bitcast<__m128i>(
4729 __to_intrin(__v)),
4730 __m128i()),
4731 __m128i()))[0]
4732 & 0x01010101u;
4733 __builtin_memcpy(__mem, &__bool4, _Np);
4734 }
4735 else if constexpr (__have_mmx)
4736 {
4737 const __m64 __k = _mm_cvtps_pi8(
4738 __and(__to_intrin(__v), _mm_set1_ps(1.f)));
4739 __builtin_memcpy(__mem, &__k, _Np);
4740 _mm_empty();
4741 }
4742 else
4743 return _Base::_S_store(__v, __mem);
4744 }
4745 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
4746 {
4747 _CommonImplX86::_S_store<_Np>(
4748 __vector_bitcast<char>(_mm_packs_epi16(
4749 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15),
4750 __m128i())),
4751 __mem);
4752 }
4753 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
4754 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem);
4755 else
4756 __assert_unreachable<_Tp>();
4757 } // }}}
4758 else if constexpr (__is_avx_abi<_Abi>()) // {{{
4759 {
4760 if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
4761 {
4762 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4763 int __bool4;
4764 if constexpr (__have_avx2)
4765 __bool4 = _mm256_movemask_epi8(__k);
4766 else
4767 __bool4 = (_mm_movemask_epi8(__lo128(__k))
4768 | (_mm_movemask_epi8(__hi128(__k)) << 16));
4769 __bool4 &= 0x01010101;
4770 __builtin_memcpy(__mem, &__bool4, _Np);
4771 }
4772 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4)
4773 {
4774 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4775 const auto __k2
4776 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)),
4777 15);
4778 const auto __k3
4779 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i()));
4780 _CommonImplX86::_S_store<_Np>(__k3, __mem);
4781 }
4782 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2)
4783 {
4784 if constexpr (__have_avx2)
4785 {
4786 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15);
4787 const auto __bools = __vector_bitcast<char>(
4788 _mm_packs_epi16(__lo128(__x), __hi128(__x)));
4789 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4790 }
4791 else
4792 {
4793 const auto __bools
4794 = 1
4795 & __vector_bitcast<_UChar>(
4796 _mm_packs_epi16(__lo128(__to_intrin(__v)),
4797 __hi128(__to_intrin(__v))));
4798 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4799 }
4800 }
4801 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1)
4802 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem);
4803 else
4804 __assert_unreachable<_Tp>();
4805 } // }}}
4806 else
4807 __assert_unreachable<_Tp>();
4808 }
4809
4810 // _S_masked_store {{{2
4811 template <typename _Tp, size_t _Np>
4812 static inline void
4813 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
4814 const _SimdWrapper<_Tp, _Np> __k) noexcept
4815 {
4816 if constexpr (__is_avx512_abi<_Abi>())
4817 {
4818 static_assert(is_same_v<_Tp, bool>);
4819 if constexpr (_Np <= 16 && __have_avx512bw_vl)
4820 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1));
4821 else if constexpr (_Np <= 16)
4822 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k,
4823 _mm512_maskz_set1_epi32(__v, 1));
4824 else if constexpr (_Np <= 32 && __have_avx512bw_vl)
4825 _mm256_mask_storeu_epi8(__mem, __k,
4826 _mm256_maskz_set1_epi8(__v, 1));
4827 else if constexpr (_Np <= 32 && __have_avx512bw)
4828 _mm256_mask_storeu_epi8(__mem, __k,
4829 __lo256(_mm512_maskz_set1_epi8(__v, 1)));
4830 else if constexpr (_Np <= 64 && __have_avx512bw)
4831 _mm512_mask_storeu_epi8(__mem, __k,
4832 _mm512_maskz_set1_epi8(__v, 1));
4833 else
4834 __assert_unreachable<_Tp>();
4835 }
4836 else
4837 _Base::_S_masked_store(__v, __mem, __k);
4838 }
4839
4840 // logical and bitwise operators {{{2
4841 template <typename _Tp, size_t _Np>
4842 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4843 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4844 {
4845 if constexpr (is_same_v<_Tp, bool>)
4846 {
4847 if constexpr (__have_avx512dq && _Np <= 8)
4848 return _kand_mask8(__x._M_data, __y._M_data);
4849 else if constexpr (_Np <= 16)
4850 return _kand_mask16(__x._M_data, __y._M_data);
4851 else if constexpr (__have_avx512bw && _Np <= 32)
4852 return _kand_mask32(__x._M_data, __y._M_data);
4853 else if constexpr (__have_avx512bw && _Np <= 64)
4854 return _kand_mask64(__x._M_data, __y._M_data);
4855 else
4856 __assert_unreachable<_Tp>();
4857 }
4858 else
4859 return _Base::_S_logical_and(__x, __y);
4860 }
4861
4862 template <typename _Tp, size_t _Np>
4863 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4864 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4865 {
4866 if constexpr (is_same_v<_Tp, bool>)
4867 {
4868 if constexpr (__have_avx512dq && _Np <= 8)
4869 return _kor_mask8(__x._M_data, __y._M_data);
4870 else if constexpr (_Np <= 16)
4871 return _kor_mask16(__x._M_data, __y._M_data);
4872 else if constexpr (__have_avx512bw && _Np <= 32)
4873 return _kor_mask32(__x._M_data, __y._M_data);
4874 else if constexpr (__have_avx512bw && _Np <= 64)
4875 return _kor_mask64(__x._M_data, __y._M_data);
4876 else
4877 __assert_unreachable<_Tp>();
4878 }
4879 else
4880 return _Base::_S_logical_or(__x, __y);
4881 }
4882
4883 template <typename _Tp, size_t _Np>
4884 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4885 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
4886 {
4887 if constexpr (is_same_v<_Tp, bool>)
4888 {
4889 if constexpr (__have_avx512dq && _Np <= 8)
4890 return _kandn_mask8(__x._M_data,
4891 _Abi::template __implicit_mask_n<_Np>());
4892 else if constexpr (_Np <= 16)
4893 return _kandn_mask16(__x._M_data,
4894 _Abi::template __implicit_mask_n<_Np>());
4895 else if constexpr (__have_avx512bw && _Np <= 32)
4896 return _kandn_mask32(__x._M_data,
4897 _Abi::template __implicit_mask_n<_Np>());
4898 else if constexpr (__have_avx512bw && _Np <= 64)
4899 return _kandn_mask64(__x._M_data,
4900 _Abi::template __implicit_mask_n<_Np>());
4901 else
4902 __assert_unreachable<_Tp>();
4903 }
4904 else
4905 return _Base::_S_bit_not(__x);
4906 }
4907
4908 template <typename _Tp, size_t _Np>
4909 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4910 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4911 {
4912 if constexpr (is_same_v<_Tp, bool>)
4913 {
4914 if constexpr (__have_avx512dq && _Np <= 8)
4915 return _kand_mask8(__x._M_data, __y._M_data);
4916 else if constexpr (_Np <= 16)
4917 return _kand_mask16(__x._M_data, __y._M_data);
4918 else if constexpr (__have_avx512bw && _Np <= 32)
4919 return _kand_mask32(__x._M_data, __y._M_data);
4920 else if constexpr (__have_avx512bw && _Np <= 64)
4921 return _kand_mask64(__x._M_data, __y._M_data);
4922 else
4923 __assert_unreachable<_Tp>();
4924 }
4925 else
4926 return _Base::_S_bit_and(__x, __y);
4927 }
4928
4929 template <typename _Tp, size_t _Np>
4930 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4931 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4932 {
4933 if constexpr (is_same_v<_Tp, bool>)
4934 {
4935 if constexpr (__have_avx512dq && _Np <= 8)
4936 return _kor_mask8(__x._M_data, __y._M_data);
4937 else if constexpr (_Np <= 16)
4938 return _kor_mask16(__x._M_data, __y._M_data);
4939 else if constexpr (__have_avx512bw && _Np <= 32)
4940 return _kor_mask32(__x._M_data, __y._M_data);
4941 else if constexpr (__have_avx512bw && _Np <= 64)
4942 return _kor_mask64(__x._M_data, __y._M_data);
4943 else
4944 __assert_unreachable<_Tp>();
4945 }
4946 else
4947 return _Base::_S_bit_or(__x, __y);
4948 }
4949
4950 template <typename _Tp, size_t _Np>
4951 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4952 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4953 {
4954 if constexpr (is_same_v<_Tp, bool>)
4955 {
4956 if constexpr (__have_avx512dq && _Np <= 8)
4957 return _kxor_mask8(__x._M_data, __y._M_data);
4958 else if constexpr (_Np <= 16)
4959 return _kxor_mask16(__x._M_data, __y._M_data);
4960 else if constexpr (__have_avx512bw && _Np <= 32)
4961 return _kxor_mask32(__x._M_data, __y._M_data);
4962 else if constexpr (__have_avx512bw && _Np <= 64)
4963 return _kxor_mask64(__x._M_data, __y._M_data);
4964 else
4965 __assert_unreachable<_Tp>();
4966 }
4967 else
4968 return _Base::_S_bit_xor(__x, __y);
4969 }
4970
4971 //}}}2
4972 // _S_masked_assign{{{
4973 template <size_t _Np>
4974 _GLIBCXX_SIMD_INTRINSIC static void
4975 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4976 _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs)
4977 {
4978 __lhs._M_data
4979 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data);
4980 }
4981
4982 template <size_t _Np>
4983 _GLIBCXX_SIMD_INTRINSIC static void
4984 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4985 _SimdWrapper<bool, _Np>& __lhs, bool __rhs)
4986 {
4987 if (__rhs)
4988 __lhs._M_data = __k._M_data | __lhs._M_data;
4989 else
4990 __lhs._M_data = ~__k._M_data & __lhs._M_data;
4991 }
4992
4993 using _MaskImplBuiltin<_Abi>::_S_masked_assign;
4994
4995 //}}}
4996 // _S_all_of {{{
4997 template <typename _Tp>
4998 _GLIBCXX_SIMD_INTRINSIC static bool
4999 _S_all_of(simd_mask<_Tp, _Abi> __k)
5000 {
5001 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5002 {
5003 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5004 using _TI = __intrinsic_type_t<_Tp, _Np>;
5005 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5006 if constexpr (__have_sse4_1)
5007 {
5008 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5009 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5010 return 0 != __testc(__a, __b);
5011 }
5012 else if constexpr (is_same_v<_Tp, float>)
5013 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1))
5014 == (1 << _Np) - 1;
5015 else if constexpr (is_same_v<_Tp, double>)
5016 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1))
5017 == (1 << _Np) - 1;
5018 else
5019 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5020 == (1 << (_Np * sizeof(_Tp))) - 1;
5021 }
5022 else if constexpr (__is_avx512_abi<_Abi>())
5023 {
5024 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>();
5025 const auto __kk = __k._M_data._M_data;
5026 if constexpr (sizeof(__kk) == 1)
5027 {
5028 if constexpr (__have_avx512dq)
5029 return _kortestc_mask8_u8(__kk, _Mask == 0xff
5030 ? __kk
5031 : __mmask8(~_Mask));
5032 else
5033 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask));
5034 }
5035 else if constexpr (sizeof(__kk) == 2)
5036 return _kortestc_mask16_u8(__kk, _Mask == 0xffff
5037 ? __kk
5038 : __mmask16(~_Mask));
5039 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw)
5040 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU
5041 ? __kk
5042 : __mmask32(~_Mask));
5043 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw)
5044 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL
5045 ? __kk
5046 : __mmask64(~_Mask));
5047 else
5048 __assert_unreachable<_Tp>();
5049 }
5050 }
5051
5052 // }}}
5053 // _S_any_of {{{
5054 template <typename _Tp>
5055 _GLIBCXX_SIMD_INTRINSIC static bool
5056 _S_any_of(simd_mask<_Tp, _Abi> __k)
5057 {
5058 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5059 {
5060 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5061 using _TI = __intrinsic_type_t<_Tp, _Np>;
5062 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5063 if constexpr (__have_sse4_1)
5064 {
5065 if constexpr (_Abi::template _S_is_partial<
5066 _Tp> || sizeof(__k) < 16)
5067 {
5068 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5069 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5070 return 0 == __testz(__a, __b);
5071 }
5072 else
5073 return 0 == __testz(__a, __a);
5074 }
5075 else if constexpr (is_same_v<_Tp, float>)
5076 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0;
5077 else if constexpr (is_same_v<_Tp, double>)
5078 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0;
5079 else
5080 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5081 != 0;
5082 }
5083 else if constexpr (__is_avx512_abi<_Abi>())
5084 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5085 != 0;
5086 }
5087
5088 // }}}
5089 // _S_none_of {{{
5090 template <typename _Tp>
5091 _GLIBCXX_SIMD_INTRINSIC static bool
5092 _S_none_of(simd_mask<_Tp, _Abi> __k)
5093 {
5094 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5095 {
5096 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5097 using _TI = __intrinsic_type_t<_Tp, _Np>;
5098 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5099 if constexpr (__have_sse4_1)
5100 {
5101 if constexpr (_Abi::template _S_is_partial<
5102 _Tp> || sizeof(__k) < 16)
5103 {
5104 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5105 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5106 return 0 != __testz(__a, __b);
5107 }
5108 else
5109 return 0 != __testz(__a, __a);
5110 }
5111 else if constexpr (is_same_v<_Tp, float>)
5112 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5113 else if constexpr (is_same_v<_Tp, double>)
5114 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5115 else
5116 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1))
5117 == 0;
5118 }
5119 else if constexpr (__is_avx512_abi<_Abi>())
5120 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5121 == 0;
5122 }
5123
5124 // }}}
5125 // _S_some_of {{{
5126 template <typename _Tp>
5127 _GLIBCXX_SIMD_INTRINSIC static bool
5128 _S_some_of(simd_mask<_Tp, _Abi> __k)
5129 {
5130 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5131 {
5132 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5133 using _TI = __intrinsic_type_t<_Tp, _Np>;
5134 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5135 if constexpr (__have_sse4_1)
5136 {
5137 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5138 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5139 return 0 != __testnzc(__a, __b);
5140 }
5141 else if constexpr (is_same_v<_Tp, float>)
5142 {
5143 constexpr int __allbits = (1 << _Np) - 1;
5144 const auto __tmp = _mm_movemask_ps(__a) & __allbits;
5145 return __tmp > 0 && __tmp < __allbits;
5146 }
5147 else if constexpr (is_same_v<_Tp, double>)
5148 {
5149 constexpr int __allbits = (1 << _Np) - 1;
5150 const auto __tmp = _mm_movemask_pd(__a) & __allbits;
5151 return __tmp > 0 && __tmp < __allbits;
5152 }
5153 else
5154 {
5155 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1;
5156 const auto __tmp = _mm_movemask_epi8(__a) & __allbits;
5157 return __tmp > 0 && __tmp < __allbits;
5158 }
5159 }
5160 else if constexpr (__is_avx512_abi<_Abi>())
5161 return _S_any_of(__k) && !_S_all_of(__k);
5162 else
5163 __assert_unreachable<_Tp>();
5164 }
5165
5166 // }}}
5167 // _S_popcount {{{
5168 template <typename _Tp>
5169 _GLIBCXX_SIMD_INTRINSIC static int
5170 _S_popcount(simd_mask<_Tp, _Abi> __k)
5171 {
5172 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5173 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data;
5174 if constexpr (__is_avx512_abi<_Abi>())
5175 {
5176 if constexpr (_Np > 32)
5177 return __builtin_popcountll(__kk);
5178 else
5179 return __builtin_popcount(__kk);
5180 }
5181 else
5182 {
5183 if constexpr (__have_popcnt)
5184 {
5185 int __bits
5186 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk)));
5187 const int __count = __builtin_popcount(__bits);
5188 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count;
5189 }
5190 else if constexpr (_Np == 2 && sizeof(_Tp) == 8)
5191 {
5192 const int mask = _mm_movemask_pd(__auto_bitcast(__kk));
5193 return mask - (mask >> 1);
5194 }
5195 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
5196 {
5197 auto __x = -(__lo128(__kk) + __hi128(__kk));
5198 return __x[0] + __x[1];
5199 }
5200 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
5201 {
5202 if constexpr (__have_sse2)
5203 {
5204 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk));
5205 __x = _mm_add_epi32(
5206 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5207 __x = _mm_add_epi32(
5208 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2)));
5209 return -_mm_cvtsi128_si32(__x);
5210 }
5211 else
5212 return __builtin_popcount(
5213 _mm_movemask_ps(__auto_bitcast(__kk)));
5214 }
5215 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
5216 {
5217 auto __x = __to_intrin(__kk);
5218 __x = _mm_add_epi16(__x,
5219 _mm_shuffle_epi32(__x,
5220 _MM_SHUFFLE(0, 1, 2, 3)));
5221 __x = _mm_add_epi16(
5222 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5223 __x = _mm_add_epi16(
5224 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1)));
5225 return -short(_mm_extract_epi16(__x, 0));
5226 }
5227 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
5228 {
5229 auto __x = __to_intrin(__kk);
5230 __x = _mm_add_epi8(__x,
5231 _mm_shuffle_epi32(__x,
5232 _MM_SHUFFLE(0, 1, 2, 3)));
5233 __x = _mm_add_epi8(__x,
5234 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2,
5235 3)));
5236 __x = _mm_add_epi8(__x,
5237 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0,
5238 1)));
5239 auto __y = -__vector_bitcast<_UChar>(__x);
5240 if constexpr (__have_sse4_1)
5241 return __y[0] + __y[1];
5242 else
5243 {
5244 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0);
5245 return (__z & 0xff) + (__z >> 8);
5246 }
5247 }
5248 else if constexpr (sizeof(__kk) == 32)
5249 {
5250 // The following works only as long as the implementations above
5251 // use a summation
5252 using _I = __int_for_sizeof_t<_Tp>;
5253 const auto __as_int = __vector_bitcast<_I>(__kk);
5254 _MaskImplX86<simd_abi::__sse>::_S_popcount(
5255 simd_mask<_I, simd_abi::__sse>(__private_init,
5256 __lo128(__as_int)
5257 + __hi128(__as_int)));
5258 }
5259 else
5260 __assert_unreachable<_Tp>();
5261 }
5262 }
5263
5264 // }}}
5265 // _S_find_first_set {{{
5266 template <typename _Tp>
5267 _GLIBCXX_SIMD_INTRINSIC static int
5268 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
5269 {
5270 if constexpr (__is_avx512_abi<_Abi>())
5271 return std::__countr_zero(__k._M_data._M_data);
5272 else
5273 return _Base::_S_find_first_set(__k);
5274 }
5275
5276 // }}}
5277 // _S_find_last_set {{{
5278 template <typename _Tp>
5279 _GLIBCXX_SIMD_INTRINSIC static int
5280 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
5281 {
5282 if constexpr (__is_avx512_abi<_Abi>())
5283 return std::__bit_width(__k._M_data._M_data) - 1;
5284 else
5285 return _Base::_S_find_last_set(__k);
5286 }
5287
5288 // }}}
5289 };
5290
5291// }}}
5292
5293_GLIBCXX_SIMD_END_NAMESPACE
5294#endif // __cplusplus >= 201703L
5295#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
5296
5297// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition: type_traits:2614
typename make_signed< _Tp >::type make_signed_t
Alias template for make_signed.
Definition: type_traits:1979
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:233