libstdc++
simd_x86.h
1// Simd x86 specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2023 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_X86INTRIN
31#error \
32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available"
33#endif
34
35_GLIBCXX_SIMD_BEGIN_NAMESPACE
36
37// __to_masktype {{{
38// Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and
39// __vector_type_t.
40template <typename _Tp, size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 {
44 return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(
45 __x._M_data);
46 }
47
48template <typename _TV,
49 typename _TVT
50 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
51 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
52 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
53 __to_masktype(_TV __x)
54 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); }
55
56// }}}
57// __interleave128_lo {{{
58template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>,
59 typename _Trait = _VectorTraits<_Tp>>
60 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
61 __interleave128_lo(const _Ap& __av, const _Bp& __bv)
62 {
63 const _Tp __a(__av);
64 const _Tp __b(__bv);
65 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
66 return _Tp{__a[0], __b[0]};
67 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
68 return _Tp{__a[0], __b[0], __a[1], __b[1]};
69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
70 return _Tp{__a[0], __b[0], __a[1], __b[1],
71 __a[2], __b[2], __a[3], __b[3]};
72 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
73 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
74 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
75 __a[6], __b[6], __a[7], __b[7]};
76 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
77 return _Tp{__a[0], __b[0], __a[2], __b[2]};
78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
79 return _Tp{__a[0], __b[0], __a[1], __b[1],
80 __a[4], __b[4], __a[5], __b[5]};
81 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
82 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
83 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
84 __a[10], __b[10], __a[11], __b[11]};
85 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
86 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
87 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
88 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
89 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
90 __a[22], __b[22], __a[23], __b[23]};
91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
92 return _Tp{__a[0], __b[0], __a[2], __b[2],
93 __a[4], __b[4], __a[6], __b[6]};
94 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
95 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
96 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
97 __a[12], __b[12], __a[13], __b[13]};
98 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
99 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
100 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
101 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
102 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
103 __a[26], __b[26], __a[27], __b[27]};
104 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
105 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
106 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
107 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
108 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
109 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
110 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
111 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
112 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
113 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
114 __b[55]};
115 else
116 __assert_unreachable<_Tp>();
117 }
118
119// }}}
120// __is_zero{{{
121template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
122 _GLIBCXX_SIMD_INTRINSIC constexpr bool
123 __is_zero(_Tp __a)
124 {
125 if (!__builtin_is_constant_evaluated())
126 {
127 if constexpr (__have_avx)
128 {
129 if constexpr (_TVT::template _S_is<float, 8>)
130 return _mm256_testz_ps(__a, __a);
131 else if constexpr (_TVT::template _S_is<double, 4>)
132 return _mm256_testz_pd(__a, __a);
133 else if constexpr (sizeof(_Tp) == 32)
134 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
135 else if constexpr (_TVT::template _S_is<float>)
136 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
137 else if constexpr (_TVT::template _S_is<double, 2>)
138 return _mm_testz_pd(__a, __a);
139 else
140 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
141 }
142 else if constexpr (__have_sse4_1)
143 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
144 __intrin_bitcast<__m128i>(__a));
145 }
146 else if constexpr (sizeof(_Tp) <= 8)
147 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
148 else
149 {
150 const auto __b = __vector_bitcast<_LLong>(__a);
151 if constexpr (sizeof(__b) == 16)
152 return (__b[0] | __b[1]) == 0;
153 else if constexpr (sizeof(__b) == 32)
154 return __is_zero(__lo128(__b) | __hi128(__b));
155 else if constexpr (sizeof(__b) == 64)
156 return __is_zero(__lo256(__b) | __hi256(__b));
157 else
158 __assert_unreachable<_Tp>();
159 }
160 }
161
162// }}}
163// __movemask{{{
164template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
165 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int
166 __movemask(_Tp __a)
167 {
168 if constexpr (sizeof(_Tp) == 32)
169 {
170 if constexpr (_TVT::template _S_is<float>)
171 return _mm256_movemask_ps(__to_intrin(__a));
172 else if constexpr (_TVT::template _S_is<double>)
173 return _mm256_movemask_pd(__to_intrin(__a));
174 else
175 return _mm256_movemask_epi8(__to_intrin(__a));
176 }
177 else if constexpr (_TVT::template _S_is<float>)
178 return _mm_movemask_ps(__to_intrin(__a));
179 else if constexpr (_TVT::template _S_is<double>)
180 return _mm_movemask_pd(__to_intrin(__a));
181 else
182 return _mm_movemask_epi8(__to_intrin(__a));
183 }
184
185// }}}
186// __testz{{{
187template <typename _TI, typename _TVT = _VectorTraits<_TI>>
188 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
189 __testz(_TI __a, _TI __b)
190 {
191 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
192 _TVT::_S_full_size>>);
193 if (!__builtin_is_constant_evaluated())
194 {
195 if constexpr (sizeof(_TI) == 32)
196 {
197 if constexpr (_TVT::template _S_is<float>)
198 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
199 else if constexpr (_TVT::template _S_is<double>)
200 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
201 else
202 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
203 }
204 else if constexpr (_TVT::template _S_is<float> && __have_avx)
205 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
206 else if constexpr (_TVT::template _S_is<double> && __have_avx)
207 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
208 else if constexpr (__have_sse4_1)
209 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
210 __intrin_bitcast<__m128i>(__to_intrin(__b)));
211 else
212 return __movemask(0 == __and(__a, __b)) != 0;
213 }
214 else
215 return __is_zero(__and(__a, __b));
216 }
217
218// }}}
219// __testc{{{
220// requires SSE4.1 or above
221template <typename _TI, typename _TVT = _VectorTraits<_TI>>
222 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
223 __testc(_TI __a, _TI __b)
224 {
225 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
226 _TVT::_S_full_size>>);
227 if (__builtin_is_constant_evaluated())
228 return __is_zero(__andnot(__a, __b));
229
230 if constexpr (sizeof(_TI) == 32)
231 {
232 if constexpr (_TVT::template _S_is<float>)
233 return _mm256_testc_ps(__a, __b);
234 else if constexpr (_TVT::template _S_is<double>)
235 return _mm256_testc_pd(__a, __b);
236 else
237 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
238 }
239 else if constexpr (_TVT::template _S_is<float> && __have_avx)
240 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
241 else if constexpr (_TVT::template _S_is<double> && __have_avx)
242 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
243 else
244 {
245 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
246 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
247 __intrin_bitcast<__m128i>(__to_intrin(__b)));
248 }
249 }
250
251// }}}
252// __testnzc{{{
253template <typename _TI, typename _TVT = _VectorTraits<_TI>>
254 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
255 __testnzc(_TI __a, _TI __b)
256 {
257 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
258 _TVT::_S_full_size>>);
259 if (!__builtin_is_constant_evaluated())
260 {
261 if constexpr (sizeof(_TI) == 32)
262 {
263 if constexpr (_TVT::template _S_is<float>)
264 return _mm256_testnzc_ps(__a, __b);
265 else if constexpr (_TVT::template _S_is<double>)
266 return _mm256_testnzc_pd(__a, __b);
267 else
268 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
269 }
270 else if constexpr (_TVT::template _S_is<float> && __have_avx)
271 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
272 else if constexpr (_TVT::template _S_is<double> && __have_avx)
273 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
274 else if constexpr (__have_sse4_1)
275 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
276 __intrin_bitcast<__m128i>(__to_intrin(__b)));
277 else
278 return __movemask(0 == __and(__a, __b)) == 0
279 && __movemask(0 == __andnot(__a, __b)) == 0;
280 }
281 else
282 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
283 }
284
285// }}}
286// __xzyw{{{
287// shuffles the complete vector, swapping the inner two quarters. Often useful
288// for AVX for fixing up a shuffle result.
289template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
290 _GLIBCXX_SIMD_INTRINSIC _Tp
291 __xzyw(_Tp __a)
292 {
293 if constexpr (sizeof(_Tp) == 16)
294 {
295 const auto __x = __vector_bitcast<conditional_t<
296 is_floating_point_v<typename _TVT::value_type>, float, int>>(__a);
297 return reinterpret_cast<_Tp>(
298 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
299 }
300 else if constexpr (sizeof(_Tp) == 32)
301 {
302 const auto __x = __vector_bitcast<conditional_t<
303 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
304 return reinterpret_cast<_Tp>(
305 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
306 }
307 else if constexpr (sizeof(_Tp) == 64)
308 {
309 const auto __x = __vector_bitcast<conditional_t<
310 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
311 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4],
312 __x[5], __x[2], __x[3],
313 __x[6], __x[7]});
314 }
315 else
316 __assert_unreachable<_Tp>();
317 }
318
319// }}}
320// __maskload_epi32{{{
321template <typename _Tp>
322 _GLIBCXX_SIMD_INTRINSIC auto
323 __maskload_epi32(const int* __ptr, _Tp __k)
324 {
325 if constexpr (sizeof(__k) == 16)
326 return _mm_maskload_epi32(__ptr, __k);
327 else
328 return _mm256_maskload_epi32(__ptr, __k);
329 }
330
331// }}}
332// __maskload_epi64{{{
333template <typename _Tp>
334 _GLIBCXX_SIMD_INTRINSIC auto
335 __maskload_epi64(const _LLong* __ptr, _Tp __k)
336 {
337 if constexpr (sizeof(__k) == 16)
338 return _mm_maskload_epi64(__ptr, __k);
339 else
340 return _mm256_maskload_epi64(__ptr, __k);
341 }
342
343// }}}
344// __maskload_ps{{{
345template <typename _Tp>
346 _GLIBCXX_SIMD_INTRINSIC auto
347 __maskload_ps(const float* __ptr, _Tp __k)
348 {
349 if constexpr (sizeof(__k) == 16)
350 return _mm_maskload_ps(__ptr, __k);
351 else
352 return _mm256_maskload_ps(__ptr, __k);
353 }
354
355// }}}
356// __maskload_pd{{{
357template <typename _Tp>
358 _GLIBCXX_SIMD_INTRINSIC auto
359 __maskload_pd(const double* __ptr, _Tp __k)
360 {
361 if constexpr (sizeof(__k) == 16)
362 return _mm_maskload_pd(__ptr, __k);
363 else
364 return _mm256_maskload_pd(__ptr, __k);
365 }
366
367// }}}
368
369#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
370#include "simd_x86_conversions.h"
371#endif
372
373// ISA & type detection {{{
374template <typename _Tp, size_t _Np>
375 constexpr bool
376 __is_sse_ps()
377 {
378 return __have_sse
379 && is_same_v<_Tp,
380 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
381 }
382
383template <typename _Tp, size_t _Np>
384 constexpr bool
385 __is_sse_pd()
386 {
387 return __have_sse2
388 && is_same_v<_Tp,
389 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
390 }
391
392template <typename _Tp, size_t _Np>
393 constexpr bool
394 __is_avx_ps()
395 {
396 return __have_avx
397 && is_same_v<_Tp,
398 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
399 }
400
401template <typename _Tp, size_t _Np>
402 constexpr bool
403 __is_avx_pd()
404 {
405 return __have_avx
406 && is_same_v<_Tp,
407 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
408 }
409
410template <typename _Tp, size_t _Np>
411 constexpr bool
412 __is_avx512_ps()
413 {
414 return __have_avx512f
415 && is_same_v<_Tp,
416 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
417 }
418
419template <typename _Tp, size_t _Np>
420 constexpr bool
421 __is_avx512_pd()
422 {
423 return __have_avx512f
424 && is_same_v<_Tp,
425 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
426 }
427
428// }}}
429struct _MaskImplX86Mixin;
430
431// _CommonImplX86 {{{
432struct _CommonImplX86 : _CommonImplBuiltin
433{
434#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
435 // _S_converts_via_decomposition {{{
436 template <typename _From, typename _To, size_t _ToSize>
437 static constexpr bool _S_converts_via_decomposition()
438 {
439 if constexpr (is_integral_v<
440 _From> && is_integral_v<_To> && sizeof(_From) == 8
441 && _ToSize == 16)
442 return (sizeof(_To) == 2 && !__have_ssse3)
443 || (sizeof(_To) == 1 && !__have_avx512f);
444 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
445 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
446 && !__have_avx512dq)
447 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
448 && _ToSize == 16);
449 else if constexpr (
450 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
451 && !__have_avx512dq)
452 return (sizeof(_To) == 4 && _ToSize == 16)
453 || (sizeof(_To) == 8 && _ToSize < 64);
454 else
455 return false;
456 }
457
458 template <typename _From, typename _To, size_t _ToSize>
459 static inline constexpr bool __converts_via_decomposition_v
460 = _S_converts_via_decomposition<_From, _To, _ToSize>();
461
462 // }}}
463#endif
464 // _S_store {{{
465 using _CommonImplBuiltin::_S_store;
466
467 template <typename _Tp, size_t _Np>
468 _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __x,
469 void* __addr)
470 {
471 constexpr size_t _Bytes = _Np * sizeof(_Tp);
472
473 if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
474 {
475 const auto __v = __to_intrin(__x);
476
477 if constexpr (_Bytes & 1)
478 {
479 if constexpr (_Bytes < 16)
480 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
481 __intrin_bitcast<__m128i>(__v));
482 else if constexpr (_Bytes < 32)
483 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
484 __intrin_bitcast<__m256i>(__v));
485 else
486 _mm512_mask_storeu_epi8(__addr,
487 0xffffffffffffffffull >> (64 - _Bytes),
488 __intrin_bitcast<__m512i>(__v));
489 }
490 else if constexpr (_Bytes & 2)
491 {
492 if constexpr (_Bytes < 16)
493 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
494 __intrin_bitcast<__m128i>(__v));
495 else if constexpr (_Bytes < 32)
496 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
497 __intrin_bitcast<__m256i>(__v));
498 else
499 _mm512_mask_storeu_epi16(__addr,
500 0xffffffffull >> (32 - _Bytes / 2),
501 __intrin_bitcast<__m512i>(__v));
502 }
503 else if constexpr (_Bytes & 4)
504 {
505 if constexpr (_Bytes < 16)
506 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
507 __intrin_bitcast<__m128i>(__v));
508 else if constexpr (_Bytes < 32)
509 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
510 __intrin_bitcast<__m256i>(__v));
511 else
512 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
513 __intrin_bitcast<__m512i>(__v));
514 }
515 else
516 {
517 static_assert(
518 _Bytes > 16,
519 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes "
520 "- 1)) != 0 is impossible");
521 if constexpr (_Bytes < 32)
522 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
523 __intrin_bitcast<__m256i>(__v));
524 else
525 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
526 __intrin_bitcast<__m512i>(__v));
527 }
528 }
529 else
530 _CommonImplBuiltin::_S_store(__x, __addr);
531 }
532
533 // }}}
534 // _S_store_bool_array(_BitMask) {{{
535 template <size_t _Np, bool _Sanitized>
536 _GLIBCXX_SIMD_INTRINSIC static constexpr void
537 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
538 {
539 if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
540 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
541 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
542 if constexpr (_Np <= 16)
543 return _mm_movm_epi8(__x._M_to_bits());
544 else if constexpr (_Np <= 32)
545 return _mm256_movm_epi8(__x._M_to_bits());
546 else if constexpr (_Np <= 64)
547 return _mm512_movm_epi8(__x._M_to_bits());
548 else
549 __assert_unreachable<_SizeConstant<_Np>>();
550 }()),
551 __mem);
552 else if constexpr (__have_bmi2)
553 {
554 if constexpr (_Np <= 4)
555 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
556 else
557 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
558 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
559 constexpr size_t __offset = __i * sizeof(size_t);
560 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
561 if constexpr (__todo == 1)
562 __mem[__offset] = __x[__offset];
563 else
564 {
565 const auto __bools =
566#ifdef __x86_64__
567 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
568 0x0101010101010101ULL);
569#else // __x86_64__
570 _pdep_u32(
571 __x.template _M_extract<__offset>()._M_to_bits(),
572 0x01010101U);
573#endif // __x86_64__
574 _S_store<__todo>(__bools, __mem + __offset);
575 }
576 });
577 }
578 else if constexpr (__have_sse2 && _Np > 7)
579 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
580 constexpr int __offset = __i * 16;
581 constexpr int __todo = std::min(16, int(_Np) - __offset);
582 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
583 __vector_type16_t<_UChar> __bools;
584 if constexpr (__have_avx512f)
585 {
586 auto __as32bits
587 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
588 __vector_broadcast<16>(1)));
589 auto __as16bits
590 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
591 __todo > 8 ? __hi256(__as32bits)
592 : __m256i()));
593 __bools = __vector_bitcast<_UChar>(
594 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
595 }
596 else
597 {
598 using _V = __vector_type_t<_UChar, 16>;
599 auto __tmp = _mm_cvtsi32_si128(__bits);
600 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
601 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
602 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
603 _V __tmp2 = reinterpret_cast<_V>(__tmp);
604 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
605 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index
606 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01
607 }
608 _S_store<__todo>(__bools, __mem + __offset);
609 });
610 else
611 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
612 }
613
614 // }}}
615 // _S_blend_avx512 {{{
616 // Returns: __k ? __b : __a
617 // TODO: reverse __a and __b to match COND_EXPR
618 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask
619 // __k
620 template <typename _Kp, typename _TV>
621 _GLIBCXX_SIMD_INTRINSIC static _TV
622 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept
623 {
624#ifdef __clang__
625 // FIXME: this does a boolean choice, not a blend
626 return __k ? __a : __b;
627#else
628 static_assert(__is_vector_type_v<_TV>);
629 using _Tp = typename _VectorTraits<_TV>::value_type;
630 static_assert(sizeof(_TV) >= 16);
631 static_assert(sizeof(_Tp) <= 8);
632 using _IntT
633 = conditional_t<(sizeof(_Tp) > 2),
634 conditional_t<sizeof(_Tp) == 4, int, long long>,
635 conditional_t<sizeof(_Tp) == 1, char, short>>;
636 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a);
637 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b);
638 if constexpr (sizeof(_TV) == 64)
639 {
640 if constexpr (sizeof(_Tp) == 1)
641 return reinterpret_cast<_TV>(
642 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
643 else if constexpr (sizeof(_Tp) == 2)
644 return reinterpret_cast<_TV>(
645 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
646 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
647 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
648 else if constexpr (sizeof(_Tp) == 4)
649 return reinterpret_cast<_TV>(
650 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
651 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
652 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
653 else if constexpr (sizeof(_Tp) == 8)
654 return reinterpret_cast<_TV>(
655 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
656 }
657 else if constexpr (sizeof(_TV) == 32)
658 {
659 if constexpr (sizeof(_Tp) == 1)
660 return reinterpret_cast<_TV>(
661 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
662 else if constexpr (sizeof(_Tp) == 2)
663 return reinterpret_cast<_TV>(
664 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
665 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
666 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
667 else if constexpr (sizeof(_Tp) == 4)
668 return reinterpret_cast<_TV>(
669 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
670 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
671 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
672 else if constexpr (sizeof(_Tp) == 8)
673 return reinterpret_cast<_TV>(
674 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
675 }
676 else if constexpr (sizeof(_TV) == 16)
677 {
678 if constexpr (sizeof(_Tp) == 1)
679 return reinterpret_cast<_TV>(
680 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
681 else if constexpr (sizeof(_Tp) == 2)
682 return reinterpret_cast<_TV>(
683 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
684 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
685 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
686 else if constexpr (sizeof(_Tp) == 4)
687 return reinterpret_cast<_TV>(
688 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
689 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
690 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
691 else if constexpr (sizeof(_Tp) == 8)
692 return reinterpret_cast<_TV>(
693 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
694 }
695#endif
696 }
697
698 // }}}
699 // _S_blend_intrin {{{
700 // Returns: __k ? __b : __a
701 // TODO: reverse __a and __b to match COND_EXPR
702 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32
703 // Bytes wide
704 template <typename _Tp>
705 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_blend_intrin(_Tp __k, _Tp __a,
706 _Tp __b) noexcept
707 {
708 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
709 constexpr struct
710 {
711 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
712 __m128 __k) const noexcept
713 {
714 return __builtin_ia32_blendvps(__a, __b, __k);
715 }
716 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
717 __m128d __k) const noexcept
718 {
719 return __builtin_ia32_blendvpd(__a, __b, __k);
720 }
721 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
722 __m128i __k) const noexcept
723 {
724 return reinterpret_cast<__m128i>(
725 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
726 reinterpret_cast<__v16qi>(__b),
727 reinterpret_cast<__v16qi>(__k)));
728 }
729 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
730 __m256 __k) const noexcept
731 {
732 return __builtin_ia32_blendvps256(__a, __b, __k);
733 }
734 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
735 __m256d __k) const noexcept
736 {
737 return __builtin_ia32_blendvpd256(__a, __b, __k);
738 }
739 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
740 __m256i __k) const noexcept
741 {
742 if constexpr (__have_avx2)
743 return reinterpret_cast<__m256i>(
744 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
745 reinterpret_cast<__v32qi>(__b),
746 reinterpret_cast<__v32qi>(__k)));
747 else
748 return reinterpret_cast<__m256i>(
749 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
750 reinterpret_cast<__v8sf>(__b),
751 reinterpret_cast<__v8sf>(__k)));
752 }
753 } __eval;
754 return __eval(__a, __b, __k);
755 }
756
757 // }}}
758 // _S_blend {{{
759 // Returns: __k ? __at1 : __at0
760 // TODO: reverse __at0 and __at1 to match COND_EXPR
761 template <typename _Tp, size_t _Np>
762 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
763 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
764 _SimdWrapper<_Tp, _Np> __at1)
765 {
766 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
767 if (__k._M_is_constprop() && __at0._M_is_constprop()
768 && __at1._M_is_constprop())
769 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
770 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
771 return __k[__i] ? __at1[__i] : __at0[__i];
772 });
773 else if constexpr (sizeof(__at0) == 64
774 || (__have_avx512vl && sizeof(__at0) >= 16))
775 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
776 else
777 {
778 static_assert((__have_avx512vl && sizeof(__at0) < 16)
779 || !__have_avx512vl);
780 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp);
781 return __vector_bitcast<_Tp, _Np>(
782 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
783 __vector_bitcast<_Tp, __size>(__at1)));
784 }
785 }
786
787 template <typename _Tp, size_t _Np>
788 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
789 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
790 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
791 {
792 const auto __kk = __wrapper_bitcast<_Tp>(__k);
793 if (__builtin_is_constant_evaluated()
794 || (__kk._M_is_constprop() && __at0._M_is_constprop()
795 && __at1._M_is_constprop()))
796 {
797 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
798 if (__r._M_is_constprop())
799 return __r;
800 }
801 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl)
802 && (sizeof(_Tp) >= 4 || __have_avx512bw))
803 // convert to bitmask and call overload above
804 return _S_blend(
805 _SimdWrapper<bool, _Np>(
806 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
807 ._M_to_bits()),
808 __at0, __at1);
809 else
810 {
811 // Since GCC does not assume __k to be a mask, using the builtin
812 // conditional operator introduces an extra compare against 0 before
813 // blending. So we rather call the intrinsic here.
814 if constexpr (__have_sse4_1)
815 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
816 __to_intrin(__at1));
817 else
818 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
819 }
820 }
821
822 // }}}
823};
824
825// }}}
826// _SimdImplX86 {{{
827template <typename _Abi, typename>
828 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
829 {
830 using _Base = _SimdImplBuiltin<_Abi>;
831
832 template <typename _Tp>
833 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
834
835 template <typename _Tp>
836 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
837
838 template <typename _Tp>
839 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
840
841 template <typename _Tp>
842 static constexpr size_t _S_max_store_size
843 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
844 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
845 : 16;
846 using _MaskImpl = typename _Abi::_MaskImpl;
847
848 // _S_masked_load {{{
849 template <typename _Tp, size_t _Np, typename _Up>
850 static inline _SimdWrapper<_Tp, _Np>
851 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
852 const _Up* __mem) noexcept
853 {
854 static_assert(_Np == _S_size<_Tp>);
855 if constexpr (is_same_v<_Tp, _Up> || // no conversion
856 (sizeof(_Tp) == sizeof(_Up)
857 && is_integral_v<
858 _Tp> == is_integral_v<_Up>) // conversion via bit
859 // reinterpretation
860 )
861 {
862 [[maybe_unused]] const auto __intrin = __to_intrin(__merge);
863 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
864 && sizeof(_Tp) == 1)
865 {
866 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
867 if constexpr (sizeof(__intrin) == 16)
868 __merge = __vector_bitcast<_Tp, _Np>(
869 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
870 else if constexpr (sizeof(__merge) == 32)
871 __merge = __vector_bitcast<_Tp, _Np>(
872 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
873 else if constexpr (sizeof(__merge) == 64)
874 __merge = __vector_bitcast<_Tp, _Np>(
875 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
876 else
877 __assert_unreachable<_Tp>();
878 }
879 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
880 && sizeof(_Tp) == 2)
881 {
882 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
883 if constexpr (sizeof(__intrin) == 16)
884 __merge = __vector_bitcast<_Tp, _Np>(
885 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
886 else if constexpr (sizeof(__intrin) == 32)
887 __merge = __vector_bitcast<_Tp, _Np>(
888 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
889 else if constexpr (sizeof(__intrin) == 64)
890 __merge = __vector_bitcast<_Tp, _Np>(
891 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
892 else
893 __assert_unreachable<_Tp>();
894 }
895 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
896 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
897 {
898 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
899 if constexpr (sizeof(__intrin) == 16)
900 __merge = __vector_bitcast<_Tp, _Np>(
901 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
902 else if constexpr (sizeof(__intrin) == 32)
903 __merge = __vector_bitcast<_Tp, _Np>(
904 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
905 else if constexpr (sizeof(__intrin) == 64)
906 __merge = __vector_bitcast<_Tp, _Np>(
907 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
908 else
909 __assert_unreachable<_Tp>();
910 }
911 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
912 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
913 {
914 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
915 if constexpr (sizeof(__intrin) == 16)
916 __merge = __vector_bitcast<_Tp, _Np>(
917 _mm_mask_loadu_ps(__intrin, __kk, __mem));
918 else if constexpr (sizeof(__intrin) == 32)
919 __merge = __vector_bitcast<_Tp, _Np>(
920 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
921 else if constexpr (sizeof(__intrin) == 64)
922 __merge = __vector_bitcast<_Tp, _Np>(
923 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
924 else
925 __assert_unreachable<_Tp>();
926 }
927 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
928 && is_integral_v<_Up>)
929 {
930 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
931 __merge
932 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
933 __vector_bitcast<_Tp, _Np>(
934 __maskload_epi32(reinterpret_cast<const int*>(__mem),
935 __to_intrin(__k))));
936 }
937 else if constexpr (__have_avx && sizeof(_Tp) == 4)
938 {
939 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
940 __merge
941 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
942 __vector_bitcast<_Tp, _Np>(
943 __maskload_ps(reinterpret_cast<const float*>(__mem),
944 __to_intrin(__k))));
945 }
946 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
947 && sizeof(_Tp) == 8 && is_integral_v<_Up>)
948 {
949 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
950 if constexpr (sizeof(__intrin) == 16)
951 __merge = __vector_bitcast<_Tp, _Np>(
952 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
953 else if constexpr (sizeof(__intrin) == 32)
954 __merge = __vector_bitcast<_Tp, _Np>(
955 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
956 else if constexpr (sizeof(__intrin) == 64)
957 __merge = __vector_bitcast<_Tp, _Np>(
958 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
959 else
960 __assert_unreachable<_Tp>();
961 }
962 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
963 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
964 {
965 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
966 if constexpr (sizeof(__intrin) == 16)
967 __merge = __vector_bitcast<_Tp, _Np>(
968 _mm_mask_loadu_pd(__intrin, __kk, __mem));
969 else if constexpr (sizeof(__intrin) == 32)
970 __merge = __vector_bitcast<_Tp, _Np>(
971 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
972 else if constexpr (sizeof(__intrin) == 64)
973 __merge = __vector_bitcast<_Tp, _Np>(
974 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
975 else
976 __assert_unreachable<_Tp>();
977 }
978 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
979 && is_integral_v<_Up>)
980 {
981 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
982 __merge
983 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
984 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
985 reinterpret_cast<const _LLong*>(__mem),
986 __to_intrin(__k))));
987 }
988 else if constexpr (__have_avx && sizeof(_Tp) == 8)
989 {
990 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
991 __merge
992 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
993 __vector_bitcast<_Tp, _Np>(
994 __maskload_pd(reinterpret_cast<const double*>(__mem),
995 __to_intrin(__k))));
996 }
997 else
998 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
999 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1000 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1001 });
1002 }
1003 /* Very uncertain, that the following improves anything. Needs
1004 benchmarking
1005 * before it's activated.
1006 else if constexpr (sizeof(_Up) <= 8 && // no long double
1007 !__converts_via_decomposition_v<
1008 _Up, _Tp,
1009 sizeof(__merge)> // conversion via decomposition
1010 // is better handled via the
1011 // bit_iteration fallback below
1012 )
1013 {
1014 // TODO: copy pattern from _S_masked_store, which doesn't resort to
1015 // fixed_size
1016 using _Ap = simd_abi::deduce_t<_Up, _Np>;
1017 using _ATraits = _SimdTraits<_Up, _Ap>;
1018 using _AImpl = typename _ATraits::_SimdImpl;
1019 typename _ATraits::_SimdMember __uncvted{};
1020 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template
1021 _S_convert<_Up>(__k);
1022 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem);
1023 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter;
1024 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted));
1025 }
1026 */
1027 else
1028 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1029 return __merge;
1030 }
1031
1032 // }}}
1033 // _S_masked_store_nocvt {{{
1034 template <typename _Tp, size_t _Np>
1035 _GLIBCXX_SIMD_INTRINSIC static void
1036 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1037 _SimdWrapper<bool, _Np> __k)
1038 {
1039 [[maybe_unused]] const auto __vi = __to_intrin(__v);
1040 if constexpr (sizeof(__vi) == 64)
1041 {
1042 static_assert(sizeof(__v) == 64 && __have_avx512f);
1043 if constexpr (__have_avx512bw && sizeof(_Tp) == 1)
1044 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1045 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1046 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1047 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1048 {
1049 if constexpr (is_integral_v<_Tp>)
1050 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1051 else
1052 _mm512_mask_storeu_ps(__mem, __k, __vi);
1053 }
1054 else if constexpr (__have_avx512f && sizeof(_Tp) == 8)
1055 {
1056 if constexpr (is_integral_v<_Tp>)
1057 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1058 else
1059 _mm512_mask_storeu_pd(__mem, __k, __vi);
1060 }
1061#if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32
1062 // with Skylake-AVX512, __have_avx512bw is true
1063 else if constexpr (__have_sse2)
1064 {
1065 using _M = __vector_type_t<_Tp, _Np>;
1066 using _MVT = _VectorTraits<_M>;
1067 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1068 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1069 reinterpret_cast<char*>(__mem));
1070 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1071 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1072 __k._M_data >> 1 * _MVT::_S_full_size)),
1073 reinterpret_cast<char*>(__mem) + 1 * 16);
1074 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1075 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1076 __k._M_data >> 2 * _MVT::_S_full_size)),
1077 reinterpret_cast<char*>(__mem) + 2 * 16);
1078 if constexpr (_Np > 48 / sizeof(_Tp))
1079 _mm_maskmoveu_si128(
1080 __auto_bitcast(__extract<3, 4>(__v._M_data)),
1081 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1082 __k._M_data >> 3 * _MVT::_S_full_size)),
1083 reinterpret_cast<char*>(__mem) + 3 * 16);
1084 }
1085#endif
1086 else
1087 __assert_unreachable<_Tp>();
1088 }
1089 else if constexpr (sizeof(__vi) == 32)
1090 {
1091 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1092 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1093 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1094 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1095 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1096 {
1097 if constexpr (is_integral_v<_Tp>)
1098 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1099 else
1100 _mm256_mask_storeu_ps(__mem, __k, __vi);
1101 }
1102 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1103 {
1104 if constexpr (is_integral_v<_Tp>)
1105 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1106 else
1107 _mm256_mask_storeu_pd(__mem, __k, __vi);
1108 }
1109 else if constexpr (__have_avx512f
1110 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1111 {
1112 // use a 512-bit maskstore, using zero-extension of the bitmask
1113 _S_masked_store_nocvt(
1114 _SimdWrapper64<_Tp>(
1115 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1116 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1117 }
1118 else
1119 _S_masked_store_nocvt(__v, __mem,
1120 _MaskImpl::template _S_to_maskvector<
1121 __int_for_sizeof_t<_Tp>, _Np>(__k));
1122 }
1123 else if constexpr (sizeof(__vi) == 16)
1124 {
1125 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1126 _mm_mask_storeu_epi8(__mem, __k, __vi);
1127 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1128 _mm_mask_storeu_epi16(__mem, __k, __vi);
1129 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1130 {
1131 if constexpr (is_integral_v<_Tp>)
1132 _mm_mask_storeu_epi32(__mem, __k, __vi);
1133 else
1134 _mm_mask_storeu_ps(__mem, __k, __vi);
1135 }
1136 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1137 {
1138 if constexpr (is_integral_v<_Tp>)
1139 _mm_mask_storeu_epi64(__mem, __k, __vi);
1140 else
1141 _mm_mask_storeu_pd(__mem, __k, __vi);
1142 }
1143 else if constexpr (__have_avx512f
1144 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1145 {
1146 // use a 512-bit maskstore, using zero-extension of the bitmask
1147 _S_masked_store_nocvt(
1148 _SimdWrapper64<_Tp>(
1149 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1150 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1151 }
1152 else
1153 _S_masked_store_nocvt(__v, __mem,
1154 _MaskImpl::template _S_to_maskvector<
1155 __int_for_sizeof_t<_Tp>, _Np>(__k));
1156 }
1157 else
1158 __assert_unreachable<_Tp>();
1159 }
1160
1161 template <typename _Tp, size_t _Np>
1162 _GLIBCXX_SIMD_INTRINSIC static void
1163 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1164 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1165 {
1166 if constexpr (sizeof(__v) <= 16)
1167 {
1168 [[maybe_unused]] const auto __vi
1169 = __intrin_bitcast<__m128i>(__as_vector(__v));
1170 [[maybe_unused]] const auto __ki
1171 = __intrin_bitcast<__m128i>(__as_vector(__k));
1172 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1173 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1174 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1175 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1176 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1177 && is_integral_v<_Tp>)
1178 _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1179 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1180 _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1181 __vector_bitcast<float>(__vi));
1182 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1183 && is_integral_v<_Tp>)
1184 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1185 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1186 _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1187 __vector_bitcast<double>(__vi));
1188 else if constexpr (__have_sse2)
1189 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<char*>(__mem));
1190 }
1191 else if constexpr (sizeof(__v) == 32)
1192 {
1193 [[maybe_unused]] const auto __vi
1194 = __intrin_bitcast<__m256i>(__as_vector(__v));
1195 [[maybe_unused]] const auto __ki
1196 = __intrin_bitcast<__m256i>(__as_vector(__k));
1197 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1198 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1199 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1200 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1201 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1202 && is_integral_v<_Tp>)
1203 _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1204 else if constexpr (sizeof(_Tp) == 4)
1205 _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1206 __vector_bitcast<float>(__v));
1207 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1208 && is_integral_v<_Tp>)
1209 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1210 __vi);
1211 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1212 _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1213 __vector_bitcast<double>(__v));
1214 else if constexpr (__have_sse2)
1215 {
1216 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1217 reinterpret_cast<char*>(__mem));
1218 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1219 reinterpret_cast<char*>(__mem) + 16);
1220 }
1221 }
1222 else
1223 __assert_unreachable<_Tp>();
1224 }
1225
1226 // }}}
1227 // _S_masked_store {{{
1228 template <typename _Tp, size_t _Np, typename _Up>
1229 _GLIBCXX_SIMD_INTRINSIC static void
1230 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1231 const _MaskMember<_Tp> __k) noexcept
1232 {
1233 if constexpr (is_integral_v<
1234 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up)
1235 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw)
1236 && (sizeof(__v) == 64 || __have_avx512vl))
1237 { // truncating store
1238 const auto __vi = __to_intrin(__v);
1239 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1240 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1241 && sizeof(__vi) == 64)
1242 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1243 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1244 && sizeof(__vi) == 32)
1245 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1246 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1247 && sizeof(__vi) == 16)
1248 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1249 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1250 && sizeof(__vi) == 64)
1251 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1252 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1253 && sizeof(__vi) == 32)
1254 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1255 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1256 && sizeof(__vi) == 16)
1257 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1258 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1259 && sizeof(__vi) == 64)
1260 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1261 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1262 && sizeof(__vi) == 32)
1263 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1264 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1265 && sizeof(__vi) == 16)
1266 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1267 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1268 && sizeof(__vi) == 64)
1269 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1270 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1271 && sizeof(__vi) == 32)
1272 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1273 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1274 && sizeof(__vi) == 16)
1275 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1276 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1277 && sizeof(__vi) == 64)
1278 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1279 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1280 && sizeof(__vi) == 32)
1281 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1282 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1283 && sizeof(__vi) == 16)
1284 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1285 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1286 && sizeof(__vi) == 64)
1287 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1288 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1289 && sizeof(__vi) == 32)
1290 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1291 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1292 && sizeof(__vi) == 16)
1293 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1294 else
1295 __assert_unreachable<_Tp>();
1296 }
1297 else
1298 _Base::_S_masked_store(__v, __mem, __k);
1299 }
1300
1301 // }}}
1302 // _S_multiplies {{{
1303 template <typename _V, typename _VVT = _VectorTraits<_V>>
1304 _GLIBCXX_SIMD_INTRINSIC static constexpr _V _S_multiplies(_V __x, _V __y)
1305 {
1306 using _Tp = typename _VVT::value_type;
1307 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1308 || __y._M_is_constprop())
1309 return __as_vector(__x) * __as_vector(__y);
1310 else if constexpr (sizeof(_Tp) == 1)
1311 {
1312 if constexpr (sizeof(_V) == 2)
1313 {
1314 const auto __xs = reinterpret_cast<short>(__x._M_data);
1315 const auto __ys = reinterpret_cast<short>(__y._M_data);
1316 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short(
1317 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1318 }
1319 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1320 {
1321 const auto __xi = reinterpret_cast<int>(__x._M_data);
1322 const auto __yi = reinterpret_cast<int>(__y._M_data);
1323 return reinterpret_cast<__vector_type_t<_Tp, 3>>(
1324 ((__xi * __yi) & 0xff)
1325 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1326 | ((__xi >> 16) * (__yi & 0xff0000)));
1327 }
1328 else if constexpr (sizeof(_V) == 4)
1329 {
1330 const auto __xi = reinterpret_cast<int>(__x._M_data);
1331 const auto __yi = reinterpret_cast<int>(__y._M_data);
1332 return reinterpret_cast<__vector_type_t<_Tp, 4>>(
1333 ((__xi * __yi) & 0xff)
1334 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1335 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1336 | ((__xi >> 24) * (__yi & 0xff000000u)));
1337 }
1338 else if constexpr (sizeof(_V) == 8 && __have_avx2
1339 && is_signed_v<_Tp>)
1340 return __convert<typename _VVT::type>(
1341 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1342 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1343 else if constexpr (sizeof(_V) == 8 && __have_avx2
1344 && is_unsigned_v<_Tp>)
1345 return __convert<typename _VVT::type>(
1346 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1347 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1348 else
1349 {
1350 // codegen of `x*y` is suboptimal (as of GCC 9.0.1)
1351 constexpr size_t __full_size = _VVT::_S_full_size;
1352 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8;
1353 using _ShortW = _SimdWrapper<short, _Np>;
1354 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1355 * __vector_bitcast<short, _Np>(__y);
1356 _ShortW __high_byte = _ShortW()._M_data - 256;
1357 //[&]() { asm("" : "+x"(__high_byte._M_data)); }();
1358 const _ShortW __odd
1359 = (__vector_bitcast<short, _Np>(__x) >> 8)
1360 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1361 if constexpr (__have_avx512bw && sizeof(_V) > 2)
1362 return _CommonImplX86::_S_blend_avx512(
1363 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1364 __vector_bitcast<_Tp>(__odd));
1365 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1366 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1367 __high_byte),
1368 __to_intrin(__even),
1369 __to_intrin(__odd));
1370 else
1371 return __to_intrin(
1372 __or(__andnot(__high_byte, __even), __odd));
1373 }
1374 }
1375 else
1376 return _Base::_S_multiplies(__x, __y);
1377 }
1378
1379 // }}}
1380 // _S_divides {{{
1381#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993
1382 template <typename _Tp, size_t _Np>
1383 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1384 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1385 {
1386 if (!__builtin_is_constant_evaluated()
1387 && !__builtin_constant_p(__y._M_data))
1388 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4)
1389 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1)
1390 // Note that using floating-point division is likely to raise the
1391 // *Inexact* exception flag and thus appears like an invalid
1392 // "as-if" transformation. However, C++ doesn't specify how the
1393 // fpenv can be observed and points to C. C says that function
1394 // calls are assumed to potentially raise fp exceptions, unless
1395 // documented otherwise. Consequently, operator/, which is a
1396 // function call, may raise fp exceptions.
1397 /*const struct _CsrGuard
1398 {
1399 const unsigned _M_data = _mm_getcsr();
1400 _CsrGuard()
1401 {
1402 _mm_setcsr(0x9f80); // turn off FP exceptions and
1403 flush-to-zero
1404 }
1405 ~_CsrGuard() { _mm_setcsr(_M_data); }
1406 } __csr;*/
1407 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1408 constexpr size_t __n_intermediate
1409 = std::min(_Np, (__have_avx512f ? 64
1410 : __have_avx ? 32
1411 : 16)
1412 / sizeof(_Float));
1413 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1414 constexpr size_t __n_floatv
1415 = __div_roundup(_Np, __n_intermediate);
1416 using _R = __vector_type_t<_Tp, _Np>;
1417 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1418 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1419 _Abi::__make_padding_nonzero(__as_vector(__y)));
1420 return __call_with_n_evaluations<__n_floatv>(
1421 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1422 return __vector_convert<_R>(__quotients...);
1423 },
1424 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1425 -> _SimdWrapper<_Float, __n_intermediate>
1426 {
1427#if !defined __clang__ && __GCC_IEC_559 == 0
1428 // If -freciprocal-math is active, using the `/` operator is
1429 // incorrect because it may be translated to an imprecise
1430 // multiplication with reciprocal. We need to use inline
1431 // assembly to force a real division.
1432 _FloatV __r;
1433 if constexpr (__have_avx) // -mno-sse2avx is irrelevant
1434 // because once -mavx is given, GCC
1435 // emits VEX encoded vdivp[sd]
1436 {
1437 if constexpr (sizeof(_Tp) == 4)
1438 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}"
1439 : "=x"(__r)
1440 : "x"(__xf[__i]), "x"(__yf[__i]));
1441 else
1442 asm("vdivps\t{%2, %1, %0|%0, %1, %2}"
1443 : "=x"(__r)
1444 : "x"(__xf[__i]), "x"(__yf[__i]));
1445 }
1446 else
1447 {
1448 __r = __xf[__i];
1449 if constexpr (sizeof(_Tp) == 4)
1450 asm("divpd\t{%1, %0|%0, %1}"
1451 : "=x"(__r)
1452 : "x"(__yf[__i]));
1453 else
1454 asm("divps\t{%1, %0|%0, %1}"
1455 : "=x"(__r)
1456 : "x"(__yf[__i]));
1457 }
1458 return __r;
1459#else
1460 return __xf[__i] / __yf[__i];
1461#endif
1462 });
1463 }
1464 /* 64-bit int division is potentially optimizable via double division if
1465 * the value in __x is small enough and the conversion between
1466 * int<->double is efficient enough:
1467 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
1468 sizeof(_Tp) == 8)
1469 {
1470 if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1471 {
1472 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull,
1473 0xffe0'0000'0000'0000ull}))
1474 {
1475 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data)
1476 }
1477 }
1478 }
1479 */
1480 return _Base::_S_divides(__x, __y);
1481 }
1482 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993
1483
1484 // }}}
1485 // _S_modulus {{{
1486 template <typename _Tp, size_t _Np>
1487 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1488 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1489 {
1490 if (__builtin_is_constant_evaluated()
1491 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8)
1492 return _Base::_S_modulus(__x, __y);
1493 else
1494 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
1495 }
1496
1497 // }}}
1498 // _S_bit_shift_left {{{
1499 // Notes on UB. C++2a [expr.shift] says:
1500 // -1- [...] The operands shall be of integral or unscoped enumeration type
1501 // and integral promotions are performed. The type of the result is that
1502 // of the promoted left operand. The behavior is undefined if the right
1503 // operand is negative, or greater than or equal to the width of the
1504 // promoted left operand.
1505 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo
1506 // 2^N, where N is the width of the type of the result.
1507 //
1508 // C++17 [expr.shift] says:
1509 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated
1510 // bits are zero-filled. If E1 has an unsigned type, the value of the
1511 // result is E1 × 2^E2 , reduced modulo one more than the maximum value
1512 // representable in the result type. Otherwise, if E1 has a signed type
1513 // and non-negative value, and E1 × 2^E2 is representable in the
1514 // corresponding unsigned type of the result type, then that value,
1515 // converted to the result type, is the resulting value; otherwise, the
1516 // behavior is undefined.
1517 //
1518 // Consequences:
1519 // With C++2a signed and unsigned types have the same UB
1520 // characteristics:
1521 // - left shift is not UB for 0 <= RHS < max(32, #bits(T))
1522 //
1523 // With C++17 there's little room for optimizations because the standard
1524 // requires all shifts to happen on promoted integrals (i.e. int). Thus,
1525 // short and char shifts must assume shifts affect bits of neighboring
1526 // values.
1527 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1528 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1529 inline _GLIBCXX_CONST static typename _TVT::type
1530 _S_bit_shift_left(_Tp __xx, int __y)
1531 {
1532 using _V = typename _TVT::type;
1533 using _Up = typename _TVT::value_type;
1534 _V __x = __xx;
1535 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1536 if (__builtin_is_constant_evaluated())
1537 return __x << __y;
1538#if __cplusplus > 201703
1539 // after C++17, signed shifts have no UB, and behave just like unsigned
1540 // shifts
1541 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>)
1542 return __vector_bitcast<_Up>(
1543 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1544 __y));
1545#endif
1546 else if constexpr (sizeof(_Up) == 1)
1547 {
1548 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894)
1549 if (__builtin_constant_p(__y))
1550 {
1551 if (__y == 0)
1552 return __x;
1553 else if (__y == 1)
1554 return __x + __x;
1555 else if (__y == 2)
1556 {
1557 __x = __x + __x;
1558 return __x + __x;
1559 }
1560 else if (__y > 2 && __y < 8)
1561 {
1562 if constexpr (sizeof(__x) > sizeof(unsigned))
1563 {
1564 const _UChar __mask = 0xff << __y; // precomputed vector
1565 return __vector_bitcast<_Up>(
1566 __vector_bitcast<_UChar>(
1567 __vector_bitcast<unsigned>(__x) << __y)
1568 & __mask);
1569 }
1570 else
1571 {
1572 const unsigned __mask
1573 = (0xff & (0xff << __y)) * 0x01010101u;
1574 return reinterpret_cast<_V>(
1575 static_cast<__int_for_sizeof_t<_V>>(
1576 unsigned(
1577 reinterpret_cast<__int_for_sizeof_t<_V>>(__x)
1578 << __y)
1579 & __mask));
1580 }
1581 }
1582 else if (__y >= 8 && __y < 32)
1583 return _V();
1584 else
1585 __builtin_unreachable();
1586 }
1587 // general strategy in the following: use an sllv instead of sll
1588 // instruction, because it's 2 to 4 times faster:
1589 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16)
1590 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8(
1591 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix),
1592 _mm256_set1_epi16(__y))));
1593 else if constexpr (__have_avx512bw && sizeof(__x) == 32)
1594 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1595 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix),
1596 _mm512_set1_epi16(__y))));
1597 else if constexpr (__have_avx512bw && sizeof(__x) == 64)
1598 {
1599 const auto __shift = _mm512_set1_epi16(__y);
1600 return __vector_bitcast<_Up>(
1601 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1602 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)),
1603 _mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1604 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift))));
1605 }
1606 else if constexpr (__have_avx2 && sizeof(__x) == 32)
1607 {
1608#if 1
1609 const auto __shift = _mm_cvtsi32_si128(__y);
1610 auto __k
1611 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift);
1612 __k |= _mm256_srli_epi16(__k, 8);
1613 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift)
1614 & __k);
1615#else
1616 const _Up __k = 0xff << __y;
1617 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y)
1618 & __k;
1619#endif
1620 }
1621 else
1622 {
1623 const auto __shift = _mm_cvtsi32_si128(__y);
1624 auto __k
1625 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift);
1626 __k |= _mm_srli_epi16(__k, 8);
1627 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k);
1628 }
1629 }
1630 return __x << __y;
1631 }
1632
1633 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1634 inline _GLIBCXX_CONST static typename _TVT::type
1635 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y)
1636 {
1637 using _V = typename _TVT::type;
1638 using _Up = typename _TVT::value_type;
1639 _V __x = __xx;
1640 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1641 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1642 if (__builtin_is_constant_evaluated())
1643 return __x << __y;
1644#if __cplusplus > 201703
1645 // after C++17, signed shifts have no UB, and behave just like unsigned
1646 // shifts
1647 else if constexpr (is_signed_v<_Up>)
1648 return __vector_bitcast<_Up>(
1649 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1650 __vector_bitcast<make_unsigned_t<_Up>>(__y)));
1651#endif
1652 else if constexpr (sizeof(_Up) == 1)
1653 {
1654 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1655 return __vector_bitcast<_Up>(__concat(
1656 _mm512_cvtepi16_epi8(
1657 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)),
1658 _mm512_cvtepu8_epi16(__lo256(__iy)))),
1659 _mm512_cvtepi16_epi8(
1660 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)),
1661 _mm512_cvtepu8_epi16(__hi256(__iy))))));
1662 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1663 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1664 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix),
1665 _mm512_cvtepu8_epi16(__iy))));
1666 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl)
1667 return __intrin_bitcast<_V>(
1668 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix),
1669 _mm_cvtepu8_epi16(__iy))));
1670 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1671 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1672 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix),
1673 _mm256_cvtepu8_epi16(__iy))));
1674 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1675 return __intrin_bitcast<_V>(
1676 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1677 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)),
1678 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy))))));
1679 else if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1680 {
1681 auto __mask
1682 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5);
1683 auto __x4
1684 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1685 __x4 &= char(0xf0);
1686 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1687 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4)));
1688 __mask += __mask;
1689 auto __x2
1690 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1691 __x2 &= char(0xfc);
1692 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1693 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2)));
1694 __mask += __mask;
1695 auto __x1 = __x + __x;
1696 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1697 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1)));
1698 return __x
1699 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1700 }
1701 else if constexpr (sizeof(__x) == 16)
1702 {
1703 auto __mask
1704 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5);
1705 auto __x4
1706 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1707 __x4 &= char(0xf0);
1708 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x;
1709 __mask += __mask;
1710 auto __x2
1711 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1712 __x2 &= char(0xfc);
1713 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x;
1714 __mask += __mask;
1715 auto __x1 = __x + __x;
1716 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x;
1717 return __x
1718 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1719 }
1720 else
1721 return __x << __y;
1722 }
1723 else if constexpr (sizeof(_Up) == 2)
1724 {
1725 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1726 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy));
1727 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl)
1728 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy));
1729 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1730 return __vector_bitcast<_Up>(
1731 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix),
1732 _mm512_castsi256_si512(__iy))));
1733 else if constexpr (sizeof __ix == 32 && __have_avx2)
1734 {
1735 const auto __ux = __vector_bitcast<unsigned>(__x);
1736 const auto __uy = __vector_bitcast<unsigned>(__y);
1737 return __vector_bitcast<_Up>(_mm256_blend_epi16(
1738 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1739 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1740 }
1741 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1742 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy));
1743 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1744 return __intrin_bitcast<_V>(
1745 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix),
1746 _mm512_castsi128_si512(__iy))));
1747 else if constexpr (sizeof __ix == 16 && __have_avx2)
1748 {
1749 const auto __ux = __vector_bitcast<unsigned>(__ix);
1750 const auto __uy = __vector_bitcast<unsigned>(__iy);
1751 return __intrin_bitcast<_V>(_mm_blend_epi16(
1752 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1753 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1754 }
1755 else if constexpr (sizeof __ix == 16)
1756 {
1757 using _Float4 = __vector_type_t<float, 4>;
1758 using _Int4 = __vector_type_t<int, 4>;
1759 using _UInt4 = __vector_type_t<unsigned, 4>;
1760 const _UInt4 __yu
1761 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3)));
1762 return __x
1763 * __intrin_bitcast<_V>(
1764 __vector_convert<_Int4>(_SimdWrapper<float, 4>(
1765 reinterpret_cast<_Float4>(__yu << 23)))
1766 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>(
1767 reinterpret_cast<_Float4>((__yu >> 16) << 23)))
1768 << 16));
1769 }
1770 else
1771 __assert_unreachable<_Tp>();
1772 }
1773 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16
1774 && !__have_avx2)
1775 // latency is suboptimal, but throughput is at full speedup
1776 return __intrin_bitcast<_V>(
1777 __vector_bitcast<unsigned>(__ix)
1778 * __vector_convert<__vector_type16_t<int>>(
1779 _SimdWrapper<float, 4>(__vector_bitcast<float>(
1780 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000))));
1781 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16
1782 && !__have_avx2)
1783 {
1784 const auto __lo = _mm_sll_epi64(__ix, __iy);
1785 const auto __hi
1786 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy));
1787 if constexpr (__have_sse4_1)
1788 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0));
1789 else
1790 return __vector_bitcast<_Up>(
1791 _mm_move_sd(__vector_bitcast<double>(__hi),
1792 __vector_bitcast<double>(__lo)));
1793 }
1794 else
1795 return __x << __y;
1796 }
1797#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
1798
1799 // }}}
1800 // _S_bit_shift_right {{{
1801#ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1802 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1803 inline _GLIBCXX_CONST static typename _TVT::type
1804 _S_bit_shift_right(_Tp __xx, int __y)
1805 {
1806 using _V = typename _TVT::type;
1807 using _Up = typename _TVT::value_type;
1808 _V __x = __xx;
1809 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1810 if (__builtin_is_constant_evaluated())
1811 return __x >> __y;
1812 else if (__builtin_constant_p(__y)
1813 && is_unsigned_v<
1814 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__))
1815 return _V();
1816 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{
1817 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y)
1818 & _Up(0xff >> __y);
1819 //}}}
1820 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{
1821 return __intrin_bitcast<_V>(
1822 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix)
1823 >> (__y + 8))
1824 << 8)
1825 | (__vector_bitcast<_UShort>(
1826 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8)
1827 >> __y)
1828 >> 8));
1829 //}}}
1830 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected
1831 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{
1832 {
1833 if (__y > 32)
1834 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32)
1835 & _Up(0xffff'ffff'0000'0000ull))
1836 | __vector_bitcast<_Up>(
1837 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix)
1838 >> 32)
1839 >> (__y - 32));
1840 else
1841 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix)
1842 >> __y)
1843 | __vector_bitcast<_Up>(
1844 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll)
1845 >> __y);
1846 }
1847 //}}}
1848 else
1849 return __x >> __y;
1850 }
1851
1852 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1853 inline _GLIBCXX_CONST static typename _TVT::type
1854 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y)
1855 {
1856 using _V = typename _TVT::type;
1857 using _Up = typename _TVT::value_type;
1858 _V __x = __xx;
1859 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1860 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1861 if (__builtin_is_constant_evaluated()
1862 || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
1863 return __x >> __y;
1864 else if constexpr (sizeof(_Up) == 1) //{{{
1865 {
1866 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl)
1867 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8(
1868 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix),
1869 _mm_cvtepi8_epi16(__iy))
1870 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix),
1871 _mm_cvtepu8_epi16(__iy))));
1872 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl)
1873 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1874 is_signed_v<_Up>
1875 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix),
1876 _mm256_cvtepi8_epi16(__iy))
1877 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix),
1878 _mm256_cvtepu8_epi16(__iy))));
1879 else if constexpr (sizeof(__x) == 32 && __have_avx512bw)
1880 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1881 is_signed_v<_Up>
1882 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix),
1883 _mm512_cvtepi8_epi16(__iy))
1884 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix),
1885 _mm512_cvtepu8_epi16(__iy))));
1886 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>)
1887 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1888 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1889 0x5555'5555'5555'5555ull,
1890 _mm512_srav_epi16(
1891 _mm512_slli_epi16(__ix, 8),
1892 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy,
1893 _mm512_set1_epi16(8)))));
1894 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>)
1895 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1896 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1897 0x5555'5555'5555'5555ull,
1898 _mm512_srlv_epi16(
1899 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix),
1900 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy))));
1901 /* This has better throughput but higher latency than the impl below
1902 else if constexpr (__have_avx2 && sizeof(__x) == 16 &&
1903 is_unsigned_v<_Up>)
1904 {
1905 const auto __shorts = __to_intrin(_S_bit_shift_right(
1906 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)),
1907 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy))));
1908 return __vector_bitcast<_Up>(
1909 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts)));
1910 }
1911 */
1912 else if constexpr (__have_avx2 && sizeof(__x) > 8)
1913 // the following uses vpsr[al]vd, which requires AVX2
1914 if constexpr (is_signed_v<_Up>)
1915 {
1916 const auto r3 = __vector_bitcast<_UInt>(
1917 (__vector_bitcast<int>(__x)
1918 >> (__vector_bitcast<_UInt>(__y) >> 24)))
1919 & 0xff000000u;
1920 const auto r2
1921 = __vector_bitcast<_UInt>(
1922 ((__vector_bitcast<int>(__x) << 8)
1923 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)))
1924 & 0xff000000u;
1925 const auto r1
1926 = __vector_bitcast<_UInt>(
1927 ((__vector_bitcast<int>(__x) << 16)
1928 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)))
1929 & 0xff000000u;
1930 const auto r0 = __vector_bitcast<_UInt>(
1931 (__vector_bitcast<int>(__x) << 24)
1932 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24));
1933 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1934 | (r0 >> 24));
1935 }
1936 else
1937 {
1938 const auto r3 = (__vector_bitcast<_UInt>(__x)
1939 >> (__vector_bitcast<_UInt>(__y) >> 24))
1940 & 0xff000000u;
1941 const auto r2
1942 = ((__vector_bitcast<_UInt>(__x) << 8)
1943 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))
1944 & 0xff000000u;
1945 const auto r1
1946 = ((__vector_bitcast<_UInt>(__x) << 16)
1947 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))
1948 & 0xff000000u;
1949 const auto r0
1950 = (__vector_bitcast<_UInt>(__x) << 24)
1951 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24);
1952 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1953 | (r0 >> 24));
1954 }
1955 else if constexpr (__have_sse4_1
1956 && is_unsigned_v<_Up> && sizeof(__x) > 2)
1957 {
1958 auto __x128 = __vector_bitcast<_Up>(__ix);
1959 auto __mask
1960 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5);
1961 auto __x4 = __vector_bitcast<_Up>(
1962 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f));
1963 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1964 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4)));
1965 __mask += __mask;
1966 auto __x2 = __vector_bitcast<_Up>(
1967 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f));
1968 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1969 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2)));
1970 __mask += __mask;
1971 auto __x1 = __vector_bitcast<_Up>(
1972 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f));
1973 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1974 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1)));
1975 return __intrin_bitcast<_V>(
1976 __x128
1977 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
1978 == 0)); // y > 7 nulls the result
1979 }
1980 else if constexpr (__have_sse4_1
1981 && is_signed_v<_Up> && sizeof(__x) > 2)
1982 {
1983 auto __mask = __vector_bitcast<_UChar>(
1984 __vector_bitcast<_UShort>(__iy) << 5);
1985 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1986 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
1987 };
1988 auto __xh = __vector_bitcast<short>(__ix);
1989 auto __xl = __vector_bitcast<short>(__ix) << 8;
1990 auto __xh4 = __xh >> 4;
1991 auto __xl4 = __xl >> 4;
1992 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
1993 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4)));
1994 __xl = __vector_bitcast<short>(
1995 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
1996 __to_intrin(__xl4)));
1997 __mask += __mask;
1998 auto __xh2 = __xh >> 2;
1999 auto __xl2 = __xl >> 2;
2000 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2001 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2)));
2002 __xl = __vector_bitcast<short>(
2003 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2004 __to_intrin(__xl2)));
2005 __mask += __mask;
2006 auto __xh1 = __xh >> 1;
2007 auto __xl1 = __xl >> 1;
2008 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2009 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1)));
2010 __xl = __vector_bitcast<short>(
2011 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2012 __to_intrin(__xl1)));
2013 return __intrin_bitcast<_V>(
2014 (__vector_bitcast<_Up>((__xh & short(0xff00)))
2015 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2016 >> 8))
2017 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2018 == 0)); // y > 7 nulls the result
2019 }
2020 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2
2021 {
2022 auto __mask
2023 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5);
2024 auto __x4 = __vector_bitcast<_Up>(
2025 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f));
2026 __x = __mask > 0x7f ? __x4 : __x;
2027 __mask += __mask;
2028 auto __x2 = __vector_bitcast<_Up>(
2029 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f));
2030 __x = __mask > 0x7f ? __x2 : __x;
2031 __mask += __mask;
2032 auto __x1 = __vector_bitcast<_Up>(
2033 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f));
2034 __x = __mask > 0x7f ? __x1 : __x;
2035 return __x
2036 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2037 }
2038 else if constexpr (sizeof(__x) > 2) // signed SSE2
2039 {
2040 static_assert(is_signed_v<_Up>);
2041 auto __maskh = __vector_bitcast<_UShort>(__y) << 5;
2042 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8);
2043 auto __xh = __vector_bitcast<short>(__x);
2044 auto __xl = __vector_bitcast<short>(__x) << 8;
2045 auto __xh4 = __xh >> 4;
2046 auto __xl4 = __xl >> 4;
2047 __xh = __maskh > 0x7fff ? __xh4 : __xh;
2048 __xl = __maskl > 0x7fff ? __xl4 : __xl;
2049 __maskh += __maskh;
2050 __maskl += __maskl;
2051 auto __xh2 = __xh >> 2;
2052 auto __xl2 = __xl >> 2;
2053 __xh = __maskh > 0x7fff ? __xh2 : __xh;
2054 __xl = __maskl > 0x7fff ? __xl2 : __xl;
2055 __maskh += __maskh;
2056 __maskl += __maskl;
2057 auto __xh1 = __xh >> 1;
2058 auto __xl1 = __xl >> 1;
2059 __xh = __maskh > 0x7fff ? __xh1 : __xh;
2060 __xl = __maskl > 0x7fff ? __xl1 : __xl;
2061 __x = __vector_bitcast<_Up>((__xh & short(0xff00)))
2062 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2063 >> 8);
2064 return __x
2065 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2066 }
2067 else
2068 return __x >> __y;
2069 } //}}}
2070 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
2071 {
2072 [[maybe_unused]] auto __blend_0xaa
2073 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2074 if constexpr (sizeof(__a) == 16)
2075 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2076 0xaa);
2077 else if constexpr (sizeof(__a) == 32)
2078 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2079 0xaa);
2080 else if constexpr (sizeof(__a) == 64)
2081 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
2082 __to_intrin(__b));
2083 else
2084 __assert_unreachable<decltype(__a)>();
2085 };
2086 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
2087 return __intrin_bitcast<_V>(is_signed_v<_Up>
2088 ? _mm_srav_epi16(__ix, __iy)
2089 : _mm_srlv_epi16(__ix, __iy));
2090 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32)
2091 return __vector_bitcast<_Up>(is_signed_v<_Up>
2092 ? _mm256_srav_epi16(__ix, __iy)
2093 : _mm256_srlv_epi16(__ix, __iy));
2094 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64)
2095 return __vector_bitcast<_Up>(is_signed_v<_Up>
2096 ? _mm512_srav_epi16(__ix, __iy)
2097 : _mm512_srlv_epi16(__ix, __iy));
2098 else if constexpr (__have_avx2 && is_signed_v<_Up>)
2099 return __intrin_bitcast<_V>(
2100 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16)
2101 >> (__vector_bitcast<int>(__iy) & 0xffffu))
2102 >> 16,
2103 __vector_bitcast<int>(__ix)
2104 >> (__vector_bitcast<int>(__iy) >> 16)));
2105 else if constexpr (__have_avx2 && is_unsigned_v<_Up>)
2106 return __intrin_bitcast<_V>(
2107 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu)
2108 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu),
2109 __vector_bitcast<_UInt>(__ix)
2110 >> (__vector_bitcast<_UInt>(__iy) >> 16)));
2111 else if constexpr (__have_sse4_1)
2112 {
2113 auto __mask = __vector_bitcast<_UShort>(__iy);
2114 auto __x128 = __vector_bitcast<_Up>(__ix);
2115 //__mask *= 0x0808;
2116 __mask = (__mask << 3) | (__mask << 11);
2117 // do __x128 = 0 where __y[4] is set
2118 __x128 = __vector_bitcast<_Up>(
2119 _mm_blendv_epi8(__to_intrin(__x128), __m128i(),
2120 __to_intrin(__mask)));
2121 // do __x128 =>> 8 where __y[3] is set
2122 __x128 = __vector_bitcast<_Up>(
2123 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8),
2124 __to_intrin(__mask += __mask)));
2125 // do __x128 =>> 4 where __y[2] is set
2126 __x128 = __vector_bitcast<_Up>(
2127 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4),
2128 __to_intrin(__mask += __mask)));
2129 // do __x128 =>> 2 where __y[1] is set
2130 __x128 = __vector_bitcast<_Up>(
2131 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2),
2132 __to_intrin(__mask += __mask)));
2133 // do __x128 =>> 1 where __y[0] is set
2134 return __intrin_bitcast<_V>(
2135 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1),
2136 __to_intrin(__mask + __mask)));
2137 }
2138 else
2139 {
2140 auto __k = __vector_bitcast<_UShort>(__iy) << 11;
2141 auto __x128 = __vector_bitcast<_Up>(__ix);
2142 auto __mask
2143 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2144 return __vector_bitcast<short>(__kk) < 0;
2145 };
2146 // do __x128 = 0 where __y[4] is set
2147 __x128 = __mask(__k) ? decltype(__x128)() : __x128;
2148 // do __x128 =>> 8 where __y[3] is set
2149 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128;
2150 // do __x128 =>> 4 where __y[2] is set
2151 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128;
2152 // do __x128 =>> 2 where __y[1] is set
2153 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128;
2154 // do __x128 =>> 1 where __y[0] is set
2155 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1
2156 : __x128);
2157 }
2158 } //}}}
2159 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{
2160 {
2161 if constexpr (is_unsigned_v<_Up>)
2162 {
2163 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31
2164 const __m128 __factor_f = reinterpret_cast<__m128>(
2165 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23));
2166 const __m128i __factor
2167 = __builtin_constant_p(__factor_f)
2168 ? __to_intrin(
2169 __make_vector<unsigned>(__factor_f[0], __factor_f[1],
2170 __factor_f[2], __factor_f[3]))
2171 : _mm_cvttps_epi32(__factor_f);
2172 const auto __r02
2173 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31);
2174 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4),
2175 _mm_srli_si128(__factor, 4));
2176 if constexpr (__have_sse4_1)
2177 return __intrin_bitcast<_V>(
2178 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33));
2179 else
2180 return __intrin_bitcast<_V>(
2181 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4));
2182 }
2183 else
2184 {
2185 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2186 if constexpr (is_signed_v<_Up>)
2187 return _mm_sra_epi32(__a, __b);
2188 else
2189 return _mm_srl_epi32(__a, __b);
2190 };
2191 const auto __r0
2192 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i()));
2193 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32));
2194 const auto __r2
2195 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i()));
2196 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12));
2197 if constexpr (__have_sse4_1)
2198 return __intrin_bitcast<_V>(
2199 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3),
2200 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0));
2201 else
2202 return __intrin_bitcast<_V>(_mm_unpacklo_epi64(
2203 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)),
2204 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4))));
2205 }
2206 } //}}}
2207 else
2208 return __x >> __y;
2209 }
2210#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
2211
2212 // }}}
2213 // compares {{{
2214 // _S_equal_to {{{
2215 template <typename _Tp, size_t _Np>
2216 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2217 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2218 {
2219 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2220 {
2221 if (__builtin_is_constant_evaluated()
2222 || (__x._M_is_constprop() && __y._M_is_constprop()))
2223 return _MaskImpl::_S_to_bits(
2224 __as_wrapper<_Np>(__x._M_data == __y._M_data));
2225
2226 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2227 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2228 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2229 if constexpr (is_floating_point_v<_Tp>)
2230 {
2231 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2232 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2233 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2234 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2235 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2236 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2237 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2238 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2239 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2240 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2241 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2242 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2243 else
2244 __assert_unreachable<_Tp>();
2245 }
2246 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2247 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2248 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2249 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2250 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2251 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2252 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2253 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2254 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2255 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2256 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2257 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2258 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2259 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2260 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2261 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2262 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2263 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2264 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2265 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2266 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2267 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2268 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2269 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2270 else
2271 __assert_unreachable<_Tp>();
2272 } // }}}
2273 else if (__builtin_is_constant_evaluated())
2274 return _Base::_S_equal_to(__x, __y);
2275 else if constexpr (sizeof(__x) == 8) // {{{
2276 {
2277 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2278 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2279 _MaskMember<_Tp> __r64;
2280 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2281 return __r64;
2282 } // }}}
2283 else
2284 return _Base::_S_equal_to(__x, __y);
2285 }
2286
2287 // }}}
2288 // _S_not_equal_to {{{
2289 template <typename _Tp, size_t _Np>
2290 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2291 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2292 {
2293 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2294 {
2295 if (__builtin_is_constant_evaluated()
2296 || (__x._M_is_constprop() && __y._M_is_constprop()))
2297 return _MaskImpl::_S_to_bits(
2298 __as_wrapper<_Np>(__x._M_data != __y._M_data));
2299
2300 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2301 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2302 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2303 if constexpr (is_floating_point_v<_Tp>)
2304 {
2305 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2306 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2307 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2308 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2309 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2310 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2311 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2312 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2313 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2314 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2315 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2316 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2317 else
2318 __assert_unreachable<_Tp>();
2319 }
2320 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2321 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2322 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2323 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2324 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2325 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2326 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2327 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2328 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2329 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2330 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2331 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2332 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2333 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2334 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2335 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2336 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2337 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2338 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2339 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2340 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2341 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2342 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2343 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2344 else
2345 __assert_unreachable<_Tp>();
2346 } // }}}
2347 else if (__builtin_is_constant_evaluated())
2348 return _Base::_S_not_equal_to(__x, __y);
2349 else if constexpr (sizeof(__x) == 8)
2350 {
2351 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2352 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2353 _MaskMember<_Tp> __r64;
2354 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2355 return __r64;
2356 }
2357 else
2358 return _Base::_S_not_equal_to(__x, __y);
2359 }
2360
2361 // }}}
2362 // _S_less {{{
2363 template <typename _Tp, size_t _Np>
2364 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2365 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2366 {
2367 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2368 {
2369 if (__builtin_is_constant_evaluated()
2370 || (__x._M_is_constprop() && __y._M_is_constprop()))
2371 return _MaskImpl::_S_to_bits(
2372 __as_wrapper<_Np>(__x._M_data < __y._M_data));
2373
2374 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2375 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2376 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2377 if constexpr (sizeof(__xi) == 64)
2378 {
2379 if constexpr (is_same_v<_Tp, float>)
2380 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2381 else if constexpr (is_same_v<_Tp, double>)
2382 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2383 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2384 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2385 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2386 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2387 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2388 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2389 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2390 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2391 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2392 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2393 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2394 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2395 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2396 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2397 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2398 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2399 else
2400 __assert_unreachable<_Tp>();
2401 }
2402 else if constexpr (sizeof(__xi) == 32)
2403 {
2404 if constexpr (is_same_v<_Tp, float>)
2405 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2406 else if constexpr (is_same_v<_Tp, double>)
2407 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2408 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2409 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2410 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2411 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2412 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2413 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2414 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2415 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2416 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2417 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2418 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2419 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2420 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2421 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2422 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2423 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2424 else
2425 __assert_unreachable<_Tp>();
2426 }
2427 else if constexpr (sizeof(__xi) == 16)
2428 {
2429 if constexpr (is_same_v<_Tp, float>)
2430 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2431 else if constexpr (is_same_v<_Tp, double>)
2432 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2433 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2434 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2435 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2436 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2437 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2438 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2439 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2440 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2441 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2442 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2443 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2444 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2445 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2446 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2447 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2448 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2449 else
2450 __assert_unreachable<_Tp>();
2451 }
2452 else
2453 __assert_unreachable<_Tp>();
2454 } // }}}
2455 else if (__builtin_is_constant_evaluated())
2456 return _Base::_S_less(__x, __y);
2457 else if constexpr (sizeof(__x) == 8)
2458 {
2459 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2460 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2461 _MaskMember<_Tp> __r64;
2462 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2463 return __r64;
2464 }
2465 else
2466 return _Base::_S_less(__x, __y);
2467 }
2468
2469 // }}}
2470 // _S_less_equal {{{
2471 template <typename _Tp, size_t _Np>
2472 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2473 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2474 {
2475 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2476 {
2477 if (__builtin_is_constant_evaluated()
2478 || (__x._M_is_constprop() && __y._M_is_constprop()))
2479 return _MaskImpl::_S_to_bits(
2480 __as_wrapper<_Np>(__x._M_data <= __y._M_data));
2481
2482 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2483 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2484 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2485 if constexpr (sizeof(__xi) == 64)
2486 {
2487 if constexpr (is_same_v<_Tp, float>)
2488 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2489 else if constexpr (is_same_v<_Tp, double>)
2490 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2491 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2492 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi);
2493 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2494 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi);
2495 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2496 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi);
2497 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2498 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi);
2499 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2500 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi);
2501 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2502 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi);
2503 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2504 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi);
2505 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2506 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi);
2507 else
2508 __assert_unreachable<_Tp>();
2509 }
2510 else if constexpr (sizeof(__xi) == 32)
2511 {
2512 if constexpr (is_same_v<_Tp, float>)
2513 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2514 else if constexpr (is_same_v<_Tp, double>)
2515 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2516 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2517 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi);
2518 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2519 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi);
2520 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2521 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi);
2522 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2523 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi);
2524 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2525 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi);
2526 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2527 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi);
2528 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2529 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi);
2530 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2531 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi);
2532 else
2533 __assert_unreachable<_Tp>();
2534 }
2535 else if constexpr (sizeof(__xi) == 16)
2536 {
2537 if constexpr (is_same_v<_Tp, float>)
2538 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2539 else if constexpr (is_same_v<_Tp, double>)
2540 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2541 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2542 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi);
2543 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2544 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi);
2545 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2546 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi);
2547 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2548 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi);
2549 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2550 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi);
2551 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2552 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi);
2553 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2554 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi);
2555 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2556 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi);
2557 else
2558 __assert_unreachable<_Tp>();
2559 }
2560 else
2561 __assert_unreachable<_Tp>();
2562 } // }}}
2563 else if (__builtin_is_constant_evaluated())
2564 return _Base::_S_less_equal(__x, __y);
2565 else if constexpr (sizeof(__x) == 8)
2566 {
2567 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2568 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2569 _MaskMember<_Tp> __r64;
2570 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2571 return __r64;
2572 }
2573 else
2574 return _Base::_S_less_equal(__x, __y);
2575 }
2576
2577 // }}} }}}
2578 // negation {{{
2579 template <typename _Tp, size_t _Np>
2580 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2581 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
2582 {
2583 if constexpr (__is_avx512_abi<_Abi>())
2584 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>());
2585 else
2586 return _Base::_S_negate(__x);
2587 }
2588
2589 // }}}
2590 // math {{{
2591 using _Base::_S_abs;
2592
2593 // _S_sqrt {{{
2594 template <typename _Tp, size_t _Np>
2595 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2596 _S_sqrt(_SimdWrapper<_Tp, _Np> __x)
2597 {
2598 if constexpr (__is_sse_ps<_Tp, _Np>())
2599 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x)));
2600 else if constexpr (__is_sse_pd<_Tp, _Np>())
2601 return _mm_sqrt_pd(__x);
2602 else if constexpr (__is_avx_ps<_Tp, _Np>())
2603 return _mm256_sqrt_ps(__x);
2604 else if constexpr (__is_avx_pd<_Tp, _Np>())
2605 return _mm256_sqrt_pd(__x);
2606 else if constexpr (__is_avx512_ps<_Tp, _Np>())
2607 return _mm512_sqrt_ps(__x);
2608 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2609 return _mm512_sqrt_pd(__x);
2610 else
2611 __assert_unreachable<_Tp>();
2612 }
2613
2614 // }}}
2615 // _S_ldexp {{{
2616 template <typename _Tp, size_t _Np>
2617 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2618 _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
2619 __fixed_size_storage_t<int, _Np> __exp)
2620 {
2621 if constexpr (sizeof(__x) == 64 || __have_avx512vl)
2622 {
2623 const auto __xi = __to_intrin(__x);
2624 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi>
2625 __cvt;
2626 const auto __expi = __to_intrin(__cvt(__exp));
2627 using _Up = __bool_storage_member_type_t<_Np>;
2628 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up();
2629 if constexpr (sizeof(__xi) == 16)
2630 {
2631 if constexpr (sizeof(_Tp) == 8)
2632 return _mm_maskz_scalef_pd(__k1, __xi, __expi);
2633 else
2634 return _mm_maskz_scalef_ps(__k1, __xi, __expi);
2635 }
2636 else if constexpr (sizeof(__xi) == 32)
2637 {
2638 if constexpr (sizeof(_Tp) == 8)
2639 return _mm256_maskz_scalef_pd(__k1, __xi, __expi);
2640 else
2641 return _mm256_maskz_scalef_ps(__k1, __xi, __expi);
2642 }
2643 else
2644 {
2645 static_assert(sizeof(__xi) == 64);
2646 if constexpr (sizeof(_Tp) == 8)
2647 return _mm512_maskz_scalef_pd(__k1, __xi, __expi);
2648 else
2649 return _mm512_maskz_scalef_ps(__k1, __xi, __expi);
2650 }
2651 }
2652 else
2653 return _Base::_S_ldexp(__x, __exp);
2654 }
2655
2656 // }}}
2657 // _S_trunc {{{
2658 template <typename _Tp, size_t _Np>
2659 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2660 _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2661 {
2662 if constexpr (__is_avx512_ps<_Tp, _Np>())
2663 return _mm512_roundscale_ps(__x, 0x0b);
2664 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2665 return _mm512_roundscale_pd(__x, 0x0b);
2666 else if constexpr (__is_avx_ps<_Tp, _Np>())
2667 return _mm256_round_ps(__x, 0xb);
2668 else if constexpr (__is_avx_pd<_Tp, _Np>())
2669 return _mm256_round_pd(__x, 0xb);
2670 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2671 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb));
2672 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2673 return _mm_round_pd(__x, 0xb);
2674 else if constexpr (__is_sse_ps<_Tp, _Np>())
2675 {
2676 auto __truncated
2677 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)));
2678 const auto __no_fractional_values
2679 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x))
2680 & 0x7f800000u)
2681 < 0x4b000000; // the exponent is so large that no mantissa bits
2682 // signify fractional values (0x3f8 + 23*8 =
2683 // 0x4b0)
2684 return __no_fractional_values ? __truncated : __to_intrin(__x);
2685 }
2686 else
2687 return _Base::_S_trunc(__x);
2688 }
2689
2690 // }}}
2691 // _S_round {{{
2692 template <typename _Tp, size_t _Np>
2693 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2694 _S_round(_SimdWrapper<_Tp, _Np> __x)
2695 {
2696 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away
2697 // from zero as required by std::round. Therefore this function is more
2698 // complicated.
2699 using _V = __vector_type_t<_Tp, _Np>;
2700 _V __truncated;
2701 if constexpr (__is_avx512_ps<_Tp, _Np>())
2702 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b);
2703 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2704 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b);
2705 else if constexpr (__is_avx_ps<_Tp, _Np>())
2706 __truncated = _mm256_round_ps(__x._M_data,
2707 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2708 else if constexpr (__is_avx_pd<_Tp, _Np>())
2709 __truncated = _mm256_round_pd(__x._M_data,
2710 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2711 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2712 __truncated = __auto_bitcast(
2713 _mm_round_ps(__to_intrin(__x),
2714 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
2715 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2716 __truncated
2717 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2718 else if constexpr (__is_sse_ps<_Tp, _Np>())
2719 __truncated = __auto_bitcast(
2720 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))));
2721 else
2722 return _Base::_S_round(__x);
2723
2724 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0
2725 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0
2726
2727 const _V __rounded
2728 = __truncated
2729 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5)
2730 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1)
2731 : _V());
2732 if constexpr (__have_sse4_1)
2733 return __rounded;
2734 else // adjust for missing range in cvttps_epi32
2735 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded
2736 : __x._M_data;
2737 }
2738
2739 // }}}
2740 // _S_nearbyint {{{
2741 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2742 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nearbyint(_Tp __x) noexcept
2743 {
2744 if constexpr (_TVT::template _S_is<float, 16>)
2745 return _mm512_roundscale_ps(__x, 0x0c);
2746 else if constexpr (_TVT::template _S_is<double, 8>)
2747 return _mm512_roundscale_pd(__x, 0x0c);
2748 else if constexpr (_TVT::template _S_is<float, 8>)
2749 return _mm256_round_ps(__x,
2750 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2751 else if constexpr (_TVT::template _S_is<double, 4>)
2752 return _mm256_round_pd(__x,
2753 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2754 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2755 return _mm_round_ps(__x,
2756 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2757 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2758 return _mm_round_pd(__x,
2759 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2760 else
2761 return _Base::_S_nearbyint(__x);
2762 }
2763
2764 // }}}
2765 // _S_rint {{{
2766 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2767 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_rint(_Tp __x) noexcept
2768 {
2769 if constexpr (_TVT::template _S_is<float, 16>)
2770 return _mm512_roundscale_ps(__x, 0x04);
2771 else if constexpr (_TVT::template _S_is<double, 8>)
2772 return _mm512_roundscale_pd(__x, 0x04);
2773 else if constexpr (_TVT::template _S_is<float, 8>)
2774 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2775 else if constexpr (_TVT::template _S_is<double, 4>)
2776 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2777 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2778 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2779 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2780 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2781 else
2782 return _Base::_S_rint(__x);
2783 }
2784
2785 // }}}
2786 // _S_floor {{{
2787 template <typename _Tp, size_t _Np>
2788 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2789 _S_floor(_SimdWrapper<_Tp, _Np> __x)
2790 {
2791 if constexpr (__is_avx512_ps<_Tp, _Np>())
2792 return _mm512_roundscale_ps(__x, 0x09);
2793 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2794 return _mm512_roundscale_pd(__x, 0x09);
2795 else if constexpr (__is_avx_ps<_Tp, _Np>())
2796 return _mm256_round_ps(__x, 0x9);
2797 else if constexpr (__is_avx_pd<_Tp, _Np>())
2798 return _mm256_round_pd(__x, 0x9);
2799 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2800 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9));
2801 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2802 return _mm_round_pd(__x, 0x9);
2803 else
2804 return _Base::_S_floor(__x);
2805 }
2806
2807 // }}}
2808 // _S_ceil {{{
2809 template <typename _Tp, size_t _Np>
2810 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2811 _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2812 {
2813 if constexpr (__is_avx512_ps<_Tp, _Np>())
2814 return _mm512_roundscale_ps(__x, 0x0a);
2815 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2816 return _mm512_roundscale_pd(__x, 0x0a);
2817 else if constexpr (__is_avx_ps<_Tp, _Np>())
2818 return _mm256_round_ps(__x, 0xa);
2819 else if constexpr (__is_avx_pd<_Tp, _Np>())
2820 return _mm256_round_pd(__x, 0xa);
2821 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2822 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa));
2823 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2824 return _mm_round_pd(__x, 0xa);
2825 else
2826 return _Base::_S_ceil(__x);
2827 }
2828
2829 // }}}
2830 // _S_signbit {{{
2831 template <typename _Tp, size_t _Np>
2832 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2833 _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2834 {
2835 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2836 {
2837 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
2838 return _mm512_movepi32_mask(
2839 __intrin_bitcast<__m512i>(__x._M_data));
2840 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
2841 return _mm512_movepi64_mask(
2842 __intrin_bitcast<__m512i>(__x._M_data));
2843 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
2844 return _mm256_movepi32_mask(
2845 __intrin_bitcast<__m256i>(__x._M_data));
2846 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
2847 return _mm256_movepi64_mask(
2848 __intrin_bitcast<__m256i>(__x._M_data));
2849 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4)
2850 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data));
2851 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8)
2852 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data));
2853 }
2854 else if constexpr (__is_avx512_abi<_Abi>())
2855 {
2856 const auto __xi = __to_intrin(__x);
2857 [[maybe_unused]] constexpr auto __k1
2858 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2859 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2860 return _mm_movemask_ps(__xi);
2861 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2862 return _mm_movemask_pd(__xi);
2863 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2864 return _mm256_movemask_ps(__xi);
2865 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2866 return _mm256_movemask_pd(__xi);
2867 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2868 return _mm512_mask_cmplt_epi32_mask(
2869 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2870 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2871 return _mm512_mask_cmplt_epi64_mask(
2872 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2873 else
2874 __assert_unreachable<_Tp>();
2875 }
2876 else
2877 return _Base::_S_signbit(__x);
2878 /*{
2879 using _I = __int_for_sizeof_t<_Tp>;
2880 if constexpr (sizeof(__x) == 64)
2881 return _S_less(__vector_bitcast<_I>(__x), _I());
2882 else
2883 {
2884 const auto __xx = __vector_bitcast<_I>(__x._M_data);
2885 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>;
2886 if constexpr ((sizeof(_Tp) == 4 &&
2887 (__have_avx2 || sizeof(__x) == 16)) ||
2888 __have_avx512vl)
2889 {
2890 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>);
2891 }
2892 else if constexpr ((__have_avx2 ||
2893 (__have_ssse3 && sizeof(__x) == 16)))
2894 {
2895 return __vector_bitcast<_Tp>((__xx & __signmask) ==
2896 __signmask);
2897 }
2898 else
2899 { // SSE2/3 or AVX (w/o AVX2)
2900 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1);
2901 return __vector_bitcast<_Tp>(
2902 __vector_bitcast<_Tp>(
2903 (__xx & __signmask) |
2904 __vector_bitcast<_I>(__one)) // -1 or 1
2905 != __one);
2906 }
2907 }
2908 }*/
2909 }
2910
2911 // }}}
2912 // _S_isnonzerovalue_mask {{{
2913 // (isnormal | is subnormal == !isinf & !isnan & !is zero)
2914 template <typename _Tp>
2915 _GLIBCXX_SIMD_INTRINSIC static auto _S_isnonzerovalue_mask(_Tp __x)
2916 {
2917 using _Traits = _VectorTraits<_Tp>;
2918 if constexpr (__have_avx512dq_vl)
2919 {
2920 if constexpr (_Traits::template _S_is<
2921 float, 2> || _Traits::template _S_is<float, 4>)
2922 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f));
2923 else if constexpr (_Traits::template _S_is<float, 8>)
2924 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f));
2925 else if constexpr (_Traits::template _S_is<float, 16>)
2926 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f));
2927 else if constexpr (_Traits::template _S_is<double, 2>)
2928 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f));
2929 else if constexpr (_Traits::template _S_is<double, 4>)
2930 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f));
2931 else if constexpr (_Traits::template _S_is<double, 8>)
2932 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f));
2933 else
2934 __assert_unreachable<_Tp>();
2935 }
2936 else
2937 {
2938 using _Up = typename _Traits::value_type;
2939 constexpr size_t _Np = _Traits::_S_full_size;
2940 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0
2941 const auto __b = __x * _Up(); // NaN if __x == inf
2942 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>())
2943 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b),
2944 _CMP_ORD_Q);
2945 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>())
2946 return __mmask8(0xf
2947 & _mm512_cmp_ps_mask(__auto_bitcast(__a),
2948 __auto_bitcast(__b),
2949 _CMP_ORD_Q));
2950 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>())
2951 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2952 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>())
2953 return __mmask8(0x3
2954 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2955 __auto_bitcast(__b),
2956 _CMP_ORD_Q));
2957 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>())
2958 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
2959 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>())
2960 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a),
2961 __auto_bitcast(__b),
2962 _CMP_ORD_Q));
2963 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>())
2964 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2965 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>())
2966 return __mmask8(0xf
2967 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2968 __auto_bitcast(__b),
2969 _CMP_ORD_Q));
2970 else if constexpr (__is_avx512_ps<_Up, _Np>())
2971 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
2972 else if constexpr (__is_avx512_pd<_Up, _Np>())
2973 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2974 else
2975 __assert_unreachable<_Tp>();
2976 }
2977 }
2978
2979 // }}}
2980 // _S_isfinite {{{
2981 template <typename _Tp, size_t _Np>
2982 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2983 _S_isfinite(_SimdWrapper<_Tp, _Np> __x)
2984 {
2985 static_assert(is_floating_point_v<_Tp>);
2986#if !__FINITE_MATH_ONLY__
2987 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2988 {
2989 const auto __xi = __to_intrin(__x);
2990 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2991 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2992 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2993 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2994 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2995 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2996 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2997 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2998 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2999 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3000 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3001 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3002 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3003 }
3004 else if constexpr (__is_avx512_abi<_Abi>())
3005 {
3006 // if all exponent bits are set, __x is either inf or NaN
3007 using _I = __int_for_sizeof_t<_Tp>;
3008 const auto __inf = __vector_bitcast<_I>(
3009 __vector_broadcast<_Np>(__infinity_v<_Tp>));
3010 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf);
3011 }
3012 else
3013#endif
3014 return _Base::_S_isfinite(__x);
3015 }
3016
3017 // }}}
3018 // _S_isinf {{{
3019 template <typename _Tp, size_t _Np>
3020 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3021 _S_isinf(_SimdWrapper<_Tp, _Np> __x)
3022 {
3023#if !__FINITE_MATH_ONLY__
3024 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3025 {
3026 const auto __xi = __to_intrin(__x);
3027 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3028 return _mm512_fpclass_ps_mask(__xi, 0x18);
3029 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3030 return _mm512_fpclass_pd_mask(__xi, 0x18);
3031 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3032 return _mm256_fpclass_ps_mask(__xi, 0x18);
3033 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3034 return _mm256_fpclass_pd_mask(__xi, 0x18);
3035 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3036 return _mm_fpclass_ps_mask(__xi, 0x18);
3037 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3038 return _mm_fpclass_pd_mask(__xi, 0x18);
3039 else
3040 __assert_unreachable<_Tp>();
3041 }
3042 else if constexpr (__have_avx512dq_vl)
3043 {
3044 if constexpr (__is_sse_pd<_Tp, _Np>())
3045 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18));
3046 else if constexpr (__is_avx_pd<_Tp, _Np>())
3047 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18));
3048 else if constexpr (__is_sse_ps<_Tp, _Np>())
3049 return _mm_movm_epi32(
3050 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18));
3051 else if constexpr (__is_avx_ps<_Tp, _Np>())
3052 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18));
3053 else
3054 __assert_unreachable<_Tp>();
3055 }
3056 else
3057#endif
3058 return _Base::_S_isinf(__x);
3059 }
3060
3061 // }}}
3062 // _S_isnormal {{{
3063 template <typename _Tp, size_t _Np>
3064 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3065 _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
3066 {
3067#if __FINITE_MATH_ONLY__
3068 [[maybe_unused]] constexpr int __mode = 0x26;
3069#else
3070 [[maybe_unused]] constexpr int __mode = 0xbf;
3071#endif
3072 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3073 {
3074 const auto __xi = __to_intrin(__x);
3075 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3076 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3077 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode);
3078 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3079 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode);
3080 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3081 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode);
3082 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3083 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode);
3084 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3085 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode);
3086 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3087 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode);
3088 else
3089 __assert_unreachable<_Tp>();
3090 }
3091 else if constexpr (__have_avx512dq)
3092 {
3093 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
3094 return _mm_movm_epi32(
3095 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode)));
3096 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>())
3097 return _mm256_movm_epi32(
3098 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode)));
3099 else if constexpr (__is_avx512_ps<_Tp, _Np>())
3100 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode));
3101 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>())
3102 return _mm_movm_epi64(
3103 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode)));
3104 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>())
3105 return _mm256_movm_epi64(
3106 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode)));
3107 else if constexpr (__is_avx512_pd<_Tp, _Np>())
3108 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode));
3109 else
3110 __assert_unreachable<_Tp>();
3111 }
3112 else if constexpr (__is_avx512_abi<_Abi>())
3113 {
3114 using _I = __int_for_sizeof_t<_Tp>;
3115 const auto absn = __vector_bitcast<_I>(_S_abs(__x));
3116 const auto minn = __vector_bitcast<_I>(
3117 __vector_broadcast<_Np>(__norm_min_v<_Tp>));
3118#if __FINITE_MATH_ONLY__
3119 return _S_less_equal<_I, _Np>(minn, absn);
3120#else
3121 const auto infn
3122 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
3123 return __and(_S_less_equal<_I, _Np>(minn, absn),
3124 _S_less<_I, _Np>(absn, infn));
3125#endif
3126 }
3127 else
3128 return _Base::_S_isnormal(__x);
3129 }
3130
3131 // }}}
3132 // _S_isnan {{{
3133 template <typename _Tp, size_t _Np>
3134 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3135 _S_isnan(_SimdWrapper<_Tp, _Np> __x)
3136 { return _S_isunordered(__x, __x); }
3137
3138 // }}}
3139 // _S_isunordered {{{
3140 template <typename _Tp, size_t _Np>
3141 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3142 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x,
3143 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y)
3144 {
3145#if __FINITE_MATH_ONLY__
3146 return {}; // false
3147#else
3148 const auto __xi = __to_intrin(__x);
3149 const auto __yi = __to_intrin(__y);
3150 if constexpr (__is_avx512_abi<_Abi>())
3151 {
3152 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3153 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3154 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3155 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3156 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3157 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3158 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3159 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3160 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3161 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3162 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3163 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3164 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3165 }
3166 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3167 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q));
3168 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3169 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q));
3170 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3171 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi));
3172 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3173 return __to_masktype(_mm_cmpunord_pd(__xi, __yi));
3174 else
3175 __assert_unreachable<_Tp>();
3176#endif
3177 }
3178
3179 // }}}
3180 // _S_isgreater {{{
3181 template <typename _Tp, size_t _Np>
3182 static constexpr _MaskMember<_Tp> _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
3183 _SimdWrapper<_Tp, _Np> __y)
3184 {
3185 const auto __xi = __to_intrin(__x);
3186 const auto __yi = __to_intrin(__y);
3187 if constexpr (__is_avx512_abi<_Abi>())
3188 {
3189 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3190 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3191 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3192 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3193 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3194 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3195 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3196 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3197 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3198 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3199 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3200 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3201 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3202 else
3203 __assert_unreachable<_Tp>();
3204 }
3205 else if constexpr (__have_avx)
3206 {
3207 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3208 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3209 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3210 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3211 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3212 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3213 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3214 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3215 else
3216 __assert_unreachable<_Tp>();
3217 }
3218 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3219 && sizeof(_Tp) == 4)
3220 {
3221 const auto __xn = __vector_bitcast<int>(__xi);
3222 const auto __yn = __vector_bitcast<int>(__yi);
3223 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3224 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3225 return __auto_bitcast(
3226 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp));
3227 }
3228 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3229 && sizeof(_Tp) == 8)
3230 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3231 -_mm_ucomigt_sd(__xi, __yi),
3232 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi),
3233 _mm_unpackhi_pd(__yi, __yi))};
3234 else
3235 return _Base::_S_isgreater(__x, __y);
3236 }
3237
3238 // }}}
3239 // _S_isgreaterequal {{{
3240 template <typename _Tp, size_t _Np>
3241 static constexpr _MaskMember<_Tp>
3242 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3243 {
3244 const auto __xi = __to_intrin(__x);
3245 const auto __yi = __to_intrin(__y);
3246 if constexpr (__is_avx512_abi<_Abi>())
3247 {
3248 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3249 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3250 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3251 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3252 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3253 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3254 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3255 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3256 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3257 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3258 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3259 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3260 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3261 else
3262 __assert_unreachable<_Tp>();
3263 }
3264 else if constexpr (__have_avx)
3265 {
3266 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3267 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3268 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3269 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3270 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3271 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3272 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3273 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3274 else
3275 __assert_unreachable<_Tp>();
3276 }
3277 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3278 && sizeof(_Tp) == 4)
3279 {
3280 const auto __xn = __vector_bitcast<int>(__xi);
3281 const auto __yn = __vector_bitcast<int>(__yi);
3282 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3283 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3284 return __auto_bitcast(
3285 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp));
3286 }
3287 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3288 && sizeof(_Tp) == 8)
3289 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3290 -_mm_ucomige_sd(__xi, __yi),
3291 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi),
3292 _mm_unpackhi_pd(__yi, __yi))};
3293 else
3294 return _Base::_S_isgreaterequal(__x, __y);
3295 }
3296
3297 // }}}
3298 // _S_isless {{{
3299 template <typename _Tp, size_t _Np>
3300 static constexpr _MaskMember<_Tp> _S_isless(_SimdWrapper<_Tp, _Np> __x,
3301 _SimdWrapper<_Tp, _Np> __y)
3302 {
3303 const auto __xi = __to_intrin(__x);
3304 const auto __yi = __to_intrin(__y);
3305 if constexpr (__is_avx512_abi<_Abi>())
3306 {
3307 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3308 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3309 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3310 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3311 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3312 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3313 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3314 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3315 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3316 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3317 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3318 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3319 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3320 else
3321 __assert_unreachable<_Tp>();
3322 }
3323 else if constexpr (__have_avx)
3324 {
3325 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3326 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3327 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3328 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3329 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3330 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3331 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3332 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3333 else
3334 __assert_unreachable<_Tp>();
3335 }
3336 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3337 && sizeof(_Tp) == 4)
3338 {
3339 const auto __xn = __vector_bitcast<int>(__xi);
3340 const auto __yn = __vector_bitcast<int>(__yi);
3341 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3342 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3343 return __auto_bitcast(
3344 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp));
3345 }
3346 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3347 && sizeof(_Tp) == 8)
3348 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3349 -_mm_ucomigt_sd(__yi, __xi),
3350 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi),
3351 _mm_unpackhi_pd(__xi, __xi))};
3352 else
3353 return _Base::_S_isless(__x, __y);
3354 }
3355
3356 // }}}
3357 // _S_islessequal {{{
3358 template <typename _Tp, size_t _Np>
3359 static constexpr _MaskMember<_Tp>
3360 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3361 {
3362 const auto __xi = __to_intrin(__x);
3363 const auto __yi = __to_intrin(__y);
3364 if constexpr (__is_avx512_abi<_Abi>())
3365 {
3366 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3367 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3368 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3369 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3370 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3371 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3372 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3373 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3374 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3375 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3376 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3377 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3378 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3379 else
3380 __assert_unreachable<_Tp>();
3381 }
3382 else if constexpr (__have_avx)
3383 {
3384 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3385 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3386 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3387 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3388 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3389 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3390 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3391 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3392 else
3393 __assert_unreachable<_Tp>();
3394 }
3395 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3396 && sizeof(_Tp) == 4)
3397 {
3398 const auto __xn = __vector_bitcast<int>(__xi);
3399 const auto __yn = __vector_bitcast<int>(__yi);
3400 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3401 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3402 return __auto_bitcast(
3403 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp));
3404 }
3405 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3406 && sizeof(_Tp) == 8)
3407 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3408 -_mm_ucomige_sd(__yi, __xi),
3409 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi),
3410 _mm_unpackhi_pd(__xi, __xi))};
3411 else
3412 return _Base::_S_islessequal(__x, __y);
3413 }
3414
3415 // }}}
3416 // _S_islessgreater {{{
3417 template <typename _Tp, size_t _Np>
3418 static constexpr _MaskMember<_Tp>
3419 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3420 {
3421 const auto __xi = __to_intrin(__x);
3422 const auto __yi = __to_intrin(__y);
3423 if constexpr (__is_avx512_abi<_Abi>())
3424 {
3425 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3426 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3427 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3428 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3429 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3430 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3431 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3432 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3433 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3434 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3435 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3436 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3437 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3438 else
3439 __assert_unreachable<_Tp>();
3440 }
3441 else if constexpr (__have_avx)
3442 {
3443 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3444 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3445 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3446 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3447 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3448 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3449 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3450 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3451 else
3452 __assert_unreachable<_Tp>();
3453 }
3454 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3455 return __auto_bitcast(
3456 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi)));
3457 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3458 return __to_masktype(
3459 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi)));
3460 else
3461 __assert_unreachable<_Tp>();
3462 }
3463
3464 //}}} }}}
3465 };
3466
3467// }}}
3468// _MaskImplX86Mixin {{{
3469struct _MaskImplX86Mixin
3470{
3471 template <typename _Tp>
3472 using _TypeTag = _Tp*;
3473
3474 using _Base = _MaskImplBuiltinMixin;
3475
3476 // _S_to_maskvector(bool) {{{
3477 template <typename _Up, size_t _ToN = 1, typename _Tp>
3478 _GLIBCXX_SIMD_INTRINSIC static constexpr enable_if_t<
3479 is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>>
3480 _S_to_maskvector(_Tp __x)
3481 {
3482 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3483 return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
3484 : __vector_type_t<_Up, _ToN>();
3485 }
3486
3487 // }}}
3488 // _S_to_maskvector(_SanitizedBitMask) {{{
3489 template <typename _Up, size_t _UpN = 0, size_t _Np,
3490 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3491 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3492 _S_to_maskvector(_SanitizedBitMask<_Np> __x)
3493 {
3494 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3495 using _UV = __vector_type_t<_Up, _ToN>;
3496 using _UI = __intrinsic_type_t<_Up, _ToN>;
3497 [[maybe_unused]] const auto __k = __x._M_to_bits();
3498 if constexpr (_Np == 1)
3499 return _S_to_maskvector<_Up, _ToN>(__k);
3500 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3501 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
3502 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; });
3503 else if constexpr (sizeof(_Up) == 1)
3504 {
3505 if constexpr (sizeof(_UI) == 16)
3506 {
3507 if constexpr (__have_avx512bw_vl)
3508 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k));
3509 else if constexpr (__have_avx512bw)
3510 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k)));
3511 else if constexpr (__have_avx512f)
3512 {
3513 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3514 auto __as16bits
3515 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3516 __hi256(__as32bits)));
3517 return __intrin_bitcast<_UV>(
3518 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
3519 }
3520 else if constexpr (__have_ssse3)
3521 {
3522 const auto __bitmask = __to_intrin(
3523 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4,
3524 8, 16, 32, 64, 128));
3525 return __intrin_bitcast<_UV>(
3526 __vector_bitcast<_Up>(
3527 _mm_shuffle_epi8(__to_intrin(
3528 __vector_type_t<_ULLong, 2>{__k}),
3529 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1,
3530 1, 1, 1, 1, 1, 1, 1))
3531 & __bitmask)
3532 != 0);
3533 }
3534 // else fall through
3535 }
3536 else if constexpr (sizeof(_UI) == 32)
3537 {
3538 if constexpr (__have_avx512bw_vl)
3539 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k));
3540 else if constexpr (__have_avx512bw)
3541 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k)));
3542 else if constexpr (__have_avx512f)
3543 {
3544 auto __as16bits = // 0 16 1 17 ... 15 31
3545 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()),
3546 16)
3547 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16,
3548 ~__m512i()),
3549 16);
3550 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16(
3551 __lo256(__as16bits),
3552 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ...
3553 );
3554 // deinterleave:
3555 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8(
3556 __0_16_1_17, // 0 16 1 17 2 ...
3557 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9,
3558 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1,
3559 3, 5, 7, 9, 11, 13,
3560 15)))); // 0-7 16-23 8-15 24-31 -> xzyw
3561 // 0-3 8-11 16-19 24-27
3562 // 4-7 12-15 20-23 28-31
3563 }
3564 else if constexpr (__have_avx2)
3565 {
3566 const auto __bitmask
3567 = _mm256_broadcastsi128_si256(__to_intrin(
3568 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
3569 4, 8, 16, 32, 64, 128)));
3570 return __vector_bitcast<_Up>(
3571 __vector_bitcast<_Up>(
3572 _mm256_shuffle_epi8(
3573 _mm256_broadcastsi128_si256(
3574 __to_intrin(__vector_type_t<_ULLong, 2>{__k})),
3575 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3576 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
3577 3, 3, 3, 3, 3, 3))
3578 & __bitmask)
3579 != 0);
3580 }
3581 // else fall through
3582 }
3583 else if constexpr (sizeof(_UI) == 64)
3584 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k));
3585 if constexpr (std::min(_ToN, _Np) <= 4)
3586 {
3587 if constexpr (_Np > 7) // avoid overflow
3588 __x &= _SanitizedBitMask<_Np>(0x0f);
3589 const _UInt __char_mask
3590 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL)
3591 * 0xff;
3592 _UV __r = {};
3593 __builtin_memcpy(&__r, &__char_mask,
3594 std::min(sizeof(__r), sizeof(__char_mask)));
3595 return __r;
3596 }
3597 else if constexpr (std::min(_ToN, _Np) <= 7)
3598 {
3599 if constexpr (_Np > 7) // avoid overflow
3600 __x &= _SanitizedBitMask<_Np>(0x7f);
3601 const _ULLong __char_mask
3602 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL)
3603 * 0xff;
3604 _UV __r = {};
3605 __builtin_memcpy(&__r, &__char_mask,
3606 std::min(sizeof(__r), sizeof(__char_mask)));
3607 return __r;
3608 }
3609 }
3610 else if constexpr (sizeof(_Up) == 2)
3611 {
3612 if constexpr (sizeof(_UI) == 16)
3613 {
3614 if constexpr (__have_avx512bw_vl)
3615 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k));
3616 else if constexpr (__have_avx512bw)
3617 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k)));
3618 else if constexpr (__have_avx512f)
3619 {
3620 __m256i __as32bits = {};
3621 if constexpr (__have_avx512vl)
3622 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i());
3623 else
3624 __as32bits
3625 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()));
3626 return __intrin_bitcast<_UV>(
3627 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits)));
3628 }
3629 // else fall through
3630 }
3631 else if constexpr (sizeof(_UI) == 32)
3632 {
3633 if constexpr (__have_avx512bw_vl)
3634 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k));
3635 else if constexpr (__have_avx512bw)
3636 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k)));
3637 else if constexpr (__have_avx512f)
3638 {
3639 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3640 return __vector_bitcast<_Up>(
3641 __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3642 __hi256(__as32bits))));
3643 }
3644 // else fall through
3645 }
3646 else if constexpr (sizeof(_UI) == 64)
3647 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k));
3648 }
3649 else if constexpr (sizeof(_Up) == 4)
3650 {
3651 if constexpr (sizeof(_UI) == 16)
3652 {
3653 if constexpr (__have_avx512dq_vl)
3654 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k));
3655 else if constexpr (__have_avx512dq)
3656 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k)));
3657 else if constexpr (__have_avx512vl)
3658 return __intrin_bitcast<_UV>(
3659 _mm_maskz_mov_epi32(__k, ~__m128i()));
3660 else if constexpr (__have_avx512f)
3661 return __intrin_bitcast<_UV>(
3662 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3663 // else fall through
3664 }
3665 else if constexpr (sizeof(_UI) == 32)
3666 {
3667 if constexpr (__have_avx512dq_vl)
3668 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k));
3669 else if constexpr (__have_avx512dq)
3670 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k)));
3671 else if constexpr (__have_avx512vl)
3672 return __vector_bitcast<_Up>(
3673 _mm256_maskz_mov_epi32(__k, ~__m256i()));
3674 else if constexpr (__have_avx512f)
3675 return __vector_bitcast<_Up>(
3676 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3677 // else fall through
3678 }
3679 else if constexpr (sizeof(_UI) == 64)
3680 return __vector_bitcast<_Up>(
3681 __have_avx512dq ? _mm512_movm_epi32(__k)
3682 : _mm512_maskz_mov_epi32(__k, ~__m512i()));
3683 }
3684 else if constexpr (sizeof(_Up) == 8)
3685 {
3686 if constexpr (sizeof(_UI) == 16)
3687 {
3688 if constexpr (__have_avx512dq_vl)
3689 return __vector_bitcast<_Up>(_mm_movm_epi64(__k));
3690 else if constexpr (__have_avx512dq)
3691 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k)));
3692 else if constexpr (__have_avx512vl)
3693 return __vector_bitcast<_Up>(
3694 _mm_maskz_mov_epi64(__k, ~__m128i()));
3695 else if constexpr (__have_avx512f)
3696 return __vector_bitcast<_Up>(
3697 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3698 // else fall through
3699 }
3700 else if constexpr (sizeof(_UI) == 32)
3701 {
3702 if constexpr (__have_avx512dq_vl)
3703 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k));
3704 else if constexpr (__have_avx512dq)
3705 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k)));
3706 else if constexpr (__have_avx512vl)
3707 return __vector_bitcast<_Up>(
3708 _mm256_maskz_mov_epi64(__k, ~__m256i()));
3709 else if constexpr (__have_avx512f)
3710 return __vector_bitcast<_Up>(
3711 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3712 // else fall through
3713 }
3714 else if constexpr (sizeof(_UI) == 64)
3715 return __vector_bitcast<_Up>(
3716 __have_avx512dq ? _mm512_movm_epi64(__k)
3717 : _mm512_maskz_mov_epi64(__k, ~__m512i()));
3718 }
3719
3720 using _UpUInt = make_unsigned_t<_Up>;
3721 using _V = __vector_type_t<_UpUInt, _ToN>;
3722 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__;
3723 if constexpr (_ToN == 2)
3724 {
3725 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])});
3726 }
3727 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32)
3728 {
3729 if constexpr (sizeof(_Up) == 4)
3730 return __vector_bitcast<_Up>(_mm256_cmp_ps(
3731 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)),
3732 _mm256_castsi256_ps(_mm256_setr_epi32(
3733 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))),
3734 _mm256_setzero_ps(), _CMP_NEQ_UQ));
3735 else if constexpr (sizeof(_Up) == 8)
3736 return __vector_bitcast<_Up>(_mm256_cmp_pd(
3737 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)),
3738 _mm256_castsi256_pd(
3739 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))),
3740 _mm256_setzero_pd(), _CMP_NEQ_UQ));
3741 else
3742 __assert_unreachable<_Up>();
3743 }
3744 else if constexpr (__bits_per_element >= _ToN)
3745 {
3746 constexpr auto __bitmask
3747 = __generate_vector<_V>([](auto __i)
3748 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt
3749 { return __i < _ToN ? 1ull << __i : 0; });
3750 const auto __bits
3751 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
3752 if constexpr (__bits_per_element > _ToN)
3753 return __vector_bitcast<_Up>(__bits) > 0;
3754 else
3755 return __vector_bitcast<_Up>(__bits != 0);
3756 }
3757 else
3758 {
3759 const _V __tmp
3760 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3761 return static_cast<_UpUInt>(
3762 __k >> (__bits_per_element * (__i / __bits_per_element)));
3763 })
3764 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3765 return static_cast<_UpUInt>(1ull
3766 << (__i % __bits_per_element));
3767 }); // mask bit index
3768 return __intrin_bitcast<_UV>(__tmp != _V());
3769 }
3770 }
3771
3772 // }}}
3773 // _S_to_maskvector(_SimdWrapper) {{{
3774 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
3775 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3776 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3777 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
3778 {
3779 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3780 using _TW = _SimdWrapper<_Tp, _Np>;
3781 using _UW = _SimdWrapper<_Up, _ToN>;
3782 using _UI = __intrinsic_type_t<_Up, _ToN>;
3783 if constexpr (is_same_v<_Tp, bool>) // bits -> vector
3784 return _S_to_maskvector<_Up, _ToN>(
3785 _BitMask<_Np>(__x._M_data)._M_sanitized());
3786 // vector -> vector bitcast
3787 else if constexpr (sizeof(_Up) == sizeof(_Tp)
3788 && sizeof(_TW) == sizeof(_UW))
3789 return __wrapper_bitcast<_Up, _ToN>(
3790 _ToN <= _Np
3791 ? __x
3792 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x));
3793 else // vector -> vector {{{
3794 {
3795 if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3796 {
3797 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
3798 return __generate_from_n_evaluations<std::min(_ToN, _Np),
3799 __vector_type_t<_Up, _ToN>>(
3800 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; });
3801 }
3802 using _To = __vector_type_t<_Up, _ToN>;
3803 [[maybe_unused]] constexpr size_t _FromN = _Np;
3804 constexpr int _FromBytes = sizeof(_Tp);
3805 constexpr int _ToBytes = sizeof(_Up);
3806 const auto __k = __x._M_data;
3807
3808 if constexpr (_FromBytes == _ToBytes)
3809 return __intrin_bitcast<_To>(__k);
3810 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16)
3811 { // SSE -> SSE {{{
3812 if constexpr (_FromBytes == 4 && _ToBytes == 8)
3813 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3814 else if constexpr (_FromBytes == 2 && _ToBytes == 8)
3815 {
3816 const auto __y
3817 = __vector_bitcast<int>(__interleave128_lo(__k, __k));
3818 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3819 }
3820 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3821 {
3822 auto __y
3823 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3824 auto __z
3825 = __vector_bitcast<int>(__interleave128_lo(__y, __y));
3826 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z));
3827 }
3828 else if constexpr (_FromBytes == 8 && _ToBytes == 4
3829 && __have_sse2)
3830 return __intrin_bitcast<_To>(
3831 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3832 else if constexpr (_FromBytes == 8 && _ToBytes == 4)
3833 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k),
3834 _UI());
3835 else if constexpr (_FromBytes == 2 && _ToBytes == 4)
3836 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3837 else if constexpr (_FromBytes == 1 && _ToBytes == 4)
3838 {
3839 const auto __y
3840 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3841 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3842 }
3843 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
3844 {
3845 if constexpr (__have_sse2 && !__have_ssse3)
3846 return __intrin_bitcast<_To>(_mm_packs_epi32(
3847 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()),
3848 __m128i()));
3849 else
3850 return __intrin_bitcast<_To>(
3851 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(
3852 __vector_bitcast<_Up>(__k)));
3853 }
3854 else if constexpr (_FromBytes == 4 && _ToBytes == 2)
3855 return __intrin_bitcast<_To>(
3856 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3857 else if constexpr (_FromBytes == 1 && _ToBytes == 2)
3858 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3859 else if constexpr (_FromBytes == 8 && _ToBytes == 1
3860 && __have_ssse3)
3861 return __intrin_bitcast<_To>(
3862 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3863 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1,
3864 -1, -1, -1, -1, -1, -1, -1,
3865 -1)));
3866 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
3867 {
3868 auto __y
3869 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3870 __y = _mm_packs_epi32(__y, __m128i());
3871 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3872 }
3873 else if constexpr (_FromBytes == 4 && _ToBytes == 1
3874 && __have_ssse3)
3875 return __intrin_bitcast<_To>(
3876 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3877 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
3878 -1, -1, -1, -1, -1, -1, -1,
3879 -1)));
3880 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
3881 {
3882 const auto __y
3883 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3884 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3885 }
3886 else if constexpr (_FromBytes == 2 && _ToBytes == 1)
3887 return __intrin_bitcast<_To>(
3888 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()));
3889 else
3890 __assert_unreachable<_Tp>();
3891 } // }}}
3892 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32)
3893 { // AVX -> AVX {{{
3894 if constexpr (_FromBytes == _ToBytes)
3895 __assert_unreachable<_Tp>();
3896 else if constexpr (_FromBytes == _ToBytes * 2)
3897 {
3898 const auto __y = __vector_bitcast<_LLong>(__k);
3899 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
3900 _mm_packs_epi16(__lo128(__y), __hi128(__y))));
3901 }
3902 else if constexpr (_FromBytes == _ToBytes * 4)
3903 {
3904 const auto __y = __vector_bitcast<_LLong>(__k);
3905 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
3906 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
3907 __m128i())));
3908 }
3909 else if constexpr (_FromBytes == _ToBytes * 8)
3910 {
3911 const auto __y = __vector_bitcast<_LLong>(__k);
3912 return __intrin_bitcast<_To>(
3913 _mm256_castsi128_si256(_mm_shuffle_epi8(
3914 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
3915 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
3916 -1, -1, -1, -1, -1))));
3917 }
3918 else if constexpr (_FromBytes * 2 == _ToBytes)
3919 {
3920 auto __y = __xzyw(__to_intrin(__k));
3921 if constexpr (is_floating_point_v<
3922 _Tp> || (!__have_avx2 && _FromBytes == 4))
3923 {
3924 const auto __yy = __vector_bitcast<float>(__y);
3925 return __intrin_bitcast<_To>(
3926 _mm256_unpacklo_ps(__yy, __yy));
3927 }
3928 else
3929 return __intrin_bitcast<_To>(
3930 _mm256_unpacklo_epi8(__y, __y));
3931 }
3932 else if constexpr (_FromBytes * 4 == _ToBytes)
3933 {
3934 auto __y
3935 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
3936 __lo128(__vector_bitcast<_LLong>(
3937 __k))); // drops 3/4 of input
3938 return __intrin_bitcast<_To>(
3939 __concat(_mm_unpacklo_epi16(__y, __y),
3940 _mm_unpackhi_epi16(__y, __y)));
3941 }
3942 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3943 {
3944 auto __y
3945 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
3946 __lo128(__vector_bitcast<_LLong>(
3947 __k))); // drops 3/4 of input
3948 __y
3949 = _mm_unpacklo_epi16(__y,
3950 __y); // drops another 1/2 => 7/8 total
3951 return __intrin_bitcast<_To>(
3952 __concat(_mm_unpacklo_epi32(__y, __y),
3953 _mm_unpackhi_epi32(__y, __y)));
3954 }
3955 else
3956 __assert_unreachable<_Tp>();
3957 } // }}}
3958 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16)
3959 { // SSE -> AVX {{{
3960 if constexpr (_FromBytes == _ToBytes)
3961 return __intrin_bitcast<_To>(
3962 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>(
3963 __zero_extend(__to_intrin(__k))));
3964 else if constexpr (_FromBytes * 2 == _ToBytes)
3965 { // keep all
3966 return __intrin_bitcast<_To>(
3967 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k),
3968 __vector_bitcast<_LLong>(__k)),
3969 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k),
3970 __vector_bitcast<_LLong>(__k))));
3971 }
3972 else if constexpr (_FromBytes * 4 == _ToBytes)
3973 {
3974 if constexpr (__have_avx2)
3975 {
3976 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
3977 __concat(__vector_bitcast<_LLong>(__k),
3978 __vector_bitcast<_LLong>(__k)),
3979 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3,
3980 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
3981 6, 6, 7, 7, 7, 7)));
3982 }
3983 else
3984 {
3985 return __intrin_bitcast<_To>(__concat(
3986 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3987 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1,
3988 2, 2, 2, 2, 3, 3, 3, 3)),
3989 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3990 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5,
3991 6, 6, 6, 6, 7, 7, 7,
3992 7))));
3993 }
3994 }
3995 else if constexpr (_FromBytes * 8 == _ToBytes)
3996 {
3997 if constexpr (__have_avx2)
3998 {
3999 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4000 __concat(__vector_bitcast<_LLong>(__k),
4001 __vector_bitcast<_LLong>(__k)),
4002 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
4003 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
4004 3, 3, 3, 3, 3, 3)));
4005 }
4006 else
4007 {
4008 return __intrin_bitcast<_To>(__concat(
4009 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4010 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
4011 1, 1, 1, 1, 1, 1, 1, 1)),
4012 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4013 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2,
4014 3, 3, 3, 3, 3, 3, 3,
4015 3))));
4016 }
4017 }
4018 else if constexpr (_FromBytes == _ToBytes * 2)
4019 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4020 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()))));
4021 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
4022 {
4023 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4024 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4025 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1,
4026 -1, -1, -1, -1, -1, -1, -1,
4027 -1)))));
4028 }
4029 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4030 {
4031 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4032 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4033 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
4034 -1, -1, -1, -1, -1, -1, -1,
4035 -1)))));
4036 }
4037 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4038 {
4039 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4040 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4041 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1,
4042 -1, -1, -1, -1, -1, -1, -1,
4043 -1, -1)))));
4044 }
4045 else
4046 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4047 } // }}}
4048 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32)
4049 { // AVX -> SSE {{{
4050 if constexpr (_FromBytes == _ToBytes)
4051 { // keep low 1/2
4052 return __intrin_bitcast<_To>(__lo128(__k));
4053 }
4054 else if constexpr (_FromBytes == _ToBytes * 2)
4055 { // keep all
4056 auto __y = __vector_bitcast<_LLong>(__k);
4057 return __intrin_bitcast<_To>(
4058 _mm_packs_epi16(__lo128(__y), __hi128(__y)));
4059 }
4060 else if constexpr (_FromBytes == _ToBytes * 4)
4061 { // add 1/2 undef
4062 auto __y = __vector_bitcast<_LLong>(__k);
4063 return __intrin_bitcast<_To>(
4064 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4065 __m128i()));
4066 }
4067 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4068 { // add 3/4 undef
4069 auto __y = __vector_bitcast<_LLong>(__k);
4070 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
4071 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4072 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1,
4073 -1, -1, -1, -1)));
4074 }
4075 else if constexpr (_FromBytes * 2 == _ToBytes)
4076 { // keep low 1/4
4077 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4078 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4079 }
4080 else if constexpr (_FromBytes * 4 == _ToBytes)
4081 { // keep low 1/8
4082 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4083 __y = _mm_unpacklo_epi8(__y, __y);
4084 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4085 }
4086 else if constexpr (_FromBytes * 8 == _ToBytes)
4087 { // keep low 1/16
4088 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4089 __y = _mm_unpacklo_epi8(__y, __y);
4090 __y = _mm_unpacklo_epi8(__y, __y);
4091 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4092 }
4093 else
4094 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4095 } // }}}
4096 else
4097 return _Base::template _S_to_maskvector<_Up, _ToN>(__x);
4098 /*
4099 if constexpr (_FromBytes > _ToBytes) {
4100 const _To __y = __vector_bitcast<_Up>(__k);
4101 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4102 constexpr int _Stride = _FromBytes / _ToBytes;
4103 return _To{__y[(_Is + 1) * _Stride - 1]...};
4104 }(make_index_sequence<std::min(_ToN, _FromN)>());
4105 } else {
4106 // {0, 0, 1, 1} (_Dups = 2, _Is<4>)
4107 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>)
4108 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>)
4109 // ...
4110 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4111 constexpr int __dup = _ToBytes / _FromBytes;
4112 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...});
4113 }(make_index_sequence<_FromN>());
4114 }
4115 */
4116 } // }}}
4117 }
4118
4119 // }}}
4120 // _S_to_bits {{{
4121 template <typename _Tp, size_t _Np>
4122 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
4123 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
4124 {
4125 if constexpr (is_same_v<_Tp, bool>)
4126 return _BitMask<_Np>(__x._M_data)._M_sanitized();
4127 else
4128 {
4129 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4130 if (__builtin_is_constant_evaluated()
4131 || __builtin_constant_p(__x._M_data))
4132 {
4133 const auto __bools = -__x._M_data;
4134 const _ULLong __k = __call_with_n_evaluations<_Np>(
4135 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4136 return (__bits | ...);
4137 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4138 return _ULLong(__bools[+__i]) << __i;
4139 });
4140 if (__builtin_is_constant_evaluated()
4141 || __builtin_constant_p(__k))
4142 return __k;
4143 }
4144 const auto __xi = __to_intrin(__x);
4145 if constexpr (sizeof(_Tp) == 1)
4146 if constexpr (sizeof(__xi) == 16)
4147 if constexpr (__have_avx512bw_vl)
4148 return _BitMask<_Np>(_mm_movepi8_mask(__xi));
4149 else // implies SSE2
4150 return _BitMask<_Np>(_mm_movemask_epi8(__xi));
4151 else if constexpr (sizeof(__xi) == 32)
4152 if constexpr (__have_avx512bw_vl)
4153 return _BitMask<_Np>(_mm256_movepi8_mask(__xi));
4154 else // implies AVX2
4155 return _BitMask<_Np>(_mm256_movemask_epi8(__xi));
4156 else // implies AVX512BW
4157 return _BitMask<_Np>(_mm512_movepi8_mask(__xi));
4158
4159 else if constexpr (sizeof(_Tp) == 2)
4160 if constexpr (sizeof(__xi) == 16)
4161 if constexpr (__have_avx512bw_vl)
4162 return _BitMask<_Np>(_mm_movepi16_mask(__xi));
4163 else if constexpr (__have_avx512bw)
4164 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4165 else // implies SSE2
4166 return _BitMask<_Np>(
4167 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i())));
4168 else if constexpr (sizeof(__xi) == 32)
4169 if constexpr (__have_avx512bw_vl)
4170 return _BitMask<_Np>(_mm256_movepi16_mask(__xi));
4171 else if constexpr (__have_avx512bw)
4172 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4173 else // implies SSE2
4174 return _BitMask<_Np>(_mm_movemask_epi8(
4175 _mm_packs_epi16(__lo128(__xi), __hi128(__xi))));
4176 else // implies AVX512BW
4177 return _BitMask<_Np>(_mm512_movepi16_mask(__xi));
4178
4179 else if constexpr (sizeof(_Tp) == 4)
4180 if constexpr (sizeof(__xi) == 16)
4181 if constexpr (__have_avx512dq_vl)
4182 return _BitMask<_Np>(_mm_movepi32_mask(__xi));
4183 else if constexpr (__have_avx512vl)
4184 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i()));
4185 else if constexpr (__have_avx512dq)
4186 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4187 else if constexpr (__have_avx512f)
4188 return _BitMask<_Np>(
4189 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4190 else // implies SSE
4191 return _BitMask<_Np>(
4192 _mm_movemask_ps(reinterpret_cast<__m128>(__xi)));
4193 else if constexpr (sizeof(__xi) == 32)
4194 if constexpr (__have_avx512dq_vl)
4195 return _BitMask<_Np>(_mm256_movepi32_mask(__xi));
4196 else if constexpr (__have_avx512dq)
4197 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4198 else if constexpr (__have_avx512vl)
4199 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i()));
4200 else if constexpr (__have_avx512f)
4201 return _BitMask<_Np>(
4202 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4203 else // implies AVX
4204 return _BitMask<_Np>(
4205 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi)));
4206 else // implies AVX512??
4207 if constexpr (__have_avx512dq)
4208 return _BitMask<_Np>(_mm512_movepi32_mask(__xi));
4209 else // implies AVX512F
4210 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i()));
4211
4212 else if constexpr (sizeof(_Tp) == 8)
4213 if constexpr (sizeof(__xi) == 16)
4214 if constexpr (__have_avx512dq_vl)
4215 return _BitMask<_Np>(_mm_movepi64_mask(__xi));
4216 else if constexpr (__have_avx512dq)
4217 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4218 else if constexpr (__have_avx512vl)
4219 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i()));
4220 else if constexpr (__have_avx512f)
4221 return _BitMask<_Np>(
4222 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4223 else // implies SSE2
4224 return _BitMask<_Np>(
4225 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi)));
4226 else if constexpr (sizeof(__xi) == 32)
4227 if constexpr (__have_avx512dq_vl)
4228 return _BitMask<_Np>(_mm256_movepi64_mask(__xi));
4229 else if constexpr (__have_avx512dq)
4230 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4231 else if constexpr (__have_avx512vl)
4232 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i()));
4233 else if constexpr (__have_avx512f)
4234 return _BitMask<_Np>(
4235 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4236 else // implies AVX
4237 return _BitMask<_Np>(
4238 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi)));
4239 else // implies AVX512??
4240 if constexpr (__have_avx512dq)
4241 return _BitMask<_Np>(_mm512_movepi64_mask(__xi));
4242 else // implies AVX512F
4243 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i()));
4244
4245 else
4246 __assert_unreachable<_Tp>();
4247 }
4248 }
4249 // }}}
4250};
4251
4252// }}}
4253// _MaskImplX86 {{{
4254template <typename _Abi, typename>
4255 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
4256 {
4257 using _MaskImplX86Mixin::_S_to_bits;
4258 using _MaskImplX86Mixin::_S_to_maskvector;
4259 using _MaskImplBuiltin<_Abi>::_S_convert;
4260
4261 // member types {{{
4262 template <typename _Tp>
4263 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
4264
4265 template <typename _Tp>
4266 using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
4267
4268 template <typename _Tp>
4269 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
4270
4271 using _Base = _MaskImplBuiltin<_Abi>;
4272
4273 // }}}
4274 // _S_broadcast {{{
4275 template <typename _Tp>
4276 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4277 _S_broadcast(bool __x)
4278 {
4279 if constexpr (__is_avx512_abi<_Abi>())
4280 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1))
4281 : _MaskMember<_Tp>();
4282 else
4283 return _Base::template _S_broadcast<_Tp>(__x);
4284 }
4285
4286 // }}}
4287 // _S_load {{{
4288 template <typename _Tp>
4289 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4290 _S_load(const bool* __mem)
4291 {
4292 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4293 if constexpr (__have_avx512bw)
4294 {
4295 const auto __to_vec_or_bits
4296 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) {
4297 if constexpr (__is_avx512_abi<_Abi>())
4298 return __bits;
4299 else
4300 return _S_to_maskvector<_Tp>(
4301 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
4302 };
4303
4304 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
4305 {
4306 __m128i __a = {};
4307 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4308 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a));
4309 }
4310 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl)
4311 {
4312 __m256i __a = {};
4313 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4314 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a));
4315 }
4316 else if constexpr (_S_size<_Tp> <= 64)
4317 {
4318 __m512i __a = {};
4319 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4320 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a));
4321 }
4322 }
4323 else if constexpr (__is_avx512_abi<_Abi>())
4324 {
4325 if constexpr (_S_size<_Tp> <= 8)
4326 {
4327 __m128i __a = {};
4328 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4329 const auto __b = _mm512_cvtepi8_epi64(__a);
4330 return _mm512_test_epi64_mask(__b, __b);
4331 }
4332 else if constexpr (_S_size<_Tp> <= 16)
4333 {
4334 __m128i __a = {};
4335 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4336 const auto __b = _mm512_cvtepi8_epi32(__a);
4337 return _mm512_test_epi32_mask(__b, __b);
4338 }
4339 else if constexpr (_S_size<_Tp> <= 32)
4340 {
4341 __m128i __a = {};
4342 __builtin_memcpy(&__a, __mem, 16);
4343 const auto __b = _mm512_cvtepi8_epi32(__a);
4344 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16);
4345 const auto __c = _mm512_cvtepi8_epi32(__a);
4346 return _mm512_test_epi32_mask(__b, __b)
4347 | (_mm512_test_epi32_mask(__c, __c) << 16);
4348 }
4349 else if constexpr (_S_size<_Tp> <= 64)
4350 {
4351 __m128i __a = {};
4352 __builtin_memcpy(&__a, __mem, 16);
4353 const auto __b = _mm512_cvtepi8_epi32(__a);
4354 __builtin_memcpy(&__a, __mem + 16, 16);
4355 const auto __c = _mm512_cvtepi8_epi32(__a);
4356 if constexpr (_S_size<_Tp> <= 48)
4357 {
4358 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32);
4359 const auto __d = _mm512_cvtepi8_epi32(__a);
4360 return _mm512_test_epi32_mask(__b, __b)
4361 | (_mm512_test_epi32_mask(__c, __c) << 16)
4362 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32);
4363 }
4364 else
4365 {
4366 __builtin_memcpy(&__a, __mem + 16, 16);
4367 const auto __d = _mm512_cvtepi8_epi32(__a);
4368 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48);
4369 const auto __e = _mm512_cvtepi8_epi32(__a);
4370 return _mm512_test_epi32_mask(__b, __b)
4371 | (_mm512_test_epi32_mask(__c, __c) << 16)
4372 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32)
4373 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48);
4374 }
4375 }
4376 else
4377 __assert_unreachable<_Tp>();
4378 }
4379 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2)
4380 return __vector_bitcast<_Tp>(
4381 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]),
4382 -int(__mem[1]), -int(__mem[1])});
4383 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx)
4384 {
4385 int __bool4 = 0;
4386 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>);
4387 const auto __k = __to_intrin(
4388 (__vector_broadcast<4>(__bool4)
4389 & __make_vector<int>(0x1, 0x100, 0x10000,
4390 _S_size<_Tp> == 4 ? 0x1000000 : 0))
4391 != 0);
4392 return __vector_bitcast<_Tp>(
4393 __concat(_mm_unpacklo_epi32(__k, __k),
4394 _mm_unpackhi_epi32(__k, __k)));
4395 }
4396 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4)
4397 {
4398 int __bools = 0;
4399 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>);
4400 if constexpr (__have_sse2)
4401 {
4402 __m128i __k = _mm_cvtsi32_si128(__bools);
4403 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4404 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4405 _mm_unpacklo_epi16(__k, __k));
4406 }
4407 else
4408 {
4409 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools));
4410 _mm_empty();
4411 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4412 _mm_cmpgt_ps(__k, __m128()));
4413 }
4414 }
4415 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8)
4416 {
4417 __m128i __k = {};
4418 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4419 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4420 return __vector_bitcast<_Tp>(
4421 __concat(_mm_unpacklo_epi16(__k, __k),
4422 _mm_unpackhi_epi16(__k, __k)));
4423 }
4424 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16)
4425 {
4426 __m128i __k = {};
4427 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4428 __k = _mm_cmpgt_epi8(__k, __m128i());
4429 if constexpr (_S_size<_Tp> <= 8)
4430 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4431 _mm_unpacklo_epi8(__k, __k));
4432 else
4433 return __concat(_mm_unpacklo_epi8(__k, __k),
4434 _mm_unpackhi_epi8(__k, __k));
4435 }
4436 else
4437 return _Base::template _S_load<_Tp>(__mem);
4438 }
4439
4440 // }}}
4441 // _S_from_bitmask{{{
4442 template <size_t _Np, typename _Tp>
4443 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
4444 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
4445 {
4446 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4447 if constexpr (__is_avx512_abi<_Abi>())
4448 return __bits._M_to_bits();
4449 else
4450 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
4451 }
4452
4453 // }}}
4454 // _S_masked_load {{{2
4455 template <typename _Tp, size_t _Np>
4456 static inline _SimdWrapper<_Tp, _Np>
4457 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
4458 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
4459 {
4460 if constexpr (__is_avx512_abi<_Abi>())
4461 {
4462 if constexpr (__have_avx512bw_vl)
4463 {
4464 if constexpr (_Np <= 16)
4465 {
4466 const auto __a
4467 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem);
4468 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a);
4469 }
4470 else if constexpr (_Np <= 32)
4471 {
4472 const auto __a
4473 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem);
4474 return (__merge & ~__mask)
4475 | _mm256_test_epi8_mask(__a, __a);
4476 }
4477 else if constexpr (_Np <= 64)
4478 {
4479 const auto __a
4480 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem);
4481 return (__merge & ~__mask)
4482 | _mm512_test_epi8_mask(__a, __a);
4483 }
4484 else
4485 __assert_unreachable<_Tp>();
4486 }
4487 else
4488 {
4489 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4490 __merge._M_set(__i, __mem[__i]);
4491 });
4492 return __merge;
4493 }
4494 }
4495 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1)
4496 {
4497 const auto __k = _S_to_bits(__mask)._M_to_bits();
4498 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(),
4499 _mm256_mask_loadu_epi8(__m256i(),
4500 __k, __mem));
4501 }
4502 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1)
4503 {
4504 const auto __k = _S_to_bits(__mask)._M_to_bits();
4505 __merge
4506 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k,
4507 __m128i(),
4508 _mm_mask_loadu_epi8(__m128i(), __k, __mem));
4509 }
4510 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2)
4511 {
4512 const auto __k = _S_to_bits(__mask)._M_to_bits();
4513 __merge = _mm256_mask_sub_epi16(
4514 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4515 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4516 }
4517 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2)
4518 {
4519 const auto __k = _S_to_bits(__mask)._M_to_bits();
4520 __merge = _mm_mask_sub_epi16(
4521 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4522 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4523 }
4524 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4)
4525 {
4526 const auto __k = _S_to_bits(__mask)._M_to_bits();
4527 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32(
4528 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4529 _mm256_cvtepi8_epi32(
4530 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4531 }
4532 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4)
4533 {
4534 const auto __k = _S_to_bits(__mask)._M_to_bits();
4535 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32(
4536 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4537 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4538 }
4539 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8)
4540 {
4541 const auto __k = _S_to_bits(__mask)._M_to_bits();
4542 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64(
4543 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4544 _mm256_cvtepi8_epi64(
4545 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4546 }
4547 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8)
4548 {
4549 const auto __k = _S_to_bits(__mask)._M_to_bits();
4550 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64(
4551 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4552 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4553 }
4554 else
4555 return _Base::_S_masked_load(__merge, __mask, __mem);
4556 return __merge;
4557 }
4558
4559 // _S_store {{{2
4560 template <typename _Tp, size_t _Np>
4561 _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v,
4562 bool* __mem) noexcept
4563 {
4564 if constexpr (__is_avx512_abi<_Abi>())
4565 {
4566 if constexpr (__have_avx512bw_vl)
4567 _CommonImplX86::_S_store<_Np>(
4568 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4569 if constexpr (_Np <= 16)
4570 return _mm_maskz_set1_epi8(__data, 1);
4571 else if constexpr (_Np <= 32)
4572 return _mm256_maskz_set1_epi8(__data, 1);
4573 else
4574 return _mm512_maskz_set1_epi8(__data, 1);
4575 }(__v._M_data)),
4576 __mem);
4577 else if constexpr (_Np <= 8)
4578 _CommonImplX86::_S_store<_Np>(
4579 __vector_bitcast<char>(
4580#if defined __x86_64__
4581 __make_wrapper<_ULLong>(
4582 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull)
4583#else
4584 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U),
4585 _pdep_u32(__v._M_data >> 4,
4586 0x01010101U))
4587#endif
4588 ),
4589 __mem);
4590 else if constexpr (_Np <= 16)
4591 _mm512_mask_cvtepi32_storeu_epi8(
4592 __mem, 0xffffu >> (16 - _Np),
4593 _mm512_maskz_set1_epi32(__v._M_data, 1));
4594 else
4595 __assert_unreachable<_Tp>();
4596 }
4597 else if constexpr (__is_sse_abi<_Abi>()) //{{{
4598 {
4599 if constexpr (_Np == 2 && sizeof(_Tp) == 8)
4600 {
4601 const auto __k = __vector_bitcast<int>(__v);
4602 __mem[0] = -__k[1];
4603 __mem[1] = -__k[3];
4604 }
4605 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
4606 {
4607 if constexpr (__have_sse2)
4608 {
4609 const unsigned __bool4
4610 = __vector_bitcast<_UInt>(_mm_packs_epi16(
4611 _mm_packs_epi32(__intrin_bitcast<__m128i>(
4612 __to_intrin(__v)),
4613 __m128i()),
4614 __m128i()))[0]
4615 & 0x01010101u;
4616 __builtin_memcpy(__mem, &__bool4, _Np);
4617 }
4618 else if constexpr (__have_mmx)
4619 {
4620 const __m64 __k = _mm_cvtps_pi8(
4621 __and(__to_intrin(__v), _mm_set1_ps(1.f)));
4622 __builtin_memcpy(__mem, &__k, _Np);
4623 _mm_empty();
4624 }
4625 else
4626 return _Base::_S_store(__v, __mem);
4627 }
4628 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
4629 {
4630 _CommonImplX86::_S_store<_Np>(
4631 __vector_bitcast<char>(_mm_packs_epi16(
4632 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15),
4633 __m128i())),
4634 __mem);
4635 }
4636 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
4637 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem);
4638 else
4639 __assert_unreachable<_Tp>();
4640 } // }}}
4641 else if constexpr (__is_avx_abi<_Abi>()) // {{{
4642 {
4643 if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
4644 {
4645 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4646 int __bool4;
4647 if constexpr (__have_avx2)
4648 __bool4 = _mm256_movemask_epi8(__k);
4649 else
4650 __bool4 = (_mm_movemask_epi8(__lo128(__k))
4651 | (_mm_movemask_epi8(__hi128(__k)) << 16));
4652 __bool4 &= 0x01010101;
4653 __builtin_memcpy(__mem, &__bool4, _Np);
4654 }
4655 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4)
4656 {
4657 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4658 const auto __k2
4659 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)),
4660 15);
4661 const auto __k3
4662 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i()));
4663 _CommonImplX86::_S_store<_Np>(__k3, __mem);
4664 }
4665 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2)
4666 {
4667 if constexpr (__have_avx2)
4668 {
4669 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15);
4670 const auto __bools = __vector_bitcast<char>(
4671 _mm_packs_epi16(__lo128(__x), __hi128(__x)));
4672 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4673 }
4674 else
4675 {
4676 const auto __bools
4677 = 1
4678 & __vector_bitcast<_UChar>(
4679 _mm_packs_epi16(__lo128(__to_intrin(__v)),
4680 __hi128(__to_intrin(__v))));
4681 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4682 }
4683 }
4684 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1)
4685 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem);
4686 else
4687 __assert_unreachable<_Tp>();
4688 } // }}}
4689 else
4690 __assert_unreachable<_Tp>();
4691 }
4692
4693 // _S_masked_store {{{2
4694 template <typename _Tp, size_t _Np>
4695 static inline void
4696 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
4697 const _SimdWrapper<_Tp, _Np> __k) noexcept
4698 {
4699 if constexpr (__is_avx512_abi<_Abi>())
4700 {
4701 static_assert(is_same_v<_Tp, bool>);
4702 if constexpr (_Np <= 16 && __have_avx512bw_vl)
4703 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1));
4704 else if constexpr (_Np <= 16)
4705 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k,
4706 _mm512_maskz_set1_epi32(__v, 1));
4707 else if constexpr (_Np <= 32 && __have_avx512bw_vl)
4708 _mm256_mask_storeu_epi8(__mem, __k,
4709 _mm256_maskz_set1_epi8(__v, 1));
4710 else if constexpr (_Np <= 32 && __have_avx512bw)
4711 _mm256_mask_storeu_epi8(__mem, __k,
4712 __lo256(_mm512_maskz_set1_epi8(__v, 1)));
4713 else if constexpr (_Np <= 64 && __have_avx512bw)
4714 _mm512_mask_storeu_epi8(__mem, __k,
4715 _mm512_maskz_set1_epi8(__v, 1));
4716 else
4717 __assert_unreachable<_Tp>();
4718 }
4719 else
4720 _Base::_S_masked_store(__v, __mem, __k);
4721 }
4722
4723 // logical and bitwise operators {{{2
4724 template <typename _Tp, size_t _Np>
4725 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4726 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x,
4727 const _SimdWrapper<_Tp, _Np>& __y)
4728 {
4729 if constexpr (is_same_v<_Tp, bool>)
4730 {
4731 if constexpr (__have_avx512dq && _Np <= 8)
4732 return _kand_mask8(__x._M_data, __y._M_data);
4733 else if constexpr (_Np <= 16)
4734 return _kand_mask16(__x._M_data, __y._M_data);
4735 else if constexpr (__have_avx512bw && _Np <= 32)
4736 return _kand_mask32(__x._M_data, __y._M_data);
4737 else if constexpr (__have_avx512bw && _Np <= 64)
4738 return _kand_mask64(__x._M_data, __y._M_data);
4739 else
4740 __assert_unreachable<_Tp>();
4741 }
4742 else
4743 return _Base::_S_logical_and(__x, __y);
4744 }
4745
4746 template <typename _Tp, size_t _Np>
4747 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4748 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x,
4749 const _SimdWrapper<_Tp, _Np>& __y)
4750 {
4751 if constexpr (is_same_v<_Tp, bool>)
4752 {
4753 if constexpr (__have_avx512dq && _Np <= 8)
4754 return _kor_mask8(__x._M_data, __y._M_data);
4755 else if constexpr (_Np <= 16)
4756 return _kor_mask16(__x._M_data, __y._M_data);
4757 else if constexpr (__have_avx512bw && _Np <= 32)
4758 return _kor_mask32(__x._M_data, __y._M_data);
4759 else if constexpr (__have_avx512bw && _Np <= 64)
4760 return _kor_mask64(__x._M_data, __y._M_data);
4761 else
4762 __assert_unreachable<_Tp>();
4763 }
4764 else
4765 return _Base::_S_logical_or(__x, __y);
4766 }
4767
4768 template <typename _Tp, size_t _Np>
4769 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4770 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
4771 {
4772 if constexpr (is_same_v<_Tp, bool>)
4773 {
4774 if constexpr (__have_avx512dq && _Np <= 8)
4775 return _kandn_mask8(__x._M_data,
4776 _Abi::template __implicit_mask_n<_Np>());
4777 else if constexpr (_Np <= 16)
4778 return _kandn_mask16(__x._M_data,
4779 _Abi::template __implicit_mask_n<_Np>());
4780 else if constexpr (__have_avx512bw && _Np <= 32)
4781 return _kandn_mask32(__x._M_data,
4782 _Abi::template __implicit_mask_n<_Np>());
4783 else if constexpr (__have_avx512bw && _Np <= 64)
4784 return _kandn_mask64(__x._M_data,
4785 _Abi::template __implicit_mask_n<_Np>());
4786 else
4787 __assert_unreachable<_Tp>();
4788 }
4789 else
4790 return _Base::_S_bit_not(__x);
4791 }
4792
4793 template <typename _Tp, size_t _Np>
4794 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4795 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x,
4796 const _SimdWrapper<_Tp, _Np>& __y)
4797 {
4798 if constexpr (is_same_v<_Tp, bool>)
4799 {
4800 if constexpr (__have_avx512dq && _Np <= 8)
4801 return _kand_mask8(__x._M_data, __y._M_data);
4802 else if constexpr (_Np <= 16)
4803 return _kand_mask16(__x._M_data, __y._M_data);
4804 else if constexpr (__have_avx512bw && _Np <= 32)
4805 return _kand_mask32(__x._M_data, __y._M_data);
4806 else if constexpr (__have_avx512bw && _Np <= 64)
4807 return _kand_mask64(__x._M_data, __y._M_data);
4808 else
4809 __assert_unreachable<_Tp>();
4810 }
4811 else
4812 return _Base::_S_bit_and(__x, __y);
4813 }
4814
4815 template <typename _Tp, size_t _Np>
4816 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4817 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x,
4818 const _SimdWrapper<_Tp, _Np>& __y)
4819 {
4820 if constexpr (is_same_v<_Tp, bool>)
4821 {
4822 if constexpr (__have_avx512dq && _Np <= 8)
4823 return _kor_mask8(__x._M_data, __y._M_data);
4824 else if constexpr (_Np <= 16)
4825 return _kor_mask16(__x._M_data, __y._M_data);
4826 else if constexpr (__have_avx512bw && _Np <= 32)
4827 return _kor_mask32(__x._M_data, __y._M_data);
4828 else if constexpr (__have_avx512bw && _Np <= 64)
4829 return _kor_mask64(__x._M_data, __y._M_data);
4830 else
4831 __assert_unreachable<_Tp>();
4832 }
4833 else
4834 return _Base::_S_bit_or(__x, __y);
4835 }
4836
4837 template <typename _Tp, size_t _Np>
4838 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4839 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x,
4840 const _SimdWrapper<_Tp, _Np>& __y)
4841 {
4842 if constexpr (is_same_v<_Tp, bool>)
4843 {
4844 if constexpr (__have_avx512dq && _Np <= 8)
4845 return _kxor_mask8(__x._M_data, __y._M_data);
4846 else if constexpr (_Np <= 16)
4847 return _kxor_mask16(__x._M_data, __y._M_data);
4848 else if constexpr (__have_avx512bw && _Np <= 32)
4849 return _kxor_mask32(__x._M_data, __y._M_data);
4850 else if constexpr (__have_avx512bw && _Np <= 64)
4851 return _kxor_mask64(__x._M_data, __y._M_data);
4852 else
4853 __assert_unreachable<_Tp>();
4854 }
4855 else
4856 return _Base::_S_bit_xor(__x, __y);
4857 }
4858
4859 //}}}2
4860 // _S_masked_assign{{{
4861 template <size_t _Np>
4862 _GLIBCXX_SIMD_INTRINSIC static void
4863 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4864 _SimdWrapper<bool, _Np>& __lhs,
4865 _SimdWrapper<bool, _Np> __rhs)
4866 {
4867 __lhs._M_data
4868 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data);
4869 }
4870
4871 template <size_t _Np>
4872 _GLIBCXX_SIMD_INTRINSIC static void
4873 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4874 _SimdWrapper<bool, _Np>& __lhs, bool __rhs)
4875 {
4876 if (__rhs)
4877 __lhs._M_data = __k._M_data | __lhs._M_data;
4878 else
4879 __lhs._M_data = ~__k._M_data & __lhs._M_data;
4880 }
4881
4882 using _MaskImplBuiltin<_Abi>::_S_masked_assign;
4883
4884 //}}}
4885 // _S_all_of {{{
4886 template <typename _Tp>
4887 _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
4888 {
4889 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4890 {
4891 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4892 using _TI = __intrinsic_type_t<_Tp, _Np>;
4893 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4894 if constexpr (__have_sse4_1)
4895 {
4896 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4897 = _Abi::template _S_implicit_mask_intrin<_Tp>();
4898 return 0 != __testc(__a, __b);
4899 }
4900 else if constexpr (is_same_v<_Tp, float>)
4901 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1))
4902 == (1 << _Np) - 1;
4903 else if constexpr (is_same_v<_Tp, double>)
4904 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1))
4905 == (1 << _Np) - 1;
4906 else
4907 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
4908 == (1 << (_Np * sizeof(_Tp))) - 1;
4909 }
4910 else if constexpr (__is_avx512_abi<_Abi>())
4911 {
4912 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>();
4913 const auto __kk = __k._M_data._M_data;
4914 if constexpr (sizeof(__kk) == 1)
4915 {
4916 if constexpr (__have_avx512dq)
4917 return _kortestc_mask8_u8(__kk, _Mask == 0xff
4918 ? __kk
4919 : __mmask8(~_Mask));
4920 else
4921 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask));
4922 }
4923 else if constexpr (sizeof(__kk) == 2)
4924 return _kortestc_mask16_u8(__kk, _Mask == 0xffff
4925 ? __kk
4926 : __mmask16(~_Mask));
4927 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw)
4928 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU
4929 ? __kk
4930 : __mmask32(~_Mask));
4931 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw)
4932 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL
4933 ? __kk
4934 : __mmask64(~_Mask));
4935 else
4936 __assert_unreachable<_Tp>();
4937 }
4938 }
4939
4940 // }}}
4941 // _S_any_of {{{
4942 template <typename _Tp>
4943 _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
4944 {
4945 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4946 {
4947 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4948 using _TI = __intrinsic_type_t<_Tp, _Np>;
4949 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4950 if constexpr (__have_sse4_1)
4951 {
4952 if constexpr (_Abi::template _S_is_partial<
4953 _Tp> || sizeof(__k) < 16)
4954 {
4955 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4956 = _Abi::template _S_implicit_mask_intrin<_Tp>();
4957 return 0 == __testz(__a, __b);
4958 }
4959 else
4960 return 0 == __testz(__a, __a);
4961 }
4962 else if constexpr (is_same_v<_Tp, float>)
4963 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0;
4964 else if constexpr (is_same_v<_Tp, double>)
4965 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0;
4966 else
4967 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
4968 != 0;
4969 }
4970 else if constexpr (__is_avx512_abi<_Abi>())
4971 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
4972 != 0;
4973 }
4974
4975 // }}}
4976 // _S_none_of {{{
4977 template <typename _Tp>
4978 _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
4979 {
4980 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4981 {
4982 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4983 using _TI = __intrinsic_type_t<_Tp, _Np>;
4984 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4985 if constexpr (__have_sse4_1)
4986 {
4987 if constexpr (_Abi::template _S_is_partial<
4988 _Tp> || sizeof(__k) < 16)
4989 {
4990 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4991 = _Abi::template _S_implicit_mask_intrin<_Tp>();
4992 return 0 != __testz(__a, __b);
4993 }
4994 else
4995 return 0 != __testz(__a, __a);
4996 }
4997 else if constexpr (is_same_v<_Tp, float>)
4998 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
4999 else if constexpr (is_same_v<_Tp, double>)
5000 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5001 else
5002 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1))
5003 == 0;
5004 }
5005 else if constexpr (__is_avx512_abi<_Abi>())
5006 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5007 == 0;
5008 }
5009
5010 // }}}
5011 // _S_some_of {{{
5012 template <typename _Tp>
5013 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
5014 {
5015 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5016 {
5017 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5018 using _TI = __intrinsic_type_t<_Tp, _Np>;
5019 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5020 if constexpr (__have_sse4_1)
5021 {
5022 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5023 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5024 return 0 != __testnzc(__a, __b);
5025 }
5026 else if constexpr (is_same_v<_Tp, float>)
5027 {
5028 constexpr int __allbits = (1 << _Np) - 1;
5029 const auto __tmp = _mm_movemask_ps(__a) & __allbits;
5030 return __tmp > 0 && __tmp < __allbits;
5031 }
5032 else if constexpr (is_same_v<_Tp, double>)
5033 {
5034 constexpr int __allbits = (1 << _Np) - 1;
5035 const auto __tmp = _mm_movemask_pd(__a) & __allbits;
5036 return __tmp > 0 && __tmp < __allbits;
5037 }
5038 else
5039 {
5040 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1;
5041 const auto __tmp = _mm_movemask_epi8(__a) & __allbits;
5042 return __tmp > 0 && __tmp < __allbits;
5043 }
5044 }
5045 else if constexpr (__is_avx512_abi<_Abi>())
5046 return _S_any_of(__k) && !_S_all_of(__k);
5047 else
5048 __assert_unreachable<_Tp>();
5049 }
5050
5051 // }}}
5052 // _S_popcount {{{
5053 template <typename _Tp>
5054 _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
5055 {
5056 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5057 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data;
5058 if constexpr (__is_avx512_abi<_Abi>())
5059 {
5060 if constexpr (_Np > 32)
5061 return __builtin_popcountll(__kk);
5062 else
5063 return __builtin_popcount(__kk);
5064 }
5065 else
5066 {
5067 if constexpr (__have_popcnt)
5068 {
5069 int __bits
5070 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk)));
5071 const int __count = __builtin_popcount(__bits);
5072 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count;
5073 }
5074 else if constexpr (_Np == 2 && sizeof(_Tp) == 8)
5075 {
5076 const int mask = _mm_movemask_pd(__auto_bitcast(__kk));
5077 return mask - (mask >> 1);
5078 }
5079 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
5080 {
5081 auto __x = -(__lo128(__kk) + __hi128(__kk));
5082 return __x[0] + __x[1];
5083 }
5084 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
5085 {
5086 if constexpr (__have_sse2)
5087 {
5088 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk));
5089 __x = _mm_add_epi32(
5090 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5091 __x = _mm_add_epi32(
5092 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2)));
5093 return -_mm_cvtsi128_si32(__x);
5094 }
5095 else
5096 return __builtin_popcount(
5097 _mm_movemask_ps(__auto_bitcast(__kk)));
5098 }
5099 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
5100 {
5101 auto __x = __to_intrin(__kk);
5102 __x = _mm_add_epi16(__x,
5103 _mm_shuffle_epi32(__x,
5104 _MM_SHUFFLE(0, 1, 2, 3)));
5105 __x = _mm_add_epi16(
5106 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5107 __x = _mm_add_epi16(
5108 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1)));
5109 return -short(_mm_extract_epi16(__x, 0));
5110 }
5111 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
5112 {
5113 auto __x = __to_intrin(__kk);
5114 __x = _mm_add_epi8(__x,
5115 _mm_shuffle_epi32(__x,
5116 _MM_SHUFFLE(0, 1, 2, 3)));
5117 __x = _mm_add_epi8(__x,
5118 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2,
5119 3)));
5120 __x = _mm_add_epi8(__x,
5121 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0,
5122 1)));
5123 auto __y = -__vector_bitcast<_UChar>(__x);
5124 if constexpr (__have_sse4_1)
5125 return __y[0] + __y[1];
5126 else
5127 {
5128 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0);
5129 return (__z & 0xff) + (__z >> 8);
5130 }
5131 }
5132 else if constexpr (sizeof(__kk) == 32)
5133 {
5134 // The following works only as long as the implementations above
5135 // use a summation
5136 using _I = __int_for_sizeof_t<_Tp>;
5137 const auto __as_int = __vector_bitcast<_I>(__kk);
5138 _MaskImplX86<simd_abi::__sse>::_S_popcount(
5139 simd_mask<_I, simd_abi::__sse>(__private_init,
5140 __lo128(__as_int)
5141 + __hi128(__as_int)));
5142 }
5143 else
5144 __assert_unreachable<_Tp>();
5145 }
5146 }
5147
5148 // }}}
5149 // _S_find_first_set {{{
5150 template <typename _Tp>
5151 _GLIBCXX_SIMD_INTRINSIC static int
5152 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
5153 {
5154 if constexpr (__is_avx512_abi<_Abi>())
5155 return std::__countr_zero(__k._M_data._M_data);
5156 else
5157 return _Base::_S_find_first_set(__k);
5158 }
5159
5160 // }}}
5161 // _S_find_last_set {{{
5162 template <typename _Tp>
5163 _GLIBCXX_SIMD_INTRINSIC static int
5164 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
5165 {
5166 if constexpr (__is_avx512_abi<_Abi>())
5167 return std::__bit_width(__k._M_data._M_data) - 1;
5168 else
5169 return _Base::_S_find_last_set(__k);
5170 }
5171
5172 // }}}
5173 };
5174
5175// }}}
5176
5177_GLIBCXX_SIMD_END_NAMESPACE
5178#endif // __cplusplus >= 201703L
5179#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
5180
5181// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition: type_traits:2614
typename enable_if< _Cond, _Tp >::type enable_if_t
Alias template for enable_if.
Definition: type_traits:2610
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:233