libstdc++
simd_x86.h
1// Simd x86 specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2023 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_X86INTRIN
31#error \
32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available"
33#endif
34
35_GLIBCXX_SIMD_BEGIN_NAMESPACE
36
37// __to_masktype {{{
38// Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and
39// __vector_type_t.
40template <typename _Tp, size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); }
44
45template <typename _TV,
46 typename _TVT
47 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
48 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
50 __to_masktype(_TV __x)
51 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); }
52
53// }}}
54// __interleave128_lo {{{
55template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>,
56 typename _Trait = _VectorTraits<_Tp>>
57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
58 __interleave128_lo(const _Ap& __av, const _Bp& __bv)
59 {
60 const _Tp __a(__av);
61 const _Tp __b(__bv);
62 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
63 return _Tp{__a[0], __b[0]};
64 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
65 return _Tp{__a[0], __b[0], __a[1], __b[1]};
66 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
67 return _Tp{__a[0], __b[0], __a[1], __b[1],
68 __a[2], __b[2], __a[3], __b[3]};
69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
72 __a[6], __b[6], __a[7], __b[7]};
73 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
74 return _Tp{__a[0], __b[0], __a[2], __b[2]};
75 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
76 return _Tp{__a[0], __b[0], __a[1], __b[1],
77 __a[4], __b[4], __a[5], __b[5]};
78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
81 __a[10], __b[10], __a[11], __b[11]};
82 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
87 __a[22], __b[22], __a[23], __b[23]};
88 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
89 return _Tp{__a[0], __b[0], __a[2], __b[2],
90 __a[4], __b[4], __a[6], __b[6]};
91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
94 __a[12], __b[12], __a[13], __b[13]};
95 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
100 __a[26], __b[26], __a[27], __b[27]};
101 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
111 __b[55]};
112 else
113 __assert_unreachable<_Tp>();
114 }
115
116// }}}
117// __is_zero{{{
118template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
119 _GLIBCXX_SIMD_INTRINSIC constexpr bool
120 __is_zero(_Tp __a)
121 {
122 if (!__builtin_is_constant_evaluated())
123 {
124 if constexpr (__have_avx)
125 {
126 if constexpr (_TVT::template _S_is<float, 8>)
127 return _mm256_testz_ps(__a, __a);
128 else if constexpr (_TVT::template _S_is<double, 4>)
129 return _mm256_testz_pd(__a, __a);
130 else if constexpr (sizeof(_Tp) == 32)
131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
132 else if constexpr (_TVT::template _S_is<float>)
133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
134 else if constexpr (_TVT::template _S_is<double, 2>)
135 return _mm_testz_pd(__a, __a);
136 else
137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
138 }
139 else if constexpr (__have_sse4_1)
140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
141 __intrin_bitcast<__m128i>(__a));
142 }
143 else if constexpr (sizeof(_Tp) <= 8)
144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
145 else
146 {
147 const auto __b = __vector_bitcast<_LLong>(__a);
148 if constexpr (sizeof(__b) == 16)
149 return (__b[0] | __b[1]) == 0;
150 else if constexpr (sizeof(__b) == 32)
151 return __is_zero(__lo128(__b) | __hi128(__b));
152 else if constexpr (sizeof(__b) == 64)
153 return __is_zero(__lo256(__b) | __hi256(__b));
154 else
155 __assert_unreachable<_Tp>();
156 }
157 }
158
159// }}}
160// __movemask{{{
161template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int
163 __movemask(_Tp __a)
164 {
165 if constexpr (sizeof(_Tp) == 32)
166 {
167 if constexpr (_TVT::template _S_is<float>)
168 return _mm256_movemask_ps(__to_intrin(__a));
169 else if constexpr (_TVT::template _S_is<double>)
170 return _mm256_movemask_pd(__to_intrin(__a));
171 else
172 return _mm256_movemask_epi8(__to_intrin(__a));
173 }
174 else if constexpr (_TVT::template _S_is<float>)
175 return _mm_movemask_ps(__to_intrin(__a));
176 else if constexpr (_TVT::template _S_is<double>)
177 return _mm_movemask_pd(__to_intrin(__a));
178 else
179 return _mm_movemask_epi8(__to_intrin(__a));
180 }
181
182// }}}
183// __testz{{{
184template <typename _TI, typename _TVT = _VectorTraits<_TI>>
185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
186 __testz(_TI __a, _TI __b)
187 {
188 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
189 _TVT::_S_full_size>>);
190 if (!__builtin_is_constant_evaluated())
191 {
192 if constexpr (sizeof(_TI) == 32)
193 {
194 if constexpr (_TVT::template _S_is<float>)
195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
196 else if constexpr (_TVT::template _S_is<double>)
197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
198 else
199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
200 }
201 else if constexpr (_TVT::template _S_is<float> && __have_avx)
202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
203 else if constexpr (_TVT::template _S_is<double> && __have_avx)
204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
205 else if constexpr (__have_sse4_1)
206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
207 __intrin_bitcast<__m128i>(__to_intrin(__b)));
208 else
209 return __movemask(0 == __and(__a, __b)) != 0;
210 }
211 else
212 return __is_zero(__and(__a, __b));
213 }
214
215// }}}
216// __testc{{{
217// requires SSE4.1 or above
218template <typename _TI, typename _TVT = _VectorTraits<_TI>>
219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
220 __testc(_TI __a, _TI __b)
221 {
222 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
223 _TVT::_S_full_size>>);
224 if (__builtin_is_constant_evaluated())
225 return __is_zero(__andnot(__a, __b));
226
227 if constexpr (sizeof(_TI) == 32)
228 {
229 if constexpr (_TVT::template _S_is<float>)
230 return _mm256_testc_ps(__a, __b);
231 else if constexpr (_TVT::template _S_is<double>)
232 return _mm256_testc_pd(__a, __b);
233 else
234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
235 }
236 else if constexpr (_TVT::template _S_is<float> && __have_avx)
237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
238 else if constexpr (_TVT::template _S_is<double> && __have_avx)
239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
240 else
241 {
242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
244 __intrin_bitcast<__m128i>(__to_intrin(__b)));
245 }
246 }
247
248// }}}
249// __testnzc{{{
250template <typename _TI, typename _TVT = _VectorTraits<_TI>>
251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
252 __testnzc(_TI __a, _TI __b)
253 {
254 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
255 _TVT::_S_full_size>>);
256 if (!__builtin_is_constant_evaluated())
257 {
258 if constexpr (sizeof(_TI) == 32)
259 {
260 if constexpr (_TVT::template _S_is<float>)
261 return _mm256_testnzc_ps(__a, __b);
262 else if constexpr (_TVT::template _S_is<double>)
263 return _mm256_testnzc_pd(__a, __b);
264 else
265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
266 }
267 else if constexpr (_TVT::template _S_is<float> && __have_avx)
268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
269 else if constexpr (_TVT::template _S_is<double> && __have_avx)
270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
271 else if constexpr (__have_sse4_1)
272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
273 __intrin_bitcast<__m128i>(__to_intrin(__b)));
274 else
275 return __movemask(0 == __and(__a, __b)) == 0
276 && __movemask(0 == __andnot(__a, __b)) == 0;
277 }
278 else
279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
280 }
281
282// }}}
283// __xzyw{{{
284// shuffles the complete vector, swapping the inner two quarters. Often useful
285// for AVX for fixing up a shuffle result.
286template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
287 _GLIBCXX_SIMD_INTRINSIC _Tp
288 __xzyw(_Tp __a)
289 {
290 if constexpr (sizeof(_Tp) == 16)
291 {
292 const auto __x = __vector_bitcast<conditional_t<
293 is_floating_point_v<typename _TVT::value_type>, float, int>>(__a);
294 return reinterpret_cast<_Tp>(
295 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
296 }
297 else if constexpr (sizeof(_Tp) == 32)
298 {
299 const auto __x = __vector_bitcast<conditional_t<
300 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
301 return reinterpret_cast<_Tp>(
302 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
303 }
304 else if constexpr (sizeof(_Tp) == 64)
305 {
306 const auto __x = __vector_bitcast<conditional_t<
307 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
308 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4],
309 __x[5], __x[2], __x[3],
310 __x[6], __x[7]});
311 }
312 else
313 __assert_unreachable<_Tp>();
314 }
315
316// }}}
317// __maskload_epi32{{{
318template <typename _Tp>
319 _GLIBCXX_SIMD_INTRINSIC auto
320 __maskload_epi32(const int* __ptr, _Tp __k)
321 {
322 if constexpr (sizeof(__k) == 16)
323 return _mm_maskload_epi32(__ptr, __k);
324 else
325 return _mm256_maskload_epi32(__ptr, __k);
326 }
327
328// }}}
329// __maskload_epi64{{{
330template <typename _Tp>
331 _GLIBCXX_SIMD_INTRINSIC auto
332 __maskload_epi64(const _LLong* __ptr, _Tp __k)
333 {
334 if constexpr (sizeof(__k) == 16)
335 return _mm_maskload_epi64(__ptr, __k);
336 else
337 return _mm256_maskload_epi64(__ptr, __k);
338 }
339
340// }}}
341// __maskload_ps{{{
342template <typename _Tp>
343 _GLIBCXX_SIMD_INTRINSIC auto
344 __maskload_ps(const float* __ptr, _Tp __k)
345 {
346 if constexpr (sizeof(__k) == 16)
347 return _mm_maskload_ps(__ptr, __k);
348 else
349 return _mm256_maskload_ps(__ptr, __k);
350 }
351
352// }}}
353// __maskload_pd{{{
354template <typename _Tp>
355 _GLIBCXX_SIMD_INTRINSIC auto
356 __maskload_pd(const double* __ptr, _Tp __k)
357 {
358 if constexpr (sizeof(__k) == 16)
359 return _mm_maskload_pd(__ptr, __k);
360 else
361 return _mm256_maskload_pd(__ptr, __k);
362 }
363
364// }}}
365
366#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
367#include "simd_x86_conversions.h"
368#endif
369
370// ISA & type detection {{{
371template <typename _Tp, size_t _Np>
372 constexpr bool
373 __is_sse_ps()
374 {
375 return __have_sse
376 && is_same_v<_Tp,
377 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
378 }
379
380template <typename _Tp, size_t _Np>
381 constexpr bool
382 __is_sse_pd()
383 {
384 return __have_sse2
385 && is_same_v<_Tp,
386 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
387 }
388
389template <typename _Tp, size_t _Np>
390 constexpr bool
391 __is_avx_ps()
392 {
393 return __have_avx
394 && is_same_v<_Tp,
395 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
396 }
397
398template <typename _Tp, size_t _Np>
399 constexpr bool
400 __is_avx_pd()
401 {
402 return __have_avx
403 && is_same_v<_Tp,
404 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
405 }
406
407template <typename _Tp, size_t _Np>
408 constexpr bool
409 __is_avx512_ps()
410 {
411 return __have_avx512f
412 && is_same_v<_Tp,
413 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
414 }
415
416template <typename _Tp, size_t _Np>
417 constexpr bool
418 __is_avx512_pd()
419 {
420 return __have_avx512f
421 && is_same_v<_Tp,
422 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
423 }
424
425// }}}
426struct _MaskImplX86Mixin;
427
428// _CommonImplX86 {{{
429struct _CommonImplX86 : _CommonImplBuiltin
430{
431#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
432 // _S_converts_via_decomposition {{{
433 template <typename _From, typename _To, size_t _ToSize>
434 static constexpr bool
435 _S_converts_via_decomposition()
436 {
437 if constexpr (is_integral_v<
438 _From> && is_integral_v<_To> && sizeof(_From) == 8
439 && _ToSize == 16)
440 return (sizeof(_To) == 2 && !__have_ssse3)
441 || (sizeof(_To) == 1 && !__have_avx512f);
442 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
443 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
444 && !__have_avx512dq)
445 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
446 && _ToSize == 16);
447 else if constexpr (
448 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
449 && !__have_avx512dq)
450 return (sizeof(_To) == 4 && _ToSize == 16)
451 || (sizeof(_To) == 8 && _ToSize < 64);
452 else
453 return false;
454 }
455
456 template <typename _From, typename _To, size_t _ToSize>
457 static inline constexpr bool __converts_via_decomposition_v
458 = _S_converts_via_decomposition<_From, _To, _ToSize>();
459
460 // }}}
461#endif
462 // _S_store {{{
463 using _CommonImplBuiltin::_S_store;
464
465 template <typename _Tp, size_t _Np>
466 _GLIBCXX_SIMD_INTRINSIC static void
467 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
468 {
469 constexpr size_t _Bytes = _Np * sizeof(_Tp);
470
471 if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
472 {
473 const auto __v = __to_intrin(__x);
474
475 if constexpr (_Bytes & 1)
476 {
477 if constexpr (_Bytes < 16)
478 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
479 __intrin_bitcast<__m128i>(__v));
480 else if constexpr (_Bytes < 32)
481 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
482 __intrin_bitcast<__m256i>(__v));
483 else
484 _mm512_mask_storeu_epi8(__addr,
485 0xffffffffffffffffull >> (64 - _Bytes),
486 __intrin_bitcast<__m512i>(__v));
487 }
488 else if constexpr (_Bytes & 2)
489 {
490 if constexpr (_Bytes < 16)
491 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
492 __intrin_bitcast<__m128i>(__v));
493 else if constexpr (_Bytes < 32)
494 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
495 __intrin_bitcast<__m256i>(__v));
496 else
497 _mm512_mask_storeu_epi16(__addr,
498 0xffffffffull >> (32 - _Bytes / 2),
499 __intrin_bitcast<__m512i>(__v));
500 }
501 else if constexpr (_Bytes & 4)
502 {
503 if constexpr (_Bytes < 16)
504 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
505 __intrin_bitcast<__m128i>(__v));
506 else if constexpr (_Bytes < 32)
507 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
508 __intrin_bitcast<__m256i>(__v));
509 else
510 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
511 __intrin_bitcast<__m512i>(__v));
512 }
513 else
514 {
515 static_assert(
516 _Bytes > 16,
517 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes "
518 "- 1)) != 0 is impossible");
519 if constexpr (_Bytes < 32)
520 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
521 __intrin_bitcast<__m256i>(__v));
522 else
523 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
524 __intrin_bitcast<__m512i>(__v));
525 }
526 }
527 else
528 _CommonImplBuiltin::_S_store(__x, __addr);
529 }
530
531 // }}}
532 // _S_store_bool_array(_BitMask) {{{
533 template <size_t _Np, bool _Sanitized>
534 _GLIBCXX_SIMD_INTRINSIC static constexpr void
535 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
536 {
537 if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
538 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
539 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
540 if constexpr (_Np <= 16)
541 return _mm_movm_epi8(__x._M_to_bits());
542 else if constexpr (_Np <= 32)
543 return _mm256_movm_epi8(__x._M_to_bits());
544 else if constexpr (_Np <= 64)
545 return _mm512_movm_epi8(__x._M_to_bits());
546 else
547 __assert_unreachable<_SizeConstant<_Np>>();
548 }()),
549 __mem);
550 else if constexpr (__have_bmi2)
551 {
552 if constexpr (_Np <= 4)
553 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
554 else
555 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
556 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
557 constexpr size_t __offset = __i * sizeof(size_t);
558 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
559 if constexpr (__todo == 1)
560 __mem[__offset] = __x[__offset];
561 else
562 {
563 const auto __bools =
564#ifdef __x86_64__
565 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
566 0x0101010101010101ULL);
567#else // __x86_64__
568 _pdep_u32(
569 __x.template _M_extract<__offset>()._M_to_bits(),
570 0x01010101U);
571#endif // __x86_64__
572 _S_store<__todo>(__bools, __mem + __offset);
573 }
574 });
575 }
576 else if constexpr (__have_sse2 && _Np > 7)
577 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
578 constexpr int __offset = __i * 16;
579 constexpr int __todo = std::min(16, int(_Np) - __offset);
580 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
581 __vector_type16_t<_UChar> __bools;
582 if constexpr (__have_avx512f)
583 {
584 auto __as32bits
585 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
586 __vector_broadcast<16>(1)));
587 auto __as16bits
588 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
589 __todo > 8 ? __hi256(__as32bits)
590 : __m256i()));
591 __bools = __vector_bitcast<_UChar>(
592 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
593 }
594 else
595 {
596 using _V = __vector_type_t<_UChar, 16>;
597 auto __tmp = _mm_cvtsi32_si128(__bits);
598 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
599 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
600 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
601 _V __tmp2 = reinterpret_cast<_V>(__tmp);
602 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
603 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index
604 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01
605 }
606 _S_store<__todo>(__bools, __mem + __offset);
607 });
608 else
609 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
610 }
611
612 // }}}
613 // _S_blend_avx512 {{{
614 // Returns: __k ? __b : __a
615 // TODO: reverse __a and __b to match COND_EXPR
616 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask
617 // __k
618 template <typename _Kp, typename _TV>
619 _GLIBCXX_SIMD_INTRINSIC static _TV
620 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept
621 {
622#ifdef __clang__
623 // FIXME: this does a boolean choice, not a blend
624 return __k ? __a : __b;
625#else
626 static_assert(__is_vector_type_v<_TV>);
627 using _Tp = typename _VectorTraits<_TV>::value_type;
628 static_assert(sizeof(_TV) >= 16);
629 static_assert(sizeof(_Tp) <= 8);
630 using _IntT
631 = conditional_t<(sizeof(_Tp) > 2),
632 conditional_t<sizeof(_Tp) == 4, int, long long>,
633 conditional_t<sizeof(_Tp) == 1, char, short>>;
634 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a);
635 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b);
636 if constexpr (sizeof(_TV) == 64)
637 {
638 if constexpr (sizeof(_Tp) == 1)
639 return reinterpret_cast<_TV>(
640 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
641 else if constexpr (sizeof(_Tp) == 2)
642 return reinterpret_cast<_TV>(
643 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
644 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
645 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
646 else if constexpr (sizeof(_Tp) == 4)
647 return reinterpret_cast<_TV>(
648 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
649 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
650 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
651 else if constexpr (sizeof(_Tp) == 8)
652 return reinterpret_cast<_TV>(
653 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
654 }
655 else if constexpr (sizeof(_TV) == 32)
656 {
657 if constexpr (sizeof(_Tp) == 1)
658 return reinterpret_cast<_TV>(
659 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
660 else if constexpr (sizeof(_Tp) == 2)
661 return reinterpret_cast<_TV>(
662 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
663 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
664 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
665 else if constexpr (sizeof(_Tp) == 4)
666 return reinterpret_cast<_TV>(
667 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
668 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
669 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
670 else if constexpr (sizeof(_Tp) == 8)
671 return reinterpret_cast<_TV>(
672 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
673 }
674 else if constexpr (sizeof(_TV) == 16)
675 {
676 if constexpr (sizeof(_Tp) == 1)
677 return reinterpret_cast<_TV>(
678 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
679 else if constexpr (sizeof(_Tp) == 2)
680 return reinterpret_cast<_TV>(
681 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
682 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
683 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
684 else if constexpr (sizeof(_Tp) == 4)
685 return reinterpret_cast<_TV>(
686 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
687 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
688 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
689 else if constexpr (sizeof(_Tp) == 8)
690 return reinterpret_cast<_TV>(
691 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
692 }
693#endif
694 }
695
696 // }}}
697 // _S_blend_intrin {{{
698 // Returns: __k ? __b : __a
699 // TODO: reverse __a and __b to match COND_EXPR
700 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32
701 // Bytes wide
702 template <typename _Tp>
703 _GLIBCXX_SIMD_INTRINSIC static _Tp
704 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept
705 {
706 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
707 constexpr struct
708 {
709 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
710 __m128 __k) const noexcept
711 {
712 return __builtin_ia32_blendvps(__a, __b, __k);
713 }
714 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
715 __m128d __k) const noexcept
716 {
717 return __builtin_ia32_blendvpd(__a, __b, __k);
718 }
719 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
720 __m128i __k) const noexcept
721 {
722 return reinterpret_cast<__m128i>(
723 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
724 reinterpret_cast<__v16qi>(__b),
725 reinterpret_cast<__v16qi>(__k)));
726 }
727 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
728 __m256 __k) const noexcept
729 {
730 return __builtin_ia32_blendvps256(__a, __b, __k);
731 }
732 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
733 __m256d __k) const noexcept
734 {
735 return __builtin_ia32_blendvpd256(__a, __b, __k);
736 }
737 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
738 __m256i __k) const noexcept
739 {
740 if constexpr (__have_avx2)
741 return reinterpret_cast<__m256i>(
742 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
743 reinterpret_cast<__v32qi>(__b),
744 reinterpret_cast<__v32qi>(__k)));
745 else
746 return reinterpret_cast<__m256i>(
747 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
748 reinterpret_cast<__v8sf>(__b),
749 reinterpret_cast<__v8sf>(__k)));
750 }
751 } __eval;
752 return __eval(__a, __b, __k);
753 }
754
755 // }}}
756 // _S_blend {{{
757 // Returns: __k ? __at1 : __at0
758 // TODO: reverse __at0 and __at1 to match COND_EXPR
759 template <typename _Tp, size_t _Np>
760 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
761 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
762 _SimdWrapper<_Tp, _Np> __at1)
763 {
764 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
765 if (__k._M_is_constprop() && __at0._M_is_constprop()
766 && __at1._M_is_constprop())
767 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
768 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
769 return __k[__i] ? __at1[__i] : __at0[__i];
770 });
771 else if constexpr (sizeof(__at0) == 64
772 || (__have_avx512vl && sizeof(__at0) >= 16))
773 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
774 else
775 {
776 static_assert((__have_avx512vl && sizeof(__at0) < 16)
777 || !__have_avx512vl);
778 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp);
779 return __vector_bitcast<_Tp, _Np>(
780 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
781 __vector_bitcast<_Tp, __size>(__at1)));
782 }
783 }
784
785 template <typename _Tp, size_t _Np>
786 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
787 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
788 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
789 {
790 const auto __kk = __wrapper_bitcast<_Tp>(__k);
791 if (__builtin_is_constant_evaluated()
792 || (__kk._M_is_constprop() && __at0._M_is_constprop()
793 && __at1._M_is_constprop()))
794 {
795 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
796 if (__r._M_is_constprop())
797 return __r;
798 }
799 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl)
800 && (sizeof(_Tp) >= 4 || __have_avx512bw))
801 // convert to bitmask and call overload above
802 return _S_blend(
803 _SimdWrapper<bool, _Np>(
804 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
805 ._M_to_bits()),
806 __at0, __at1);
807 else
808 {
809 // Since GCC does not assume __k to be a mask, using the builtin
810 // conditional operator introduces an extra compare against 0 before
811 // blending. So we rather call the intrinsic here.
812 if constexpr (__have_sse4_1)
813 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
814 __to_intrin(__at1));
815 else
816 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
817 }
818 }
819
820 // }}}
821};
822
823// }}}
824// _SimdImplX86 {{{
825template <typename _Abi, typename>
826 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
827 {
828 using _Base = _SimdImplBuiltin<_Abi>;
829
830 template <typename _Tp>
831 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
832
833 template <typename _Tp>
834 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
835
836 template <typename _Tp>
837 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
838
839 template <typename _Tp>
840 static constexpr size_t _S_max_store_size
841 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
842 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
843 : 16;
844
845 using _MaskImpl = typename _Abi::_MaskImpl;
846
847 // _S_masked_load {{{
848 template <typename _Tp, size_t _Np, typename _Up>
849 static inline _SimdWrapper<_Tp, _Np>
850 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
851 const _Up* __mem) noexcept
852 {
853 static_assert(_Np == _S_size<_Tp>);
854 if constexpr (is_same_v<_Tp, _Up> || // no conversion
855 (sizeof(_Tp) == sizeof(_Up)
856 && is_integral_v<
857 _Tp> == is_integral_v<_Up>) // conversion via bit
858 // reinterpretation
859 )
860 {
861 [[maybe_unused]] const auto __intrin = __to_intrin(__merge);
862 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
863 && sizeof(_Tp) == 1)
864 {
865 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
866 if constexpr (sizeof(__intrin) == 16)
867 __merge = __vector_bitcast<_Tp, _Np>(
868 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
869 else if constexpr (sizeof(__merge) == 32)
870 __merge = __vector_bitcast<_Tp, _Np>(
871 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
872 else if constexpr (sizeof(__merge) == 64)
873 __merge = __vector_bitcast<_Tp, _Np>(
874 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
875 else
876 __assert_unreachable<_Tp>();
877 }
878 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
879 && sizeof(_Tp) == 2)
880 {
881 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
882 if constexpr (sizeof(__intrin) == 16)
883 __merge = __vector_bitcast<_Tp, _Np>(
884 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
885 else if constexpr (sizeof(__intrin) == 32)
886 __merge = __vector_bitcast<_Tp, _Np>(
887 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
888 else if constexpr (sizeof(__intrin) == 64)
889 __merge = __vector_bitcast<_Tp, _Np>(
890 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
891 else
892 __assert_unreachable<_Tp>();
893 }
894 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
895 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
896 {
897 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
898 if constexpr (sizeof(__intrin) == 16)
899 __merge = __vector_bitcast<_Tp, _Np>(
900 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
901 else if constexpr (sizeof(__intrin) == 32)
902 __merge = __vector_bitcast<_Tp, _Np>(
903 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
904 else if constexpr (sizeof(__intrin) == 64)
905 __merge = __vector_bitcast<_Tp, _Np>(
906 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
907 else
908 __assert_unreachable<_Tp>();
909 }
910 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
911 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
912 {
913 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
914 if constexpr (sizeof(__intrin) == 16)
915 __merge = __vector_bitcast<_Tp, _Np>(
916 _mm_mask_loadu_ps(__intrin, __kk, __mem));
917 else if constexpr (sizeof(__intrin) == 32)
918 __merge = __vector_bitcast<_Tp, _Np>(
919 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
920 else if constexpr (sizeof(__intrin) == 64)
921 __merge = __vector_bitcast<_Tp, _Np>(
922 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
923 else
924 __assert_unreachable<_Tp>();
925 }
926 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
927 && is_integral_v<_Up>)
928 {
929 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
930 __merge
931 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
932 __vector_bitcast<_Tp, _Np>(
933 __maskload_epi32(reinterpret_cast<const int*>(__mem),
934 __to_intrin(__k))));
935 }
936 else if constexpr (__have_avx && sizeof(_Tp) == 4)
937 {
938 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
939 __merge
940 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
941 __vector_bitcast<_Tp, _Np>(
942 __maskload_ps(reinterpret_cast<const float*>(__mem),
943 __to_intrin(__k))));
944 }
945 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
946 && sizeof(_Tp) == 8 && is_integral_v<_Up>)
947 {
948 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
949 if constexpr (sizeof(__intrin) == 16)
950 __merge = __vector_bitcast<_Tp, _Np>(
951 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
952 else if constexpr (sizeof(__intrin) == 32)
953 __merge = __vector_bitcast<_Tp, _Np>(
954 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
955 else if constexpr (sizeof(__intrin) == 64)
956 __merge = __vector_bitcast<_Tp, _Np>(
957 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
958 else
959 __assert_unreachable<_Tp>();
960 }
961 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
962 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
963 {
964 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
965 if constexpr (sizeof(__intrin) == 16)
966 __merge = __vector_bitcast<_Tp, _Np>(
967 _mm_mask_loadu_pd(__intrin, __kk, __mem));
968 else if constexpr (sizeof(__intrin) == 32)
969 __merge = __vector_bitcast<_Tp, _Np>(
970 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
971 else if constexpr (sizeof(__intrin) == 64)
972 __merge = __vector_bitcast<_Tp, _Np>(
973 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
974 else
975 __assert_unreachable<_Tp>();
976 }
977 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
978 && is_integral_v<_Up>)
979 {
980 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
981 __merge
982 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
983 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
984 reinterpret_cast<const _LLong*>(__mem),
985 __to_intrin(__k))));
986 }
987 else if constexpr (__have_avx && sizeof(_Tp) == 8)
988 {
989 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
990 __merge
991 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
992 __vector_bitcast<_Tp, _Np>(
993 __maskload_pd(reinterpret_cast<const double*>(__mem),
994 __to_intrin(__k))));
995 }
996 else
997 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
998 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
999 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1000 });
1001 }
1002 /* Very uncertain, that the following improves anything. Needs
1003 benchmarking
1004 * before it's activated.
1005 else if constexpr (sizeof(_Up) <= 8 && // no long double
1006 !__converts_via_decomposition_v<
1007 _Up, _Tp,
1008 sizeof(__merge)> // conversion via decomposition
1009 // is better handled via the
1010 // bit_iteration fallback below
1011 )
1012 {
1013 // TODO: copy pattern from _S_masked_store, which doesn't resort to
1014 // fixed_size
1015 using _Ap = simd_abi::deduce_t<_Up, _Np>;
1016 using _ATraits = _SimdTraits<_Up, _Ap>;
1017 using _AImpl = typename _ATraits::_SimdImpl;
1018 typename _ATraits::_SimdMember __uncvted{};
1019 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template
1020 _S_convert<_Up>(__k);
1021 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem);
1022 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter;
1023 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted));
1024 }
1025 */
1026 else
1027 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1028 return __merge;
1029 }
1030
1031 // }}}
1032 // _S_masked_store_nocvt {{{
1033 template <typename _Tp, size_t _Np>
1034 _GLIBCXX_SIMD_INTRINSIC static void
1035 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k)
1036 {
1037 [[maybe_unused]] const auto __vi = __to_intrin(__v);
1038 if constexpr (sizeof(__vi) == 64)
1039 {
1040 static_assert(sizeof(__v) == 64 && __have_avx512f);
1041 if constexpr (__have_avx512bw && sizeof(_Tp) == 1)
1042 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1043 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1044 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1045 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1046 {
1047 if constexpr (is_integral_v<_Tp>)
1048 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1049 else
1050 _mm512_mask_storeu_ps(__mem, __k, __vi);
1051 }
1052 else if constexpr (__have_avx512f && sizeof(_Tp) == 8)
1053 {
1054 if constexpr (is_integral_v<_Tp>)
1055 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1056 else
1057 _mm512_mask_storeu_pd(__mem, __k, __vi);
1058 }
1059#if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32
1060 // with Skylake-AVX512, __have_avx512bw is true
1061 else if constexpr (__have_sse2)
1062 {
1063 using _M = __vector_type_t<_Tp, _Np>;
1064 using _MVT = _VectorTraits<_M>;
1065 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1066 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1067 reinterpret_cast<char*>(__mem));
1068 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1069 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1070 __k._M_data >> 1 * _MVT::_S_full_size)),
1071 reinterpret_cast<char*>(__mem) + 1 * 16);
1072 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1073 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1074 __k._M_data >> 2 * _MVT::_S_full_size)),
1075 reinterpret_cast<char*>(__mem) + 2 * 16);
1076 if constexpr (_Np > 48 / sizeof(_Tp))
1077 _mm_maskmoveu_si128(
1078 __auto_bitcast(__extract<3, 4>(__v._M_data)),
1079 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1080 __k._M_data >> 3 * _MVT::_S_full_size)),
1081 reinterpret_cast<char*>(__mem) + 3 * 16);
1082 }
1083#endif
1084 else
1085 __assert_unreachable<_Tp>();
1086 }
1087 else if constexpr (sizeof(__vi) == 32)
1088 {
1089 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1090 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1091 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1092 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1093 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1094 {
1095 if constexpr (is_integral_v<_Tp>)
1096 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1097 else
1098 _mm256_mask_storeu_ps(__mem, __k, __vi);
1099 }
1100 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1101 {
1102 if constexpr (is_integral_v<_Tp>)
1103 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1104 else
1105 _mm256_mask_storeu_pd(__mem, __k, __vi);
1106 }
1107 else if constexpr (__have_avx512f
1108 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1109 {
1110 // use a 512-bit maskstore, using zero-extension of the bitmask
1111 _S_masked_store_nocvt(
1112 _SimdWrapper64<_Tp>(
1113 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1114 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1115 }
1116 else
1117 _S_masked_store_nocvt(__v, __mem,
1118 _MaskImpl::template _S_to_maskvector<
1119 __int_for_sizeof_t<_Tp>, _Np>(__k));
1120 }
1121 else if constexpr (sizeof(__vi) == 16)
1122 {
1123 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1124 _mm_mask_storeu_epi8(__mem, __k, __vi);
1125 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1126 _mm_mask_storeu_epi16(__mem, __k, __vi);
1127 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1128 {
1129 if constexpr (is_integral_v<_Tp>)
1130 _mm_mask_storeu_epi32(__mem, __k, __vi);
1131 else
1132 _mm_mask_storeu_ps(__mem, __k, __vi);
1133 }
1134 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1135 {
1136 if constexpr (is_integral_v<_Tp>)
1137 _mm_mask_storeu_epi64(__mem, __k, __vi);
1138 else
1139 _mm_mask_storeu_pd(__mem, __k, __vi);
1140 }
1141 else if constexpr (__have_avx512f
1142 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1143 {
1144 // use a 512-bit maskstore, using zero-extension of the bitmask
1145 _S_masked_store_nocvt(
1146 _SimdWrapper64<_Tp>(
1147 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1148 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1149 }
1150 else
1151 _S_masked_store_nocvt(__v, __mem,
1152 _MaskImpl::template _S_to_maskvector<
1153 __int_for_sizeof_t<_Tp>, _Np>(__k));
1154 }
1155 else
1156 __assert_unreachable<_Tp>();
1157 }
1158
1159 template <typename _Tp, size_t _Np>
1160 _GLIBCXX_SIMD_INTRINSIC static void
1161 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1162 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1163 {
1164 if constexpr (sizeof(__v) <= 16)
1165 {
1166 [[maybe_unused]] const auto __vi
1167 = __intrin_bitcast<__m128i>(__as_vector(__v));
1168 [[maybe_unused]] const auto __ki
1169 = __intrin_bitcast<__m128i>(__as_vector(__k));
1170 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1171 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1172 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1173 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1174 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1175 && is_integral_v<_Tp>)
1176 _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1177 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1178 _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1179 __vector_bitcast<float>(__vi));
1180 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1181 && is_integral_v<_Tp>)
1182 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1183 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1184 _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1185 __vector_bitcast<double>(__vi));
1186 else if constexpr (__have_sse2)
1187 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<char*>(__mem));
1188 }
1189 else if constexpr (sizeof(__v) == 32)
1190 {
1191 [[maybe_unused]] const auto __vi
1192 = __intrin_bitcast<__m256i>(__as_vector(__v));
1193 [[maybe_unused]] const auto __ki
1194 = __intrin_bitcast<__m256i>(__as_vector(__k));
1195 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1196 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1197 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1198 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1199 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1200 && is_integral_v<_Tp>)
1201 _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1202 else if constexpr (sizeof(_Tp) == 4)
1203 _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1204 __vector_bitcast<float>(__v));
1205 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1206 && is_integral_v<_Tp>)
1207 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1208 __vi);
1209 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1210 _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1211 __vector_bitcast<double>(__v));
1212 else if constexpr (__have_sse2)
1213 {
1214 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1215 reinterpret_cast<char*>(__mem));
1216 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1217 reinterpret_cast<char*>(__mem) + 16);
1218 }
1219 }
1220 else
1221 __assert_unreachable<_Tp>();
1222 }
1223
1224 // }}}
1225 // _S_masked_store {{{
1226 template <typename _Tp, size_t _Np, typename _Up>
1227 _GLIBCXX_SIMD_INTRINSIC static void
1228 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1229 const _MaskMember<_Tp> __k) noexcept
1230 {
1231 if constexpr (is_integral_v<
1232 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up)
1233 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw)
1234 && (sizeof(__v) == 64 || __have_avx512vl))
1235 { // truncating store
1236 const auto __vi = __to_intrin(__v);
1237 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1238 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1239 && sizeof(__vi) == 64)
1240 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1241 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1242 && sizeof(__vi) == 32)
1243 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1244 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1245 && sizeof(__vi) == 16)
1246 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1247 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1248 && sizeof(__vi) == 64)
1249 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1250 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1251 && sizeof(__vi) == 32)
1252 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1253 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1254 && sizeof(__vi) == 16)
1255 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1256 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1257 && sizeof(__vi) == 64)
1258 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1259 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1260 && sizeof(__vi) == 32)
1261 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1262 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1263 && sizeof(__vi) == 16)
1264 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1265 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1266 && sizeof(__vi) == 64)
1267 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1268 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1269 && sizeof(__vi) == 32)
1270 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1271 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1272 && sizeof(__vi) == 16)
1273 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1274 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1275 && sizeof(__vi) == 64)
1276 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1277 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1278 && sizeof(__vi) == 32)
1279 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1280 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1281 && sizeof(__vi) == 16)
1282 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1283 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1284 && sizeof(__vi) == 64)
1285 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1286 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1287 && sizeof(__vi) == 32)
1288 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1289 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1290 && sizeof(__vi) == 16)
1291 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1292 else
1293 __assert_unreachable<_Tp>();
1294 }
1295 else
1296 _Base::_S_masked_store(__v, __mem, __k);
1297 }
1298
1299 // }}}
1300 // _S_multiplies {{{
1301 template <typename _V, typename _VVT = _VectorTraits<_V>>
1302 _GLIBCXX_SIMD_INTRINSIC static constexpr _V
1303 _S_multiplies(_V __x, _V __y)
1304 {
1305 using _Tp = typename _VVT::value_type;
1306 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1307 || __y._M_is_constprop())
1308 return __as_vector(__x) * __as_vector(__y);
1309 else if constexpr (sizeof(_Tp) == 1)
1310 {
1311 if constexpr (sizeof(_V) == 2)
1312 {
1313 const auto __xs = reinterpret_cast<short>(__x._M_data);
1314 const auto __ys = reinterpret_cast<short>(__y._M_data);
1315 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short(
1316 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1317 }
1318 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1319 {
1320 const auto __xi = reinterpret_cast<int>(__x._M_data);
1321 const auto __yi = reinterpret_cast<int>(__y._M_data);
1322 return reinterpret_cast<__vector_type_t<_Tp, 3>>(
1323 ((__xi * __yi) & 0xff)
1324 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1325 | ((__xi >> 16) * (__yi & 0xff0000)));
1326 }
1327 else if constexpr (sizeof(_V) == 4)
1328 {
1329 const auto __xi = reinterpret_cast<int>(__x._M_data);
1330 const auto __yi = reinterpret_cast<int>(__y._M_data);
1331 return reinterpret_cast<__vector_type_t<_Tp, 4>>(
1332 ((__xi * __yi) & 0xff)
1333 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1334 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1335 | ((__xi >> 24) * (__yi & 0xff000000u)));
1336 }
1337 else if constexpr (sizeof(_V) == 8 && __have_avx2
1338 && is_signed_v<_Tp>)
1339 return __convert<typename _VVT::type>(
1340 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1341 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1342 else if constexpr (sizeof(_V) == 8 && __have_avx2
1343 && is_unsigned_v<_Tp>)
1344 return __convert<typename _VVT::type>(
1345 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1346 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1347 else
1348 {
1349 // codegen of `x*y` is suboptimal (as of GCC 9.0.1)
1350 constexpr size_t __full_size = _VVT::_S_full_size;
1351 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8;
1352 using _ShortW = _SimdWrapper<short, _Np>;
1353 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1354 * __vector_bitcast<short, _Np>(__y);
1355 _ShortW __high_byte = _ShortW()._M_data - 256;
1356 //[&]() { asm("" : "+x"(__high_byte._M_data)); }();
1357 const _ShortW __odd
1358 = (__vector_bitcast<short, _Np>(__x) >> 8)
1359 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1360 if constexpr (__have_avx512bw && sizeof(_V) > 2)
1361 return _CommonImplX86::_S_blend_avx512(
1362 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1363 __vector_bitcast<_Tp>(__odd));
1364 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1365 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1366 __high_byte),
1367 __to_intrin(__even),
1368 __to_intrin(__odd));
1369 else
1370 return __to_intrin(
1371 __or(__andnot(__high_byte, __even), __odd));
1372 }
1373 }
1374 else
1375 return _Base::_S_multiplies(__x, __y);
1376 }
1377
1378 // }}}
1379 // _S_divides {{{
1380#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993
1381 template <typename _Tp, size_t _Np>
1382 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1383 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1384 {
1385 if (!__builtin_is_constant_evaluated()
1386 && !__builtin_constant_p(__y._M_data))
1387 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4)
1388 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1)
1389 // Note that using floating-point division is likely to raise the
1390 // *Inexact* exception flag and thus appears like an invalid
1391 // "as-if" transformation. However, C++ doesn't specify how the
1392 // fpenv can be observed and points to C. C says that function
1393 // calls are assumed to potentially raise fp exceptions, unless
1394 // documented otherwise. Consequently, operator/, which is a
1395 // function call, may raise fp exceptions.
1396 /*const struct _CsrGuard
1397 {
1398 const unsigned _M_data = _mm_getcsr();
1399 _CsrGuard()
1400 {
1401 _mm_setcsr(0x9f80); // turn off FP exceptions and
1402 flush-to-zero
1403 }
1404 ~_CsrGuard() { _mm_setcsr(_M_data); }
1405 } __csr;*/
1406 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1407 constexpr size_t __n_intermediate
1408 = std::min(_Np, (__have_avx512f ? 64
1409 : __have_avx ? 32
1410 : 16)
1411 / sizeof(_Float));
1412 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1413 constexpr size_t __n_floatv
1414 = __div_roundup(_Np, __n_intermediate);
1415 using _R = __vector_type_t<_Tp, _Np>;
1416 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1417 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1418 _Abi::__make_padding_nonzero(__as_vector(__y)));
1419 return __call_with_n_evaluations<__n_floatv>(
1420 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1421 return __vector_convert<_R>(__quotients...);
1422 },
1423 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1424 -> _SimdWrapper<_Float, __n_intermediate>
1425 {
1426#if !defined __clang__ && __GCC_IEC_559 == 0
1427 // If -freciprocal-math is active, using the `/` operator is
1428 // incorrect because it may be translated to an imprecise
1429 // multiplication with reciprocal. We need to use inline
1430 // assembly to force a real division.
1431 _FloatV __r;
1432 if constexpr (__have_avx) // -mno-sse2avx is irrelevant
1433 // because once -mavx is given, GCC
1434 // emits VEX encoded vdivp[sd]
1435 {
1436 if constexpr (sizeof(_Tp) == 4)
1437 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}"
1438 : "=x"(__r)
1439 : "x"(__xf[__i]), "x"(__yf[__i]));
1440 else
1441 asm("vdivps\t{%2, %1, %0|%0, %1, %2}"
1442 : "=x"(__r)
1443 : "x"(__xf[__i]), "x"(__yf[__i]));
1444 }
1445 else
1446 {
1447 __r = __xf[__i];
1448 if constexpr (sizeof(_Tp) == 4)
1449 asm("divpd\t{%1, %0|%0, %1}"
1450 : "=x"(__r)
1451 : "x"(__yf[__i]));
1452 else
1453 asm("divps\t{%1, %0|%0, %1}"
1454 : "=x"(__r)
1455 : "x"(__yf[__i]));
1456 }
1457 return __r;
1458#else
1459 return __xf[__i] / __yf[__i];
1460#endif
1461 });
1462 }
1463 /* 64-bit int division is potentially optimizable via double division if
1464 * the value in __x is small enough and the conversion between
1465 * int<->double is efficient enough:
1466 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
1467 sizeof(_Tp) == 8)
1468 {
1469 if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1470 {
1471 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull,
1472 0xffe0'0000'0000'0000ull}))
1473 {
1474 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data)
1475 }
1476 }
1477 }
1478 */
1479 return _Base::_S_divides(__x, __y);
1480 }
1481 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993
1482
1483 // }}}
1484 // _S_modulus {{{
1485 template <typename _Tp, size_t _Np>
1486 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1487 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1488 {
1489 if (__builtin_is_constant_evaluated()
1490 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8)
1491 return _Base::_S_modulus(__x, __y);
1492 else
1493 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
1494 }
1495
1496 // }}}
1497 // _S_bit_shift_left {{{
1498 // Notes on UB. C++2a [expr.shift] says:
1499 // -1- [...] The operands shall be of integral or unscoped enumeration type
1500 // and integral promotions are performed. The type of the result is that
1501 // of the promoted left operand. The behavior is undefined if the right
1502 // operand is negative, or greater than or equal to the width of the
1503 // promoted left operand.
1504 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo
1505 // 2^N, where N is the width of the type of the result.
1506 //
1507 // C++17 [expr.shift] says:
1508 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated
1509 // bits are zero-filled. If E1 has an unsigned type, the value of the
1510 // result is E1 × 2^E2 , reduced modulo one more than the maximum value
1511 // representable in the result type. Otherwise, if E1 has a signed type
1512 // and non-negative value, and E1 × 2^E2 is representable in the
1513 // corresponding unsigned type of the result type, then that value,
1514 // converted to the result type, is the resulting value; otherwise, the
1515 // behavior is undefined.
1516 //
1517 // Consequences:
1518 // With C++2a signed and unsigned types have the same UB
1519 // characteristics:
1520 // - left shift is not UB for 0 <= RHS < max(32, #bits(T))
1521 //
1522 // With C++17 there's little room for optimizations because the standard
1523 // requires all shifts to happen on promoted integrals (i.e. int). Thus,
1524 // short and char shifts must assume shifts affect bits of neighboring
1525 // values.
1526 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1527 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1528 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1529 _S_bit_shift_left(_Tp __xx, int __y)
1530 {
1531 using _V = typename _TVT::type;
1532 using _Up = typename _TVT::value_type;
1533 _V __x = __xx;
1534 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1535 if (__builtin_is_constant_evaluated())
1536 return __x << __y;
1537#if __cplusplus > 201703
1538 // after C++17, signed shifts have no UB, and behave just like unsigned
1539 // shifts
1540 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>)
1541 return __vector_bitcast<_Up>(
1542 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1543 __y));
1544#endif
1545 else if constexpr (sizeof(_Up) == 1)
1546 {
1547 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894)
1548 if (__builtin_constant_p(__y))
1549 {
1550 if (__y == 0)
1551 return __x;
1552 else if (__y == 1)
1553 return __x + __x;
1554 else if (__y == 2)
1555 {
1556 __x = __x + __x;
1557 return __x + __x;
1558 }
1559 else if (__y > 2 && __y < 8)
1560 {
1561 if constexpr (sizeof(__x) > sizeof(unsigned))
1562 {
1563 const _UChar __mask = 0xff << __y; // precomputed vector
1564 return __vector_bitcast<_Up>(
1565 __vector_bitcast<_UChar>(
1566 __vector_bitcast<unsigned>(__x) << __y)
1567 & __mask);
1568 }
1569 else
1570 {
1571 const unsigned __mask
1572 = (0xff & (0xff << __y)) * 0x01010101u;
1573 return reinterpret_cast<_V>(
1574 static_cast<__int_for_sizeof_t<_V>>(
1575 unsigned(
1576 reinterpret_cast<__int_for_sizeof_t<_V>>(__x)
1577 << __y)
1578 & __mask));
1579 }
1580 }
1581 else if (__y >= 8 && __y < 32)
1582 return _V();
1583 else
1584 __builtin_unreachable();
1585 }
1586 // general strategy in the following: use an sllv instead of sll
1587 // instruction, because it's 2 to 4 times faster:
1588 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16)
1589 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8(
1590 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix),
1591 _mm256_set1_epi16(__y))));
1592 else if constexpr (__have_avx512bw && sizeof(__x) == 32)
1593 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1594 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix),
1595 _mm512_set1_epi16(__y))));
1596 else if constexpr (__have_avx512bw && sizeof(__x) == 64)
1597 {
1598 const auto __shift = _mm512_set1_epi16(__y);
1599 return __vector_bitcast<_Up>(
1600 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1601 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)),
1602 _mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1603 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift))));
1604 }
1605 else if constexpr (__have_avx2 && sizeof(__x) == 32)
1606 {
1607#if 1
1608 const auto __shift = _mm_cvtsi32_si128(__y);
1609 auto __k
1610 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift);
1611 __k |= _mm256_srli_epi16(__k, 8);
1612 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift)
1613 & __k);
1614#else
1615 const _Up __k = 0xff << __y;
1616 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y)
1617 & __k;
1618#endif
1619 }
1620 else
1621 {
1622 const auto __shift = _mm_cvtsi32_si128(__y);
1623 auto __k
1624 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift);
1625 __k |= _mm_srli_epi16(__k, 8);
1626 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k);
1627 }
1628 }
1629 return __x << __y;
1630 }
1631
1632 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1633 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1634 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y)
1635 {
1636 using _V = typename _TVT::type;
1637 using _Up = typename _TVT::value_type;
1638 _V __x = __xx;
1639 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1640 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1641 if (__builtin_is_constant_evaluated())
1642 return __x << __y;
1643#if __cplusplus > 201703
1644 // after C++17, signed shifts have no UB, and behave just like unsigned
1645 // shifts
1646 else if constexpr (is_signed_v<_Up>)
1647 return __vector_bitcast<_Up>(
1648 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1649 __vector_bitcast<make_unsigned_t<_Up>>(__y)));
1650#endif
1651 else if constexpr (sizeof(_Up) == 1)
1652 {
1653 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1654 return __vector_bitcast<_Up>(__concat(
1655 _mm512_cvtepi16_epi8(
1656 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)),
1657 _mm512_cvtepu8_epi16(__lo256(__iy)))),
1658 _mm512_cvtepi16_epi8(
1659 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)),
1660 _mm512_cvtepu8_epi16(__hi256(__iy))))));
1661 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1662 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1663 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix),
1664 _mm512_cvtepu8_epi16(__iy))));
1665 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl)
1666 return __intrin_bitcast<_V>(
1667 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix),
1668 _mm_cvtepu8_epi16(__iy))));
1669 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1670 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1671 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix),
1672 _mm256_cvtepu8_epi16(__iy))));
1673 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1674 return __intrin_bitcast<_V>(
1675 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1676 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)),
1677 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy))))));
1678 else if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1679 {
1680 auto __mask
1681 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5);
1682 auto __x4
1683 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1684 __x4 &= char(0xf0);
1685 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1686 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4)));
1687 __mask += __mask;
1688 auto __x2
1689 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1690 __x2 &= char(0xfc);
1691 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1692 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2)));
1693 __mask += __mask;
1694 auto __x1 = __x + __x;
1695 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1696 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1)));
1697 return __x
1698 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1699 }
1700 else if constexpr (sizeof(__x) == 16)
1701 {
1702 auto __mask
1703 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5);
1704 auto __x4
1705 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1706 __x4 &= char(0xf0);
1707 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x;
1708 __mask += __mask;
1709 auto __x2
1710 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1711 __x2 &= char(0xfc);
1712 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x;
1713 __mask += __mask;
1714 auto __x1 = __x + __x;
1715 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x;
1716 return __x
1717 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1718 }
1719 else
1720 return __x << __y;
1721 }
1722 else if constexpr (sizeof(_Up) == 2)
1723 {
1724 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1725 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy));
1726 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl)
1727 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy));
1728 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1729 return __vector_bitcast<_Up>(
1730 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix),
1731 _mm512_castsi256_si512(__iy))));
1732 else if constexpr (sizeof __ix == 32 && __have_avx2)
1733 {
1734 const auto __ux = __vector_bitcast<unsigned>(__x);
1735 const auto __uy = __vector_bitcast<unsigned>(__y);
1736 return __vector_bitcast<_Up>(_mm256_blend_epi16(
1737 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1738 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1739 }
1740 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1741 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy));
1742 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1743 return __intrin_bitcast<_V>(
1744 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix),
1745 _mm512_castsi128_si512(__iy))));
1746 else if constexpr (sizeof __ix == 16 && __have_avx2)
1747 {
1748 const auto __ux = __vector_bitcast<unsigned>(__ix);
1749 const auto __uy = __vector_bitcast<unsigned>(__iy);
1750 return __intrin_bitcast<_V>(_mm_blend_epi16(
1751 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1752 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1753 }
1754 else if constexpr (sizeof __ix == 16)
1755 {
1756 using _Float4 = __vector_type_t<float, 4>;
1757 using _Int4 = __vector_type_t<int, 4>;
1758 using _UInt4 = __vector_type_t<unsigned, 4>;
1759 const _UInt4 __yu
1760 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3)));
1761 return __x
1762 * __intrin_bitcast<_V>(
1763 __vector_convert<_Int4>(_SimdWrapper<float, 4>(
1764 reinterpret_cast<_Float4>(__yu << 23)))
1765 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>(
1766 reinterpret_cast<_Float4>((__yu >> 16) << 23)))
1767 << 16));
1768 }
1769 else
1770 __assert_unreachable<_Tp>();
1771 }
1772 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16
1773 && !__have_avx2)
1774 // latency is suboptimal, but throughput is at full speedup
1775 return __intrin_bitcast<_V>(
1776 __vector_bitcast<unsigned>(__ix)
1777 * __vector_convert<__vector_type16_t<int>>(
1778 _SimdWrapper<float, 4>(__vector_bitcast<float>(
1779 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000))));
1780 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16
1781 && !__have_avx2)
1782 {
1783 const auto __lo = _mm_sll_epi64(__ix, __iy);
1784 const auto __hi
1785 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy));
1786 if constexpr (__have_sse4_1)
1787 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0));
1788 else
1789 return __vector_bitcast<_Up>(
1790 _mm_move_sd(__vector_bitcast<double>(__hi),
1791 __vector_bitcast<double>(__lo)));
1792 }
1793 else
1794 return __x << __y;
1795 }
1796#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
1797
1798 // }}}
1799 // _S_bit_shift_right {{{
1800#ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1801 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1802 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1803 _S_bit_shift_right(_Tp __xx, int __y)
1804 {
1805 using _V = typename _TVT::type;
1806 using _Up = typename _TVT::value_type;
1807 _V __x = __xx;
1808 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1809 if (__builtin_is_constant_evaluated())
1810 return __x >> __y;
1811 else if (__builtin_constant_p(__y)
1812 && is_unsigned_v<
1813 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__))
1814 return _V();
1815 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{
1816 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y)
1817 & _Up(0xff >> __y);
1818 //}}}
1819 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{
1820 return __intrin_bitcast<_V>(
1821 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix)
1822 >> (__y + 8))
1823 << 8)
1824 | (__vector_bitcast<_UShort>(
1825 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8)
1826 >> __y)
1827 >> 8));
1828 //}}}
1829 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected
1830 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{
1831 {
1832 if (__y > 32)
1833 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32)
1834 & _Up(0xffff'ffff'0000'0000ull))
1835 | __vector_bitcast<_Up>(
1836 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix)
1837 >> 32)
1838 >> (__y - 32));
1839 else
1840 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix)
1841 >> __y)
1842 | __vector_bitcast<_Up>(
1843 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll)
1844 >> __y);
1845 }
1846 //}}}
1847 else
1848 return __x >> __y;
1849 }
1850
1851 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1852 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1853 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y)
1854 {
1855 using _V = typename _TVT::type;
1856 using _Up = typename _TVT::value_type;
1857 _V __x = __xx;
1858 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1859 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1860 if (__builtin_is_constant_evaluated()
1861 || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
1862 return __x >> __y;
1863 else if constexpr (sizeof(_Up) == 1) //{{{
1864 {
1865 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl)
1866 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8(
1867 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix),
1868 _mm_cvtepi8_epi16(__iy))
1869 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix),
1870 _mm_cvtepu8_epi16(__iy))));
1871 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl)
1872 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1873 is_signed_v<_Up>
1874 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix),
1875 _mm256_cvtepi8_epi16(__iy))
1876 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix),
1877 _mm256_cvtepu8_epi16(__iy))));
1878 else if constexpr (sizeof(__x) == 32 && __have_avx512bw)
1879 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1880 is_signed_v<_Up>
1881 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix),
1882 _mm512_cvtepi8_epi16(__iy))
1883 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix),
1884 _mm512_cvtepu8_epi16(__iy))));
1885 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>)
1886 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1887 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1888 0x5555'5555'5555'5555ull,
1889 _mm512_srav_epi16(
1890 _mm512_slli_epi16(__ix, 8),
1891 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy,
1892 _mm512_set1_epi16(8)))));
1893 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>)
1894 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1895 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1896 0x5555'5555'5555'5555ull,
1897 _mm512_srlv_epi16(
1898 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix),
1899 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy))));
1900 /* This has better throughput but higher latency than the impl below
1901 else if constexpr (__have_avx2 && sizeof(__x) == 16 &&
1902 is_unsigned_v<_Up>)
1903 {
1904 const auto __shorts = __to_intrin(_S_bit_shift_right(
1905 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)),
1906 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy))));
1907 return __vector_bitcast<_Up>(
1908 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts)));
1909 }
1910 */
1911 else if constexpr (__have_avx2 && sizeof(__x) > 8)
1912 // the following uses vpsr[al]vd, which requires AVX2
1913 if constexpr (is_signed_v<_Up>)
1914 {
1915 const auto r3 = __vector_bitcast<_UInt>(
1916 (__vector_bitcast<int>(__x)
1917 >> (__vector_bitcast<_UInt>(__y) >> 24)))
1918 & 0xff000000u;
1919 const auto r2
1920 = __vector_bitcast<_UInt>(
1921 ((__vector_bitcast<int>(__x) << 8)
1922 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)))
1923 & 0xff000000u;
1924 const auto r1
1925 = __vector_bitcast<_UInt>(
1926 ((__vector_bitcast<int>(__x) << 16)
1927 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)))
1928 & 0xff000000u;
1929 const auto r0 = __vector_bitcast<_UInt>(
1930 (__vector_bitcast<int>(__x) << 24)
1931 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24));
1932 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1933 | (r0 >> 24));
1934 }
1935 else
1936 {
1937 const auto r3 = (__vector_bitcast<_UInt>(__x)
1938 >> (__vector_bitcast<_UInt>(__y) >> 24))
1939 & 0xff000000u;
1940 const auto r2
1941 = ((__vector_bitcast<_UInt>(__x) << 8)
1942 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))
1943 & 0xff000000u;
1944 const auto r1
1945 = ((__vector_bitcast<_UInt>(__x) << 16)
1946 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))
1947 & 0xff000000u;
1948 const auto r0
1949 = (__vector_bitcast<_UInt>(__x) << 24)
1950 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24);
1951 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1952 | (r0 >> 24));
1953 }
1954 else if constexpr (__have_sse4_1
1955 && is_unsigned_v<_Up> && sizeof(__x) > 2)
1956 {
1957 auto __x128 = __vector_bitcast<_Up>(__ix);
1958 auto __mask
1959 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5);
1960 auto __x4 = __vector_bitcast<_Up>(
1961 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f));
1962 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1963 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4)));
1964 __mask += __mask;
1965 auto __x2 = __vector_bitcast<_Up>(
1966 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f));
1967 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1968 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2)));
1969 __mask += __mask;
1970 auto __x1 = __vector_bitcast<_Up>(
1971 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f));
1972 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1973 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1)));
1974 return __intrin_bitcast<_V>(
1975 __x128
1976 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
1977 == 0)); // y > 7 nulls the result
1978 }
1979 else if constexpr (__have_sse4_1
1980 && is_signed_v<_Up> && sizeof(__x) > 2)
1981 {
1982 auto __mask = __vector_bitcast<_UChar>(
1983 __vector_bitcast<_UShort>(__iy) << 5);
1984 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1985 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
1986 };
1987 auto __xh = __vector_bitcast<short>(__ix);
1988 auto __xl = __vector_bitcast<short>(__ix) << 8;
1989 auto __xh4 = __xh >> 4;
1990 auto __xl4 = __xl >> 4;
1991 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
1992 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4)));
1993 __xl = __vector_bitcast<short>(
1994 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
1995 __to_intrin(__xl4)));
1996 __mask += __mask;
1997 auto __xh2 = __xh >> 2;
1998 auto __xl2 = __xl >> 2;
1999 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2000 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2)));
2001 __xl = __vector_bitcast<short>(
2002 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2003 __to_intrin(__xl2)));
2004 __mask += __mask;
2005 auto __xh1 = __xh >> 1;
2006 auto __xl1 = __xl >> 1;
2007 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2008 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1)));
2009 __xl = __vector_bitcast<short>(
2010 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2011 __to_intrin(__xl1)));
2012 return __intrin_bitcast<_V>(
2013 (__vector_bitcast<_Up>((__xh & short(0xff00)))
2014 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2015 >> 8))
2016 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2017 == 0)); // y > 7 nulls the result
2018 }
2019 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2
2020 {
2021 auto __mask
2022 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5);
2023 auto __x4 = __vector_bitcast<_Up>(
2024 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f));
2025 __x = __mask > 0x7f ? __x4 : __x;
2026 __mask += __mask;
2027 auto __x2 = __vector_bitcast<_Up>(
2028 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f));
2029 __x = __mask > 0x7f ? __x2 : __x;
2030 __mask += __mask;
2031 auto __x1 = __vector_bitcast<_Up>(
2032 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f));
2033 __x = __mask > 0x7f ? __x1 : __x;
2034 return __x
2035 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2036 }
2037 else if constexpr (sizeof(__x) > 2) // signed SSE2
2038 {
2039 static_assert(is_signed_v<_Up>);
2040 auto __maskh = __vector_bitcast<_UShort>(__y) << 5;
2041 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8);
2042 auto __xh = __vector_bitcast<short>(__x);
2043 auto __xl = __vector_bitcast<short>(__x) << 8;
2044 auto __xh4 = __xh >> 4;
2045 auto __xl4 = __xl >> 4;
2046 __xh = __maskh > 0x7fff ? __xh4 : __xh;
2047 __xl = __maskl > 0x7fff ? __xl4 : __xl;
2048 __maskh += __maskh;
2049 __maskl += __maskl;
2050 auto __xh2 = __xh >> 2;
2051 auto __xl2 = __xl >> 2;
2052 __xh = __maskh > 0x7fff ? __xh2 : __xh;
2053 __xl = __maskl > 0x7fff ? __xl2 : __xl;
2054 __maskh += __maskh;
2055 __maskl += __maskl;
2056 auto __xh1 = __xh >> 1;
2057 auto __xl1 = __xl >> 1;
2058 __xh = __maskh > 0x7fff ? __xh1 : __xh;
2059 __xl = __maskl > 0x7fff ? __xl1 : __xl;
2060 __x = __vector_bitcast<_Up>((__xh & short(0xff00)))
2061 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2062 >> 8);
2063 return __x
2064 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2065 }
2066 else
2067 return __x >> __y;
2068 } //}}}
2069 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
2070 {
2071 [[maybe_unused]] auto __blend_0xaa
2072 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2073 if constexpr (sizeof(__a) == 16)
2074 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2075 0xaa);
2076 else if constexpr (sizeof(__a) == 32)
2077 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2078 0xaa);
2079 else if constexpr (sizeof(__a) == 64)
2080 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
2081 __to_intrin(__b));
2082 else
2083 __assert_unreachable<decltype(__a)>();
2084 };
2085 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
2086 return __intrin_bitcast<_V>(is_signed_v<_Up>
2087 ? _mm_srav_epi16(__ix, __iy)
2088 : _mm_srlv_epi16(__ix, __iy));
2089 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32)
2090 return __vector_bitcast<_Up>(is_signed_v<_Up>
2091 ? _mm256_srav_epi16(__ix, __iy)
2092 : _mm256_srlv_epi16(__ix, __iy));
2093 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64)
2094 return __vector_bitcast<_Up>(is_signed_v<_Up>
2095 ? _mm512_srav_epi16(__ix, __iy)
2096 : _mm512_srlv_epi16(__ix, __iy));
2097 else if constexpr (__have_avx2 && is_signed_v<_Up>)
2098 return __intrin_bitcast<_V>(
2099 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16)
2100 >> (__vector_bitcast<int>(__iy) & 0xffffu))
2101 >> 16,
2102 __vector_bitcast<int>(__ix)
2103 >> (__vector_bitcast<int>(__iy) >> 16)));
2104 else if constexpr (__have_avx2 && is_unsigned_v<_Up>)
2105 return __intrin_bitcast<_V>(
2106 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu)
2107 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu),
2108 __vector_bitcast<_UInt>(__ix)
2109 >> (__vector_bitcast<_UInt>(__iy) >> 16)));
2110 else if constexpr (__have_sse4_1)
2111 {
2112 auto __mask = __vector_bitcast<_UShort>(__iy);
2113 auto __x128 = __vector_bitcast<_Up>(__ix);
2114 //__mask *= 0x0808;
2115 __mask = (__mask << 3) | (__mask << 11);
2116 // do __x128 = 0 where __y[4] is set
2117 __x128 = __vector_bitcast<_Up>(
2118 _mm_blendv_epi8(__to_intrin(__x128), __m128i(),
2119 __to_intrin(__mask)));
2120 // do __x128 =>> 8 where __y[3] is set
2121 __x128 = __vector_bitcast<_Up>(
2122 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8),
2123 __to_intrin(__mask += __mask)));
2124 // do __x128 =>> 4 where __y[2] is set
2125 __x128 = __vector_bitcast<_Up>(
2126 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4),
2127 __to_intrin(__mask += __mask)));
2128 // do __x128 =>> 2 where __y[1] is set
2129 __x128 = __vector_bitcast<_Up>(
2130 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2),
2131 __to_intrin(__mask += __mask)));
2132 // do __x128 =>> 1 where __y[0] is set
2133 return __intrin_bitcast<_V>(
2134 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1),
2135 __to_intrin(__mask + __mask)));
2136 }
2137 else
2138 {
2139 auto __k = __vector_bitcast<_UShort>(__iy) << 11;
2140 auto __x128 = __vector_bitcast<_Up>(__ix);
2141 auto __mask
2142 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2143 return __vector_bitcast<short>(__kk) < 0;
2144 };
2145 // do __x128 = 0 where __y[4] is set
2146 __x128 = __mask(__k) ? decltype(__x128)() : __x128;
2147 // do __x128 =>> 8 where __y[3] is set
2148 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128;
2149 // do __x128 =>> 4 where __y[2] is set
2150 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128;
2151 // do __x128 =>> 2 where __y[1] is set
2152 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128;
2153 // do __x128 =>> 1 where __y[0] is set
2154 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1
2155 : __x128);
2156 }
2157 } //}}}
2158 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{
2159 {
2160 if constexpr (is_unsigned_v<_Up>)
2161 {
2162 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31
2163 const __m128 __factor_f = reinterpret_cast<__m128>(
2164 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23));
2165 const __m128i __factor
2166 = __builtin_constant_p(__factor_f)
2167 ? __to_intrin(
2168 __make_vector<unsigned>(__factor_f[0], __factor_f[1],
2169 __factor_f[2], __factor_f[3]))
2170 : _mm_cvttps_epi32(__factor_f);
2171 const auto __r02
2172 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31);
2173 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4),
2174 _mm_srli_si128(__factor, 4));
2175 if constexpr (__have_sse4_1)
2176 return __intrin_bitcast<_V>(
2177 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33));
2178 else
2179 return __intrin_bitcast<_V>(
2180 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4));
2181 }
2182 else
2183 {
2184 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2185 if constexpr (is_signed_v<_Up>)
2186 return _mm_sra_epi32(__a, __b);
2187 else
2188 return _mm_srl_epi32(__a, __b);
2189 };
2190 const auto __r0
2191 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i()));
2192 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32));
2193 const auto __r2
2194 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i()));
2195 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12));
2196 if constexpr (__have_sse4_1)
2197 return __intrin_bitcast<_V>(
2198 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3),
2199 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0));
2200 else
2201 return __intrin_bitcast<_V>(_mm_unpacklo_epi64(
2202 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)),
2203 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4))));
2204 }
2205 } //}}}
2206 else
2207 return __x >> __y;
2208 }
2209#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
2210
2211 // }}}
2212 // compares {{{
2213 // _S_equal_to {{{
2214 template <typename _Tp, size_t _Np>
2215 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2216 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2217 {
2218 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2219 {
2220 if (__builtin_is_constant_evaluated()
2221 || (__x._M_is_constprop() && __y._M_is_constprop()))
2222 return _MaskImpl::_S_to_bits(
2223 __as_wrapper<_Np>(__x._M_data == __y._M_data));
2224
2225 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2226 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2227 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2228 if constexpr (is_floating_point_v<_Tp>)
2229 {
2230 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2231 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2232 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2233 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2234 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2235 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2236 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2237 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2238 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2239 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2240 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2241 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2242 else
2243 __assert_unreachable<_Tp>();
2244 }
2245 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2246 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2247 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2248 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2249 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2250 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2251 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2252 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2253 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2254 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2255 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2256 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2257 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2258 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2259 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2260 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2261 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2262 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2263 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2264 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2265 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2266 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2267 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2268 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2269 else
2270 __assert_unreachable<_Tp>();
2271 } // }}}
2272 else if (__builtin_is_constant_evaluated())
2273 return _Base::_S_equal_to(__x, __y);
2274 else if constexpr (sizeof(__x) == 8) // {{{
2275 {
2276 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2277 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2278 _MaskMember<_Tp> __r64;
2279 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2280 return __r64;
2281 } // }}}
2282 else
2283 return _Base::_S_equal_to(__x, __y);
2284 }
2285
2286 // }}}
2287 // _S_not_equal_to {{{
2288 template <typename _Tp, size_t _Np>
2289 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2290 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2291 {
2292 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2293 {
2294 if (__builtin_is_constant_evaluated()
2295 || (__x._M_is_constprop() && __y._M_is_constprop()))
2296 return _MaskImpl::_S_to_bits(
2297 __as_wrapper<_Np>(__x._M_data != __y._M_data));
2298
2299 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2300 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2301 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2302 if constexpr (is_floating_point_v<_Tp>)
2303 {
2304 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2305 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2306 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2307 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2308 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2309 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2310 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2311 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2312 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2313 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2314 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2315 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2316 else
2317 __assert_unreachable<_Tp>();
2318 }
2319 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2320 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2321 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2322 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2323 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2324 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2325 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2326 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2327 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2328 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2329 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2330 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2331 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2332 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2333 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2334 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2335 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2336 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2337 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2338 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2339 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2340 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2341 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2342 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2343 else
2344 __assert_unreachable<_Tp>();
2345 } // }}}
2346 else if (__builtin_is_constant_evaluated())
2347 return _Base::_S_not_equal_to(__x, __y);
2348 else if constexpr (sizeof(__x) == 8)
2349 {
2350 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2351 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2352 _MaskMember<_Tp> __r64;
2353 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2354 return __r64;
2355 }
2356 else
2357 return _Base::_S_not_equal_to(__x, __y);
2358 }
2359
2360 // }}}
2361 // _S_less {{{
2362 template <typename _Tp, size_t _Np>
2363 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2364 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2365 {
2366 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2367 {
2368 if (__builtin_is_constant_evaluated()
2369 || (__x._M_is_constprop() && __y._M_is_constprop()))
2370 return _MaskImpl::_S_to_bits(
2371 __as_wrapper<_Np>(__x._M_data < __y._M_data));
2372
2373 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2374 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2375 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2376 if constexpr (sizeof(__xi) == 64)
2377 {
2378 if constexpr (is_same_v<_Tp, float>)
2379 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2380 else if constexpr (is_same_v<_Tp, double>)
2381 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2382 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2383 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2384 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2385 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2386 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2387 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2388 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2389 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2390 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2391 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2392 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2393 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2394 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2395 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2396 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2397 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2398 else
2399 __assert_unreachable<_Tp>();
2400 }
2401 else if constexpr (sizeof(__xi) == 32)
2402 {
2403 if constexpr (is_same_v<_Tp, float>)
2404 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2405 else if constexpr (is_same_v<_Tp, double>)
2406 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2407 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2408 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2409 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2410 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2411 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2412 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2413 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2414 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2415 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2416 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2417 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2418 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2419 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2420 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2421 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2422 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2423 else
2424 __assert_unreachable<_Tp>();
2425 }
2426 else if constexpr (sizeof(__xi) == 16)
2427 {
2428 if constexpr (is_same_v<_Tp, float>)
2429 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2430 else if constexpr (is_same_v<_Tp, double>)
2431 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2432 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2433 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2435 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2436 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2437 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2438 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2439 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2440 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2441 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2443 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2444 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2445 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2446 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2447 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2448 else
2449 __assert_unreachable<_Tp>();
2450 }
2451 else
2452 __assert_unreachable<_Tp>();
2453 } // }}}
2454 else if (__builtin_is_constant_evaluated())
2455 return _Base::_S_less(__x, __y);
2456 else if constexpr (sizeof(__x) == 8)
2457 {
2458 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2459 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2460 _MaskMember<_Tp> __r64;
2461 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2462 return __r64;
2463 }
2464 else
2465 return _Base::_S_less(__x, __y);
2466 }
2467
2468 // }}}
2469 // _S_less_equal {{{
2470 template <typename _Tp, size_t _Np>
2471 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2472 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2473 {
2474 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2475 {
2476 if (__builtin_is_constant_evaluated()
2477 || (__x._M_is_constprop() && __y._M_is_constprop()))
2478 return _MaskImpl::_S_to_bits(
2479 __as_wrapper<_Np>(__x._M_data <= __y._M_data));
2480
2481 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2482 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2483 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2484 if constexpr (sizeof(__xi) == 64)
2485 {
2486 if constexpr (is_same_v<_Tp, float>)
2487 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2488 else if constexpr (is_same_v<_Tp, double>)
2489 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2490 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2491 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi);
2492 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2493 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi);
2494 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2495 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi);
2496 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2497 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi);
2498 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2499 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi);
2500 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2501 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi);
2502 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2503 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi);
2504 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2505 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi);
2506 else
2507 __assert_unreachable<_Tp>();
2508 }
2509 else if constexpr (sizeof(__xi) == 32)
2510 {
2511 if constexpr (is_same_v<_Tp, float>)
2512 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2513 else if constexpr (is_same_v<_Tp, double>)
2514 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2515 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2516 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi);
2517 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2518 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi);
2519 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2520 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi);
2521 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2522 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi);
2523 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2524 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi);
2525 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2526 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi);
2527 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2528 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi);
2529 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2530 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi);
2531 else
2532 __assert_unreachable<_Tp>();
2533 }
2534 else if constexpr (sizeof(__xi) == 16)
2535 {
2536 if constexpr (is_same_v<_Tp, float>)
2537 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2538 else if constexpr (is_same_v<_Tp, double>)
2539 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2540 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2541 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi);
2542 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2543 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi);
2544 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2545 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi);
2546 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2547 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi);
2548 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2549 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi);
2550 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2551 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi);
2552 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2553 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi);
2554 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2555 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi);
2556 else
2557 __assert_unreachable<_Tp>();
2558 }
2559 else
2560 __assert_unreachable<_Tp>();
2561 } // }}}
2562 else if (__builtin_is_constant_evaluated())
2563 return _Base::_S_less_equal(__x, __y);
2564 else if constexpr (sizeof(__x) == 8)
2565 {
2566 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2567 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2568 _MaskMember<_Tp> __r64;
2569 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2570 return __r64;
2571 }
2572 else
2573 return _Base::_S_less_equal(__x, __y);
2574 }
2575
2576 // }}} }}}
2577 // negation {{{
2578 template <typename _Tp, size_t _Np>
2579 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2580 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
2581 {
2582 if constexpr (__is_avx512_abi<_Abi>())
2583 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>());
2584 else
2585 return _Base::_S_negate(__x);
2586 }
2587
2588 // }}}
2589 // math {{{
2590 using _Base::_S_abs;
2591
2592 // _S_sqrt {{{
2593 template <typename _Tp, size_t _Np>
2594 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2595 _S_sqrt(_SimdWrapper<_Tp, _Np> __x)
2596 {
2597 if constexpr (__is_sse_ps<_Tp, _Np>())
2598 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x)));
2599 else if constexpr (__is_sse_pd<_Tp, _Np>())
2600 return _mm_sqrt_pd(__x);
2601 else if constexpr (__is_avx_ps<_Tp, _Np>())
2602 return _mm256_sqrt_ps(__x);
2603 else if constexpr (__is_avx_pd<_Tp, _Np>())
2604 return _mm256_sqrt_pd(__x);
2605 else if constexpr (__is_avx512_ps<_Tp, _Np>())
2606 return _mm512_sqrt_ps(__x);
2607 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2608 return _mm512_sqrt_pd(__x);
2609 else
2610 __assert_unreachable<_Tp>();
2611 }
2612
2613 // }}}
2614 // _S_ldexp {{{
2615 template <typename _Tp, size_t _Np>
2616 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2617 _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
2618 __fixed_size_storage_t<int, _Np> __exp)
2619 {
2620 if constexpr (sizeof(__x) == 64 || __have_avx512vl)
2621 {
2622 const auto __xi = __to_intrin(__x);
2623 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi>
2624 __cvt;
2625 const auto __expi = __to_intrin(__cvt(__exp));
2626 using _Up = __bool_storage_member_type_t<_Np>;
2627 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up();
2628 if constexpr (sizeof(__xi) == 16)
2629 {
2630 if constexpr (sizeof(_Tp) == 8)
2631 return _mm_maskz_scalef_pd(__k1, __xi, __expi);
2632 else
2633 return _mm_maskz_scalef_ps(__k1, __xi, __expi);
2634 }
2635 else if constexpr (sizeof(__xi) == 32)
2636 {
2637 if constexpr (sizeof(_Tp) == 8)
2638 return _mm256_maskz_scalef_pd(__k1, __xi, __expi);
2639 else
2640 return _mm256_maskz_scalef_ps(__k1, __xi, __expi);
2641 }
2642 else
2643 {
2644 static_assert(sizeof(__xi) == 64);
2645 if constexpr (sizeof(_Tp) == 8)
2646 return _mm512_maskz_scalef_pd(__k1, __xi, __expi);
2647 else
2648 return _mm512_maskz_scalef_ps(__k1, __xi, __expi);
2649 }
2650 }
2651 else
2652 return _Base::_S_ldexp(__x, __exp);
2653 }
2654
2655 // }}}
2656 // _S_trunc {{{
2657 template <typename _Tp, size_t _Np>
2658 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2659 _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2660 {
2661 if constexpr (__is_avx512_ps<_Tp, _Np>())
2662 return _mm512_roundscale_ps(__x, 0x0b);
2663 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2664 return _mm512_roundscale_pd(__x, 0x0b);
2665 else if constexpr (__is_avx_ps<_Tp, _Np>())
2666 return _mm256_round_ps(__x, 0xb);
2667 else if constexpr (__is_avx_pd<_Tp, _Np>())
2668 return _mm256_round_pd(__x, 0xb);
2669 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2670 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb));
2671 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2672 return _mm_round_pd(__x, 0xb);
2673 else if constexpr (__is_sse_ps<_Tp, _Np>())
2674 {
2675 auto __truncated
2676 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)));
2677 const auto __no_fractional_values
2678 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x))
2679 & 0x7f800000u)
2680 < 0x4b000000; // the exponent is so large that no mantissa bits
2681 // signify fractional values (0x3f8 + 23*8 =
2682 // 0x4b0)
2683 return __no_fractional_values ? __truncated : __to_intrin(__x);
2684 }
2685 else
2686 return _Base::_S_trunc(__x);
2687 }
2688
2689 // }}}
2690 // _S_round {{{
2691 template <typename _Tp, size_t _Np>
2692 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2693 _S_round(_SimdWrapper<_Tp, _Np> __x)
2694 {
2695 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away
2696 // from zero as required by std::round. Therefore this function is more
2697 // complicated.
2698 using _V = __vector_type_t<_Tp, _Np>;
2699 _V __truncated;
2700 if constexpr (__is_avx512_ps<_Tp, _Np>())
2701 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b);
2702 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2703 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b);
2704 else if constexpr (__is_avx_ps<_Tp, _Np>())
2705 __truncated = _mm256_round_ps(__x._M_data,
2706 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2707 else if constexpr (__is_avx_pd<_Tp, _Np>())
2708 __truncated = _mm256_round_pd(__x._M_data,
2709 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2710 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2711 __truncated = __auto_bitcast(
2712 _mm_round_ps(__to_intrin(__x),
2713 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
2714 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2715 __truncated
2716 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2717 else if constexpr (__is_sse_ps<_Tp, _Np>())
2718 __truncated = __auto_bitcast(
2719 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))));
2720 else
2721 return _Base::_S_round(__x);
2722
2723 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0
2724 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0
2725
2726 const _V __rounded
2727 = __truncated
2728 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5)
2729 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1)
2730 : _V());
2731 if constexpr (__have_sse4_1)
2732 return __rounded;
2733 else // adjust for missing range in cvttps_epi32
2734 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded
2735 : __x._M_data;
2736 }
2737
2738 // }}}
2739 // _S_nearbyint {{{
2740 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2741 _GLIBCXX_SIMD_INTRINSIC static _Tp
2742 _S_nearbyint(_Tp __x) noexcept
2743 {
2744 if constexpr (_TVT::template _S_is<float, 16>)
2745 return _mm512_roundscale_ps(__x, 0x0c);
2746 else if constexpr (_TVT::template _S_is<double, 8>)
2747 return _mm512_roundscale_pd(__x, 0x0c);
2748 else if constexpr (_TVT::template _S_is<float, 8>)
2749 return _mm256_round_ps(__x,
2750 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2751 else if constexpr (_TVT::template _S_is<double, 4>)
2752 return _mm256_round_pd(__x,
2753 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2754 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2755 return _mm_round_ps(__x,
2756 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2757 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2758 return _mm_round_pd(__x,
2759 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2760 else
2761 return _Base::_S_nearbyint(__x);
2762 }
2763
2764 // }}}
2765 // _S_rint {{{
2766 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2767 _GLIBCXX_SIMD_INTRINSIC static _Tp
2768 _S_rint(_Tp __x) noexcept
2769 {
2770 if constexpr (_TVT::template _S_is<float, 16>)
2771 return _mm512_roundscale_ps(__x, 0x04);
2772 else if constexpr (_TVT::template _S_is<double, 8>)
2773 return _mm512_roundscale_pd(__x, 0x04);
2774 else if constexpr (_TVT::template _S_is<float, 8>)
2775 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2776 else if constexpr (_TVT::template _S_is<double, 4>)
2777 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2778 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2779 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2780 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2781 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2782 else
2783 return _Base::_S_rint(__x);
2784 }
2785
2786 // }}}
2787 // _S_floor {{{
2788 template <typename _Tp, size_t _Np>
2789 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2790 _S_floor(_SimdWrapper<_Tp, _Np> __x)
2791 {
2792 if constexpr (__is_avx512_ps<_Tp, _Np>())
2793 return _mm512_roundscale_ps(__x, 0x09);
2794 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2795 return _mm512_roundscale_pd(__x, 0x09);
2796 else if constexpr (__is_avx_ps<_Tp, _Np>())
2797 return _mm256_round_ps(__x, 0x9);
2798 else if constexpr (__is_avx_pd<_Tp, _Np>())
2799 return _mm256_round_pd(__x, 0x9);
2800 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2801 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9));
2802 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2803 return _mm_round_pd(__x, 0x9);
2804 else
2805 return _Base::_S_floor(__x);
2806 }
2807
2808 // }}}
2809 // _S_ceil {{{
2810 template <typename _Tp, size_t _Np>
2811 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2812 _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2813 {
2814 if constexpr (__is_avx512_ps<_Tp, _Np>())
2815 return _mm512_roundscale_ps(__x, 0x0a);
2816 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2817 return _mm512_roundscale_pd(__x, 0x0a);
2818 else if constexpr (__is_avx_ps<_Tp, _Np>())
2819 return _mm256_round_ps(__x, 0xa);
2820 else if constexpr (__is_avx_pd<_Tp, _Np>())
2821 return _mm256_round_pd(__x, 0xa);
2822 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2823 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa));
2824 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2825 return _mm_round_pd(__x, 0xa);
2826 else
2827 return _Base::_S_ceil(__x);
2828 }
2829
2830 // }}}
2831 // _S_signbit {{{
2832 template <typename _Tp, size_t _Np>
2833 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2834 _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2835 {
2836 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2837 {
2838 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
2839 return _mm512_movepi32_mask(
2840 __intrin_bitcast<__m512i>(__x._M_data));
2841 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
2842 return _mm512_movepi64_mask(
2843 __intrin_bitcast<__m512i>(__x._M_data));
2844 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
2845 return _mm256_movepi32_mask(
2846 __intrin_bitcast<__m256i>(__x._M_data));
2847 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
2848 return _mm256_movepi64_mask(
2849 __intrin_bitcast<__m256i>(__x._M_data));
2850 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4)
2851 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data));
2852 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8)
2853 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data));
2854 }
2855 else if constexpr (__is_avx512_abi<_Abi>())
2856 {
2857 const auto __xi = __to_intrin(__x);
2858 [[maybe_unused]] constexpr auto __k1
2859 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2860 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2861 return _mm_movemask_ps(__xi);
2862 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2863 return _mm_movemask_pd(__xi);
2864 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2865 return _mm256_movemask_ps(__xi);
2866 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2867 return _mm256_movemask_pd(__xi);
2868 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2869 return _mm512_mask_cmplt_epi32_mask(
2870 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2871 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2872 return _mm512_mask_cmplt_epi64_mask(
2873 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2874 else
2875 __assert_unreachable<_Tp>();
2876 }
2877 else
2878 return _Base::_S_signbit(__x);
2879 /*{
2880 using _I = __int_for_sizeof_t<_Tp>;
2881 if constexpr (sizeof(__x) == 64)
2882 return _S_less(__vector_bitcast<_I>(__x), _I());
2883 else
2884 {
2885 const auto __xx = __vector_bitcast<_I>(__x._M_data);
2886 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>;
2887 if constexpr ((sizeof(_Tp) == 4 &&
2888 (__have_avx2 || sizeof(__x) == 16)) ||
2889 __have_avx512vl)
2890 {
2891 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>);
2892 }
2893 else if constexpr ((__have_avx2 ||
2894 (__have_ssse3 && sizeof(__x) == 16)))
2895 {
2896 return __vector_bitcast<_Tp>((__xx & __signmask) ==
2897 __signmask);
2898 }
2899 else
2900 { // SSE2/3 or AVX (w/o AVX2)
2901 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1);
2902 return __vector_bitcast<_Tp>(
2903 __vector_bitcast<_Tp>(
2904 (__xx & __signmask) |
2905 __vector_bitcast<_I>(__one)) // -1 or 1
2906 != __one);
2907 }
2908 }
2909 }*/
2910 }
2911
2912 // }}}
2913 // _S_isnonzerovalue_mask {{{
2914 // (isnormal | is subnormal == !isinf & !isnan & !is zero)
2915 template <typename _Tp>
2916 _GLIBCXX_SIMD_INTRINSIC static auto
2917 _S_isnonzerovalue_mask(_Tp __x)
2918 {
2919 using _Traits = _VectorTraits<_Tp>;
2920 if constexpr (__have_avx512dq_vl)
2921 {
2922 if constexpr (_Traits::template _S_is<
2923 float, 2> || _Traits::template _S_is<float, 4>)
2924 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f));
2925 else if constexpr (_Traits::template _S_is<float, 8>)
2926 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f));
2927 else if constexpr (_Traits::template _S_is<float, 16>)
2928 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f));
2929 else if constexpr (_Traits::template _S_is<double, 2>)
2930 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f));
2931 else if constexpr (_Traits::template _S_is<double, 4>)
2932 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f));
2933 else if constexpr (_Traits::template _S_is<double, 8>)
2934 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f));
2935 else
2936 __assert_unreachable<_Tp>();
2937 }
2938 else
2939 {
2940 using _Up = typename _Traits::value_type;
2941 constexpr size_t _Np = _Traits::_S_full_size;
2942 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0
2943 const auto __b = __x * _Up(); // NaN if __x == inf
2944 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>())
2945 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b),
2946 _CMP_ORD_Q);
2947 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>())
2948 return __mmask8(0xf
2949 & _mm512_cmp_ps_mask(__auto_bitcast(__a),
2950 __auto_bitcast(__b),
2951 _CMP_ORD_Q));
2952 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>())
2953 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2954 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>())
2955 return __mmask8(0x3
2956 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2957 __auto_bitcast(__b),
2958 _CMP_ORD_Q));
2959 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>())
2960 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
2961 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>())
2962 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a),
2963 __auto_bitcast(__b),
2964 _CMP_ORD_Q));
2965 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>())
2966 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2967 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>())
2968 return __mmask8(0xf
2969 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2970 __auto_bitcast(__b),
2971 _CMP_ORD_Q));
2972 else if constexpr (__is_avx512_ps<_Up, _Np>())
2973 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
2974 else if constexpr (__is_avx512_pd<_Up, _Np>())
2975 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2976 else
2977 __assert_unreachable<_Tp>();
2978 }
2979 }
2980
2981 // }}}
2982 // _S_isfinite {{{
2983 template <typename _Tp, size_t _Np>
2984 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2985 _S_isfinite(_SimdWrapper<_Tp, _Np> __x)
2986 {
2987 static_assert(is_floating_point_v<_Tp>);
2988#if !__FINITE_MATH_ONLY__
2989 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2990 {
2991 const auto __xi = __to_intrin(__x);
2992 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2993 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2994 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2995 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2996 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2997 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2998 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2999 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3000 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3001 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3002 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3003 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3004 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3005 }
3006 else if constexpr (__is_avx512_abi<_Abi>())
3007 {
3008 // if all exponent bits are set, __x is either inf or NaN
3009 using _I = __int_for_sizeof_t<_Tp>;
3010 const auto __inf = __vector_bitcast<_I>(
3011 __vector_broadcast<_Np>(__infinity_v<_Tp>));
3012 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf);
3013 }
3014 else
3015#endif
3016 return _Base::_S_isfinite(__x);
3017 }
3018
3019 // }}}
3020 // _S_isinf {{{
3021 template <typename _Tp, size_t _Np>
3022 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3023 _S_isinf(_SimdWrapper<_Tp, _Np> __x)
3024 {
3025#if !__FINITE_MATH_ONLY__
3026 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3027 {
3028 const auto __xi = __to_intrin(__x);
3029 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3030 return _mm512_fpclass_ps_mask(__xi, 0x18);
3031 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3032 return _mm512_fpclass_pd_mask(__xi, 0x18);
3033 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3034 return _mm256_fpclass_ps_mask(__xi, 0x18);
3035 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3036 return _mm256_fpclass_pd_mask(__xi, 0x18);
3037 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3038 return _mm_fpclass_ps_mask(__xi, 0x18);
3039 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3040 return _mm_fpclass_pd_mask(__xi, 0x18);
3041 else
3042 __assert_unreachable<_Tp>();
3043 }
3044 else if constexpr (__have_avx512dq_vl)
3045 {
3046 if constexpr (__is_sse_pd<_Tp, _Np>())
3047 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18));
3048 else if constexpr (__is_avx_pd<_Tp, _Np>())
3049 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18));
3050 else if constexpr (__is_sse_ps<_Tp, _Np>())
3051 return _mm_movm_epi32(
3052 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18));
3053 else if constexpr (__is_avx_ps<_Tp, _Np>())
3054 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18));
3055 else
3056 __assert_unreachable<_Tp>();
3057 }
3058 else
3059#endif
3060 return _Base::_S_isinf(__x);
3061 }
3062
3063 // }}}
3064 // _S_isnormal {{{
3065 template <typename _Tp, size_t _Np>
3066 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3067 _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
3068 {
3069#if __FINITE_MATH_ONLY__
3070 [[maybe_unused]] constexpr int __mode = 0x26;
3071#else
3072 [[maybe_unused]] constexpr int __mode = 0xbf;
3073#endif
3074 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3075 {
3076 const auto __xi = __to_intrin(__x);
3077 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3078 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3079 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode);
3080 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3081 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode);
3082 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3083 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode);
3084 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3085 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode);
3086 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3087 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode);
3088 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3089 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode);
3090 else
3091 __assert_unreachable<_Tp>();
3092 }
3093 else if constexpr (__have_avx512dq)
3094 {
3095 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
3096 return _mm_movm_epi32(
3097 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode)));
3098 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>())
3099 return _mm256_movm_epi32(
3100 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode)));
3101 else if constexpr (__is_avx512_ps<_Tp, _Np>())
3102 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode));
3103 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>())
3104 return _mm_movm_epi64(
3105 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode)));
3106 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>())
3107 return _mm256_movm_epi64(
3108 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode)));
3109 else if constexpr (__is_avx512_pd<_Tp, _Np>())
3110 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode));
3111 else
3112 __assert_unreachable<_Tp>();
3113 }
3114 else if constexpr (__is_avx512_abi<_Abi>())
3115 {
3116 using _I = __int_for_sizeof_t<_Tp>;
3117 const auto absn = __vector_bitcast<_I>(_S_abs(__x));
3118 const auto minn = __vector_bitcast<_I>(
3119 __vector_broadcast<_Np>(__norm_min_v<_Tp>));
3120#if __FINITE_MATH_ONLY__
3121 return _S_less_equal<_I, _Np>(minn, absn);
3122#else
3123 const auto infn
3124 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
3125 return __and(_S_less_equal<_I, _Np>(minn, absn),
3126 _S_less<_I, _Np>(absn, infn));
3127#endif
3128 }
3129 else
3130 return _Base::_S_isnormal(__x);
3131 }
3132
3133 // }}}
3134 // _S_isnan {{{
3135 template <typename _Tp, size_t _Np>
3136 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3137 _S_isnan(_SimdWrapper<_Tp, _Np> __x)
3138 { return _S_isunordered(__x, __x); }
3139
3140 // }}}
3141 // _S_isunordered {{{
3142 template <typename _Tp, size_t _Np>
3143 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3144 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x,
3145 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y)
3146 {
3147#if __FINITE_MATH_ONLY__
3148 return {}; // false
3149#else
3150 const auto __xi = __to_intrin(__x);
3151 const auto __yi = __to_intrin(__y);
3152 if constexpr (__is_avx512_abi<_Abi>())
3153 {
3154 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3155 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3156 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3157 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3158 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3159 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3160 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3161 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3162 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3163 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3164 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3165 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3166 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3167 }
3168 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3169 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q));
3170 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3171 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q));
3172 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3173 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi));
3174 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3175 return __to_masktype(_mm_cmpunord_pd(__xi, __yi));
3176 else
3177 __assert_unreachable<_Tp>();
3178#endif
3179 }
3180
3181 // }}}
3182 // _S_isgreater {{{
3183 template <typename _Tp, size_t _Np>
3184 static constexpr _MaskMember<_Tp>
3185 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3186 {
3187 const auto __xi = __to_intrin(__x);
3188 const auto __yi = __to_intrin(__y);
3189 if constexpr (__is_avx512_abi<_Abi>())
3190 {
3191 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3192 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3193 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3194 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3195 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3196 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3197 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3198 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3199 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3200 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3201 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3202 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3203 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3204 else
3205 __assert_unreachable<_Tp>();
3206 }
3207 else if constexpr (__have_avx)
3208 {
3209 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3210 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3211 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3212 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3213 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3214 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3215 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3216 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3217 else
3218 __assert_unreachable<_Tp>();
3219 }
3220 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3221 && sizeof(_Tp) == 4)
3222 {
3223 const auto __xn = __vector_bitcast<int>(__xi);
3224 const auto __yn = __vector_bitcast<int>(__yi);
3225 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3226 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3227 return __auto_bitcast(
3228 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp));
3229 }
3230 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3231 && sizeof(_Tp) == 8)
3232 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3233 -_mm_ucomigt_sd(__xi, __yi),
3234 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi),
3235 _mm_unpackhi_pd(__yi, __yi))};
3236 else
3237 return _Base::_S_isgreater(__x, __y);
3238 }
3239
3240 // }}}
3241 // _S_isgreaterequal {{{
3242 template <typename _Tp, size_t _Np>
3243 static constexpr _MaskMember<_Tp>
3244 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3245 {
3246 const auto __xi = __to_intrin(__x);
3247 const auto __yi = __to_intrin(__y);
3248 if constexpr (__is_avx512_abi<_Abi>())
3249 {
3250 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3251 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3252 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3253 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3254 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3255 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3256 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3257 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3258 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3259 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3260 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3261 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3262 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3263 else
3264 __assert_unreachable<_Tp>();
3265 }
3266 else if constexpr (__have_avx)
3267 {
3268 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3269 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3270 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3271 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3272 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3273 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3274 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3275 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3276 else
3277 __assert_unreachable<_Tp>();
3278 }
3279 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3280 && sizeof(_Tp) == 4)
3281 {
3282 const auto __xn = __vector_bitcast<int>(__xi);
3283 const auto __yn = __vector_bitcast<int>(__yi);
3284 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3285 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3286 return __auto_bitcast(
3287 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp));
3288 }
3289 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3290 && sizeof(_Tp) == 8)
3291 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3292 -_mm_ucomige_sd(__xi, __yi),
3293 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi),
3294 _mm_unpackhi_pd(__yi, __yi))};
3295 else
3296 return _Base::_S_isgreaterequal(__x, __y);
3297 }
3298
3299 // }}}
3300 // _S_isless {{{
3301 template <typename _Tp, size_t _Np>
3302 static constexpr _MaskMember<_Tp>
3303 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3304 {
3305 const auto __xi = __to_intrin(__x);
3306 const auto __yi = __to_intrin(__y);
3307 if constexpr (__is_avx512_abi<_Abi>())
3308 {
3309 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3310 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3311 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3312 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3313 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3314 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3315 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3316 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3317 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3318 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3319 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3320 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3321 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3322 else
3323 __assert_unreachable<_Tp>();
3324 }
3325 else if constexpr (__have_avx)
3326 {
3327 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3328 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3329 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3330 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3331 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3332 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3333 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3334 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3335 else
3336 __assert_unreachable<_Tp>();
3337 }
3338 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3339 && sizeof(_Tp) == 4)
3340 {
3341 const auto __xn = __vector_bitcast<int>(__xi);
3342 const auto __yn = __vector_bitcast<int>(__yi);
3343 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3344 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3345 return __auto_bitcast(
3346 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp));
3347 }
3348 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3349 && sizeof(_Tp) == 8)
3350 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3351 -_mm_ucomigt_sd(__yi, __xi),
3352 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi),
3353 _mm_unpackhi_pd(__xi, __xi))};
3354 else
3355 return _Base::_S_isless(__x, __y);
3356 }
3357
3358 // }}}
3359 // _S_islessequal {{{
3360 template <typename _Tp, size_t _Np>
3361 static constexpr _MaskMember<_Tp>
3362 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3363 {
3364 const auto __xi = __to_intrin(__x);
3365 const auto __yi = __to_intrin(__y);
3366 if constexpr (__is_avx512_abi<_Abi>())
3367 {
3368 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3369 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3370 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3371 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3372 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3373 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3374 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3375 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3376 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3377 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3378 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3379 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3380 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3381 else
3382 __assert_unreachable<_Tp>();
3383 }
3384 else if constexpr (__have_avx)
3385 {
3386 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3387 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3388 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3389 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3390 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3391 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3392 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3393 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3394 else
3395 __assert_unreachable<_Tp>();
3396 }
3397 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3398 && sizeof(_Tp) == 4)
3399 {
3400 const auto __xn = __vector_bitcast<int>(__xi);
3401 const auto __yn = __vector_bitcast<int>(__yi);
3402 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3403 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3404 return __auto_bitcast(
3405 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp));
3406 }
3407 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3408 && sizeof(_Tp) == 8)
3409 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3410 -_mm_ucomige_sd(__yi, __xi),
3411 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi),
3412 _mm_unpackhi_pd(__xi, __xi))};
3413 else
3414 return _Base::_S_islessequal(__x, __y);
3415 }
3416
3417 // }}}
3418 // _S_islessgreater {{{
3419 template <typename _Tp, size_t _Np>
3420 static constexpr _MaskMember<_Tp>
3421 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3422 {
3423 const auto __xi = __to_intrin(__x);
3424 const auto __yi = __to_intrin(__y);
3425 if constexpr (__is_avx512_abi<_Abi>())
3426 {
3427 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3428 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3429 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3430 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3431 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3432 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3433 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3434 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3435 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3436 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3437 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3438 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3439 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3440 else
3441 __assert_unreachable<_Tp>();
3442 }
3443 else if constexpr (__have_avx)
3444 {
3445 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3446 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3447 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3448 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3449 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3450 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3451 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3452 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3453 else
3454 __assert_unreachable<_Tp>();
3455 }
3456 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3457 return __auto_bitcast(
3458 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi)));
3459 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3460 return __to_masktype(
3461 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi)));
3462 else
3463 __assert_unreachable<_Tp>();
3464 }
3465
3466 //}}} }}}
3467 template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np>
3468 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
3469 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v)
3470 {
3471 if (__k._M_is_constprop_none_of())
3472 return __v;
3473 else if (__k._M_is_constprop_all_of())
3474 {
3475 auto __vv = _Base::_M_make_simd(__v);
3476 _Op<decltype(__vv)> __op;
3477 return __data(__op(__vv));
3478 }
3479 else if constexpr (__is_bitmask_v<decltype(__k)>
3480 && (is_same_v<_Op<void>, __increment<void>>
3481 || is_same_v<_Op<void>, __decrement<void>>))
3482 {
3483 // optimize masked unary increment and decrement as masked sub +/-1
3484 constexpr int __pm_one
3485 = is_same_v<_Op<void>, __increment<void>> ? -1 : 1;
3486 if constexpr (is_integral_v<_Tp>)
3487 {
3488 constexpr bool __lp64 = sizeof(long) == sizeof(long long);
3489 using _Ip = std::make_signed_t<_Tp>;
3490 using _Up = std::conditional_t<
3491 std::is_same_v<_Ip, long>,
3494 std::is_same_v<_Ip, signed char>, char, _Ip>>;
3495 const auto __value = __vector_bitcast<_Up>(__v._M_data);
3496#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3497 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \
3498 return __vector_bitcast<_Tp>(__builtin_ia32_##_Instr##_mask(__value, \
3499 __vector_broadcast<_Np>(_Up(__pm_one)), __value, __k._M_data))
3500 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512);
3501 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256);
3502 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128);
3503 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512);
3504 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256);
3505 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128);
3506 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512);
3507 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256);
3508 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128);
3509 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512);
3510 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256);
3511 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128);
3512#undef _GLIBCXX_SIMD_MASK_SUB
3513 }
3514 else
3515 {
3516#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3517 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \
3518 return __builtin_ia32_##_Instr##_mask( \
3519 __v._M_data, __vector_broadcast<_Np>(_Tp(__pm_one)), __v._M_data, \
3520 __k._M_data, _MM_FROUND_CUR_DIRECTION)
3521 _GLIBCXX_SIMD_MASK_SUB(4, 64, subps512);
3522 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256);
3523 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128);
3524 _GLIBCXX_SIMD_MASK_SUB(8, 64, subpd512);
3525 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256);
3526 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128);
3527#undef _GLIBCXX_SIMD_MASK_SUB
3528 }
3529 }
3530 else
3531 return _Base::template _S_masked_unary<_Op>(__k, __v);
3532 }
3533 };
3534
3535// }}}
3536// _MaskImplX86Mixin {{{
3537struct _MaskImplX86Mixin
3538{
3539 template <typename _Tp>
3540 using _TypeTag = _Tp*;
3541
3542 using _Base = _MaskImplBuiltinMixin;
3543
3544 // _S_to_maskvector(bool) {{{
3545 template <typename _Up, size_t _ToN = 1, typename _Tp>
3546 _GLIBCXX_SIMD_INTRINSIC static constexpr
3547 enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>>
3548 _S_to_maskvector(_Tp __x)
3549 {
3550 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3551 return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
3552 : __vector_type_t<_Up, _ToN>();
3553 }
3554
3555 // }}}
3556 // _S_to_maskvector(_SanitizedBitMask) {{{
3557 template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN>
3558 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3559 _S_to_maskvector(_SanitizedBitMask<_Np> __x)
3560 {
3561 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3562 using _UV = __vector_type_t<_Up, _ToN>;
3563 using _UI = __intrinsic_type_t<_Up, _ToN>;
3564 [[maybe_unused]] const auto __k = __x._M_to_bits();
3565 if constexpr (_Np == 1)
3566 return _S_to_maskvector<_Up, _ToN>(__k);
3567 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3568 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
3569 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; });
3570 else if constexpr (sizeof(_Up) == 1)
3571 {
3572 if constexpr (sizeof(_UI) == 16)
3573 {
3574 if constexpr (__have_avx512bw_vl)
3575 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k));
3576 else if constexpr (__have_avx512bw)
3577 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k)));
3578 else if constexpr (__have_avx512f)
3579 {
3580 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3581 auto __as16bits
3582 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3583 __hi256(__as32bits)));
3584 return __intrin_bitcast<_UV>(
3585 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
3586 }
3587 else if constexpr (__have_ssse3)
3588 {
3589 const auto __bitmask = __to_intrin(
3590 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4,
3591 8, 16, 32, 64, 128));
3592 return __intrin_bitcast<_UV>(
3593 __vector_bitcast<_Up>(
3594 _mm_shuffle_epi8(__to_intrin(
3595 __vector_type_t<_ULLong, 2>{__k}),
3596 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1,
3597 1, 1, 1, 1, 1, 1, 1))
3598 & __bitmask)
3599 != 0);
3600 }
3601 // else fall through
3602 }
3603 else if constexpr (sizeof(_UI) == 32)
3604 {
3605 if constexpr (__have_avx512bw_vl)
3606 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k));
3607 else if constexpr (__have_avx512bw)
3608 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k)));
3609 else if constexpr (__have_avx512f)
3610 {
3611 auto __as16bits = // 0 16 1 17 ... 15 31
3612 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()),
3613 16)
3614 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16,
3615 ~__m512i()),
3616 16);
3617 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16(
3618 __lo256(__as16bits),
3619 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ...
3620 );
3621 // deinterleave:
3622 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8(
3623 __0_16_1_17, // 0 16 1 17 2 ...
3624 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9,
3625 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1,
3626 3, 5, 7, 9, 11, 13,
3627 15)))); // 0-7 16-23 8-15 24-31 -> xzyw
3628 // 0-3 8-11 16-19 24-27
3629 // 4-7 12-15 20-23 28-31
3630 }
3631 else if constexpr (__have_avx2)
3632 {
3633 const auto __bitmask
3634 = _mm256_broadcastsi128_si256(__to_intrin(
3635 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
3636 4, 8, 16, 32, 64, 128)));
3637 return __vector_bitcast<_Up>(
3638 __vector_bitcast<_Up>(
3639 _mm256_shuffle_epi8(
3640 _mm256_broadcastsi128_si256(
3641 __to_intrin(__vector_type_t<_ULLong, 2>{__k})),
3642 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3643 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
3644 3, 3, 3, 3, 3, 3))
3645 & __bitmask)
3646 != 0);
3647 }
3648 // else fall through
3649 }
3650 else if constexpr (sizeof(_UI) == 64)
3651 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k));
3652 if constexpr (std::min(_ToN, _Np) <= 4)
3653 {
3654 if constexpr (_Np > 7) // avoid overflow
3655 __x &= _SanitizedBitMask<_Np>(0x0f);
3656 const _UInt __char_mask
3657 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL)
3658 * 0xff;
3659 _UV __r = {};
3660 __builtin_memcpy(&__r, &__char_mask,
3661 std::min(sizeof(__r), sizeof(__char_mask)));
3662 return __r;
3663 }
3664 else if constexpr (std::min(_ToN, _Np) <= 7)
3665 {
3666 if constexpr (_Np > 7) // avoid overflow
3667 __x &= _SanitizedBitMask<_Np>(0x7f);
3668 const _ULLong __char_mask
3669 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL)
3670 * 0xff;
3671 _UV __r = {};
3672 __builtin_memcpy(&__r, &__char_mask,
3673 std::min(sizeof(__r), sizeof(__char_mask)));
3674 return __r;
3675 }
3676 }
3677 else if constexpr (sizeof(_Up) == 2)
3678 {
3679 if constexpr (sizeof(_UI) == 16)
3680 {
3681 if constexpr (__have_avx512bw_vl)
3682 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k));
3683 else if constexpr (__have_avx512bw)
3684 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k)));
3685 else if constexpr (__have_avx512f)
3686 {
3687 __m256i __as32bits = {};
3688 if constexpr (__have_avx512vl)
3689 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i());
3690 else
3691 __as32bits
3692 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()));
3693 return __intrin_bitcast<_UV>(
3694 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits)));
3695 }
3696 // else fall through
3697 }
3698 else if constexpr (sizeof(_UI) == 32)
3699 {
3700 if constexpr (__have_avx512bw_vl)
3701 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k));
3702 else if constexpr (__have_avx512bw)
3703 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k)));
3704 else if constexpr (__have_avx512f)
3705 {
3706 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3707 return __vector_bitcast<_Up>(
3708 __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3709 __hi256(__as32bits))));
3710 }
3711 // else fall through
3712 }
3713 else if constexpr (sizeof(_UI) == 64)
3714 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k));
3715 }
3716 else if constexpr (sizeof(_Up) == 4)
3717 {
3718 if constexpr (sizeof(_UI) == 16)
3719 {
3720 if constexpr (__have_avx512dq_vl)
3721 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k));
3722 else if constexpr (__have_avx512dq)
3723 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k)));
3724 else if constexpr (__have_avx512vl)
3725 return __intrin_bitcast<_UV>(
3726 _mm_maskz_mov_epi32(__k, ~__m128i()));
3727 else if constexpr (__have_avx512f)
3728 return __intrin_bitcast<_UV>(
3729 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3730 // else fall through
3731 }
3732 else if constexpr (sizeof(_UI) == 32)
3733 {
3734 if constexpr (__have_avx512dq_vl)
3735 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k));
3736 else if constexpr (__have_avx512dq)
3737 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k)));
3738 else if constexpr (__have_avx512vl)
3739 return __vector_bitcast<_Up>(
3740 _mm256_maskz_mov_epi32(__k, ~__m256i()));
3741 else if constexpr (__have_avx512f)
3742 return __vector_bitcast<_Up>(
3743 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3744 // else fall through
3745 }
3746 else if constexpr (sizeof(_UI) == 64)
3747 return __vector_bitcast<_Up>(
3748 __have_avx512dq ? _mm512_movm_epi32(__k)
3749 : _mm512_maskz_mov_epi32(__k, ~__m512i()));
3750 }
3751 else if constexpr (sizeof(_Up) == 8)
3752 {
3753 if constexpr (sizeof(_UI) == 16)
3754 {
3755 if constexpr (__have_avx512dq_vl)
3756 return __vector_bitcast<_Up>(_mm_movm_epi64(__k));
3757 else if constexpr (__have_avx512dq)
3758 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k)));
3759 else if constexpr (__have_avx512vl)
3760 return __vector_bitcast<_Up>(
3761 _mm_maskz_mov_epi64(__k, ~__m128i()));
3762 else if constexpr (__have_avx512f)
3763 return __vector_bitcast<_Up>(
3764 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3765 // else fall through
3766 }
3767 else if constexpr (sizeof(_UI) == 32)
3768 {
3769 if constexpr (__have_avx512dq_vl)
3770 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k));
3771 else if constexpr (__have_avx512dq)
3772 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k)));
3773 else if constexpr (__have_avx512vl)
3774 return __vector_bitcast<_Up>(
3775 _mm256_maskz_mov_epi64(__k, ~__m256i()));
3776 else if constexpr (__have_avx512f)
3777 return __vector_bitcast<_Up>(
3778 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3779 // else fall through
3780 }
3781 else if constexpr (sizeof(_UI) == 64)
3782 return __vector_bitcast<_Up>(
3783 __have_avx512dq ? _mm512_movm_epi64(__k)
3784 : _mm512_maskz_mov_epi64(__k, ~__m512i()));
3785 }
3786
3787 using _UpUInt = make_unsigned_t<_Up>;
3788 using _V = __vector_type_t<_UpUInt, _ToN>;
3789 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__;
3790 if constexpr (_ToN == 2)
3791 {
3792 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])});
3793 }
3794 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32)
3795 {
3796 if constexpr (sizeof(_Up) == 4)
3797 return __vector_bitcast<_Up>(_mm256_cmp_ps(
3798 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)),
3799 _mm256_castsi256_ps(_mm256_setr_epi32(
3800 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))),
3801 _mm256_setzero_ps(), _CMP_NEQ_UQ));
3802 else if constexpr (sizeof(_Up) == 8)
3803 return __vector_bitcast<_Up>(_mm256_cmp_pd(
3804 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)),
3805 _mm256_castsi256_pd(
3806 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))),
3807 _mm256_setzero_pd(), _CMP_NEQ_UQ));
3808 else
3809 __assert_unreachable<_Up>();
3810 }
3811 else if constexpr (__bits_per_element >= _ToN)
3812 {
3813 constexpr auto __bitmask
3814 = __generate_vector<_V>([](auto __i)
3815 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt
3816 { return __i < _ToN ? 1ull << __i : 0; });
3817 const auto __bits
3818 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
3819 if constexpr (__bits_per_element > _ToN)
3820 return __vector_bitcast<_Up>(__bits) > 0;
3821 else
3822 return __vector_bitcast<_Up>(__bits != 0);
3823 }
3824 else
3825 {
3826 const _V __tmp
3827 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3828 return static_cast<_UpUInt>(
3829 __k >> (__bits_per_element * (__i / __bits_per_element)));
3830 })
3831 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3832 return static_cast<_UpUInt>(1ull
3833 << (__i % __bits_per_element));
3834 }); // mask bit index
3835 return __intrin_bitcast<_UV>(__tmp != _V());
3836 }
3837 }
3838
3839 // }}}
3840 // _S_to_maskvector(_SimdWrapper) {{{
3841 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
3842 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3843 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3844 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
3845 {
3846 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3847 using _TW = _SimdWrapper<_Tp, _Np>;
3848 using _UW = _SimdWrapper<_Up, _ToN>;
3849 using _UI = __intrinsic_type_t<_Up, _ToN>;
3850 if constexpr (is_same_v<_Tp, bool>) // bits -> vector
3851 return _S_to_maskvector<_Up, _ToN>(
3852 _BitMask<_Np>(__x._M_data)._M_sanitized());
3853 // vector -> vector bitcast
3854 else if constexpr (sizeof(_Up) == sizeof(_Tp)
3855 && sizeof(_TW) == sizeof(_UW))
3856 return __wrapper_bitcast<_Up, _ToN>(
3857 _ToN <= _Np
3858 ? __x
3859 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x));
3860 else // vector -> vector {{{
3861 {
3862 if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3863 {
3864 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
3865 return __generate_from_n_evaluations<std::min(_ToN, _Np),
3866 __vector_type_t<_Up, _ToN>>(
3867 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; });
3868 }
3869 using _To = __vector_type_t<_Up, _ToN>;
3870 [[maybe_unused]] constexpr size_t _FromN = _Np;
3871 constexpr int _FromBytes = sizeof(_Tp);
3872 constexpr int _ToBytes = sizeof(_Up);
3873 const auto __k = __x._M_data;
3874
3875 if constexpr (_FromBytes == _ToBytes)
3876 return __intrin_bitcast<_To>(__k);
3877 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16)
3878 { // SSE -> SSE {{{
3879 if constexpr (_FromBytes == 4 && _ToBytes == 8)
3880 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3881 else if constexpr (_FromBytes == 2 && _ToBytes == 8)
3882 {
3883 const auto __y
3884 = __vector_bitcast<int>(__interleave128_lo(__k, __k));
3885 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3886 }
3887 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3888 {
3889 auto __y
3890 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3891 auto __z
3892 = __vector_bitcast<int>(__interleave128_lo(__y, __y));
3893 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z));
3894 }
3895 else if constexpr (_FromBytes == 8 && _ToBytes == 4
3896 && __have_sse2)
3897 return __intrin_bitcast<_To>(
3898 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3899 else if constexpr (_FromBytes == 8 && _ToBytes == 4)
3900 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k),
3901 _UI());
3902 else if constexpr (_FromBytes == 2 && _ToBytes == 4)
3903 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3904 else if constexpr (_FromBytes == 1 && _ToBytes == 4)
3905 {
3906 const auto __y
3907 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3908 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3909 }
3910 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
3911 {
3912 if constexpr (__have_sse2 && !__have_ssse3)
3913 return __intrin_bitcast<_To>(_mm_packs_epi32(
3914 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()),
3915 __m128i()));
3916 else
3917 return __intrin_bitcast<_To>(
3918 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(
3919 __vector_bitcast<_Up>(__k)));
3920 }
3921 else if constexpr (_FromBytes == 4 && _ToBytes == 2)
3922 return __intrin_bitcast<_To>(
3923 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3924 else if constexpr (_FromBytes == 1 && _ToBytes == 2)
3925 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3926 else if constexpr (_FromBytes == 8 && _ToBytes == 1
3927 && __have_ssse3)
3928 return __intrin_bitcast<_To>(
3929 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3930 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1,
3931 -1, -1, -1, -1, -1, -1, -1,
3932 -1)));
3933 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
3934 {
3935 auto __y
3936 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3937 __y = _mm_packs_epi32(__y, __m128i());
3938 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3939 }
3940 else if constexpr (_FromBytes == 4 && _ToBytes == 1
3941 && __have_ssse3)
3942 return __intrin_bitcast<_To>(
3943 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3944 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
3945 -1, -1, -1, -1, -1, -1, -1,
3946 -1)));
3947 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
3948 {
3949 const auto __y
3950 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3951 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3952 }
3953 else if constexpr (_FromBytes == 2 && _ToBytes == 1)
3954 return __intrin_bitcast<_To>(
3955 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()));
3956 else
3957 __assert_unreachable<_Tp>();
3958 } // }}}
3959 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32)
3960 { // AVX -> AVX {{{
3961 if constexpr (_FromBytes == _ToBytes)
3962 __assert_unreachable<_Tp>();
3963 else if constexpr (_FromBytes == _ToBytes * 2)
3964 {
3965 const auto __y = __vector_bitcast<_LLong>(__k);
3966 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
3967 _mm_packs_epi16(__lo128(__y), __hi128(__y))));
3968 }
3969 else if constexpr (_FromBytes == _ToBytes * 4)
3970 {
3971 const auto __y = __vector_bitcast<_LLong>(__k);
3972 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
3973 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
3974 __m128i())));
3975 }
3976 else if constexpr (_FromBytes == _ToBytes * 8)
3977 {
3978 const auto __y = __vector_bitcast<_LLong>(__k);
3979 return __intrin_bitcast<_To>(
3980 _mm256_castsi128_si256(_mm_shuffle_epi8(
3981 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
3982 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
3983 -1, -1, -1, -1, -1))));
3984 }
3985 else if constexpr (_FromBytes * 2 == _ToBytes)
3986 {
3987 auto __y = __xzyw(__to_intrin(__k));
3988 if constexpr (is_floating_point_v<
3989 _Tp> || (!__have_avx2 && _FromBytes == 4))
3990 {
3991 const auto __yy = __vector_bitcast<float>(__y);
3992 return __intrin_bitcast<_To>(
3993 _mm256_unpacklo_ps(__yy, __yy));
3994 }
3995 else
3996 return __intrin_bitcast<_To>(
3997 _mm256_unpacklo_epi8(__y, __y));
3998 }
3999 else if constexpr (_FromBytes * 4 == _ToBytes)
4000 {
4001 auto __y
4002 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4003 __lo128(__vector_bitcast<_LLong>(
4004 __k))); // drops 3/4 of input
4005 return __intrin_bitcast<_To>(
4006 __concat(_mm_unpacklo_epi16(__y, __y),
4007 _mm_unpackhi_epi16(__y, __y)));
4008 }
4009 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
4010 {
4011 auto __y
4012 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4013 __lo128(__vector_bitcast<_LLong>(
4014 __k))); // drops 3/4 of input
4015 __y
4016 = _mm_unpacklo_epi16(__y,
4017 __y); // drops another 1/2 => 7/8 total
4018 return __intrin_bitcast<_To>(
4019 __concat(_mm_unpacklo_epi32(__y, __y),
4020 _mm_unpackhi_epi32(__y, __y)));
4021 }
4022 else
4023 __assert_unreachable<_Tp>();
4024 } // }}}
4025 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16)
4026 { // SSE -> AVX {{{
4027 if constexpr (_FromBytes == _ToBytes)
4028 return __intrin_bitcast<_To>(
4029 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>(
4030 __zero_extend(__to_intrin(__k))));
4031 else if constexpr (_FromBytes * 2 == _ToBytes)
4032 { // keep all
4033 return __intrin_bitcast<_To>(
4034 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k),
4035 __vector_bitcast<_LLong>(__k)),
4036 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k),
4037 __vector_bitcast<_LLong>(__k))));
4038 }
4039 else if constexpr (_FromBytes * 4 == _ToBytes)
4040 {
4041 if constexpr (__have_avx2)
4042 {
4043 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4044 __concat(__vector_bitcast<_LLong>(__k),
4045 __vector_bitcast<_LLong>(__k)),
4046 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3,
4047 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
4048 6, 6, 7, 7, 7, 7)));
4049 }
4050 else
4051 {
4052 return __intrin_bitcast<_To>(__concat(
4053 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4054 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1,
4055 2, 2, 2, 2, 3, 3, 3, 3)),
4056 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4057 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5,
4058 6, 6, 6, 6, 7, 7, 7,
4059 7))));
4060 }
4061 }
4062 else if constexpr (_FromBytes * 8 == _ToBytes)
4063 {
4064 if constexpr (__have_avx2)
4065 {
4066 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4067 __concat(__vector_bitcast<_LLong>(__k),
4068 __vector_bitcast<_LLong>(__k)),
4069 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
4070 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
4071 3, 3, 3, 3, 3, 3)));
4072 }
4073 else
4074 {
4075 return __intrin_bitcast<_To>(__concat(
4076 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4077 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
4078 1, 1, 1, 1, 1, 1, 1, 1)),
4079 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4080 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2,
4081 3, 3, 3, 3, 3, 3, 3,
4082 3))));
4083 }
4084 }
4085 else if constexpr (_FromBytes == _ToBytes * 2)
4086 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4087 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()))));
4088 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
4089 {
4090 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4091 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4092 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1,
4093 -1, -1, -1, -1, -1, -1, -1,
4094 -1)))));
4095 }
4096 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4097 {
4098 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4099 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4100 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
4101 -1, -1, -1, -1, -1, -1, -1,
4102 -1)))));
4103 }
4104 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4105 {
4106 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4107 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4108 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1,
4109 -1, -1, -1, -1, -1, -1, -1,
4110 -1, -1)))));
4111 }
4112 else
4113 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4114 } // }}}
4115 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32)
4116 { // AVX -> SSE {{{
4117 if constexpr (_FromBytes == _ToBytes)
4118 { // keep low 1/2
4119 return __intrin_bitcast<_To>(__lo128(__k));
4120 }
4121 else if constexpr (_FromBytes == _ToBytes * 2)
4122 { // keep all
4123 auto __y = __vector_bitcast<_LLong>(__k);
4124 return __intrin_bitcast<_To>(
4125 _mm_packs_epi16(__lo128(__y), __hi128(__y)));
4126 }
4127 else if constexpr (_FromBytes == _ToBytes * 4)
4128 { // add 1/2 undef
4129 auto __y = __vector_bitcast<_LLong>(__k);
4130 return __intrin_bitcast<_To>(
4131 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4132 __m128i()));
4133 }
4134 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4135 { // add 3/4 undef
4136 auto __y = __vector_bitcast<_LLong>(__k);
4137 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
4138 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4139 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1,
4140 -1, -1, -1, -1)));
4141 }
4142 else if constexpr (_FromBytes * 2 == _ToBytes)
4143 { // keep low 1/4
4144 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4145 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4146 }
4147 else if constexpr (_FromBytes * 4 == _ToBytes)
4148 { // keep low 1/8
4149 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4150 __y = _mm_unpacklo_epi8(__y, __y);
4151 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4152 }
4153 else if constexpr (_FromBytes * 8 == _ToBytes)
4154 { // keep low 1/16
4155 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4156 __y = _mm_unpacklo_epi8(__y, __y);
4157 __y = _mm_unpacklo_epi8(__y, __y);
4158 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4159 }
4160 else
4161 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4162 } // }}}
4163 else
4164 return _Base::template _S_to_maskvector<_Up, _ToN>(__x);
4165 /*
4166 if constexpr (_FromBytes > _ToBytes) {
4167 const _To __y = __vector_bitcast<_Up>(__k);
4168 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4169 constexpr int _Stride = _FromBytes / _ToBytes;
4170 return _To{__y[(_Is + 1) * _Stride - 1]...};
4171 }(make_index_sequence<std::min(_ToN, _FromN)>());
4172 } else {
4173 // {0, 0, 1, 1} (_Dups = 2, _Is<4>)
4174 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>)
4175 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>)
4176 // ...
4177 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4178 constexpr int __dup = _ToBytes / _FromBytes;
4179 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...});
4180 }(make_index_sequence<_FromN>());
4181 }
4182 */
4183 } // }}}
4184 }
4185
4186 // }}}
4187 // _S_to_bits {{{
4188 template <typename _Tp, size_t _Np>
4189 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
4190 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
4191 {
4192 if constexpr (is_same_v<_Tp, bool>)
4193 return _BitMask<_Np>(__x._M_data)._M_sanitized();
4194 else
4195 {
4196 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4197 if (__builtin_is_constant_evaluated()
4198 || __builtin_constant_p(__x._M_data))
4199 {
4200 const auto __bools = -__x._M_data;
4201 const _ULLong __k = __call_with_n_evaluations<_Np>(
4202 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4203 return (__bits | ...);
4204 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4205 return _ULLong(__bools[+__i]) << __i;
4206 });
4207 if (__builtin_is_constant_evaluated()
4208 || __builtin_constant_p(__k))
4209 return __k;
4210 }
4211 const auto __xi = __to_intrin(__x);
4212 if constexpr (sizeof(_Tp) == 1)
4213 if constexpr (sizeof(__xi) == 16)
4214 if constexpr (__have_avx512bw_vl)
4215 return _BitMask<_Np>(_mm_movepi8_mask(__xi));
4216 else // implies SSE2
4217 return _BitMask<_Np>(_mm_movemask_epi8(__xi));
4218 else if constexpr (sizeof(__xi) == 32)
4219 if constexpr (__have_avx512bw_vl)
4220 return _BitMask<_Np>(_mm256_movepi8_mask(__xi));
4221 else // implies AVX2
4222 return _BitMask<_Np>(_mm256_movemask_epi8(__xi));
4223 else // implies AVX512BW
4224 return _BitMask<_Np>(_mm512_movepi8_mask(__xi));
4225
4226 else if constexpr (sizeof(_Tp) == 2)
4227 if constexpr (sizeof(__xi) == 16)
4228 if constexpr (__have_avx512bw_vl)
4229 return _BitMask<_Np>(_mm_movepi16_mask(__xi));
4230 else if constexpr (__have_avx512bw)
4231 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4232 else // implies SSE2
4233 return _BitMask<_Np>(
4234 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i())));
4235 else if constexpr (sizeof(__xi) == 32)
4236 if constexpr (__have_avx512bw_vl)
4237 return _BitMask<_Np>(_mm256_movepi16_mask(__xi));
4238 else if constexpr (__have_avx512bw)
4239 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4240 else // implies SSE2
4241 return _BitMask<_Np>(_mm_movemask_epi8(
4242 _mm_packs_epi16(__lo128(__xi), __hi128(__xi))));
4243 else // implies AVX512BW
4244 return _BitMask<_Np>(_mm512_movepi16_mask(__xi));
4245
4246 else if constexpr (sizeof(_Tp) == 4)
4247 if constexpr (sizeof(__xi) == 16)
4248 if constexpr (__have_avx512dq_vl)
4249 return _BitMask<_Np>(_mm_movepi32_mask(__xi));
4250 else if constexpr (__have_avx512vl)
4251 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i()));
4252 else if constexpr (__have_avx512dq)
4253 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4254 else if constexpr (__have_avx512f)
4255 return _BitMask<_Np>(
4256 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4257 else // implies SSE
4258 return _BitMask<_Np>(
4259 _mm_movemask_ps(reinterpret_cast<__m128>(__xi)));
4260 else if constexpr (sizeof(__xi) == 32)
4261 if constexpr (__have_avx512dq_vl)
4262 return _BitMask<_Np>(_mm256_movepi32_mask(__xi));
4263 else if constexpr (__have_avx512dq)
4264 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4265 else if constexpr (__have_avx512vl)
4266 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i()));
4267 else if constexpr (__have_avx512f)
4268 return _BitMask<_Np>(
4269 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4270 else // implies AVX
4271 return _BitMask<_Np>(
4272 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi)));
4273 else // implies AVX512??
4274 if constexpr (__have_avx512dq)
4275 return _BitMask<_Np>(_mm512_movepi32_mask(__xi));
4276 else // implies AVX512F
4277 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i()));
4278
4279 else if constexpr (sizeof(_Tp) == 8)
4280 if constexpr (sizeof(__xi) == 16)
4281 if constexpr (__have_avx512dq_vl)
4282 return _BitMask<_Np>(_mm_movepi64_mask(__xi));
4283 else if constexpr (__have_avx512dq)
4284 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4285 else if constexpr (__have_avx512vl)
4286 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i()));
4287 else if constexpr (__have_avx512f)
4288 return _BitMask<_Np>(
4289 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4290 else // implies SSE2
4291 return _BitMask<_Np>(
4292 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi)));
4293 else if constexpr (sizeof(__xi) == 32)
4294 if constexpr (__have_avx512dq_vl)
4295 return _BitMask<_Np>(_mm256_movepi64_mask(__xi));
4296 else if constexpr (__have_avx512dq)
4297 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4298 else if constexpr (__have_avx512vl)
4299 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i()));
4300 else if constexpr (__have_avx512f)
4301 return _BitMask<_Np>(
4302 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4303 else // implies AVX
4304 return _BitMask<_Np>(
4305 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi)));
4306 else // implies AVX512??
4307 if constexpr (__have_avx512dq)
4308 return _BitMask<_Np>(_mm512_movepi64_mask(__xi));
4309 else // implies AVX512F
4310 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i()));
4311
4312 else
4313 __assert_unreachable<_Tp>();
4314 }
4315 }
4316 // }}}
4317};
4318
4319// }}}
4320// _MaskImplX86 {{{
4321template <typename _Abi, typename>
4322 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
4323 {
4324 using _MaskImplX86Mixin::_S_to_bits;
4325 using _MaskImplX86Mixin::_S_to_maskvector;
4326 using _MaskImplBuiltin<_Abi>::_S_convert;
4327
4328 // member types {{{
4329 template <typename _Tp>
4330 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
4331
4332 template <typename _Tp>
4333 using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
4334
4335 template <typename _Tp>
4336 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
4337
4338 using _Base = _MaskImplBuiltin<_Abi>;
4339
4340 // }}}
4341 // _S_broadcast {{{
4342 template <typename _Tp>
4343 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4344 _S_broadcast(bool __x)
4345 {
4346 if constexpr (__is_avx512_abi<_Abi>())
4347 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1))
4348 : _MaskMember<_Tp>();
4349 else
4350 return _Base::template _S_broadcast<_Tp>(__x);
4351 }
4352
4353 // }}}
4354 // _S_load {{{
4355 template <typename _Tp>
4356 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4357 _S_load(const bool* __mem)
4358 {
4359 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4360 if constexpr (__have_avx512bw)
4361 {
4362 const auto __to_vec_or_bits
4363 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) {
4364 if constexpr (__is_avx512_abi<_Abi>())
4365 return __bits;
4366 else
4367 return _S_to_maskvector<_Tp>(
4368 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
4369 };
4370
4371 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
4372 {
4373 __m128i __a = {};
4374 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4375 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a));
4376 }
4377 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl)
4378 {
4379 __m256i __a = {};
4380 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4381 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a));
4382 }
4383 else if constexpr (_S_size<_Tp> <= 64)
4384 {
4385 __m512i __a = {};
4386 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4387 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a));
4388 }
4389 }
4390 else if constexpr (__is_avx512_abi<_Abi>())
4391 {
4392 if constexpr (_S_size<_Tp> <= 8)
4393 {
4394 __m128i __a = {};
4395 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4396 const auto __b = _mm512_cvtepi8_epi64(__a);
4397 return _mm512_test_epi64_mask(__b, __b);
4398 }
4399 else if constexpr (_S_size<_Tp> <= 16)
4400 {
4401 __m128i __a = {};
4402 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4403 const auto __b = _mm512_cvtepi8_epi32(__a);
4404 return _mm512_test_epi32_mask(__b, __b);
4405 }
4406 else if constexpr (_S_size<_Tp> <= 32)
4407 {
4408 __m128i __a = {};
4409 __builtin_memcpy(&__a, __mem, 16);
4410 const auto __b = _mm512_cvtepi8_epi32(__a);
4411 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16);
4412 const auto __c = _mm512_cvtepi8_epi32(__a);
4413 return _mm512_test_epi32_mask(__b, __b)
4414 | (_mm512_test_epi32_mask(__c, __c) << 16);
4415 }
4416 else if constexpr (_S_size<_Tp> <= 64)
4417 {
4418 __m128i __a = {};
4419 __builtin_memcpy(&__a, __mem, 16);
4420 const auto __b = _mm512_cvtepi8_epi32(__a);
4421 __builtin_memcpy(&__a, __mem + 16, 16);
4422 const auto __c = _mm512_cvtepi8_epi32(__a);
4423 if constexpr (_S_size<_Tp> <= 48)
4424 {
4425 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32);
4426 const auto __d = _mm512_cvtepi8_epi32(__a);
4427 return _mm512_test_epi32_mask(__b, __b)
4428 | (_mm512_test_epi32_mask(__c, __c) << 16)
4429 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32);
4430 }
4431 else
4432 {
4433 __builtin_memcpy(&__a, __mem + 16, 16);
4434 const auto __d = _mm512_cvtepi8_epi32(__a);
4435 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48);
4436 const auto __e = _mm512_cvtepi8_epi32(__a);
4437 return _mm512_test_epi32_mask(__b, __b)
4438 | (_mm512_test_epi32_mask(__c, __c) << 16)
4439 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32)
4440 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48);
4441 }
4442 }
4443 else
4444 __assert_unreachable<_Tp>();
4445 }
4446 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2)
4447 return __vector_bitcast<_Tp>(
4448 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]),
4449 -int(__mem[1]), -int(__mem[1])});
4450 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx)
4451 {
4452 int __bool4 = 0;
4453 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>);
4454 const auto __k = __to_intrin(
4455 (__vector_broadcast<4>(__bool4)
4456 & __make_vector<int>(0x1, 0x100, 0x10000,
4457 _S_size<_Tp> == 4 ? 0x1000000 : 0))
4458 != 0);
4459 return __vector_bitcast<_Tp>(
4460 __concat(_mm_unpacklo_epi32(__k, __k),
4461 _mm_unpackhi_epi32(__k, __k)));
4462 }
4463 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4)
4464 {
4465 int __bools = 0;
4466 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>);
4467 if constexpr (__have_sse2)
4468 {
4469 __m128i __k = _mm_cvtsi32_si128(__bools);
4470 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4471 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4472 _mm_unpacklo_epi16(__k, __k));
4473 }
4474 else
4475 {
4476 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools));
4477 _mm_empty();
4478 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4479 _mm_cmpgt_ps(__k, __m128()));
4480 }
4481 }
4482 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8)
4483 {
4484 __m128i __k = {};
4485 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4486 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4487 return __vector_bitcast<_Tp>(
4488 __concat(_mm_unpacklo_epi16(__k, __k),
4489 _mm_unpackhi_epi16(__k, __k)));
4490 }
4491 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16)
4492 {
4493 __m128i __k = {};
4494 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4495 __k = _mm_cmpgt_epi8(__k, __m128i());
4496 if constexpr (_S_size<_Tp> <= 8)
4497 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4498 _mm_unpacklo_epi8(__k, __k));
4499 else
4500 return __concat(_mm_unpacklo_epi8(__k, __k),
4501 _mm_unpackhi_epi8(__k, __k));
4502 }
4503 else
4504 return _Base::template _S_load<_Tp>(__mem);
4505 }
4506
4507 // }}}
4508 // _S_from_bitmask{{{
4509 template <size_t _Np, typename _Tp>
4510 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
4511 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
4512 {
4513 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4514 if constexpr (__is_avx512_abi<_Abi>())
4515 return __bits._M_to_bits();
4516 else
4517 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
4518 }
4519
4520 // }}}
4521 // _S_masked_load {{{2
4522 template <typename _Tp, size_t _Np>
4523 static inline _SimdWrapper<_Tp, _Np>
4524 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
4525 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
4526 {
4527 if constexpr (__is_avx512_abi<_Abi>())
4528 {
4529 if constexpr (__have_avx512bw_vl)
4530 {
4531 if constexpr (_Np <= 16)
4532 {
4533 const auto __a
4534 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem);
4535 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a);
4536 }
4537 else if constexpr (_Np <= 32)
4538 {
4539 const auto __a
4540 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem);
4541 return (__merge & ~__mask)
4542 | _mm256_test_epi8_mask(__a, __a);
4543 }
4544 else if constexpr (_Np <= 64)
4545 {
4546 const auto __a
4547 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem);
4548 return (__merge & ~__mask)
4549 | _mm512_test_epi8_mask(__a, __a);
4550 }
4551 else
4552 __assert_unreachable<_Tp>();
4553 }
4554 else
4555 {
4556 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4557 __merge._M_set(__i, __mem[__i]);
4558 });
4559 return __merge;
4560 }
4561 }
4562 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1)
4563 {
4564 const auto __k = _S_to_bits(__mask)._M_to_bits();
4565 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(),
4566 _mm256_mask_loadu_epi8(__m256i(),
4567 __k, __mem));
4568 }
4569 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1)
4570 {
4571 const auto __k = _S_to_bits(__mask)._M_to_bits();
4572 __merge
4573 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k,
4574 __m128i(),
4575 _mm_mask_loadu_epi8(__m128i(), __k, __mem));
4576 }
4577 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2)
4578 {
4579 const auto __k = _S_to_bits(__mask)._M_to_bits();
4580 __merge = _mm256_mask_sub_epi16(
4581 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4582 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4583 }
4584 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2)
4585 {
4586 const auto __k = _S_to_bits(__mask)._M_to_bits();
4587 __merge = _mm_mask_sub_epi16(
4588 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4589 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4590 }
4591 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4)
4592 {
4593 const auto __k = _S_to_bits(__mask)._M_to_bits();
4594 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32(
4595 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4596 _mm256_cvtepi8_epi32(
4597 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4598 }
4599 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4)
4600 {
4601 const auto __k = _S_to_bits(__mask)._M_to_bits();
4602 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32(
4603 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4604 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4605 }
4606 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8)
4607 {
4608 const auto __k = _S_to_bits(__mask)._M_to_bits();
4609 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64(
4610 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4611 _mm256_cvtepi8_epi64(
4612 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4613 }
4614 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8)
4615 {
4616 const auto __k = _S_to_bits(__mask)._M_to_bits();
4617 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64(
4618 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4619 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4620 }
4621 else
4622 return _Base::_S_masked_load(__merge, __mask, __mem);
4623 return __merge;
4624 }
4625
4626 // _S_store {{{2
4627 template <typename _Tp, size_t _Np>
4628 _GLIBCXX_SIMD_INTRINSIC static void
4629 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
4630 {
4631 if constexpr (__is_avx512_abi<_Abi>())
4632 {
4633 if constexpr (__have_avx512bw_vl)
4634 _CommonImplX86::_S_store<_Np>(
4635 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4636 if constexpr (_Np <= 16)
4637 return _mm_maskz_set1_epi8(__data, 1);
4638 else if constexpr (_Np <= 32)
4639 return _mm256_maskz_set1_epi8(__data, 1);
4640 else
4641 return _mm512_maskz_set1_epi8(__data, 1);
4642 }(__v._M_data)),
4643 __mem);
4644 else if constexpr (_Np <= 8)
4645 _CommonImplX86::_S_store<_Np>(
4646 __vector_bitcast<char>(
4647#if defined __x86_64__
4648 __make_wrapper<_ULLong>(
4649 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull)
4650#else
4651 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U),
4652 _pdep_u32(__v._M_data >> 4,
4653 0x01010101U))
4654#endif
4655 ),
4656 __mem);
4657 else if constexpr (_Np <= 16)
4658 _mm512_mask_cvtepi32_storeu_epi8(
4659 __mem, 0xffffu >> (16 - _Np),
4660 _mm512_maskz_set1_epi32(__v._M_data, 1));
4661 else
4662 __assert_unreachable<_Tp>();
4663 }
4664 else if constexpr (__is_sse_abi<_Abi>()) //{{{
4665 {
4666 if constexpr (_Np == 2 && sizeof(_Tp) == 8)
4667 {
4668 const auto __k = __vector_bitcast<int>(__v);
4669 __mem[0] = -__k[1];
4670 __mem[1] = -__k[3];
4671 }
4672 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
4673 {
4674 if constexpr (__have_sse2)
4675 {
4676 const unsigned __bool4
4677 = __vector_bitcast<_UInt>(_mm_packs_epi16(
4678 _mm_packs_epi32(__intrin_bitcast<__m128i>(
4679 __to_intrin(__v)),
4680 __m128i()),
4681 __m128i()))[0]
4682 & 0x01010101u;
4683 __builtin_memcpy(__mem, &__bool4, _Np);
4684 }
4685 else if constexpr (__have_mmx)
4686 {
4687 const __m64 __k = _mm_cvtps_pi8(
4688 __and(__to_intrin(__v), _mm_set1_ps(1.f)));
4689 __builtin_memcpy(__mem, &__k, _Np);
4690 _mm_empty();
4691 }
4692 else
4693 return _Base::_S_store(__v, __mem);
4694 }
4695 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
4696 {
4697 _CommonImplX86::_S_store<_Np>(
4698 __vector_bitcast<char>(_mm_packs_epi16(
4699 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15),
4700 __m128i())),
4701 __mem);
4702 }
4703 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
4704 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem);
4705 else
4706 __assert_unreachable<_Tp>();
4707 } // }}}
4708 else if constexpr (__is_avx_abi<_Abi>()) // {{{
4709 {
4710 if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
4711 {
4712 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4713 int __bool4;
4714 if constexpr (__have_avx2)
4715 __bool4 = _mm256_movemask_epi8(__k);
4716 else
4717 __bool4 = (_mm_movemask_epi8(__lo128(__k))
4718 | (_mm_movemask_epi8(__hi128(__k)) << 16));
4719 __bool4 &= 0x01010101;
4720 __builtin_memcpy(__mem, &__bool4, _Np);
4721 }
4722 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4)
4723 {
4724 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4725 const auto __k2
4726 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)),
4727 15);
4728 const auto __k3
4729 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i()));
4730 _CommonImplX86::_S_store<_Np>(__k3, __mem);
4731 }
4732 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2)
4733 {
4734 if constexpr (__have_avx2)
4735 {
4736 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15);
4737 const auto __bools = __vector_bitcast<char>(
4738 _mm_packs_epi16(__lo128(__x), __hi128(__x)));
4739 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4740 }
4741 else
4742 {
4743 const auto __bools
4744 = 1
4745 & __vector_bitcast<_UChar>(
4746 _mm_packs_epi16(__lo128(__to_intrin(__v)),
4747 __hi128(__to_intrin(__v))));
4748 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4749 }
4750 }
4751 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1)
4752 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem);
4753 else
4754 __assert_unreachable<_Tp>();
4755 } // }}}
4756 else
4757 __assert_unreachable<_Tp>();
4758 }
4759
4760 // _S_masked_store {{{2
4761 template <typename _Tp, size_t _Np>
4762 static inline void
4763 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
4764 const _SimdWrapper<_Tp, _Np> __k) noexcept
4765 {
4766 if constexpr (__is_avx512_abi<_Abi>())
4767 {
4768 static_assert(is_same_v<_Tp, bool>);
4769 if constexpr (_Np <= 16 && __have_avx512bw_vl)
4770 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1));
4771 else if constexpr (_Np <= 16)
4772 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k,
4773 _mm512_maskz_set1_epi32(__v, 1));
4774 else if constexpr (_Np <= 32 && __have_avx512bw_vl)
4775 _mm256_mask_storeu_epi8(__mem, __k,
4776 _mm256_maskz_set1_epi8(__v, 1));
4777 else if constexpr (_Np <= 32 && __have_avx512bw)
4778 _mm256_mask_storeu_epi8(__mem, __k,
4779 __lo256(_mm512_maskz_set1_epi8(__v, 1)));
4780 else if constexpr (_Np <= 64 && __have_avx512bw)
4781 _mm512_mask_storeu_epi8(__mem, __k,
4782 _mm512_maskz_set1_epi8(__v, 1));
4783 else
4784 __assert_unreachable<_Tp>();
4785 }
4786 else
4787 _Base::_S_masked_store(__v, __mem, __k);
4788 }
4789
4790 // logical and bitwise operators {{{2
4791 template <typename _Tp, size_t _Np>
4792 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4793 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4794 {
4795 if constexpr (is_same_v<_Tp, bool>)
4796 {
4797 if constexpr (__have_avx512dq && _Np <= 8)
4798 return _kand_mask8(__x._M_data, __y._M_data);
4799 else if constexpr (_Np <= 16)
4800 return _kand_mask16(__x._M_data, __y._M_data);
4801 else if constexpr (__have_avx512bw && _Np <= 32)
4802 return _kand_mask32(__x._M_data, __y._M_data);
4803 else if constexpr (__have_avx512bw && _Np <= 64)
4804 return _kand_mask64(__x._M_data, __y._M_data);
4805 else
4806 __assert_unreachable<_Tp>();
4807 }
4808 else
4809 return _Base::_S_logical_and(__x, __y);
4810 }
4811
4812 template <typename _Tp, size_t _Np>
4813 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4814 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4815 {
4816 if constexpr (is_same_v<_Tp, bool>)
4817 {
4818 if constexpr (__have_avx512dq && _Np <= 8)
4819 return _kor_mask8(__x._M_data, __y._M_data);
4820 else if constexpr (_Np <= 16)
4821 return _kor_mask16(__x._M_data, __y._M_data);
4822 else if constexpr (__have_avx512bw && _Np <= 32)
4823 return _kor_mask32(__x._M_data, __y._M_data);
4824 else if constexpr (__have_avx512bw && _Np <= 64)
4825 return _kor_mask64(__x._M_data, __y._M_data);
4826 else
4827 __assert_unreachable<_Tp>();
4828 }
4829 else
4830 return _Base::_S_logical_or(__x, __y);
4831 }
4832
4833 template <typename _Tp, size_t _Np>
4834 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4835 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
4836 {
4837 if constexpr (is_same_v<_Tp, bool>)
4838 {
4839 if constexpr (__have_avx512dq && _Np <= 8)
4840 return _kandn_mask8(__x._M_data,
4841 _Abi::template __implicit_mask_n<_Np>());
4842 else if constexpr (_Np <= 16)
4843 return _kandn_mask16(__x._M_data,
4844 _Abi::template __implicit_mask_n<_Np>());
4845 else if constexpr (__have_avx512bw && _Np <= 32)
4846 return _kandn_mask32(__x._M_data,
4847 _Abi::template __implicit_mask_n<_Np>());
4848 else if constexpr (__have_avx512bw && _Np <= 64)
4849 return _kandn_mask64(__x._M_data,
4850 _Abi::template __implicit_mask_n<_Np>());
4851 else
4852 __assert_unreachable<_Tp>();
4853 }
4854 else
4855 return _Base::_S_bit_not(__x);
4856 }
4857
4858 template <typename _Tp, size_t _Np>
4859 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4860 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4861 {
4862 if constexpr (is_same_v<_Tp, bool>)
4863 {
4864 if constexpr (__have_avx512dq && _Np <= 8)
4865 return _kand_mask8(__x._M_data, __y._M_data);
4866 else if constexpr (_Np <= 16)
4867 return _kand_mask16(__x._M_data, __y._M_data);
4868 else if constexpr (__have_avx512bw && _Np <= 32)
4869 return _kand_mask32(__x._M_data, __y._M_data);
4870 else if constexpr (__have_avx512bw && _Np <= 64)
4871 return _kand_mask64(__x._M_data, __y._M_data);
4872 else
4873 __assert_unreachable<_Tp>();
4874 }
4875 else
4876 return _Base::_S_bit_and(__x, __y);
4877 }
4878
4879 template <typename _Tp, size_t _Np>
4880 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4881 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4882 {
4883 if constexpr (is_same_v<_Tp, bool>)
4884 {
4885 if constexpr (__have_avx512dq && _Np <= 8)
4886 return _kor_mask8(__x._M_data, __y._M_data);
4887 else if constexpr (_Np <= 16)
4888 return _kor_mask16(__x._M_data, __y._M_data);
4889 else if constexpr (__have_avx512bw && _Np <= 32)
4890 return _kor_mask32(__x._M_data, __y._M_data);
4891 else if constexpr (__have_avx512bw && _Np <= 64)
4892 return _kor_mask64(__x._M_data, __y._M_data);
4893 else
4894 __assert_unreachable<_Tp>();
4895 }
4896 else
4897 return _Base::_S_bit_or(__x, __y);
4898 }
4899
4900 template <typename _Tp, size_t _Np>
4901 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4902 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4903 {
4904 if constexpr (is_same_v<_Tp, bool>)
4905 {
4906 if constexpr (__have_avx512dq && _Np <= 8)
4907 return _kxor_mask8(__x._M_data, __y._M_data);
4908 else if constexpr (_Np <= 16)
4909 return _kxor_mask16(__x._M_data, __y._M_data);
4910 else if constexpr (__have_avx512bw && _Np <= 32)
4911 return _kxor_mask32(__x._M_data, __y._M_data);
4912 else if constexpr (__have_avx512bw && _Np <= 64)
4913 return _kxor_mask64(__x._M_data, __y._M_data);
4914 else
4915 __assert_unreachable<_Tp>();
4916 }
4917 else
4918 return _Base::_S_bit_xor(__x, __y);
4919 }
4920
4921 //}}}2
4922 // _S_masked_assign{{{
4923 template <size_t _Np>
4924 _GLIBCXX_SIMD_INTRINSIC static void
4925 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4926 _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs)
4927 {
4928 __lhs._M_data
4929 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data);
4930 }
4931
4932 template <size_t _Np>
4933 _GLIBCXX_SIMD_INTRINSIC static void
4934 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4935 _SimdWrapper<bool, _Np>& __lhs, bool __rhs)
4936 {
4937 if (__rhs)
4938 __lhs._M_data = __k._M_data | __lhs._M_data;
4939 else
4940 __lhs._M_data = ~__k._M_data & __lhs._M_data;
4941 }
4942
4943 using _MaskImplBuiltin<_Abi>::_S_masked_assign;
4944
4945 //}}}
4946 // _S_all_of {{{
4947 template <typename _Tp>
4948 _GLIBCXX_SIMD_INTRINSIC static bool
4949 _S_all_of(simd_mask<_Tp, _Abi> __k)
4950 {
4951 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4952 {
4953 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4954 using _TI = __intrinsic_type_t<_Tp, _Np>;
4955 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4956 if constexpr (__have_sse4_1)
4957 {
4958 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4959 = _Abi::template _S_implicit_mask_intrin<_Tp>();
4960 return 0 != __testc(__a, __b);
4961 }
4962 else if constexpr (is_same_v<_Tp, float>)
4963 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1))
4964 == (1 << _Np) - 1;
4965 else if constexpr (is_same_v<_Tp, double>)
4966 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1))
4967 == (1 << _Np) - 1;
4968 else
4969 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
4970 == (1 << (_Np * sizeof(_Tp))) - 1;
4971 }
4972 else if constexpr (__is_avx512_abi<_Abi>())
4973 {
4974 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>();
4975 const auto __kk = __k._M_data._M_data;
4976 if constexpr (sizeof(__kk) == 1)
4977 {
4978 if constexpr (__have_avx512dq)
4979 return _kortestc_mask8_u8(__kk, _Mask == 0xff
4980 ? __kk
4981 : __mmask8(~_Mask));
4982 else
4983 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask));
4984 }
4985 else if constexpr (sizeof(__kk) == 2)
4986 return _kortestc_mask16_u8(__kk, _Mask == 0xffff
4987 ? __kk
4988 : __mmask16(~_Mask));
4989 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw)
4990 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU
4991 ? __kk
4992 : __mmask32(~_Mask));
4993 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw)
4994 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL
4995 ? __kk
4996 : __mmask64(~_Mask));
4997 else
4998 __assert_unreachable<_Tp>();
4999 }
5000 }
5001
5002 // }}}
5003 // _S_any_of {{{
5004 template <typename _Tp>
5005 _GLIBCXX_SIMD_INTRINSIC static bool
5006 _S_any_of(simd_mask<_Tp, _Abi> __k)
5007 {
5008 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5009 {
5010 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5011 using _TI = __intrinsic_type_t<_Tp, _Np>;
5012 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5013 if constexpr (__have_sse4_1)
5014 {
5015 if constexpr (_Abi::template _S_is_partial<
5016 _Tp> || sizeof(__k) < 16)
5017 {
5018 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5019 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5020 return 0 == __testz(__a, __b);
5021 }
5022 else
5023 return 0 == __testz(__a, __a);
5024 }
5025 else if constexpr (is_same_v<_Tp, float>)
5026 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0;
5027 else if constexpr (is_same_v<_Tp, double>)
5028 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0;
5029 else
5030 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5031 != 0;
5032 }
5033 else if constexpr (__is_avx512_abi<_Abi>())
5034 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5035 != 0;
5036 }
5037
5038 // }}}
5039 // _S_none_of {{{
5040 template <typename _Tp>
5041 _GLIBCXX_SIMD_INTRINSIC static bool
5042 _S_none_of(simd_mask<_Tp, _Abi> __k)
5043 {
5044 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5045 {
5046 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5047 using _TI = __intrinsic_type_t<_Tp, _Np>;
5048 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5049 if constexpr (__have_sse4_1)
5050 {
5051 if constexpr (_Abi::template _S_is_partial<
5052 _Tp> || sizeof(__k) < 16)
5053 {
5054 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5055 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5056 return 0 != __testz(__a, __b);
5057 }
5058 else
5059 return 0 != __testz(__a, __a);
5060 }
5061 else if constexpr (is_same_v<_Tp, float>)
5062 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5063 else if constexpr (is_same_v<_Tp, double>)
5064 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5065 else
5066 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1))
5067 == 0;
5068 }
5069 else if constexpr (__is_avx512_abi<_Abi>())
5070 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5071 == 0;
5072 }
5073
5074 // }}}
5075 // _S_some_of {{{
5076 template <typename _Tp>
5077 _GLIBCXX_SIMD_INTRINSIC static bool
5078 _S_some_of(simd_mask<_Tp, _Abi> __k)
5079 {
5080 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5081 {
5082 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5083 using _TI = __intrinsic_type_t<_Tp, _Np>;
5084 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5085 if constexpr (__have_sse4_1)
5086 {
5087 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5088 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5089 return 0 != __testnzc(__a, __b);
5090 }
5091 else if constexpr (is_same_v<_Tp, float>)
5092 {
5093 constexpr int __allbits = (1 << _Np) - 1;
5094 const auto __tmp = _mm_movemask_ps(__a) & __allbits;
5095 return __tmp > 0 && __tmp < __allbits;
5096 }
5097 else if constexpr (is_same_v<_Tp, double>)
5098 {
5099 constexpr int __allbits = (1 << _Np) - 1;
5100 const auto __tmp = _mm_movemask_pd(__a) & __allbits;
5101 return __tmp > 0 && __tmp < __allbits;
5102 }
5103 else
5104 {
5105 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1;
5106 const auto __tmp = _mm_movemask_epi8(__a) & __allbits;
5107 return __tmp > 0 && __tmp < __allbits;
5108 }
5109 }
5110 else if constexpr (__is_avx512_abi<_Abi>())
5111 return _S_any_of(__k) && !_S_all_of(__k);
5112 else
5113 __assert_unreachable<_Tp>();
5114 }
5115
5116 // }}}
5117 // _S_popcount {{{
5118 template <typename _Tp>
5119 _GLIBCXX_SIMD_INTRINSIC static int
5120 _S_popcount(simd_mask<_Tp, _Abi> __k)
5121 {
5122 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5123 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data;
5124 if constexpr (__is_avx512_abi<_Abi>())
5125 {
5126 if constexpr (_Np > 32)
5127 return __builtin_popcountll(__kk);
5128 else
5129 return __builtin_popcount(__kk);
5130 }
5131 else
5132 {
5133 if constexpr (__have_popcnt)
5134 {
5135 int __bits
5136 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk)));
5137 const int __count = __builtin_popcount(__bits);
5138 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count;
5139 }
5140 else if constexpr (_Np == 2 && sizeof(_Tp) == 8)
5141 {
5142 const int mask = _mm_movemask_pd(__auto_bitcast(__kk));
5143 return mask - (mask >> 1);
5144 }
5145 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
5146 {
5147 auto __x = -(__lo128(__kk) + __hi128(__kk));
5148 return __x[0] + __x[1];
5149 }
5150 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
5151 {
5152 if constexpr (__have_sse2)
5153 {
5154 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk));
5155 __x = _mm_add_epi32(
5156 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5157 __x = _mm_add_epi32(
5158 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2)));
5159 return -_mm_cvtsi128_si32(__x);
5160 }
5161 else
5162 return __builtin_popcount(
5163 _mm_movemask_ps(__auto_bitcast(__kk)));
5164 }
5165 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
5166 {
5167 auto __x = __to_intrin(__kk);
5168 __x = _mm_add_epi16(__x,
5169 _mm_shuffle_epi32(__x,
5170 _MM_SHUFFLE(0, 1, 2, 3)));
5171 __x = _mm_add_epi16(
5172 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5173 __x = _mm_add_epi16(
5174 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1)));
5175 return -short(_mm_extract_epi16(__x, 0));
5176 }
5177 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
5178 {
5179 auto __x = __to_intrin(__kk);
5180 __x = _mm_add_epi8(__x,
5181 _mm_shuffle_epi32(__x,
5182 _MM_SHUFFLE(0, 1, 2, 3)));
5183 __x = _mm_add_epi8(__x,
5184 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2,
5185 3)));
5186 __x = _mm_add_epi8(__x,
5187 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0,
5188 1)));
5189 auto __y = -__vector_bitcast<_UChar>(__x);
5190 if constexpr (__have_sse4_1)
5191 return __y[0] + __y[1];
5192 else
5193 {
5194 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0);
5195 return (__z & 0xff) + (__z >> 8);
5196 }
5197 }
5198 else if constexpr (sizeof(__kk) == 32)
5199 {
5200 // The following works only as long as the implementations above
5201 // use a summation
5202 using _I = __int_for_sizeof_t<_Tp>;
5203 const auto __as_int = __vector_bitcast<_I>(__kk);
5204 _MaskImplX86<simd_abi::__sse>::_S_popcount(
5205 simd_mask<_I, simd_abi::__sse>(__private_init,
5206 __lo128(__as_int)
5207 + __hi128(__as_int)));
5208 }
5209 else
5210 __assert_unreachable<_Tp>();
5211 }
5212 }
5213
5214 // }}}
5215 // _S_find_first_set {{{
5216 template <typename _Tp>
5217 _GLIBCXX_SIMD_INTRINSIC static int
5218 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
5219 {
5220 if constexpr (__is_avx512_abi<_Abi>())
5221 return std::__countr_zero(__k._M_data._M_data);
5222 else
5223 return _Base::_S_find_first_set(__k);
5224 }
5225
5226 // }}}
5227 // _S_find_last_set {{{
5228 template <typename _Tp>
5229 _GLIBCXX_SIMD_INTRINSIC static int
5230 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
5231 {
5232 if constexpr (__is_avx512_abi<_Abi>())
5233 return std::__bit_width(__k._M_data._M_data) - 1;
5234 else
5235 return _Base::_S_find_last_set(__k);
5236 }
5237
5238 // }}}
5239 };
5240
5241// }}}
5242
5243_GLIBCXX_SIMD_END_NAMESPACE
5244#endif // __cplusplus >= 201703L
5245#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
5246
5247// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition: type_traits:2614
typename make_signed< _Tp >::type make_signed_t
Alias template for make_signed.
Definition: type_traits:1979
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:233