libstdc++
simd_detail.h
1// Internal macros for the simd implementation -*- C++ -*-
2
3// Copyright (C) 2020-2023 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
27
28#if __cplusplus >= 201703L
29
30#include <cstddef>
31#include <cstdint>
32
33/// @cond undocumented
34
35#define _GLIBCXX_SIMD_BEGIN_NAMESPACE \
36 namespace std _GLIBCXX_VISIBILITY(default) \
37 { \
38 _GLIBCXX_BEGIN_NAMESPACE_VERSION \
39 namespace experimental { \
40 inline namespace parallelism_v2 {
41#define _GLIBCXX_SIMD_END_NAMESPACE \
42 } \
43 } \
44 _GLIBCXX_END_NAMESPACE_VERSION \
45 }
46
47// ISA extension detection. The following defines all the _GLIBCXX_SIMD_HAVE_XXX
48// macros ARM{{{
49#if defined __ARM_NEON
50#define _GLIBCXX_SIMD_HAVE_NEON 1
51#else
52#define _GLIBCXX_SIMD_HAVE_NEON 0
53#endif
54#if defined __ARM_NEON && (__ARM_ARCH >= 8 || defined __aarch64__)
55#define _GLIBCXX_SIMD_HAVE_NEON_A32 1
56#else
57#define _GLIBCXX_SIMD_HAVE_NEON_A32 0
58#endif
59#if defined __ARM_NEON && defined __aarch64__
60#define _GLIBCXX_SIMD_HAVE_NEON_A64 1
61#else
62#define _GLIBCXX_SIMD_HAVE_NEON_A64 0
63#endif
64//}}}
65// x86{{{
66#ifdef __MMX__
67#define _GLIBCXX_SIMD_HAVE_MMX 1
68#else
69#define _GLIBCXX_SIMD_HAVE_MMX 0
70#endif
71#if defined __SSE__ || defined __x86_64__
72#define _GLIBCXX_SIMD_HAVE_SSE 1
73#else
74#define _GLIBCXX_SIMD_HAVE_SSE 0
75#endif
76#if defined __SSE2__ || defined __x86_64__
77#define _GLIBCXX_SIMD_HAVE_SSE2 1
78#else
79#define _GLIBCXX_SIMD_HAVE_SSE2 0
80#endif
81#ifdef __SSE3__
82#define _GLIBCXX_SIMD_HAVE_SSE3 1
83#else
84#define _GLIBCXX_SIMD_HAVE_SSE3 0
85#endif
86#ifdef __SSSE3__
87#define _GLIBCXX_SIMD_HAVE_SSSE3 1
88#else
89#define _GLIBCXX_SIMD_HAVE_SSSE3 0
90#endif
91#ifdef __SSE4_1__
92#define _GLIBCXX_SIMD_HAVE_SSE4_1 1
93#else
94#define _GLIBCXX_SIMD_HAVE_SSE4_1 0
95#endif
96#ifdef __SSE4_2__
97#define _GLIBCXX_SIMD_HAVE_SSE4_2 1
98#else
99#define _GLIBCXX_SIMD_HAVE_SSE4_2 0
100#endif
101#ifdef __XOP__
102#define _GLIBCXX_SIMD_HAVE_XOP 1
103#else
104#define _GLIBCXX_SIMD_HAVE_XOP 0
105#endif
106#ifdef __AVX__
107#define _GLIBCXX_SIMD_HAVE_AVX 1
108#else
109#define _GLIBCXX_SIMD_HAVE_AVX 0
110#endif
111#ifdef __AVX2__
112#define _GLIBCXX_SIMD_HAVE_AVX2 1
113#else
114#define _GLIBCXX_SIMD_HAVE_AVX2 0
115#endif
116#ifdef __BMI__
117#define _GLIBCXX_SIMD_HAVE_BMI1 1
118#else
119#define _GLIBCXX_SIMD_HAVE_BMI1 0
120#endif
121#ifdef __BMI2__
122#define _GLIBCXX_SIMD_HAVE_BMI2 1
123#else
124#define _GLIBCXX_SIMD_HAVE_BMI2 0
125#endif
126#ifdef __LZCNT__
127#define _GLIBCXX_SIMD_HAVE_LZCNT 1
128#else
129#define _GLIBCXX_SIMD_HAVE_LZCNT 0
130#endif
131#ifdef __SSE4A__
132#define _GLIBCXX_SIMD_HAVE_SSE4A 1
133#else
134#define _GLIBCXX_SIMD_HAVE_SSE4A 0
135#endif
136#ifdef __FMA__
137#define _GLIBCXX_SIMD_HAVE_FMA 1
138#else
139#define _GLIBCXX_SIMD_HAVE_FMA 0
140#endif
141#ifdef __FMA4__
142#define _GLIBCXX_SIMD_HAVE_FMA4 1
143#else
144#define _GLIBCXX_SIMD_HAVE_FMA4 0
145#endif
146#ifdef __F16C__
147#define _GLIBCXX_SIMD_HAVE_F16C 1
148#else
149#define _GLIBCXX_SIMD_HAVE_F16C 0
150#endif
151#ifdef __POPCNT__
152#define _GLIBCXX_SIMD_HAVE_POPCNT 1
153#else
154#define _GLIBCXX_SIMD_HAVE_POPCNT 0
155#endif
156#ifdef __AVX512F__
157#define _GLIBCXX_SIMD_HAVE_AVX512F 1
158#else
159#define _GLIBCXX_SIMD_HAVE_AVX512F 0
160#endif
161#ifdef __AVX512DQ__
162#define _GLIBCXX_SIMD_HAVE_AVX512DQ 1
163#else
164#define _GLIBCXX_SIMD_HAVE_AVX512DQ 0
165#endif
166#ifdef __AVX512VL__
167#define _GLIBCXX_SIMD_HAVE_AVX512VL 1
168#else
169#define _GLIBCXX_SIMD_HAVE_AVX512VL 0
170#endif
171#ifdef __AVX512BW__
172#define _GLIBCXX_SIMD_HAVE_AVX512BW 1
173#else
174#define _GLIBCXX_SIMD_HAVE_AVX512BW 0
175#endif
176#ifdef __AVX512BITALG__
177#define _GLIBCXX_SIMD_HAVE_AVX512BITALG 1
178#else
179#define _GLIBCXX_SIMD_HAVE_AVX512BITALG 0
180#endif
181#ifdef __AVX512VBMI2__
182#define _GLIBCXX_SIMD_HAVE_AVX512VBMI2 1
183#else
184#define _GLIBCXX_SIMD_HAVE_AVX512VBMI2 0
185#endif
186#ifdef __AVX512VBMI__
187#define _GLIBCXX_SIMD_HAVE_AVX512VBMI 1
188#else
189#define _GLIBCXX_SIMD_HAVE_AVX512VBMI 0
190#endif
191#ifdef __AVX512IFMA__
192#define _GLIBCXX_SIMD_HAVE_AVX512IFMA 1
193#else
194#define _GLIBCXX_SIMD_HAVE_AVX512IFMA 0
195#endif
196#ifdef __AVX512CD__
197#define _GLIBCXX_SIMD_HAVE_AVX512CD 1
198#else
199#define _GLIBCXX_SIMD_HAVE_AVX512CD 0
200#endif
201#ifdef __AVX512VNNI__
202#define _GLIBCXX_SIMD_HAVE_AVX512VNNI 1
203#else
204#define _GLIBCXX_SIMD_HAVE_AVX512VNNI 0
205#endif
206#ifdef __AVX512VPOPCNTDQ__
207#define _GLIBCXX_SIMD_HAVE_AVX512VPOPCNTDQ 1
208#else
209#define _GLIBCXX_SIMD_HAVE_AVX512VPOPCNTDQ 0
210#endif
211#ifdef __AVX512VP2INTERSECT__
212#define _GLIBCXX_SIMD_HAVE_AVX512VP2INTERSECT 1
213#else
214#define _GLIBCXX_SIMD_HAVE_AVX512VP2INTERSECT 0
215#endif
216
217#if _GLIBCXX_SIMD_HAVE_SSE
218#define _GLIBCXX_SIMD_HAVE_SSE_ABI 1
219#else
220#define _GLIBCXX_SIMD_HAVE_SSE_ABI 0
221#endif
222#if _GLIBCXX_SIMD_HAVE_SSE2
223#define _GLIBCXX_SIMD_HAVE_FULL_SSE_ABI 1
224#else
225#define _GLIBCXX_SIMD_HAVE_FULL_SSE_ABI 0
226#endif
227
228#if _GLIBCXX_SIMD_HAVE_AVX
229#define _GLIBCXX_SIMD_HAVE_AVX_ABI 1
230#else
231#define _GLIBCXX_SIMD_HAVE_AVX_ABI 0
232#endif
233#if _GLIBCXX_SIMD_HAVE_AVX2
234#define _GLIBCXX_SIMD_HAVE_FULL_AVX_ABI 1
235#else
236#define _GLIBCXX_SIMD_HAVE_FULL_AVX_ABI 0
237#endif
238
239#if _GLIBCXX_SIMD_HAVE_AVX512F
240#define _GLIBCXX_SIMD_HAVE_AVX512_ABI 1
241#else
242#define _GLIBCXX_SIMD_HAVE_AVX512_ABI 0
243#endif
244#if _GLIBCXX_SIMD_HAVE_AVX512BW
245#define _GLIBCXX_SIMD_HAVE_FULL_AVX512_ABI 1
246#else
247#define _GLIBCXX_SIMD_HAVE_FULL_AVX512_ABI 0
248#endif
249
250#if defined __x86_64__ && !_GLIBCXX_SIMD_HAVE_SSE2
251#error "Use of SSE2 is required on AMD64"
252#endif
253//}}}
254
255#ifdef __clang__
256#define _GLIBCXX_SIMD_NORMAL_MATH
257#else
258#define _GLIBCXX_SIMD_NORMAL_MATH \
259 [[__gnu__::__optimize__("finite-math-only,no-signed-zeros")]]
260#endif
261#define _GLIBCXX_SIMD_NEVER_INLINE [[__gnu__::__noinline__]]
262#define _GLIBCXX_SIMD_INTRINSIC \
263 [[__gnu__::__always_inline__, __gnu__::__artificial__]] inline
264#define _GLIBCXX_SIMD_ALWAYS_INLINE [[__gnu__::__always_inline__]] inline
265#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA __attribute__((__always_inline__))
266#define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0)
267#define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1)
268
269#if defined __STRICT_ANSI__ && __STRICT_ANSI__
270#define _GLIBCXX_SIMD_CONSTEXPR
271#define _GLIBCXX_SIMD_USE_CONSTEXPR_API const
272#else
273#define _GLIBCXX_SIMD_CONSTEXPR constexpr
274#define _GLIBCXX_SIMD_USE_CONSTEXPR_API constexpr
275#endif
276
277#if defined __clang__
278#define _GLIBCXX_SIMD_USE_CONSTEXPR const
279#else
280#define _GLIBCXX_SIMD_USE_CONSTEXPR constexpr
281#endif
282
283#define _GLIBCXX_SIMD_LIST_BINARY(__macro) __macro(|) __macro(&) __macro(^)
284#define _GLIBCXX_SIMD_LIST_SHIFTS(__macro) __macro(<<) __macro(>>)
285#define _GLIBCXX_SIMD_LIST_ARITHMETICS(__macro) \
286 __macro(+) __macro(-) __macro(*) __macro(/) __macro(%)
287
288#define _GLIBCXX_SIMD_ALL_BINARY(__macro) \
289 _GLIBCXX_SIMD_LIST_BINARY(__macro) static_assert(true)
290#define _GLIBCXX_SIMD_ALL_SHIFTS(__macro) \
291 _GLIBCXX_SIMD_LIST_SHIFTS(__macro) static_assert(true)
292#define _GLIBCXX_SIMD_ALL_ARITHMETICS(__macro) \
293 _GLIBCXX_SIMD_LIST_ARITHMETICS(__macro) static_assert(true)
294
295#ifdef _GLIBCXX_SIMD_NO_ALWAYS_INLINE
296#undef _GLIBCXX_SIMD_ALWAYS_INLINE
297#define _GLIBCXX_SIMD_ALWAYS_INLINE inline
298#undef _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
299#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
300#undef _GLIBCXX_SIMD_INTRINSIC
301#define _GLIBCXX_SIMD_INTRINSIC inline
302#endif
303
304#if _GLIBCXX_SIMD_HAVE_SSE || _GLIBCXX_SIMD_HAVE_MMX
305#define _GLIBCXX_SIMD_X86INTRIN 1
306#else
307#define _GLIBCXX_SIMD_X86INTRIN 0
308#endif
309
310// workaround macros {{{
311// use aliasing loads to help GCC understand the data accesses better
312// This also seems to hide a miscompilation on swap(x[i], x[i + 1]) with
313// fixed_size_simd<float, 16> x.
314#define _GLIBCXX_SIMD_USE_ALIASING_LOADS 1
315
316// vector conversions on x86 not optimized:
317#if _GLIBCXX_SIMD_X86INTRIN
318#define _GLIBCXX_SIMD_WORKAROUND_PR85048 1
319#endif
320
321// integer division not optimized
322#define _GLIBCXX_SIMD_WORKAROUND_PR90993 1
323
324// very bad codegen for extraction and concatenation of 128/256 "subregisters"
325// with sizeof(element type) < 8: https://godbolt.org/g/mqUsgM
326#if _GLIBCXX_SIMD_X86INTRIN
327#define _GLIBCXX_SIMD_WORKAROUND_XXX_1 1
328#endif
329
330// bad codegen for 8 Byte memcpy to __vector_type_t<char, 16>
331#define _GLIBCXX_SIMD_WORKAROUND_PR90424 1
332
333// bad codegen for zero-extend using simple concat(__x, 0)
334#if _GLIBCXX_SIMD_X86INTRIN
335#define _GLIBCXX_SIMD_WORKAROUND_XXX_3 1
336#endif
337
338// https://github.com/cplusplus/parallelism-ts/issues/65 (incorrect return type
339// of static_simd_cast)
340#define _GLIBCXX_SIMD_FIX_P2TS_ISSUE65 1
341
342// https://github.com/cplusplus/parallelism-ts/issues/66 (incorrect SFINAE
343// constraint on (static)_simd_cast)
344#define _GLIBCXX_SIMD_FIX_P2TS_ISSUE66 1
345// }}}
346
347/// @endcond
348
349#endif // __cplusplus >= 201703L
350#endif // _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
351
352// vim: foldmethod=marker