libstdc++
unicode.h
Go to the documentation of this file.
1// Unicode utilities -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/** @file include/bits/unicode.h
26 * This is an internal header file, included by other library headers.
27 * Do not attempt to use it directly. @headername{format}
28 */
29
30#ifndef _GLIBCXX_UNICODE_H
31#define _GLIBCXX_UNICODE_H 1
32
33#if __cplusplus >= 202002L
34#include <array>
35#include <bit>
36#include <cstdint>
37#include <bits/stl_algo.h>
38#include <bits/stl_iterator.h>
39#include <bits/ranges_base.h>
40
41namespace std _GLIBCXX_VISIBILITY(default)
42{
43_GLIBCXX_BEGIN_NAMESPACE_VERSION
44namespace __unicode
45{
46 // A Unicode code point that is not a high or low surrogate.
47 constexpr bool
48 __is_scalar_value(char32_t __c)
49 {
50 if (__c < 0xD800) [[likely]]
51 return true;
52 return 0xDFFF < __c && __c <= 0x10FFFF;
53 }
54
55 // A code point that can be encoded in a single code unit of type _CharT.
56 template<typename _CharT>
57 constexpr bool
58 __is_single_code_unit(char32_t __c)
59 {
60 if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
61 return __c < 0x7F; // ASCII character
62 else
63 return __c < __gnu_cxx::__int_traits<_CharT>::__max
64 && __is_scalar_value(__c);
65 }
66
67 // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
68
69 struct _Repl
70 {
71 constexpr char32_t
72 operator()() const noexcept
73 { return 0xFFFD; }
74 };
75
76 struct _Null_sentinel_t
77 {
78 template<input_iterator _It>
79 requires default_initializable<iter_value_t<_It>>
80 && equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
81 friend constexpr auto
82 operator==(_It __it, _Null_sentinel_t)
83 { return *__it == iter_value_t<_It>{}; }
84 };
85
86 template<typename _FromFmt, typename _ToFmt,
87 input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
88 typename _ErrorHandler = _Repl>
89 requires convertible_to<iter_value_t<_Iter>, _FromFmt>
90 class _Utf_iterator
91 {
92 static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
93
94 public:
95 using value_type = _ToFmt;
96 using difference_type = iter_difference_t<_Iter>;
97 using reference = value_type;
98 using iterator_concept
100 bidirectional_iterator_tag>;
101
102 constexpr _Utf_iterator() = default;
103
104 constexpr
105 _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
106 requires bidirectional_iterator<_Iter>
107 : _M_first_and_curr{__first, __it}, _M_last(__last)
108 {
109 if (_M_curr() != _M_last)
110 _M_read();
111 else
112 _M_buf = {};
113 }
114
115 constexpr
116 _Utf_iterator(_Iter __it, _Sent __last)
117 requires (!bidirectional_iterator<_Iter>)
118 : _M_first_and_curr{__it}, _M_last(__last)
119 {
120 if (_M_curr() != _M_last)
121 _M_read();
122 else
123 _M_buf = {};
124 }
125
126 template<class _Iter2, class _Sent2>
127 requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
128 constexpr
129 _Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
130 _ErrorHandler>& __other)
131 : _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
132 _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
133 _M_last(__other._M_last)
134 { }
135
136 [[nodiscard]]
137 constexpr _Iter
138 begin() const requires bidirectional_iterator<_Iter>
139 { return _M_first(); }
140
141 [[nodiscard]]
142 constexpr _Sent
143 end() const { return _M_last; }
144
145 [[nodiscard]]
146 constexpr _Iter
147 base() const requires forward_iterator<_Iter>
148 { return _M_curr(); }
149
150 [[nodiscard]]
151 constexpr value_type
152 operator*() const { return _M_buf[_M_buf_index]; }
153
154 constexpr _Utf_iterator&
155 operator++()
156 {
157 if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last)
158 {
159 if constexpr (forward_iterator<_Iter>)
160 std::advance(_M_curr(), _M_to_increment);
161 if (_M_curr() == _M_last)
162 _M_buf_index = 0;
163 else
164 _M_read();
165 }
166 else if (_M_buf_index + 1 < _M_buf_last)
167 ++_M_buf_index;
168 return *this;
169 }
170
171 constexpr _Utf_iterator
172 operator++(int)
173 {
174 auto __tmp = *this;
175 ++*this;
176 return __tmp;
177 }
178
179 constexpr _Utf_iterator&
180 operator--() requires bidirectional_iterator<_Iter>
181 {
182 if (!_M_buf_index && _M_curr() != _M_first())
183 _M_read_reverse();
184 else if (_M_buf_index)
185 --_M_buf_index;
186 return *this;
187 }
188
189 constexpr _Utf_iterator
190 operator--(int)
191 {
192 auto __tmp = *this;
193 --*this;
194 return __tmp;
195 }
196
197 [[nodiscard]]
198 friend constexpr bool
199 operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
200 requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
201 {
202 if constexpr (forward_iterator<_Iter>)
203 return __lhs._M_curr() == __rhs._M_curr()
204 && __lhs._M_buf_index == __rhs._M_buf_index;
205 else if (__lhs._M_curr() != __rhs._M_curr())
206 return false;
207 else if (__lhs._M_buf_index == __rhs._M_buf_index
208 && __lhs._M_buf_last == __rhs._M_buf_last)
209 return true;
210 else
211 return __lhs._M_buf_index == __lhs._M_buf_last
212 && __rhs._M_buf_index == __rhs._M_buf_last;
213 }
214
215 [[nodiscard]]
216 friend constexpr bool
217 operator==(_Utf_iterator __lhs, _Sent __rhs)
218 {
219 if constexpr (forward_iterator<_Iter>)
220 return __lhs._M_curr() == __rhs;
221 else
222 return __lhs._M_curr() == __rhs
223 && __lhs._M_buf_index == __lhs._M_buf_last;
224 }
225
226 private:
227 constexpr void
228 _M_read()
229 {
230 if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
231 _M_read_utf8();
232 else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
233 _M_read_utf16();
234 else
235 {
236 static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
237 _M_read_utf32();
238 }
239 }
240
241 constexpr void
242 _M_read_reverse(); // TODO
243
244 template<typename>
245 struct _Guard
246 {
247 _Guard(void*, _Iter&) { }
248 };
249
250 template<typename _It> requires forward_iterator<_It>
251 struct _Guard<_It>
252 {
253 constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
254 _Utf_iterator* _M_this;
255 _It _M_orig;
256 };
257
258 constexpr void
259 _M_read_utf8()
260 {
261 _Guard<_Iter> __g{this, _M_curr()};
262 char32_t __c{};
263 uint8_t __u = *_M_curr()++;
264 const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
265 uint8_t __to_incr = 1;
266
267 if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
268 __c = __u;
269 else if (__u < 0xC2) [[unlikely]]
270 __c = _S_error();
271 else if (_M_curr() == _M_last) [[unlikely]]
272 __c = _S_error();
273 else if (__u <= 0xDF) // 0xC2 to 0xDF
274 {
275 __c = __u & 0x1F;
276 __u = *_M_curr();
277
278 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
279 __c = _S_error();
280 else
281 {
282 __c = (__c << 6) | (__u & 0x3F);
283 ++_M_curr();
284 ++__to_incr;
285 }
286 }
287 else if (__u <= 0xEF) // 0xE0 to 0xEF
288 {
289 const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
290 const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
291
292 __c = __u & 0x0F;
293 __u = *_M_curr();
294
295 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
296 __c = _S_error();
297 else if (++_M_curr() == _M_last) [[unlikely]]
298 __c = _S_error();
299 else
300 {
301 ++__to_incr;
302 __c = (__c << 6) | (__u & 0x3F);
303 __u = *_M_curr();
304
305 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
306 __c = _S_error();
307 else
308 {
309 __c = (__c << 6) | (__u & 0x3F);
310 ++_M_curr();
311 ++__to_incr;
312 }
313 }
314 }
315 else if (__u <= 0xF4) // 0xF0 to 0xF4
316 {
317 const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
318 const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
319
320 __c = __u & 0x07;
321 __u = *_M_curr();
322
323 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
324 __c = _S_error();
325 else if (++_M_curr() == _M_last) [[unlikely]]
326 __c = _S_error();
327 else
328 {
329 ++__to_incr;
330 __c = (__c << 6) | (__u & 0x3F);
331 __u = *_M_curr();
332
333 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
334 __c = _S_error();
335 else if (++_M_curr() == _M_last) [[unlikely]]
336 __c = _S_error();
337 else
338 {
339 ++__to_incr;
340 __c = (__c << 6) | (__u & 0x3F);
341 __u = *_M_curr();
342
343 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
344 __c = _S_error();
345 else
346 {
347 __c = (__c << 6) | (__u & 0x3F);
348 ++_M_curr();
349 ++__to_incr;
350 }
351 }
352 }
353 }
354 else [[unlikely]]
355 __c = _S_error();
356
357 _M_update(__c, __to_incr);
358 }
359
360 constexpr void
361 _M_read_utf16()
362 {
363 _Guard<_Iter> __g{this, _M_curr()};
364 char32_t __c{};
365 uint16_t __u = *_M_curr()++;
366 uint8_t __to_incr = 1;
367
368 if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
369 __c = __u;
370 else if (__u < 0xDC00 && _M_curr() != _M_last)
371 {
372 uint16_t __u2 = *_M_curr();
373 if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
374 __c = _S_error();
375 else
376 {
377 ++_M_curr();
378 __to_incr = 2;
379 uint32_t __x = (__u & 0x3F) << 10 | __u2 & 0x3FF;
380 uint32_t __w = (__u >> 6) & 0x1F;
381 __c = (__w + 1) << 16 | __x;
382 }
383 }
384 else
385 __c = _S_error();
386
387 _M_update(__c, __to_incr);
388 }
389
390 constexpr void
391 _M_read_utf32()
392 {
393 _Guard<_Iter> __g{this, _M_curr()};
394 char32_t __c = *_M_curr()++;
395 if (!__is_scalar_value(__c)) [[unlikely]]
396 __c = _S_error();
397 _M_update(__c, 1);
398 }
399
400 // Encode the code point __c as one or more code units in _M_buf.
401 constexpr void
402 _M_update(char32_t __c, uint8_t __to_incr)
403 {
404 _M_to_increment = __to_incr;
405 _M_buf_index = 0;
406 if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
407 {
408 _M_buf[0] = __c;
409 _M_buf_last = 1;
410 }
411 else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
412 {
413 if (__is_single_code_unit<_ToFmt>(__c))
414 {
415 _M_buf[0] = __c;
416 _M_buf[1] = 0;
417 _M_buf_last = 1;
418 }
419 else
420 {
421 // From http://www.unicode.org/faq/utf_bom.html#utf16-4
422 const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
423 char16_t __lead = __lead_offset + (__c >> 10);
424 char16_t __trail = 0xDC00 + (__c & 0x3FF);
425 _M_buf[0] = __lead;
426 _M_buf[1] = __trail;
427 _M_buf_last = 2;
428 }
429 }
430 else
431 {
432 static_assert(sizeof(_ToFmt) == 1);
433 int __bits = std::bit_width((uint32_t)__c);
434 if (__bits <= 7) [[likely]]
435 {
436 _M_buf[0] = __c;
437 _M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
438 _M_buf_last = 1;
439 }
440 else if (__bits <= 11)
441 {
442 _M_buf[0] = 0xC0 | (__c >> 6);
443 _M_buf[1] = 0x80 | (__c & 0x3F);
444 _M_buf[2] = _M_buf[3] = 0;
445 _M_buf_last = 2;
446 }
447 else if (__bits <= 16)
448 {
449 _M_buf[0] = 0xE0 | (__c >> 12);
450 _M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
451 _M_buf[2] = 0x80 | (__c & 0x3F);
452 _M_buf[3] = 0;
453 _M_buf_last = 3;
454 }
455 else
456 {
457 _M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
458 _M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
459 _M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
460 _M_buf[3] = 0x80 | (__c & 0x3F);
461 _M_buf_last = 4;
462 }
463 }
464 }
465
466 constexpr char32_t
467 _S_error()
468 {
469 char32_t __c = _ErrorHandler()();
470 __glibcxx_assert(__is_scalar_value(__c));
471 return __c;
472 }
473
474 constexpr _Iter
475 _M_first() const requires bidirectional_iterator<_Iter>
476 { return _M_first_and_curr._M_first; }
477
478 constexpr _Iter&
479 _M_curr() { return _M_first_and_curr._M_curr; }
480
481 constexpr _Iter
482 _M_curr() const { return _M_first_and_curr._M_curr; }
483
484 array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
485
486 template<typename _It>
487 struct _First_and_curr
488 {
489 _First_and_curr() = default;
490
491 constexpr
492 _First_and_curr(_It __curr) : _M_curr(__curr) { }
493
494 template<convertible_to<_It> _It2>
495 constexpr
496 _First_and_curr(const _First_and_curr<_It2>& __other)
497 : _M_curr(__other._M_curr) { }
498
499 _It _M_curr;
500 };
501
502 template<typename _It> requires bidirectional_iterator<_It>
503 struct _First_and_curr<_It>
504 {
505 _First_and_curr() = default;
506
507 constexpr
508 _First_and_curr(_It __first, _It __curr)
509 : _M_first(__first), _M_curr(__curr) { }
510
511 template<convertible_to<_It> _It2>
512 constexpr
513 _First_and_curr(const _First_and_curr<_It2>& __other)
514 : _M_first(__other._M_first), _M_curr(__other._M_curr) { }
515
516 _It _M_first;
517 _It _M_curr;
518 };
519
520 _First_and_curr<_Iter> _M_first_and_curr;
521
522 uint8_t _M_buf_index = 0;
523 uint8_t _M_buf_last = 0;
524 uint8_t _M_to_increment = 0;
525
526 [[no_unique_address]] _Sent _M_last;
527
528 template<typename _FromFmt2, typename _ToFmt2,
529 input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
530 typename _ErrHandler>
531 requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
532 friend class _Utf_iterator;
533 };
534
535 template<typename _ToFormat, ranges::input_range _Range>
536 class _Utf_view
537 : public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
538 {
539 using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
540 _ToFormat, ranges::iterator_t<_Range>,
541 ranges::sentinel_t<_Range>>;
542
543 template<typename _Iter, typename _Sent>
544 constexpr auto
545 _M_begin(_Iter __first, _Sent __last)
546 {
547 if constexpr (bidirectional_iterator<_Iter>)
548 return _Iterator(__first, __first, __last);
549 else
550 return _Iterator(__first, __last);
551 }
552
553 template<typename _Iter, typename _Sent>
554 constexpr auto
555 _M_end(_Iter __first, _Sent __last)
556 {
557 if constexpr (!is_same_v<_Iter, _Sent>)
558 return __last;
559 else if constexpr (bidirectional_iterator<_Iter>)
560 return _Iterator(__first, __last, __last);
561 else
562 return _Iterator(__last, __last);
563 }
564
565 _Range _M_base;
566
567 public:
568 constexpr explicit
569 _Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
570
571 constexpr auto begin()
572 { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
573
574 constexpr auto end()
575 { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
576
577 constexpr bool empty() const { return ranges::empty(_M_base); }
578 };
579
580 template<typename _View>
581 using _Utf8_view = _Utf_view<char8_t, _View>;
582 template<typename _View>
583 using _Utf16_view = _Utf_view<char16_t, _View>;
584 template<typename _View>
585 using _Utf32_view = _Utf_view<char32_t, _View>;
586
587inline namespace __v15_1_0
588{
589#define _GLIBCXX_GET_UNICODE_DATA 150100
590#include "unicode-data.h"
591#ifdef _GLIBCXX_GET_UNICODE_DATA
592# error "Invalid unicode data"
593#endif
594
595 // The field width of a code point.
596 constexpr int
597 __field_width(char32_t __c) noexcept
598 {
599 if (__c < __width_edges[0]) [[likely]]
600 return 1;
601
602 auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
603 return (__p - __width_edges) % 2 + 1;
604 }
605
606 // @pre c <= 0x10FFFF
607 constexpr _Gcb_property
608 __grapheme_cluster_break_property(char32_t __c) noexcept
609 {
610 constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
611 auto* __end = std::end(__gcb_edges);
612 auto* __p = std::lower_bound(__gcb_edges, __end,
613 (__c << __gcb_shift_bits) | __mask);
614 return _Gcb_property(__p[-1] & __mask);
615 }
616
617 constexpr bool
618 __is_incb_linker(char32_t __c) noexcept
619 {
620 const auto __end = std::end(__incb_linkers);
621 // Array is small enough that linear search is faster than binary search.
622 return std::find(__incb_linkers, __end, __c) != __end;
623 }
624
625 // @pre c <= 0x10FFFF
626 constexpr _InCB
627 __incb_property(char32_t __c) noexcept
628 {
629 if ((__c << 2) < __incb_edges[0]) [[likely]]
630 return _InCB(0);
631
632 constexpr uint32_t __mask = 0x3;
633 auto* __end = std::end(__incb_edges);
634 auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
635 return _InCB(__p[-1] & __mask);
636 }
637
638 constexpr bool
639 __is_extended_pictographic(char32_t __c)
640 {
641 if (__c < __xpicto_edges[0]) [[likely]]
642 return 0;
643
644 auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
645 return (__p - __xpicto_edges) % 2;
646 }
647
648 struct _Grapheme_cluster_iterator_base
649 {
650 char32_t _M_c; // First code point in the cluster.
651 _Gcb_property _M_prop; // GCB property of _M_c.
652 enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
653 _XPicto _M_xpicto_seq_state = _XPicto::_Init;
654 unsigned char _M_RI_count = 0;
655 bool _M_incb_linker_seen = false;
656
657 constexpr void
658 _M_reset(char32_t __c, _Gcb_property __p)
659 {
660 _M_c = __c;
661 _M_prop = __p;
662 _M_xpicto_seq_state = _XPicto::_Init;
663 _M_RI_count = 0;
664 _M_incb_linker_seen = false;
665 }
666
667 constexpr void
668 _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
669 {
670 if (_M_xpicto_seq_state == _XPicto::_Failed)
671 return;
672
673 auto __next_state = _XPicto::_Failed;
674 if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
675 {
676 if (__p == _Gcb_property::_Gcb_ZWJ)
677 {
678 if (_M_xpicto_seq_state == _XPicto::_Matched)
679 __next_state = _XPicto::_Zwj;
680 // We check _M_c here so that we do the lookup at most once,
681 // and only for clusters containing at least one ZWJ.
682 else if (__is_extended_pictographic(_M_c))
683 __next_state = _XPicto::_Zwj;
684 }
685 else if (__p == _Gcb_property::_Gcb_Extend)
686 __next_state = _M_xpicto_seq_state; // no change
687 }
688 else // Zwj
689 {
690 // This assumes that all \p{Extended_Pictographic} emoji have
691 // Grapheme_Cluster_Break=Other.
692 if (__p == _Gcb_property::_Gcb_Other
693 && __is_extended_pictographic(__c))
694 __next_state = _XPicto::_Matched;
695 }
696 _M_xpicto_seq_state = __next_state;
697 }
698
699 constexpr void
700 _M_update_ri_count(_Gcb_property __p)
701 {
702 if (__p == _Gcb_property::_Gcb_Regional_Indicator)
703 ++_M_RI_count;
704 else
705 _M_RI_count = 0;
706 }
707
708 constexpr void
709 _M_update_incb_state(char32_t __c, _Gcb_property)
710 {
711 if (__is_incb_linker(__c))
712 _M_incb_linker_seen = true;
713 }
714 };
715
716 // Split a range into extended grapheme clusters.
717 template<ranges::forward_range _View>
718 class _Grapheme_cluster_view
719 : public ranges::view_interface<_Grapheme_cluster_view<_View>>
720 {
721 public:
722
723 constexpr
724 _Grapheme_cluster_view(const _View& __v)
725 : _M_begin(_Utf32_view(__v).begin())
726 { }
727
728 constexpr auto begin() const { return _M_begin; }
729 constexpr auto end() const { return _M_begin.end(); }
730
731 private:
732 struct _Iterator : private _Grapheme_cluster_iterator_base
733 {
734 private:
735 // Iterator over the underlying code points.
736 using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
737
738 public:
739 // TODO: Change value_type to be subrange<_U32_iterator> instead?
740 // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
741 // That would be the whole cluster, not just the first code point.
742 // Would need to store two iterators and find end of current cluster
743 // on increment, so operator* returns value_type(_M_base, _M_next).
744 using value_type = char32_t;
745 using iterator_concept = forward_iterator_tag;
746 using difference_type = ptrdiff_t;
747
748 constexpr
749 _Iterator(_U32_iterator __i)
750 : _M_base(__i)
751 {
752 if (__i != __i.end())
753 {
754 _M_c = *__i;
755 _M_prop = __grapheme_cluster_break_property(_M_c);
756 }
757 }
758
759 // The first code point of the current extended grapheme cluster.
760 constexpr value_type
761 operator*() const
762 { return _M_c; }
763
764 constexpr auto
765 operator->() const
766 { return &_M_c; }
767
768 // Move to the next extended grapheme cluster.
769 constexpr _Iterator&
770 operator++()
771 {
772 const auto __end = _M_base.end();
773 if (_M_base != __end)
774 {
775 auto __p_prev = _M_prop;
776 auto __it = _M_base;
777 while (++__it != __end)
778 {
779 char32_t __c = *__it;
780 auto __p = __grapheme_cluster_break_property(*__it);
781 _M_update_xpicto_seq_state(__c, __p);
782 _M_update_ri_count(__p);
783 _M_update_incb_state(__c, __p);
784 if (_M_is_break(__p_prev, __p, __it))
785 {
786 // Found a grapheme cluster break
787 _M_reset(__c, __p);
788 break;
789 }
790 __p_prev = __p;
791 }
792 _M_base = __it;
793 }
794 return *this;
795 }
796
797 constexpr _Iterator
798 operator++(int)
799 {
800 auto __tmp = *this;
801 ++this;
802 return __tmp;
803 }
804
805 constexpr bool
806 operator==(const _Iterator& __i) const
807 { return _M_base == __i._M_base; }
808
809 // This supports iter != iter.end()
810 constexpr bool
811 operator==(const ranges::sentinel_t<_View>& __i) const
812 { return _M_base == __i; }
813
814 // Iterator to the start of the current cluster.
815 constexpr auto base() const { return _M_base.base(); }
816
817 // The end of the underlying view (not the end of the current cluster!)
818 constexpr auto end() const { return _M_base.end(); }
819
820 // Field width of the first code point in the cluster.
821 constexpr int
822 width() const noexcept
823 { return __field_width(_M_c); }
824
825 private:
826 _U32_iterator _M_base;
827
828 // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
829 // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
830 // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
831 // Return true if there is a break between code point with property p1
832 // and code point with property p2.
833 constexpr bool
834 _M_is_break(_Gcb_property __p1, _Gcb_property __p2,
835 _U32_iterator __curr) const
836 {
837 using enum _Gcb_property;
838
839 if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
840 return true; // Break after Control or LF.
841
842 if (__p1 == _Gcb_CR)
843 return __p2 != _Gcb_LF; // Do not break between a CR and LF.
844
845 // Rule GB5
846 if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
847 return true; // Break before Control, CR or LF.
848
849 // Rule GB6
850 if (__p1 == _Gcb_L)
851 switch (__p2)
852 {
853 case _Gcb_L:
854 case _Gcb_V:
855 case _Gcb_LV:
856 case _Gcb_LVT:
857 return false; // Do not break Hangul syllable sequences.
858 default:
859 return true;
860 }
861
862 // Rule GB7
863 if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
864 switch (__p2)
865 {
866 case _Gcb_V:
867 case _Gcb_T:
868 return false; // Do not break Hangul syllable sequences.
869 default:
870 return true;
871 }
872
873 // Rule GB8
874 if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
875 return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
876
877 // Rule GB9
878 if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
879 return false; // Do not break before extending characters or ZWJ.
880
881 // The following GB9x rules only apply to extended grapheme clusters,
882 // which is what the C++ standard uses (not legacy grapheme clusters).
883
884 // Rule GB9a
885 if (__p2 == _Gcb_SpacingMark)
886 return false; // Do not break before SpacingMarks,
887 // Rule GB9b
888 if (__p1 == _Gcb_Prepend)
889 return false; // or after Prepend characters.
890
891 // Rule GB9c (Unicode 15.1.0)
892 // Do not break within certain combinations with
893 // Indic_Conjunct_Break (InCB)=Linker.
894 if (_M_incb_linker_seen
895 && __incb_property(_M_c) == _InCB::_Consonant
896 && __incb_property(*__curr) == _InCB::_Consonant)
897 {
898 // Match [_M_base, __curr] against regular expression
899 // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
900 bool __have_linker = false;
901 auto __it = _M_base;
902 while (++__it != __curr)
903 {
904 if (__is_incb_linker(*__it))
905 __have_linker = true;
906 else
907 {
908 auto __incb = __incb_property(*__it);
909 if (__incb == _InCB::_Consonant)
910 __have_linker = false;
911 else if (__incb != _InCB::_Extend)
912 break;
913 }
914 }
915 if (__it == __curr && __have_linker)
916 return false;
917 }
918
919 // Rule GB11
920 // Do not break within emoji modifier sequences
921 // or emoji zwj sequences.
922 if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
923 return false;
924
925 // Rules GB12 and GB13
926 // Do not break within emoji flag sequences. That is, do not break
927 // between regional indicator (RI) symbols if there is an odd number
928 // of RI characters before the break point.
929 if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
930 return (_M_RI_count & 1) == 0;
931
932 // Rule GB999
933 return true; // Otherwise, break everywhere.
934 }
935 };
936
937 _Iterator _M_begin;
938 };
939
940} // namespace __v15_1_0
941
942 // Return the field width of a string.
943 template<typename _CharT>
944 constexpr size_t
945 __field_width(basic_string_view<_CharT> __s)
946 {
947 if (__s.empty()) [[unlikely]]
948 return 0;
949 _Grapheme_cluster_view __gc(__s);
950 auto __it = __gc.begin();
951 const auto __end = __gc.end();
952 size_t __n = __it.width();
953 while (++__it != __end)
954 __n += __it.width();
955 return __n;
956 }
957
958 // Truncate a string to at most `__max` field width units, and return the
959 // resulting field width.
960 template<typename _CharT>
961 constexpr size_t
962 __truncate(basic_string_view<_CharT>& __s, size_t __max)
963 {
964 if (__s.empty()) [[unlikely]]
965 return 0;
966
967 _Grapheme_cluster_view __gc(__s);
968 auto __it = __gc.begin();
969 const auto __end = __gc.end();
970 size_t __n = __it.width();
971 if (__n > __max)
972 {
973 __s = {};
974 return 0;
975 }
976 while (++__it != __end)
977 {
978 size_t __n2 = __n + __it.width();
979 if (__n2 > __max)
980 {
981 __s = basic_string_view<_CharT>(__s.begin(), __it.base());
982 return __n;
983 }
984 __n = __n2;
985 }
986 return __n;
987 }
988
989 template<typename _CharT>
990 consteval bool
991 __literal_encoding_is_unicode()
992 {
993 if constexpr (is_same_v<_CharT, char8_t>)
994 return true;
995 else if constexpr (is_same_v<_CharT, char16_t>)
996 return true;
997 else if constexpr (is_same_v<_CharT, char32_t>)
998 return true;
999
1000 const char* __enc = "";
1001
1002#ifdef __GNUC_EXECUTION_CHARSET_NAME
1003 auto __remove_iso10646_prefix = [](const char* __s) {
1004 // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1005 if (__s[0] == 'I' || __s[0] == 'i')
1006 if (__s[1] == 'S' || __s[1] == 's')
1007 if (__s[2] == 'O' || __s[2] == 'o')
1008 if (string_view(__s + 3).starts_with("-10646/"))
1009 return __s + 10;
1010 return __s;
1011 };
1012
1013 if constexpr (is_same_v<_CharT, char>)
1014 __enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
1015# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1016 else
1017 __enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
1018# endif
1019
1020 if ((__enc[0] == 'U' || __enc[0] == 'u')
1021 && (__enc[1] == 'T' || __enc[1] == 't')
1022 && (__enc[2] == 'F' || __enc[2] == 'f'))
1023 {
1024 __enc += 3;
1025 if (__enc[0] == '-')
1026 ++__enc;
1027 if (__enc[0] == '8')
1028 return __enc[1] == '\0' || string_view(__enc + 1) == "//";
1029 else if constexpr (!is_same_v<_CharT, char>)
1030 {
1031 string_view __s(__enc);
1032 if (__s.ends_with("//"))
1033 __s.remove_suffix(2);
1034 return __s == "16" || __s == "32";
1035 }
1036 }
1037#elif defined __clang_literal_encoding__
1038 if constexpr (is_same_v<_CharT, char>)
1039 __enc = __clang_literal_encoding__;
1040# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1041 else
1042 __enc = __clang_wide_literal_encoding__;
1043# endif
1044 // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1045 string_view __s(__enc);
1046 if (__s == "UTF-8")
1047 return true;
1048 else if constexpr (!is_same_v<_CharT, char>)
1049 return __s == "UTF-16" || __s == "UTF-32";
1050#endif
1051
1052 return false;
1053 }
1054
1055 consteval bool
1056 __literal_encoding_is_utf8()
1057 { return __literal_encoding_is_unicode<char>(); }
1058
1059} // namespace __unicode
1060
1061_GLIBCXX_END_NAMESPACE_VERSION
1062} // namespace std
1063#endif // C++20
1064#endif // _GLIBCXX_UNICODE_H
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition move.h:126
constexpr _Tp && forward(typename std::remove_reference< _Tp >::type &__t) noexcept
Forward an lvalue.
Definition move.h:70
_Tp * end(valarray< _Tp > &__va) noexcept
Return an iterator pointing to one past the last element of the valarray.
Definition valarray:1243
ISO C++ entities toplevel namespace is std.
constexpr void advance(_InputIterator &__i, _Distance __n)
A generalization of pointer arithmetic.
__numeric_traits_integer< _Tp > __int_traits
Convenience alias for __numeric_traits<integer-type>.