libstdc++
text_encoding
Go to the documentation of this file.
1 // <text_encoding> -*- C++ -*-
2 
3 // Copyright The GNU Toolchain Authors.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /** @file include/text_encoding
26  * This is a Standard C++ Library header.
27  */
28 
29 #ifndef _GLIBCXX_TEXT_ENCODING
30 #define _GLIBCXX_TEXT_ENCODING
31 
32 #pragma GCC system_header
33 
34 #include <bits/requires_hosted.h>
35 
36 #define __glibcxx_want_text_encoding
37 #include <bits/version.h>
38 
39 #ifdef __cpp_lib_text_encoding
40 #include <compare>
41 #include <string_view>
42 #include <bits/functional_hash.h> // hash
43 #include <bits/ranges_util.h> // view_interface
44 #include <bits/unicode.h> // __charset_alias_match
45 #include <ext/numeric_traits.h> // __int_traits
46 
47 namespace std _GLIBCXX_VISIBILITY(default)
48 {
49 _GLIBCXX_BEGIN_NAMESPACE_VERSION
50 
51  /**
52  * @brief An interface for accessing the IANA Character Sets registry.
53  * @ingroup locales
54  * @since C++23
55  */
56  struct text_encoding
57  {
58  private:
59  struct _Rep
60  {
61  using id = __INT_LEAST32_TYPE__;
62  id _M_id;
63  const char* _M_name;
64 
65  friend constexpr bool
66  operator<(const _Rep& __r, id __m) noexcept
67  { return __r._M_id < __m; }
68 
69  friend constexpr bool
70  operator==(const _Rep& __r, string_view __name) noexcept
71  { return __r._M_name == __name; }
72  };
73 
74  public:
75  static constexpr size_t max_name_length = 63;
76 
77  enum class id : _Rep::id
78  {
79  other = 1,
80  unknown = 2,
81  ASCII = 3,
82  ISOLatin1 = 4,
83  ISOLatin2 = 5,
84  ISOLatin3 = 6,
85  ISOLatin4 = 7,
86  ISOLatinCyrillic = 8,
87  ISOLatinArabic = 9,
88  ISOLatinGreek = 10,
89  ISOLatinHebrew = 11,
90  ISOLatin5 = 12,
91  ISOLatin6 = 13,
92  ISOTextComm = 14,
93  HalfWidthKatakana = 15,
94  JISEncoding = 16,
95  ShiftJIS = 17,
96  EUCPkdFmtJapanese = 18,
97  EUCFixWidJapanese = 19,
98  ISO4UnitedKingdom = 20,
99  ISO11SwedishForNames = 21,
100  ISO15Italian = 22,
101  ISO17Spanish = 23,
102  ISO21German = 24,
103  ISO60DanishNorwegian = 25,
104  ISO69French = 26,
105  ISO10646UTF1 = 27,
106  ISO646basic1983 = 28,
107  INVARIANT = 29,
108  ISO2IntlRefVersion = 30,
109  NATSSEFI = 31,
110  NATSSEFIADD = 32,
111  ISO10Swedish = 35,
112  KSC56011987 = 36,
113  ISO2022KR = 37,
114  EUCKR = 38,
115  ISO2022JP = 39,
116  ISO2022JP2 = 40,
117  ISO13JISC6220jp = 41,
118  ISO14JISC6220ro = 42,
119  ISO16Portuguese = 43,
120  ISO18Greek7Old = 44,
121  ISO19LatinGreek = 45,
122  ISO25French = 46,
123  ISO27LatinGreek1 = 47,
124  ISO5427Cyrillic = 48,
125  ISO42JISC62261978 = 49,
126  ISO47BSViewdata = 50,
127  ISO49INIS = 51,
128  ISO50INIS8 = 52,
129  ISO51INISCyrillic = 53,
130  ISO54271981 = 54,
131  ISO5428Greek = 55,
132  ISO57GB1988 = 56,
133  ISO58GB231280 = 57,
134  ISO61Norwegian2 = 58,
135  ISO70VideotexSupp1 = 59,
136  ISO84Portuguese2 = 60,
137  ISO85Spanish2 = 61,
138  ISO86Hungarian = 62,
139  ISO87JISX0208 = 63,
140  ISO88Greek7 = 64,
141  ISO89ASMO449 = 65,
142  ISO90 = 66,
143  ISO91JISC62291984a = 67,
144  ISO92JISC62991984b = 68,
145  ISO93JIS62291984badd = 69,
146  ISO94JIS62291984hand = 70,
147  ISO95JIS62291984handadd = 71,
148  ISO96JISC62291984kana = 72,
149  ISO2033 = 73,
150  ISO99NAPLPS = 74,
151  ISO102T617bit = 75,
152  ISO103T618bit = 76,
153  ISO111ECMACyrillic = 77,
154  ISO121Canadian1 = 78,
155  ISO122Canadian2 = 79,
156  ISO123CSAZ24341985gr = 80,
157  ISO88596E = 81,
158  ISO88596I = 82,
159  ISO128T101G2 = 83,
160  ISO88598E = 84,
161  ISO88598I = 85,
162  ISO139CSN369103 = 86,
163  ISO141JUSIB1002 = 87,
164  ISO143IECP271 = 88,
165  ISO146Serbian = 89,
166  ISO147Macedonian = 90,
167  ISO150 = 91,
168  ISO151Cuba = 92,
169  ISO6937Add = 93,
170  ISO153GOST1976874 = 94,
171  ISO8859Supp = 95,
172  ISO10367Box = 96,
173  ISO158Lap = 97,
174  ISO159JISX02121990 = 98,
175  ISO646Danish = 99,
176  USDK = 100,
177  DKUS = 101,
178  KSC5636 = 102,
179  Unicode11UTF7 = 103,
180  ISO2022CN = 104,
181  ISO2022CNEXT = 105,
182  UTF8 = 106,
183  ISO885913 = 109,
184  ISO885914 = 110,
185  ISO885915 = 111,
186  ISO885916 = 112,
187  GBK = 113,
188  GB18030 = 114,
189  OSDEBCDICDF0415 = 115,
190  OSDEBCDICDF03IRV = 116,
191  OSDEBCDICDF041 = 117,
192  ISO115481 = 118,
193  KZ1048 = 119,
194  UCS2 = 1000,
195  UCS4 = 1001,
196  UnicodeASCII = 1002,
197  UnicodeLatin1 = 1003,
198  UnicodeJapanese = 1004,
199  UnicodeIBM1261 = 1005,
200  UnicodeIBM1268 = 1006,
201  UnicodeIBM1276 = 1007,
202  UnicodeIBM1264 = 1008,
203  UnicodeIBM1265 = 1009,
204  Unicode11 = 1010,
205  SCSU = 1011,
206  UTF7 = 1012,
207  UTF16BE = 1013,
208  UTF16LE = 1014,
209  UTF16 = 1015,
210  CESU8 = 1016,
211  UTF32 = 1017,
212  UTF32BE = 1018,
213  UTF32LE = 1019,
214  BOCU1 = 1020,
215  UTF7IMAP = 1021,
216  Windows30Latin1 = 2000,
217  Windows31Latin1 = 2001,
218  Windows31Latin2 = 2002,
219  Windows31Latin5 = 2003,
220  HPRoman8 = 2004,
221  AdobeStandardEncoding = 2005,
222  VenturaUS = 2006,
223  VenturaInternational = 2007,
224  DECMCS = 2008,
225  PC850Multilingual = 2009,
226  PC8DanishNorwegian = 2012,
227  PC862LatinHebrew = 2013,
228  PC8Turkish = 2014,
229  IBMSymbols = 2015,
230  IBMThai = 2016,
231  HPLegal = 2017,
232  HPPiFont = 2018,
233  HPMath8 = 2019,
234  HPPSMath = 2020,
235  HPDesktop = 2021,
236  VenturaMath = 2022,
237  MicrosoftPublishing = 2023,
238  Windows31J = 2024,
239  GB2312 = 2025,
240  Big5 = 2026,
241  Macintosh = 2027,
242  IBM037 = 2028,
243  IBM038 = 2029,
244  IBM273 = 2030,
245  IBM274 = 2031,
246  IBM275 = 2032,
247  IBM277 = 2033,
248  IBM278 = 2034,
249  IBM280 = 2035,
250  IBM281 = 2036,
251  IBM284 = 2037,
252  IBM285 = 2038,
253  IBM290 = 2039,
254  IBM297 = 2040,
255  IBM420 = 2041,
256  IBM423 = 2042,
257  IBM424 = 2043,
258  PC8CodePage437 = 2011,
259  IBM500 = 2044,
260  IBM851 = 2045,
261  PCp852 = 2010,
262  IBM855 = 2046,
263  IBM857 = 2047,
264  IBM860 = 2048,
265  IBM861 = 2049,
266  IBM863 = 2050,
267  IBM864 = 2051,
268  IBM865 = 2052,
269  IBM868 = 2053,
270  IBM869 = 2054,
271  IBM870 = 2055,
272  IBM871 = 2056,
273  IBM880 = 2057,
274  IBM891 = 2058,
275  IBM903 = 2059,
276  IBM904 = 2060,
277  IBM905 = 2061,
278  IBM918 = 2062,
279  IBM1026 = 2063,
280  IBMEBCDICATDE = 2064,
281  EBCDICATDEA = 2065,
282  EBCDICCAFR = 2066,
283  EBCDICDKNO = 2067,
284  EBCDICDKNOA = 2068,
285  EBCDICFISE = 2069,
286  EBCDICFISEA = 2070,
287  EBCDICFR = 2071,
288  EBCDICIT = 2072,
289  EBCDICPT = 2073,
290  EBCDICES = 2074,
291  EBCDICESA = 2075,
292  EBCDICESS = 2076,
293  EBCDICUK = 2077,
294  EBCDICUS = 2078,
295  Unknown8BiT = 2079,
296  Mnemonic = 2080,
297  Mnem = 2081,
298  VISCII = 2082,
299  VIQR = 2083,
300  KOI8R = 2084,
301  HZGB2312 = 2085,
302  IBM866 = 2086,
303  PC775Baltic = 2087,
304  KOI8U = 2088,
305  IBM00858 = 2089,
306  IBM00924 = 2090,
307  IBM01140 = 2091,
308  IBM01141 = 2092,
309  IBM01142 = 2093,
310  IBM01143 = 2094,
311  IBM01144 = 2095,
312  IBM01145 = 2096,
313  IBM01146 = 2097,
314  IBM01147 = 2098,
315  IBM01148 = 2099,
316  IBM01149 = 2100,
317  Big5HKSCS = 2101,
318  IBM1047 = 2102,
319  PTCP154 = 2103,
320  Amiga1251 = 2104,
321  KOI7switched = 2105,
322  BRF = 2106,
323  TSCII = 2107,
324  CP51932 = 2108,
325  windows874 = 2109,
326  windows1250 = 2250,
327  windows1251 = 2251,
328  windows1252 = 2252,
329  windows1253 = 2253,
330  windows1254 = 2254,
331  windows1255 = 2255,
332  windows1256 = 2256,
333  windows1257 = 2257,
334  windows1258 = 2258,
335  TIS620 = 2259,
336  CP50220 = 2260
337  };
338  using enum id;
339 
340  constexpr text_encoding() = default;
341 
342  constexpr explicit
343  text_encoding(string_view __enc) noexcept
344  : _M_rep(_S_find_name(__enc))
345  {
346  __enc.copy(_M_name, max_name_length);
347  }
348 
349  // @pre i has the value of one of the enumerators of id.
350  constexpr
351  text_encoding(id __i) noexcept
352  : _M_rep(_S_find_id(__i))
353  {
354  if (string_view __name(_M_rep->_M_name); !__name.empty())
355  __name.copy(_M_name, max_name_length);
356  }
357 
358  constexpr id mib() const noexcept { return id(_M_rep->_M_id); }
359 
360  constexpr const char* name() const noexcept { return _M_name; }
361 
362  struct aliases_view : ranges::view_interface<aliases_view>
363  {
364  private:
365  class _Iterator;
366  struct _Sentinel { };
367 
368  public:
369  constexpr _Iterator begin() const noexcept;
370  constexpr _Sentinel end() const noexcept { return {}; }
371 
372  private:
373  friend struct text_encoding;
374 
375  constexpr explicit aliases_view(const _Rep* __r) : _M_begin(__r) { }
376 
377  const _Rep* _M_begin = nullptr;
378  };
379 
380  constexpr aliases_view
381  aliases() const noexcept
382  {
383  return _M_rep->_M_name[0] ? aliases_view(_M_rep) : aliases_view{nullptr};
384  }
385 
386  friend constexpr bool
387  operator==(const text_encoding& __a,
388  const text_encoding& __b) noexcept
389  {
390  if (__a.mib() == id::other && __b.mib() == id::other) [[unlikely]]
391  return _S_comp(__a._M_name, __b._M_name);
392  else
393  return __a.mib() == __b.mib();
394  }
395 
396  friend constexpr bool
397  operator==(const text_encoding& __encoding, id __i) noexcept
398  { return __encoding.mib() == __i; }
399 
400 #if __CHAR_BIT__ == 8
401  static consteval text_encoding
402  literal() noexcept
403  {
404 #ifdef __GNUC_EXECUTION_CHARSET_NAME
405  return text_encoding(__GNUC_EXECUTION_CHARSET_NAME);
406 #elif defined __clang_literal_encoding__
407  return text_encoding(__clang_literal_encoding__);
408 #else
409  return text_encoding();
410 #endif
411  }
412 
413  static text_encoding
414  environment();
415 
416  template<id _Id>
417  static bool
418  environment_is()
419  { return text_encoding(_Id)._M_is_environment(); }
420 #else
421  static text_encoding literal() = delete;
422  static text_encoding environment() = delete;
423  template<id> static bool environment_is() = delete;
424 #endif
425 
426  private:
427  const _Rep* _M_rep = _S_reps + 1; // id::unknown
428  char _M_name[max_name_length + 1] = {0};
429 
430  bool
431  _M_is_environment() const;
432 
433  static inline constexpr _Rep _S_reps[] = {
434  { 1, "" }, { 2, "" },
435 #define _GLIBCXX_GET_ENCODING_DATA
436 #include <bits/text_encoding-data.h>
437 #ifdef _GLIBCXX_GET_ENCODING_DATA
438 # error "Invalid text_encoding data"
439 #endif
440  { 9999, nullptr }, // sentinel
441  };
442 
443  static constexpr bool
444  _S_comp(string_view __a, string_view __b)
445  { return __unicode::__charset_alias_match(__a, __b); }
446 
447  static constexpr const _Rep*
448  _S_find_name(string_view __name) noexcept
449  {
450 #ifdef _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET
451  // Optimize the common UTF-8 case to avoid a linear search through all
452  // strings in the table using the _S_comp function.
453  if (__name == "UTF-8")
454  return _S_reps + 2 + _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET;
455 #endif
456 
457  // The first two array elements (other and unknown) don't have names.
458  // The last element is a sentinel that can never match anything.
459  const auto __first = _S_reps + 2, __end = std::end(_S_reps) - 1;
460  for (auto __r = __first; __r != __end; ++__r)
461  if (_S_comp(__r->_M_name, __name))
462  {
463  // Might have matched an alias. Find the first entry for this ID.
464  const auto __id = __r->_M_id;
465  while (__r[-1]._M_id == __id)
466  --__r;
467  return __r;
468  }
469  return _S_reps; // id::other
470  }
471 
472  static constexpr const _Rep*
473  _S_find_id(id __id) noexcept
474  {
475  const auto __i = (_Rep::id)__id;
476  const auto __r = std::lower_bound(_S_reps, std::end(_S_reps) - 1, __i);
477  if (__r->_M_id == __i) [[likely]]
478  return __r;
479  else
480  {
481  // Preconditions: i has the value of one of the enumerators of id.
482  __glibcxx_assert(__r->_M_id == __i);
483  return _S_reps + 1; // id::unknown
484  }
485  }
486  };
487 
488  template<>
489  struct hash<text_encoding>
490  {
491  size_t
492  operator()(const text_encoding& __enc) const noexcept
493  { return std::hash<text_encoding::id>()(__enc.mib()); }
494  };
495 
496  class text_encoding::aliases_view::_Iterator
497  {
498  public:
499  using value_type = const char*;
500  using reference = const char*;
501  using difference_type = int;
502 
503  constexpr _Iterator() = default;
504 
505  constexpr value_type
506  operator*() const
507  {
508  if (_M_dereferenceable()) [[likely]]
509  return _M_rep->_M_name;
510  else
511  {
512  __glibcxx_assert(_M_dereferenceable());
513  return "";
514  }
515  }
516 
517  constexpr _Iterator&
518  operator++()
519  {
520  if (_M_dereferenceable()) [[likely]]
521  ++_M_rep;
522  else
523  {
524  __glibcxx_assert(_M_dereferenceable());
525  *this = _Iterator{};
526  }
527  return *this;
528  }
529 
530  constexpr _Iterator&
531  operator--()
532  {
533  const bool __decrementable
534  = _M_rep != nullptr && _M_rep[-1]._M_id == _M_id;
535  if (__decrementable) [[likely]]
536  --_M_rep;
537  else
538  {
539  __glibcxx_assert(__decrementable);
540  *this = _Iterator{};
541  }
542  return *this;
543  }
544 
545  constexpr _Iterator
546  operator++(int)
547  {
548  auto __it = *this;
549  ++*this;
550  return __it;
551  }
552 
553  constexpr _Iterator
554  operator--(int)
555  {
556  auto __it = *this;
557  --*this;
558  return __it;
559  }
560 
561  constexpr value_type
562  operator[](difference_type __n) const
563  { return *(*this + __n); }
564 
565  constexpr _Iterator&
566  operator+=(difference_type __n)
567  {
568  if (_M_rep != nullptr)
569  {
570  if (__n > 0)
571  {
572  if (__n < (std::end(_S_reps) - _M_rep)
573  && _M_rep[__n - 1]._M_id == _M_id) [[likely]]
574  _M_rep += __n;
575  else
576  *this = _Iterator{};
577  }
578  else if (__n < 0)
579  {
580  if (__n > (_S_reps - _M_rep)
581  && _M_rep[__n]._M_id == _M_id) [[likely]]
582  _M_rep += __n;
583  else
584  *this = _Iterator{};
585  }
586  }
587  if (__n != 0)
588  __glibcxx_assert(_M_rep != nullptr);
589  return *this;
590  }
591 
592  constexpr _Iterator&
593  operator-=(difference_type __n)
594  {
595  using _Traits = __gnu_cxx::__int_traits<difference_type>;
596  if (__n == _Traits::__min) [[unlikely]]
597  return operator+=(_Traits::__max);
598  return operator+=(-__n);
599  }
600 
601  constexpr difference_type
602  operator-(const _Iterator& __i) const
603  {
604  if (_M_id == __i._M_id)
605  return _M_rep - __i._M_rep;
606  __glibcxx_assert(_M_id == __i._M_id);
607  return __gnu_cxx::__int_traits<difference_type>::__max;
608  }
609 
610  constexpr bool
611  operator==(const _Iterator&) const = default;
612 
613  constexpr bool
614  operator==(_Sentinel) const noexcept
615  { return !_M_dereferenceable(); }
616 
617  constexpr strong_ordering
618  operator<=>(const _Iterator& __i) const
619  {
620  __glibcxx_assert(_M_id == __i._M_id);
621  return _M_rep <=> __i._M_rep;
622  }
623 
624  friend constexpr _Iterator
625  operator+(_Iterator __i, difference_type __n)
626  {
627  __i += __n;
628  return __i;
629  }
630 
631  friend constexpr _Iterator
632  operator+(difference_type __n, _Iterator __i)
633  {
634  __i += __n;
635  return __i;
636  }
637 
638  friend constexpr _Iterator
639  operator-(_Iterator __i, difference_type __n)
640  {
641  __i -= __n;
642  return __i;
643  }
644 
645  private:
646  friend struct text_encoding;
647 
648  constexpr explicit
649  _Iterator(const _Rep* __r) noexcept
650  : _M_rep(__r), _M_id(__r ? __r->_M_id : 0)
651  { }
652 
653  constexpr bool
654  _M_dereferenceable() const noexcept
655  { return _M_rep != nullptr && _M_rep->_M_id == _M_id; }
656 
657  const _Rep* _M_rep = nullptr;
658  _Rep::id _M_id = 0;
659  };
660 
661  constexpr auto
662  text_encoding::aliases_view::begin() const noexcept
663  -> _Iterator
664  { return _Iterator(_M_begin); }
665 
666 namespace ranges
667 {
668  // Opt-in to borrowed_range concept
669  template<>
670  inline constexpr bool
671  enable_borrowed_range<std::text_encoding::aliases_view> = true;
672 }
673 
674 _GLIBCXX_END_NAMESPACE_VERSION
675 } // namespace std
676 
677 #endif // __cpp_lib_text_encoding
678 #endif // _GLIBCXX_TEXT_ENCODING