libstdc++-v3/ChangeLog:
* src/c++20/format.cc (__encoding::conv): Convert ISO-8859-1 and
ISO-8859-15 directly without using iconv. Check if iconv can be
avoided for some extended ASCII encodings.
---
Tested x86_64-linux.
libstdc++-v3/src/c++20/format.cc | 119 +++++++++++++++++++++++++++++++
1 file changed, 119 insertions(+)
diff --git a/libstdc++-v3/src/c++20/format.cc b/libstdc++-v3/src/c++20/format.cc
index 1a24fcab7f7..ee91c291d38 100644
--- a/libstdc++-v3/src/c++20/format.cc
+++ b/libstdc++-v3/src/c++20/format.cc
@@ -74,6 +74,8 @@ struct __encoding : locale::facet
{
case UTF8:
case ASCII:
+ case ISOLatin1:
+ case ISO885915:
break;
default:
_M_cd = ::iconv_open("UTF-8", _M_enc.name());
@@ -102,6 +104,123 @@ struct __encoding : locale::facet
if (input.empty()) [[unlikely]]
return codecvt_base::noconv;
+ using enum text_encoding::id;
+ switch (_M_enc.mib())
+ {
+ case UTF8:
+ case ASCII:
+ return codecvt_base::noconv;
+ case ISOLatin1:
+ case ISO885915:
+ case windows1252:
+ {
+ auto next = input.begin();
+ const auto end = input.end();
+ do
+ {
+ if ((unsigned char)*next & 0x80)
+ break;
+ }
+ while (++next != end);
+
+ if (next == end) // No 8-bit chars that need conversion to UTF-8.
+ return codecvt_base::noconv;
+
+ out.assign(input.begin(), next);
+ do
+ {
+ if (uint16_t c = (unsigned char)*next; c & 0x80) // 8-bit char
+ {
+ if ((c & 0xe0) == 0xa0 && _M_enc.mib() == ISO885915)
+ {
+ // For ISO-8859-15 some characters do not map directly
+ // to the Unicode code point with the same value.
+ switch (c & 0xbf)
+ {
+ case 0xa4:
+ // Euro symbol requires three UTF-8 code units,
+ // so deal with it differently:
+ out += "\u20AC";
+ continue;
+ case 0xbc:
+ c = 0x0152;
+ break;
+ case 0xbd:
+ c = 0x0153;
+ break;
+ case 0xa6:
+ c = 0x0160;
+ break;
+ case 0xa8:
+ c = 0x0161;
+ break;
+ case 0xbe:
+ c = 0x0178;
+ break;
+ case 0xb4:
+ c = 0x017d;
+ break;
+ case 0xb8:
+ c = 0x017e;
+ break;
+ default:
+ // Everything else is the same as ISO-8859-1
+ break;
+ }
+ }
+ else if (c < 0xa0 && _M_enc.mib() == windows1252)
+ {
+ // For Windows-1252 some chars in range [0x80,0xa0)
+ // do not map directly to a single UTF-8 code unit.
+ // We could handle them here, but just use iconv for now.
+ goto use_iconv;
+ }
+
+ // Convert code point to two UTF-8 code units:
+ char units[2];
+ units[0] = 0xc0 | (c >> 6);
+ units[1] = 0x80 | (c & 0x3f);
+ out.append(units, 2);
+ }
+ else // 7-bit chars map directly to a single UTF-8 code point:
+ out += c;
+ }
+ while (++next < input.end());
+
+ return codecvt_base::ok;
+ }
+
+ case ISOLatin2:
+ case ISOLatin3:
+ case ISOLatin4:
+ case ISOLatin5:
+ case ISOLatinCyrillic:
+ case ISOLatinGreek:
+ case windows1250:
+ case windows1251:
+ case windows1253:
+ case windows1254:
+ case windows1255:
+ case windows1256:
+ case windows1257:
+ case windows1258:
+ {
+ bool ascii = true;
+ for (unsigned char c : input)
+ if (c & 0x80)
+ {
+ ascii = false;
+ break;
+ }
+ if (ascii)
+ return codecvt_base::noconv;
+ break;
+ }
+ default:
+ break;
+ }
+
+use_iconv:
#ifdef _GLIBCXX_HAVE_ICONV
if (_M_cd == (::iconv_t)-1)
return codecvt_base::error;
--
2.47.0