UTF-8 characters in diagnostic output (such as the warning emoji ⚠️ used by fanalyzer) display as mojibake on Windows unless the utf8 code page is being used
This patch adds UTF-8 to UTF-16 conversion when outputting to a console on Windows. gcc/ChangeLog: * pretty-print.cc (decode_utf8_char): Move forward declaration. (utf8_to_utf16): New function to convert UTF-8 to UTF-16. (is_console_handle): New function to detect Windows console handles. (write_all): Add UTF-8 to UTF-16 conversion for console output, falling back to WriteFile for ASCII strings and regular files. Signed-off-by: Peter Damianov <peter0...@disroot.org> --- gcc/pretty-print.cc | 132 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 3 deletions(-) diff --git a/gcc/pretty-print.cc b/gcc/pretty-print.cc index d79a8282cfb..8ff97d8f1eb 100644 --- a/gcc/pretty-print.cc +++ b/gcc/pretty-print.cc @@ -43,6 +43,10 @@ along with GCC; see the file COPYING3. If not see /* Replacement for fputs() that handles ANSI escape codes on Windows NT. Contributed by: Liu Hao (lh_mouse at 126 dot com) + Extended by: Peter Damianov + Converts UTF-8 to UTF-16 if outputting to a console, so that emojis and + various other unicode characters don't get mojibak'd. + XXX: This file is compiled into libcommon.a that will be self-contained. It looks like that these functions can be put nowhere else. */ @@ -50,11 +54,136 @@ along with GCC; see the file COPYING3. If not see #define WIN32_LEAN_AND_MEAN 1 #include <windows.h> +static int +decode_utf8_char (const unsigned char *, size_t len, unsigned int *); + +/* Convert UTF-8 string to UTF-16. + Returns true if conversion was performed, false if string is pure ASCII. + + If the string contains only ASCII characters, returns false + without allocating any memory. Otherwise, a buffer that the caller must + free is allocated and the string is converted into it. */ +static bool +utf8_to_utf16 (const char *utf8_str, size_t utf8_len, wchar_t **utf16_str, + size_t *utf16_len) +{ + if (utf8_len == 0) + { + *utf16_str = NULL; + *utf16_len = 0; + return false; /* No conversion needed for empty string. */ + } + + /* First pass: scan for non-ASCII and count UTF-16 code units needed. */ + size_t utf16_count = 0; + const unsigned char *p = (const unsigned char *) utf8_str; + const unsigned char *end = p + utf8_len; + bool found_non_ascii = false; + + while (p < end) + { + if (*p <= 127) + { + /* ASCII character - count as 1 UTF-16 unit and advance. */ + utf16_count++; + p++; + } + else + { + /* Non-ASCII character - decode UTF-8 sequence. */ + found_non_ascii = true; + unsigned int codepoint; + int utf8_char_len = decode_utf8_char (p, end - p, &codepoint); + + if (utf8_char_len == 0) + return false; /* Invalid UTF-8. */ + + if (codepoint <= 0xFFFF) + utf16_count += 1; /* Single UTF-16 unit. */ + else + utf16_count += 2; /* Surrogate pair. */ + + p += utf8_char_len; + } + } + + /* If string is pure ASCII, no conversion needed. */ + if (!found_non_ascii) + return false; + + *utf16_str = (wchar_t *) xmalloc (utf16_count * sizeof (wchar_t)); + *utf16_len = utf16_count; + + /* Second pass: convert UTF-8 to UTF-16. */ + wchar_t *out = *utf16_str; + p = (const unsigned char *) utf8_str; + + while (p < end) + { + if (*p <= 127) + { + /* ASCII character. */ + *out++ = (wchar_t) *p++; + } + else + { + /* Non-ASCII character - decode and convert. */ + unsigned int codepoint; + int utf8_char_len = decode_utf8_char (p, end - p, &codepoint); + + if (codepoint <= 0xFFFF) + { + *out++ = (wchar_t) codepoint; + } + else + { + /* Convert to UTF-16 surrogate pair. */ + codepoint -= 0x10000; + *out++ = (wchar_t) (0xD800 + (codepoint >> 10)); + *out++ = (wchar_t) (0xDC00 + (codepoint & 0x3FF)); + } + + p += utf8_char_len; + } + } + + return true; +} + +/* Check if the handle is a console. + Returns false if not a console. */ +static bool +is_console_handle (HANDLE h) +{ + DWORD mode; + return GetConsoleMode (h, &mode); +} + /* Write all bytes in [s,s+n) into the specified stream. + If outputting to a Windows console, convert UTF-8 to UTF-16 if needed. Errors are ignored. */ static void write_all (HANDLE h, const char *s, size_t n) { + /* If writing to console, try to convert from UTF-8 to UTF-16 and use + WriteConsoleW. utf8_to_utf16 will return false if the string is pure + ASCII, in which case we fall back to the regular WriteFile path. */ + if (is_console_handle (h)) + { + wchar_t *utf16_str; + size_t utf16_len; + + if (utf8_to_utf16 (s, n, &utf16_str, &utf16_len)) + { + DWORD written; + WriteConsoleW (h, utf16_str, utf16_len, &written, NULL); + free (utf16_str); + return; + } + /* If UTF-8 conversion returned false, fall back to WriteFile. */ + } + + /* WriteFile for regular files or when conversion to UTF-16 is not needed. */ size_t rem = n; DWORD step; @@ -712,8 +841,6 @@ mingw_ansi_fputs (const char *str, FILE *fp) #endif /* __MINGW32__ */ -static int -decode_utf8_char (const unsigned char *, size_t len, unsigned int *); static void pp_quoted_string (pretty_printer *, const char *, size_t = -1); extern void @@ -2790,7 +2917,6 @@ pp_end_quote (pretty_printer *pp, bool show_color) pp_string (pp, close_quote); } - /* The string starting at P has LEN (at least 1) bytes left; if they start with a valid UTF-8 sequence, return the length of that sequence and set *VALUE to the value of that sequence, and -- 2.39.5