Hi, It is known for a long time that on native Windows, the wchar_t[] encoding on strings is UTF-16. [1] Now, Corinna Vinschen has confirmed that it is the same for Cygwin >= 1.7. [2]
Other platforms have either a 32-bit wchar_t (such as glibc, Solaris, *BSD, and many others), or have a 16-bit wchar_t that, in UTF-8 locales, uses the UCS-2 encoding (namely AIX).[3] What consequences does this have? 1) All code that uses the functions from <wctype.h> (wide character classification and mapping) or wcwidth() malfunctions on strings that contains Unicode characters outside the BMP, i.e. outside the range U+0000..U+FFFF. 2) Code that uses mbrtowc() or wcrtomb() is also likely to malfunction. On Cygwin >= 1.7 mbrtowc() and wcrtomb() is implemented in an intelligent but somewhat surprising way: wcrtomb() may return 0, that is, produce no output bytes when it consumes a wchar_t. On native Windows, I could not test it (I could not enable any UTF-8 or GB18030 locale on Windows XP), but due to the behaviour of the functions MultiByteToWideChar and WideCharToMultiByte [4] it looks like the implementations of mbrtowc() and wcrtomb() will not be able to cope with characters outside the BMP. Examples of such code are: - In gnulib, the files file uses exclude.c towlower fnmatch.c towlower mbchar.h isw* mbmemcasecoll.c towlower mbscasestr.c towlower mbswidth.c iswcntrl, wcwidth quotearg.c iswprint regcomp.c towlower regex_internal.h iswalnum, iswlower regex_internal.c towupper strftime.c towlower, towupper strtol.c iswalpha, iswspace, towupper - In coreutils, the program 'wc': The correct behaviour is: $ echo 'a b' | wc -w -m 2 4 Now with an U+2002 space: $ printf 'a\xe2\x80\x82b\n' | wc -w -m 2 4 Now with a chinese character from the BMP: $ printf 'a\xe3\x91\x96b\n' | wc -w -m 1 4 $ printf 'a \xe3\x91\x96 b\n' | wc -w -m 3 6 Now with a chinese character outside the BMP: $ printf 'a\xf0\xa1\x88\xb4b\n' | wc -w -m 1 4 $ printf 'a \xf0\xa1\x88\xb4 b\n' | wc -w -m 3 6 On Cygwin 1.7.5 (with LANG=C.UTF-8 and 'wc' from GNU coreutils 8.5): $ printf 'a\xf0\xa1\x88\xb4b\n' | wc -w -m 1 5 $ printf 'a \xf0\xa1\x88\xb4 b\n' | wc -w -m 2 7 So both the number of characters and the number of words are counted wrong as soon as non-BMP characters occur. What can we do about it? Adding lots of conditional code to the above listed gnulib, coreutils, gettext etc. source files? That would be and endless amount of work. I'm more in favour of overriding wchar_t and all functions that depend on it - like we did successfully for the socket functions. In practice, this would mean that on Windows (both native Windows and Cygwin >= 1.7) the use of a 'wchar_t' module will - override wchar_t to be 32 bits, like in glibc, - cause functions from mbrtowc() to wcwidth() to be overridden. Since the corresponding system functions are unusable, the replacements will use the modules from libunistring (such as unictype/ctype-alnum and uniwidth/width). It also means that we will have separate modules for 'iswalnum', ..., 'towupper', which are currently all in the module 'wctype'. How does that sound? Other thoughts? Bruno [1] http://msdn.microsoft.com/en-us/library/dd319072%28v=vs.85%29.aspx [2] http://cygwin.com/ml/cygwin/2011-01/msg00410.html [3] Found by running the attached program multibyte-utf16-unix.c [4] See the attached program multibyte-utf16-win32.c
#include <locale.h> #include <stdlib.h> #include <wchar.h> #include <stdio.h> int main () { // U+21234; in CJK Ideograph Extension B // in UTF-16 form: 0xD844 0xDE34 const char buf[4] = { 0xF0, 0xA1, 0x88, 0xB4 }; wchar_t wbuf[4] = { 0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE }; const char *inptr = buf; char rbuf[6]; size_t ret, ret1, ret2; size_t i; #ifdef _AIX if (setlocale (LC_ALL, "ZH_CN.UTF-8") == NULL) #else if (setlocale (LC_ALL, "zh_CN.UTF-8") == NULL) #endif { printf ("setlocale failed\n"); return 1; } /* Test mbrtowc */ ret = mbrtowc (&wbuf[0], inptr, buf + sizeof (buf) - inptr, NULL); printf ("ret = %d, wbuf[0] = %x\n", (int) ret, (unsigned int) wbuf[0]); if ((int) ret > 0 && (int) ret < 4) { inptr += ret; ret = mbrtowc (&wbuf[1], inptr, buf + sizeof (buf) - inptr, NULL); printf ("ret = %d, wbuf[1] = %x\n", (int) ret, (unsigned int) wbuf[1]); if (ret > 0) { inptr += ret; /* Test wcrtomb */ ret1 = wcrtomb (rbuf, wbuf[0], NULL); printf ("ret1 = %d\n", (int) ret1); if ((int) ret1 >= 0) { printf ("rbuf ="); for (i = 0; i < ret1; i++) printf (" %02X", (unsigned char) rbuf[i]); printf ("\n"); ret2 = wcrtomb (rbuf + ret1, wbuf[1], NULL); printf ("ret2 = %d\n", (int) ret2); if ((int) ret2 >= 0) { printf ("rbuf ="); for (i = 0; i < ret1 + ret2; i++) printf (" %02X", (unsigned char) rbuf[i]); printf ("\n"); } } } } else { /* Test wcrtomb */ ret1 = wcrtomb (rbuf, wbuf[0], NULL); printf ("ret1 = %d\n", (int) ret1); if ((int) ret1 >= 0) { printf ("rbuf ="); for (i = 0; i < ret1; i++) printf (" %02X", (unsigned char) rbuf[i]); printf ("\n"); } } return 0; } /* Result on glibc: wchar_t[] is UCS-4 ret = 4, wbuf[0] = 21234 ret1 = 4 rbuf = F0 A1 88 B4 Result on Cygwin 1.7.5: wchar_t[] is UTF-16 ret = 3, wbuf[0] = d844 ret = 1, wbuf[1] = de34 ret1 = 0 rbuf = ret2 = 4 rbuf = F0 A1 88 B4 Result on AIX 5.1: ret = -1, wbuf[0] = fffe Result on AIX 5.2, 5.3, 6.1: wchar_t[] is UCS-2 ret = -1, wbuf[0] = fffd */
#include <locale.h> #include <stdlib.h> #include <wchar.h> #include <stdio.h> #include <errno.h> #include <windows.h> // Valid locales for setlocale: // German_Germany.1252 // Chinese_Taiwan // Chinese_China // etc. // but none supports an encoding that goes further than the BMP // (at least not in Windows XP). // So use the Win32 functions instead of the C89 functions. #define codepage 65001 // UTF-8 //#define codepage 54936 // GB18030 // unsupported in Windows XP, // despite what <http://msdn.microsoft.com/en-us/library/dd317756.aspx> says #define mbrtowc my_mbrtowc static size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { unsigned char c; size_t i; if (n == 0) return (size_t)-2; /* MultiByteToWideChar fails with ERROR_NO_UNICODE_TRANSLATION if the input is too short. We need to distinguish this case from an invalid input sequence. */ c = (unsigned char) s[0]; if (c < 0xc0) i = 1; else if (c < 0xe0) i = 2; else if (c < 0xf0) i = 3; else if (c < 0xf8) i = 4; else { errno = EILSEQ; return (size_t)-1; } if (MultiByteToWideChar (codepage, MB_ERR_INVALID_CHARS /* | MB_PRECOMPOSED does not work */, s, i, pwc, pwc != NULL)) return i; switch (GetLastError ()) { case ERROR_INSUFFICIENT_BUFFER: break; case ERROR_NO_UNICODE_TRANSLATION: errno = EILSEQ; return (size_t)-1; case ERROR_INVALID_FLAGS: case ERROR_INVALID_PARAMETER: default: fprintf (stderr, "last error: %x\n", GetLastError ()); fflush (stderr); abort (); } return (size_t)-2; } #define wcrtomb my_wcrtomb static size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps) { BOOL invalid_conversion; int bytes; if (s == NULL) { static char buf[6]; s = buf; wc = 0; } bytes = WideCharToMultiByte (codepage, 0, &wc, 1, s, 6, NULL, codepage == 65001 ? NULL : &invalid_conversion); if (bytes) { if (codepage == 65001 || !invalid_conversion) return bytes; else { errno = EILSEQ; return (size_t)-1; } } else switch (GetLastError ()) { case ERROR_NO_UNICODE_TRANSLATION: errno = EILSEQ; return (size_t)-1; case ERROR_INVALID_PARAMETER: errno = EINVAL; return (size_t)-1; case ERROR_INVALID_FLAGS: case ERROR_INSUFFICIENT_BUFFER: default: fprintf (stderr, "last error: 0x%x\n", GetLastError ()); fflush (stderr); abort (); } } int main (int argc, char*argv[]) { // U+21234; in CJK Ideograph Extension B // in UTF-16 form: 0xD844 0xDE34 #if codepage == 65001 const char buf[4] = { 0xF0, 0xA1, 0x88, 0xB4 }; // fails //const char buf[4] = { 0xE3, 0xBF, 0xA3, 0x66 }; // OK //const char buf[4] = { 0xC4, 0xB4, 0x55, 0x66 }; // OK //const char buf[4] = { 0x22, 0x33, 0x55, 0x66 }; // OK #endif #if codepage == 54936 //const char buf[4] = { 0x95, 0x35, 0xDA, 0x36 }; //const char buf[4] = { 0x82, 0x32, 0xA3, 0x38 }; //const char buf[4] = { 0x81, 0x30, 0x90, 0x36 }; const char buf[4] = { 0x22, 0x33, 0x55, 0x66 }; #endif wchar_t wbuf[4] = { 0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE }; const char *inptr = buf; char rbuf[6]; size_t ret, ret1, ret2; BOOL invalid_conversion; size_t i; /* Test mbrtowc */ ret = mbrtowc (&wbuf[0], inptr, buf + sizeof (buf) - inptr, NULL); printf ("ret = %d, wbuf[0] = %x\n", (int) ret, (unsigned int) wbuf[0]); /* Test MultiByteToWideChar, converting 2 wchar_t units at once. */ if (MultiByteToWideChar (codepage, MB_ERR_INVALID_CHARS /* | MB_PRECOMPOSED does not work */, inptr, buf + sizeof (buf) - inptr, wbuf, 2)) { printf ("wbuf[0] = %x, wbuf[1] = %x\n", (unsigned int) wbuf[0], (unsigned int) wbuf[1]); /* Test wcrtomb */ ret1 = wcrtomb (rbuf, wbuf[0], NULL); printf ("ret1 = %d\n", (int) ret1); if (ret1 > 0) { printf ("rbuf ="); for (i = 0; i < ret1; i++) printf (" %02X", (unsigned char) rbuf[i]); printf ("\n"); } /* Test WideCharToMultiByte, converting 2 wchar_t units at once. */ ret2 = WideCharToMultiByte (codepage, 0, wbuf, 2, rbuf, sizeof (rbuf), NULL, codepage == 65001 ? NULL : &invalid_conversion); printf ("ret2 = %d\n", ret2); if (ret2 > 0) { printf ("rbuf ="); for (i = 0; i < ret2; i++) printf (" %02X", (unsigned char) rbuf[i]); printf ("\n"); } } return 0; } /* Result on Windows XP: ret = -1, wbuf[0] = 0 wbuf[0] = d844, wbuf[1] = de34 ret1 = 3 rbuf = ED A1 84 ret2 = 4 rbuf = F0 A1 88 B4 */