Hello, I was investigating wc*tomb* and mb*towc* functions in CRT and comparing their behavior to other implementations.
Take the following example: ``` mbrtowc (NULL, s, 1, ps) mbrtowc (NULL, s + 1, 1, ps) ``` Here, `s` is a pointer to multibyte (DBCS) character, but since n==1 mbrtowc returns (size_t)-2 and updates ps. Next call completes converting multibyte character. What's the return value? CRT returns 2 while glibc returns 1. It seems to me that ISO C and POSIX specify different behavior for this case. cppreference[1] has the following: ``` the number of bytes [1...n] of the multibyte character successfully converted from s. ``` In this case, CRT seems good. We converted a DBCS character with length of 2. POSIX[2] has the following: ``` between 1 and n inclusive If the next n or fewer bytes complete a valid character (which is the value stored); the value returned shall be the number of bytes that complete the character. (size_t)-2 If the next n bytes contribute to an incomplete but potentially valid character, and all n bytes have been processed (no value is stored). ``` glibc seems good here. The first call *consumed* n (1) bytes. Second call consumed 1 byte to complete it, so the return value is 1. Any ideas here? Or should I try my luck on gnulib list? I guess consistency with CRT would be preferred on Windows. I also attached a simple program which you can compile with CRT or glibc to observe this. - Kirill Makurin [1] https://en.cppreference.com/w/c/string/multibyte/mbrtowc [2] https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbrtowc.html
#define __USE_MINGW_ANSI_STDIO 0 #include <assert.h> #ifdef _WIN32 #include <fcntl.h> #include <io.h> #endif #include <locale.h> #include <stdio.h> #include <stdlib.h> #include <wchar.h> #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include <windows.h> #endif /** * With msvcrt.dll, define EMU to use mingw-w64's replacement for mbrtowc */ typedef size_t (* mbrtowc_t) (wchar_t *, const char *, size_t, mbstate_t *); int main (void) { #ifdef _WIN32 _setmode (_fileno (stdout), _O_U8TEXT); _setmode (_fileno (stderr), _O_U8TEXT); _wsetlocale (LC_ALL, L"Japanese_Japan.20932"); #else setlocale (LC_ALL, "ja_JP"); #endif mbrtowc_t func = NULL; #if defined (_UCRT) || defined (EMU) || !defined (_WIN32) func = mbrtowc; #else HANDLE msvcrt = LoadLibraryW (L"msvcrt.dll"); if (msvcrt == NULL) { abort (); } func = (mbrtowc_t) GetProcAddress (msvcrt, "mbrtowc"); if (func == NULL) { abort (); } #endif /* EUC-JP (code page 20932) */ char lead1[] = {(char) 161}; char lead2[] = {(char) 162}; char valid1[] = {(char) 161, (char) 162}; char valid2[] = {(char) 162, (char) 161}; mbstate_t state = {0}; wchar_t wc = WEOF; assert (func (&wc, lead1, 1, &state) == (size_t) -2); #ifdef _WIN32 assert (func (&wc, lead2, 1, &state) == 2); #else assert (func (&wc, lead2, 1, &state) == 1); #endif wprintf (L"%lc\n", wc); assert (func (&wc, lead2, 1, &state) == (size_t) -2); #ifdef _WIN32 assert (func (&wc, lead1, 1, &state) == 2); #else assert (func (&wc, lead1, 1, &state) == 1); #endif wprintf (L"%lc\n", wc); assert (func (&wc, valid1, 2, &state) == 2); wprintf (L"%lc\n", wc); assert (func (&wc, valid2, 2, &state) == 2); wprintf (L"%lc\n", wc); return 0; }
_______________________________________________ Mingw-w64-public mailing list Mingw-w64-public@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/mingw-w64-public