16-bit wchar_t on Windows and Cygwin

Bruno Haible Sun, 30 Jan 2011 18:06:14 -0800

Hi,

It is known for a long time that on native Windows, the wchar_t[] encoding on
strings is UTF-16. [1] Now, Corinna Vinschen has confirmed that it is the same
for Cygwin >= 1.7. [2]


Other platforms have either a 32-bit wchar_t (such as glibc, Solaris, *BSD,
and many others), or have a 16-bit wchar_t that, in UTF-8 locales, uses the
UCS-2 encoding (namely AIX).[3]

What consequences does this have?

  1) All code that uses the functions from <wctype.h> (wide character
     classification and mapping) or wcwidth() malfunctions on strings that
     contains Unicode characters outside the BMP, i.e. outside the range
     U+0000..U+FFFF.

  2) Code that uses mbrtowc() or wcrtomb() is also likely to malfunction.
     On Cygwin >= 1.7 mbrtowc() and wcrtomb() is implemented in an intelligent
     but somewhat surprising way: wcrtomb() may return 0, that is, produce no
     output bytes when it consumes a wchar_t.
     On native Windows, I could not test it (I could not enable any UTF-8 or
     GB18030 locale on Windows XP), but due to the behaviour of the functions
     MultiByteToWideChar and WideCharToMultiByte [4] it looks like the
     implementations of mbrtowc() and wcrtomb() will not be able to cope
     with characters outside the BMP.

Examples of such code are:

- In gnulib, the files

  file              uses

  exclude.c         towlower
  fnmatch.c         towlower
  mbchar.h          isw*
  mbmemcasecoll.c   towlower
  mbscasestr.c      towlower
  mbswidth.c        iswcntrl, wcwidth
  quotearg.c        iswprint
  regcomp.c         towlower
  regex_internal.h  iswalnum, iswlower
  regex_internal.c  towupper
  strftime.c        towlower, towupper
  strtol.c          iswalpha, iswspace, towupper

- In coreutils, the program 'wc':

  The correct behaviour is:

  $ echo 'a b' | wc -w -m
        2       4

  Now with an U+2002 space:
  $ printf 'a\xe2\x80\x82b\n' | wc -w -m
        2       4

  Now with a chinese character from the BMP:
  $ printf 'a\xe3\x91\x96b\n' | wc -w -m
        1       4
  $ printf 'a \xe3\x91\x96 b\n' | wc -w -m
        3       6

  Now with a chinese character outside the BMP:
  $ printf 'a\xf0\xa1\x88\xb4b\n' | wc -w -m
        1       4
  $ printf 'a \xf0\xa1\x88\xb4 b\n' | wc -w -m
        3       6

  On Cygwin 1.7.5 (with LANG=C.UTF-8 and 'wc' from GNU coreutils 8.5):

  $ printf 'a\xf0\xa1\x88\xb4b\n' | wc -w -m
        1       5
  $ printf 'a \xf0\xa1\x88\xb4 b\n' | wc -w -m
        2       7

  So both the number of characters and the number of words are counted
  wrong as soon as non-BMP characters occur.


What can we do about it?

Adding lots of conditional code to the above listed gnulib, coreutils, gettext
etc. source files? That would be and endless amount of work.

I'm more in favour of overriding wchar_t and all functions that depend on it -
like we did successfully for the socket functions.

In practice, this would mean that on Windows (both native Windows and
Cygwin >= 1.7) the use of a 'wchar_t' module will
  - override wchar_t to be 32 bits, like in glibc,
  - cause functions from mbrtowc() to wcwidth() to be overridden. Since the
    corresponding system functions are unusable, the replacements will use the
    modules from libunistring (such as unictype/ctype-alnum and uniwidth/width).

It also means that we will have separate modules for 'iswalnum', ..., 
'towupper',
which are currently all in the module 'wctype'.

How does that sound? Other thoughts?

Bruno


[1] http://msdn.microsoft.com/en-us/library/dd319072%28v=vs.85%29.aspx
[2] http://cygwin.com/ml/cygwin/2011-01/msg00410.html
[3] Found by running the attached program multibyte-utf16-unix.c
[4] See the attached program multibyte-utf16-win32.c

#include <locale.h>
#include <stdlib.h>
#include <wchar.h>
#include <stdio.h>

int main ()
{
  // U+21234; in CJK Ideograph Extension B
  // in UTF-16 form: 0xD844 0xDE34
  const char buf[4] = { 0xF0, 0xA1, 0x88, 0xB4 };
  wchar_t wbuf[4] = { 0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE };
  const char *inptr = buf;
  char rbuf[6];
  size_t ret, ret1, ret2;
  size_t i;

#ifdef _AIX
  if (setlocale (LC_ALL, "ZH_CN.UTF-8") == NULL)
#else
  if (setlocale (LC_ALL, "zh_CN.UTF-8") == NULL)
#endif
    {
      printf ("setlocale failed\n");
      return 1;
    }

  /* Test mbrtowc */
  ret = mbrtowc (&wbuf[0], inptr, buf + sizeof (buf) - inptr, NULL);
  printf ("ret = %d, wbuf[0] = %x\n", (int) ret, (unsigned int) wbuf[0]);
  if ((int) ret > 0 && (int) ret < 4)
    {
      inptr += ret;
      ret = mbrtowc (&wbuf[1], inptr, buf + sizeof (buf) - inptr, NULL);
      printf ("ret = %d, wbuf[1] = %x\n", (int) ret, (unsigned int) wbuf[1]);
      if (ret > 0)
        {
          inptr += ret;

          /* Test wcrtomb */
          ret1 = wcrtomb (rbuf, wbuf[0], NULL);
          printf ("ret1 = %d\n", (int) ret1);
          if ((int) ret1 >= 0)
            {
              printf ("rbuf =");
              for (i = 0; i < ret1; i++)
                printf (" %02X", (unsigned char) rbuf[i]);
              printf ("\n");

              ret2 = wcrtomb (rbuf + ret1, wbuf[1], NULL);
              printf ("ret2 = %d\n", (int) ret2);
              if ((int) ret2 >= 0)
                {
                  printf ("rbuf =");
                  for (i = 0; i < ret1 + ret2; i++)
                    printf (" %02X", (unsigned char) rbuf[i]);
                  printf ("\n");
                }
            }
        }
    }
  else
    {
      /* Test wcrtomb */
      ret1 = wcrtomb (rbuf, wbuf[0], NULL);
      printf ("ret1 = %d\n", (int) ret1);
      if ((int) ret1 >= 0)
        {
          printf ("rbuf =");
          for (i = 0; i < ret1; i++)
            printf (" %02X", (unsigned char) rbuf[i]);
          printf ("\n");
        }
    }

  return 0;
}

/*
Result on glibc: wchar_t[] is UCS-4
ret = 4, wbuf[0] = 21234
ret1 = 4
rbuf = F0 A1 88 B4

Result on Cygwin 1.7.5: wchar_t[] is UTF-16
ret = 3, wbuf[0] = d844
ret = 1, wbuf[1] = de34
ret1 = 0
rbuf =
ret2 = 4
rbuf = F0 A1 88 B4

Result on AIX 5.1:
ret = -1, wbuf[0] = fffe

Result on AIX 5.2, 5.3, 6.1: wchar_t[] is UCS-2
ret = -1, wbuf[0] = fffd

*/

#include <locale.h>
#include <stdlib.h>
#include <wchar.h>
#include <stdio.h>
#include <errno.h>
#include <windows.h>

  // Valid locales for setlocale:
  // German_Germany.1252
  // Chinese_Taiwan
  // Chinese_China
  // etc.
  // but none supports an encoding that goes further than the BMP
  // (at least not in Windows XP).

// So use the Win32 functions instead of the C89 functions.

#define codepage 65001 // UTF-8
//#define codepage 54936 // GB18030 // unsupported in Windows XP,
// despite what <http://msdn.microsoft.com/en-us/library/dd317756.aspx> says

#define mbrtowc my_mbrtowc
static size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
{
  unsigned char c;
  size_t i;
  if (n == 0)
    return (size_t)-2;
  /* MultiByteToWideChar fails with ERROR_NO_UNICODE_TRANSLATION if the input
     is too short. We need to distinguish this case from an invalid input sequence. */
  c = (unsigned char) s[0];
  if (c < 0xc0)
    i = 1;
  else if (c < 0xe0)
    i = 2;
  else if (c < 0xf0)
    i = 3;
  else if (c < 0xf8)
    i = 4;
  else
    { errno = EILSEQ; return (size_t)-1; }
  if (MultiByteToWideChar (codepage, MB_ERR_INVALID_CHARS /* | MB_PRECOMPOSED does not work */, s, i, pwc, pwc != NULL))
    return i;
  switch (GetLastError ())
    {
    case ERROR_INSUFFICIENT_BUFFER:
      break;
    case ERROR_NO_UNICODE_TRANSLATION:
      errno = EILSEQ; return (size_t)-1;
    case ERROR_INVALID_FLAGS:
    case ERROR_INVALID_PARAMETER:
    default:
      fprintf (stderr, "last error: %x\n", GetLastError ());
      fflush (stderr);
      abort ();
    }
  return (size_t)-2;
}

#define wcrtomb my_wcrtomb
static size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
{
  BOOL invalid_conversion;
  int bytes;
  if (s == NULL)
    {
      static char buf[6];
      s = buf;
      wc = 0;
    }
  bytes = WideCharToMultiByte (codepage, 0, &wc, 1, s, 6, NULL, codepage == 65001 ? NULL : &invalid_conversion);
  if (bytes)
    {
      if (codepage == 65001 || !invalid_conversion)
        return bytes;
      else
        {
          errno = EILSEQ;
          return (size_t)-1;
        }
    }
  else
    switch (GetLastError ())
      {
      case ERROR_NO_UNICODE_TRANSLATION:
        errno = EILSEQ;
        return (size_t)-1;
      case ERROR_INVALID_PARAMETER:
        errno = EINVAL;
        return (size_t)-1;
      case ERROR_INVALID_FLAGS:
      case ERROR_INSUFFICIENT_BUFFER:
      default:
        fprintf (stderr, "last error: 0x%x\n", GetLastError ());
        fflush (stderr);
        abort ();
      }

}

int main (int argc, char*argv[])
{
  // U+21234; in CJK Ideograph Extension B
  // in UTF-16 form: 0xD844 0xDE34
#if codepage == 65001
  const char buf[4] = { 0xF0, 0xA1, 0x88, 0xB4 }; // fails
  //const char buf[4] = { 0xE3, 0xBF, 0xA3, 0x66 }; // OK
  //const char buf[4] = { 0xC4, 0xB4, 0x55, 0x66 }; // OK
  //const char buf[4] = { 0x22, 0x33, 0x55, 0x66 }; // OK
#endif
#if codepage == 54936
  //const char buf[4] = { 0x95, 0x35, 0xDA, 0x36 };
  //const char buf[4] = { 0x82, 0x32, 0xA3, 0x38 };
  //const char buf[4] = { 0x81, 0x30, 0x90, 0x36 };
  const char buf[4] = { 0x22, 0x33, 0x55, 0x66 };
#endif
  wchar_t wbuf[4] = { 0xFFFE, 0xFFFE, 0xFFFE, 0xFFFE };
  const char *inptr = buf;
  char rbuf[6];
  size_t ret, ret1, ret2;
  BOOL invalid_conversion;
  size_t i;

  /* Test mbrtowc */
  ret = mbrtowc (&wbuf[0], inptr, buf + sizeof (buf) - inptr, NULL);
  printf ("ret = %d, wbuf[0] = %x\n", (int) ret, (unsigned int) wbuf[0]);

  /* Test MultiByteToWideChar, converting 2 wchar_t units at once. */
  if (MultiByteToWideChar (codepage, MB_ERR_INVALID_CHARS /* | MB_PRECOMPOSED does not work */, inptr, buf + sizeof (buf) - inptr, wbuf, 2))
    {
      printf ("wbuf[0] = %x, wbuf[1] = %x\n", (unsigned int) wbuf[0], (unsigned int) wbuf[1]);

      /* Test wcrtomb */
      ret1 = wcrtomb (rbuf, wbuf[0], NULL);
      printf ("ret1 = %d\n", (int) ret1);
      if (ret1 > 0)
        {
          printf ("rbuf =");
          for (i = 0; i < ret1; i++)
            printf (" %02X", (unsigned char) rbuf[i]);
          printf ("\n");
        }

      /* Test WideCharToMultiByte, converting 2 wchar_t units at once. */
      ret2 = WideCharToMultiByte (codepage, 0, wbuf, 2, rbuf, sizeof (rbuf), NULL, codepage == 65001 ? NULL : &invalid_conversion);
      printf ("ret2 = %d\n", ret2);
      if (ret2 > 0)
        {
          printf ("rbuf =");
          for (i = 0; i < ret2; i++)
            printf (" %02X", (unsigned char) rbuf[i]);
          printf ("\n");
        }
    }

  return 0;
}

/*
Result on Windows XP:
ret = -1, wbuf[0] = 0
wbuf[0] = d844, wbuf[1] = de34
ret1 = 3
rbuf = ED A1 84
ret2 = 4
rbuf = F0 A1 88 B4

*/

16-bit wchar_t on Windows and Cygwin

Reply via email to