The 'mbrtowc' module emulation, based on the system's mbtowc(), is more interesting:
- sizeof(mbstate_t) has to be >= 4 for UTF-8, but on IRIX 6.5 mbstate_t is defined to 'char'. (Thanks to Paul for idea to verify this assumption :-)). So we have to override this type. - mbtowc() does not report to the caller a distinction between incomplete and invalid multibyte sequence. But mbrtowc() has to do so. How to do so, is tricky. - What should the return value of mbrtowc(&wc, s, 0) be? According to the spec, I would think (size_t)(-2). But glibc returns 0. Hmm... - glibc's mbtowc() implementation for UTF-8 is buggy. 2008-12-18 Bruno Haible <br...@clisp.org> New module 'mbrtowc'. * lib/wchar.in.h (mbstate_t): Override when the system does not have mbsinit and mbrtowc. (mbrtowc): New declaration. * lib/mbrtowc.c: New file. * m4/mbrtowc.m4 (gl_FUNC_MBRTOWC, gl_PREREQ_MBRTOWC): New macros. * modules/mbrtowc: New file. * m4/wchar.m4 (gl_WCHAR_H_DEFAULTS): Initialize GNULIB_MBRTOWC and HAVE_MBRTOWC. * modules/wchar (Makefile.am): Substitute GNULIB_MBRTOWC and HAVE_MBRTOWC. * doc/posix-functions/mbrtowc.texi: Document the new module. ================================ lib/mbrtowc.c ================================ /* Convert multibyte character to wide character. Copyright (C) 1999-2002, 2005-2008 Free Software Foundation, Inc. Written by Bruno Haible <br...@clisp.org>, 2008. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <config.h> /* Specification. */ #include <wchar.h> #include <errno.h> #include <stdlib.h> #include "localcharset.h" #include "streq.h" static char internal_state[4]; size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { char *pstate = (char *)ps; if (pstate == NULL) pstate = internal_state; if (s == NULL) { pwc = NULL; s = ""; n = 1; } if (n == 0) return (size_t)(-2); /* Here n > 0. */ { size_t nstate = pstate[0]; char buf[4]; const char *p; size_t m; switch (nstate) { case 0: p = s; m = n; break; case 3: buf[2] = pstate[3]; /*FALLTHROUGH*/ case 2: buf[1] = pstate[2]; /*FALLTHROUGH*/ case 1: buf[0] = pstate[1]; p = buf; m = nstate; buf[m++] = s[0]; if (n >= 2 && m < 4) { buf[m++] = s[1]; if (n >= 3 && m < 4) buf[m++] = s[2]; } break; default: errno = EINVAL; return (size_t)(-1); } /* Here 0 < m ≤ 4. */ #if __GLIBC__ /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */ mbtowc (NULL, NULL, 0); #endif { int res = mbtowc (pwc, p, m); if (res >= 0) { if (pwc != NULL && ((*pwc == 0) != (res == 0))) abort (); if (nstate >= (res > 0 ? res : 1)) abort (); res -= nstate; pstate[0] = 0; return res; } /* mbtowc does not distinguish between invalid and incomplete multibyte sequences. But mbrtowc needs to make this distinction. There are two possible approaches: - Use iconv() and its return value. - Use built-in knowledge about the possible encodings. Given the low quality of implementation of iconv() on the systems that lack mbrtowc(), we use the second approach. The possible encodings are: - 8-bit encodings, - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, SJIS, - UTF-8. Use specialized code for each. */ if (m >= 4 || m >= MB_CUR_MAX) goto invalid; /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ { const char *encoding = locale_charset (); if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) { /* Cf. unistr/u8-mblen.c. */ unsigned char c = (unsigned char) p[0]; if (c >= 0xc2) { if (c < 0xe0) { if (m == 1) goto incomplete; } else if (c < 0xf0) { if (m == 1) goto incomplete; if (m == 2) { unsigned char c2 = (unsigned char) p[1]; if ((c2 ^ 0x80) < 0x40 && (c >= 0xe1 || c2 >= 0xa0) && (c != 0xed || c2 < 0xa0)) goto incomplete; } } else if (c <= 0xf4) { if (m == 1) goto incomplete; else /* m == 2 || m == 3 */ { unsigned char c2 = (unsigned char) p[1]; if ((c2 ^ 0x80) < 0x40 && (c >= 0xf1 || c2 >= 0x90) && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) { if (m == 2) goto incomplete; else /* m == 3 */ { unsigned char c3 = (unsigned char) p[2]; if ((c3 ^ 0x80) < 0x40) goto incomplete; } } } } } goto invalid; } /* As a reference for this code, you can use the GNU libiconv implementation. Look for uses of the RET_TOOFEW macro. */ if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) goto incomplete; } if (m == 2) { unsigned char c = (unsigned char) p[0]; if (c == 0x8f) { unsigned char c2 = (unsigned char) p[1]; if (c2 >= 0xa1 && c2 < 0xff) goto incomplete; } } goto invalid; } if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if (c >= 0xa1 && c < 0xff) goto incomplete; } goto invalid; } if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0xa1 && c < 0xff) || c == 0x8e) goto incomplete; } else /* m == 2 || m == 3 */ { unsigned char c = (unsigned char) p[0]; if (c == 0x8e) goto incomplete; } goto invalid; } if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) || (c >= 0xf0 && c <= 0xf9)) goto incomplete; } goto invalid; } /* An unknown multibyte encoding. */ goto incomplete; } incomplete: { size_t k = nstate; /* Here 0 < k < m < 4. */ pstate[++k] = s[0]; if (k < m) pstate[++k] = s[1]; if (k != m) abort (); } pstate[0] = m; return (size_t)(-2); invalid: errno = EILSEQ; /* The conversion state is undefined, says POSIX. */ return (size_t)(-1); } } } =============================== modules/mbrtowc =============================== Description: mbrtowc() function: convert multibyte character to wide character. Files: lib/mbrtowc.c m4/mbrtowc.m4 m4/mbstate_t.m4 Depends-on: wchar localcharset streq configure.ac: gl_FUNC_MBRTOWC gl_WCHAR_MODULE_INDICATOR([mbrtowc]) Makefile.am: Include: <wchar.h> License: LGPL Maintainer: Bruno Haible =============================================================================== --- lib/wchar.in.h.orig 2008-12-19 03:15:03.000000000 +0100 +++ lib/wchar.in.h 2008-12-19 03:13:37.000000000 +0100 @@ -71,6 +71,16 @@ #endif +/* Override mbstate_t if it is too small. + On IRIX 6.5, sizeof (mbstate_t) == 1, which is not sufficient for + implementing mbrtowc for encodings like UTF-8. */ +#if !(@HAVE_MBSINIT@ && @HAVE_MBRTOWC@) +typedef int rpl_mbstate_t; +# undef mbstate_t +# define mbstate_t rpl_mbstate_t +#endif + + /* Convert a single-byte character to a wide character. */ #if @GNULIB_BTOWC@ # if !...@have_btowc@ @@ -113,6 +123,20 @@ #endif +/* Convert a multibyte character to a wide character. */ +#if @GNULIB_MBRTOWC@ +# if !...@have_mbrtowc@ +extern size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps); +# endif +#elif defined GNULIB_POSIXCHECK +# undef mbrtowc +# define mbrtowc(w,s,n,p) \ + (GL_LINK_WARNING ("mbrtowc is unportable - " \ + "use gnulib module mbrtowc for portability"), \ + mbrtowc (w, s, n, p)) +#endif + + /* Return the number of screen columns needed for WC. */ #if @GNULIB_WCWIDTH@ # if @REPLACE_WCWIDTH@ --- m4/mbrtowc.m4.orig 2008-12-19 03:15:03.000000000 +0100 +++ m4/mbrtowc.m4 2008-12-19 03:13:37.000000000 +0100 @@ -1,9 +1,29 @@ -# mbrtowc.m4 serial 9 +# mbrtowc.m4 serial 10 dnl Copyright (C) 2001-2002, 2004-2005, 2008 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, dnl with or without modifications, as long as this notice is preserved. +AC_DEFUN([gl_FUNC_MBRTOWC], +[ + AC_REQUIRE([gl_WCHAR_H_DEFAULTS]) + + AC_REQUIRE([AC_TYPE_MBSTATE_T]) + AC_CHECK_FUNCS_ONCE([mbrtowc]) + if test $ac_cv_func_mbrtowc = no; then + HAVE_MBRTOWC=0 + gl_REPLACE_WCHAR_H + AC_LIBOBJ([mbrtowc]) + gl_PREREQ_MBRTOWC + fi +]) + +# Prerequisites of lib/mbrtowc.c. +AC_DEFUN([gl_PREREQ_MBRTOWC], [ + : +]) + + dnl From Paul Eggert dnl This override of an autoconf macro can be removed when autoconf 2.60 or --- m4/wchar.m4.orig 2008-12-19 03:15:03.000000000 +0100 +++ m4/wchar.m4 2008-12-19 03:13:37.000000000 +0100 @@ -64,11 +64,13 @@ GNULIB_BTOWC=0; AC_SUBST([GNULIB_BTOWC]) GNULIB_WCTOB=0; AC_SUBST([GNULIB_WCTOB]) GNULIB_MBSINIT=0; AC_SUBST([GNULIB_MBSINIT]) + GNULIB_MBRTOWC=0; AC_SUBST([GNULIB_MBRTOWC]) GNULIB_WCWIDTH=0; AC_SUBST([GNULIB_WCWIDTH]) dnl Assume proper GNU behavior unless another module says otherwise. HAVE_BTOWC=1; AC_SUBST([HAVE_BTOWC]) HAVE_WCTOB=1; AC_SUBST([HAVE_WCTOB]) HAVE_MBSINIT=1; AC_SUBST([HAVE_MBSINIT]) + HAVE_MBRTOWC=1; AC_SUBST([HAVE_MBRTOWC]) HAVE_DECL_WCWIDTH=1; AC_SUBST([HAVE_DECL_WCWIDTH]) REPLACE_WCWIDTH=0; AC_SUBST([REPLACE_WCWIDTH]) WCHAR_H=''; AC_SUBST([WCHAR_H]) --- modules/wchar.orig 2008-12-19 03:15:03.000000000 +0100 +++ modules/wchar 2008-12-19 03:13:37.000000000 +0100 @@ -28,11 +28,13 @@ -e 's|@''GNULIB_BTOWC''@|$(GNULIB_BTOWC)|g' \ -e 's|@''GNULIB_WCTOB''@|$(GNULIB_WCTOB)|g' \ -e 's|@''GNULIB_MBSINIT''@|$(GNULIB_MBSINIT)|g' \ + -e 's|@''GNULIB_MBRTOWC''@|$(GNULIB_MBRTOWC)|g' \ -e 's|@''GNULIB_WCWIDTH''@|$(GNULIB_WCWIDTH)|g' \ -e 's|@''HAVE_WINT_T''@|$(HAVE_WINT_T)|g' \ -e 's|@''HAVE_BTOWC''@|$(HAVE_BTOWC)|g' \ -e 's|@''HAVE_WCTOB''@|$(HAVE_WCTOB)|g' \ -e 's|@''HAVE_MBSINIT''@|$(HAVE_MBSINIT)|g' \ + -e 's|@''HAVE_MBRTOWC''@|$(HAVE_MBRTOWC)|g' \ -e 's|@''HAVE_DECL_WCWIDTH''@|$(HAVE_DECL_WCWIDTH)|g' \ -e 's|@''REPLACE_WCWIDTH''@|$(REPLACE_WCWIDTH)|g' \ -e '/definition of GL_LINK_WARNING/r $(LINK_WARNING_H)' \ --- doc/posix-functions/mbrtowc.texi.orig 2008-12-19 03:15:03.000000000 +0100 +++ doc/posix-functions/mbrtowc.texi 2008-12-19 03:13:37.000000000 +0100 @@ -4,18 +4,18 @@ POSIX specification: @url{http://www.opengroup.org/onlinepubs/9699919799/functions/mbrtowc.html} -Gnulib module: --- +Gnulib module: mbrtowc Portability problems fixed by Gnulib: @itemize +...@item +This function is missing on some platforms: +HP-UX 11, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5. @end itemize Portability problems not fixed by Gnulib: @itemize @item -This function is missing on some platforms: -HP-UX 11, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5. -...@item On Windows platforms, @code{wchar_t} is a 16-bit type and therefore cannot accommodate all Unicode characters. @end itemize