Pádraig Brady wrote: > I'm going to apply my whar-single module to gnulib to tweak > it so the main bottleneck of calling locale_charset repeatedly > is removed from wcwidth() and mbrtowc(), in a simple manner, > without the need for another API.
This patch has two problems: 1) It introduces a test suite failure: A gnulib testdir ./gnulib-tool --create-testdir --dir=... --single-configure mbrtowc wcwidth wchar-single fails the test-wcwidth test (test-wcwidth.c:59) on Mac OS X. 2) It introduces a multithread-safety bug: You added 'static' variables also to the normal (non-singlethreaded) code path. Now, when you have two threads which operate in different locales (via uselocale()) and 1. thread1 does encoding = locale_charset (); 2. thread2 does encoding = locale_charset (); 3. thread1 does utf8_charset = STREQ_OPT (encoding, ...); you've got a bug. To eliminate such kinds of bugs, it is better to structure the code as follows: - A function that returns that value that can be cached. - A function that does the caching AND NOTHING ELSE. So that it can be #ifdefed out easily. - Code that uses the caching function. Do not add complexity here! 2018-06-24 Bruno Haible <br...@clisp.org> wchar-single: Fix test failure in wcwidth tests. * tests/test-wcwidth.c (main): If the wchar-single module is present, skip the tests in the C locale. diff --git a/tests/test-wcwidth.c b/tests/test-wcwidth.c index f0eb7ab..4aa6ab7 100644 --- a/tests/test-wcwidth.c +++ b/tests/test-wcwidth.c @@ -35,10 +35,12 @@ main () { wchar_t wc; -#ifdef C_CTYPE_ASCII +#if !GNULIB_WCHAR_SINGLE +# ifdef C_CTYPE_ASCII /* Test width of ASCII characters. */ for (wc = 0x20; wc < 0x7F; wc++) ASSERT (wcwidth (wc) == 1); +# endif #endif /* Switch to an UTF-8 locale. */ 2018-06-24 Bruno Haible <br...@clisp.org> mbrtowc, wcwidth: Fix MT-safety bug (regression from 2018-06-23). * lib/mbrtowc.c (enc_t): New enum type. (locale_enc, locale_enc_cached): New functions. (mbrtowc): Eliminate static variables. Use locale_enc_cached instead. * lib/wcwidth.c (is_locale_utf8, is_locale_utf8_cached): New functions. (wcwidth): Eliminate static variables. Use is_locale_utf8_cached instead. * m4/mbrtowc.m4 (gl_PREREQ_MBRTOWC): Require AC_C_INLINE. * m4/wcwidth.m4 (gl_PREREQ_WCWIDTH): New macro. * modules/wcwidth (configure.ac): Invoke it. diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c index dc3597f..2f6df28 100644 --- a/lib/mbrtowc.c +++ b/lib/mbrtowc.c @@ -35,12 +35,60 @@ # include "streq.h" # include "verify.h" -#ifndef FALLTHROUGH -# if __GNUC__ < 7 -# define FALLTHROUGH ((void) 0) -# else -# define FALLTHROUGH __attribute__ ((__fallthrough__)) +# ifndef FALLTHROUGH +# if __GNUC__ < 7 +# define FALLTHROUGH ((void) 0) +# else +# define FALLTHROUGH __attribute__ ((__fallthrough__)) +# endif # endif + +/* Returns a classification of special values of the encoding of the current + locale. */ +typedef enum { + enc_other, /* other */ + enc_utf8, /* UTF-8 */ + enc_eucjp, /* EUC-JP */ + enc_94, /* EUC-KR, GB2312, BIG5 */ + enc_euctw, /* EUC-TW */ + enc_gb18030, /* GB18030 */ + enc_sjis /* SJIS */ +} enc_t; +static inline enc_t +locale_enc (void) +{ + const char *encoding = locale_charset (); + if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) + return enc_utf8; + if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) + return enc_eucjp; + if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) + || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) + || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) + return enc_94; + if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) + return enc_euctw; + if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) + return enc_gb18030; + if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) + return enc_sjis; + return enc_other; +} + +#if GNULIB_WCHAR_SINGLE +/* When we know that the locale does not change, provide a speedup by + caching the value of locale_enc. */ +static int cached_locale_enc = -1; +static inline enc_t +locale_enc_cached (void) +{ + if (cached_locale_enc < 0) + cached_locale_enc = locale_enc (); + return cached_locale_enc; +} +#else +/* By default, don't make assumptions, hence no caching. */ +# define locale_enc_cached locale_enc #endif verify (sizeof (mbstate_t) >= 4); @@ -137,20 +185,9 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) if (m >= 4 || m >= MB_CUR_MAX) goto invalid; /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ - { - static int utf8_charset = -1; - static const char *encoding; - -# if GNULIB_WCHAR_SINGLE - if (utf8_charset == -1) -# endif - { - encoding = locale_charset (); - utf8_charset = STREQ_OPT (encoding, - "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0); - } - - if (utf8_charset) + switch (locale_enc_cached ()) + { + case enc_utf8: /* UTF-8 */ { /* Cf. unistr/u8-mblen.c. */ unsigned char c = (unsigned char) p[0]; @@ -207,8 +244,7 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) /* As a reference for this code, you can use the GNU libiconv implementation. Look for uses of the RET_TOOFEW macro. */ - if (STREQ_OPT (encoding, - "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) + case enc_eucjp: /* EUC-JP */ { if (m == 1) { @@ -231,12 +267,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) } goto invalid; } - if (STREQ_OPT (encoding, - "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) - || STREQ_OPT (encoding, - "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) - || STREQ_OPT (encoding, - "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) + + case enc_94: /* EUC-KR, GB2312, BIG5 */ { if (m == 1) { @@ -247,8 +279,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) } goto invalid; } - if (STREQ_OPT (encoding, - "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) + + case enc_euctw: /* EUC-TW */ { if (m == 1) { @@ -266,8 +298,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) } goto invalid; } - if (STREQ_OPT (encoding, - "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) + + case enc_gb18030: /* GB18030 */ { if (m == 1) { @@ -300,7 +332,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) } goto invalid; } - if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) + + case enc_sjis: /* SJIS */ { if (m == 1) { @@ -313,9 +346,10 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) goto invalid; } - /* An unknown multibyte encoding. */ - goto incomplete; - } + default: + /* An unknown multibyte encoding. */ + goto incomplete; + } incomplete: { diff --git a/lib/wcwidth.c b/lib/wcwidth.c index d03a50f..d33b6a9 100644 --- a/lib/wcwidth.c +++ b/lib/wcwidth.c @@ -26,28 +26,40 @@ #include "streq.h" #include "uniwidth.h" -int -wcwidth (wchar_t wc) -#undef wcwidth +/* Returns 1 if the current locale is an UTF-8 locale, 0 otherwise. */ +static inline int +is_locale_utf8 (void) { - static int utf8_charset = -1; - static const char *encoding; + const char *encoding = locale_charset (); + return STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0); +} #if GNULIB_WCHAR_SINGLE - if (utf8_charset == -1) +/* When we know that the locale does not change, provide a speedup by + caching the value of is_locale_utf8. */ +static int cached_is_locale_utf8 = -1; +static inline int +is_locale_utf8_cached (void) +{ + if (cached_is_locale_utf8 < 0) + cached_is_locale_utf8 = is_locale_utf8 (); + return cached_is_locale_utf8; +} +#else +/* By default, don't make assumptions, hence no caching. */ +# define is_locale_utf8_cached is_locale_utf8 #endif - { - encoding = locale_charset (); - utf8_charset = STREQ_OPT (encoding, - "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0); - } +int +wcwidth (wchar_t wc) +#undef wcwidth +{ /* In UTF-8 locales, use a Unicode aware width function. */ - if (utf8_charset) + if (is_locale_utf8_cached ()) { /* We assume that in a UTF-8 locale, a wide character is the same as a Unicode character. */ - return uc_width (wc, encoding); + return uc_width (wc, "UTF-8"); } else { diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4 index f789875..c706d04 100644 --- a/m4/mbrtowc.m4 +++ b/m4/mbrtowc.m4 @@ -1,4 +1,4 @@ -# mbrtowc.m4 serial 30 -*- coding: utf-8 -*- +# mbrtowc.m4 serial 31 -*- coding: utf-8 -*- dnl Copyright (C) 2001-2002, 2004-2005, 2008-2018 Free Software Foundation, dnl Inc. dnl This file is free software; the Free Software Foundation @@ -634,6 +634,7 @@ AC_DEFUN([gl_MBRTOWC_C_LOCALE], # Prerequisites of lib/mbrtowc.c. AC_DEFUN([gl_PREREQ_MBRTOWC], [ + AC_REQUIRE([AC_C_INLINE]) : ]) diff --git a/m4/wcwidth.m4 b/m4/wcwidth.m4 index 0605ce8..1cd489b 100644 --- a/m4/wcwidth.m4 +++ b/m4/wcwidth.m4 @@ -1,4 +1,4 @@ -# wcwidth.m4 serial 26 +# wcwidth.m4 serial 27 dnl Copyright (C) 2006-2018 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -115,3 +115,9 @@ changequote([,])dnl dnl We don't substitute HAVE_WCWIDTH. We assume that if the system does not dnl have the wcwidth function, then it does not declare it. ]) + +# Prerequisites of lib/wcwidth.c. +AC_DEFUN([gl_PREREQ_WCWIDTH], [ + AC_REQUIRE([AC_C_INLINE]) + : +]) diff --git a/modules/wcwidth b/modules/wcwidth index 5a27713..372c210 100644 --- a/modules/wcwidth +++ b/modules/wcwidth @@ -19,6 +19,7 @@ configure.ac: gl_FUNC_WCWIDTH if test $HAVE_WCWIDTH = 0 || test $REPLACE_WCWIDTH = 1; then AC_LIBOBJ([wcwidth]) + gl_PREREQ_WCWIDTH fi gl_WCHAR_MODULE_INDICATOR([wcwidth])