Re: speeding up `wc -m`

Bruno Haible Sun, 24 Jun 2018 07:25:36 -0700

Pádraig Brady wrote:
> I'm going to apply my whar-single module to gnulib to tweak
> it so the main bottleneck of calling locale_charset repeatedly
> is removed from wcwidth() and mbrtowc(), in a simple manner,
> without the need for another API.


This patch has two problems:

1) It introduces a test suite failure: A gnulib testdir
   ./gnulib-tool --create-testdir --dir=... --single-configure mbrtowc wcwidth 
wchar-single
   fails the test-wcwidth test (test-wcwidth.c:59) on Mac OS X.

2) It introduces a multithread-safety bug: You added 'static' variables
   also to the normal (non-singlethreaded) code path. Now, when you have
   two threads which operate in different locales (via uselocale()) and
     1. thread1 does encoding = locale_charset ();
     2. thread2 does encoding = locale_charset ();
     3. thread1 does utf8_charset = STREQ_OPT (encoding, ...);
   you've got a bug.
   To eliminate such kinds of bugs, it is better to structure the code as 
follows:
     - A function that returns that value that can be cached.
     - A function that does the caching AND NOTHING ELSE. So that it can be 
#ifdefed
       out easily.
     - Code that uses the caching function. Do not add complexity here!


2018-06-24  Bruno Haible  <br...@clisp.org>

        wchar-single: Fix test failure in wcwidth tests.
        * tests/test-wcwidth.c (main): If the wchar-single module is present,
        skip the tests in the C locale.

diff --git a/tests/test-wcwidth.c b/tests/test-wcwidth.c
index f0eb7ab..4aa6ab7 100644
--- a/tests/test-wcwidth.c
+++ b/tests/test-wcwidth.c
@@ -35,10 +35,12 @@ main ()
 {
   wchar_t wc;
 
-#ifdef C_CTYPE_ASCII
+#if !GNULIB_WCHAR_SINGLE
+# ifdef C_CTYPE_ASCII
   /* Test width of ASCII characters.  */
   for (wc = 0x20; wc < 0x7F; wc++)
     ASSERT (wcwidth (wc) == 1);
+# endif
 #endif
 
   /* Switch to an UTF-8 locale.  */


2018-06-24  Bruno Haible  <br...@clisp.org>

        mbrtowc, wcwidth: Fix MT-safety bug (regression from 2018-06-23).
        * lib/mbrtowc.c (enc_t): New enum type.
        (locale_enc, locale_enc_cached): New functions.
        (mbrtowc): Eliminate static variables. Use locale_enc_cached instead.
        * lib/wcwidth.c (is_locale_utf8, is_locale_utf8_cached): New functions.
        (wcwidth): Eliminate static variables. Use is_locale_utf8_cached
        instead.
        * m4/mbrtowc.m4 (gl_PREREQ_MBRTOWC): Require AC_C_INLINE.
        * m4/wcwidth.m4 (gl_PREREQ_WCWIDTH): New macro.
        * modules/wcwidth (configure.ac): Invoke it.

diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c
index dc3597f..2f6df28 100644
--- a/lib/mbrtowc.c
+++ b/lib/mbrtowc.c
@@ -35,12 +35,60 @@
 # include "streq.h"
 # include "verify.h"
 
-#ifndef FALLTHROUGH
-# if __GNUC__ < 7
-#  define FALLTHROUGH ((void) 0)
-# else
-#  define FALLTHROUGH __attribute__ ((__fallthrough__))
+# ifndef FALLTHROUGH
+#  if __GNUC__ < 7
+#   define FALLTHROUGH ((void) 0)
+#  else
+#   define FALLTHROUGH __attribute__ ((__fallthrough__))
+#  endif
 # endif
+
+/* Returns a classification of special values of the encoding of the current
+   locale.  */
+typedef enum {
+  enc_other,      /* other */
+  enc_utf8,       /* UTF-8 */
+  enc_eucjp,      /* EUC-JP */
+  enc_94,         /* EUC-KR, GB2312, BIG5 */
+  enc_euctw,      /* EUC-TW */
+  enc_gb18030,    /* GB18030 */
+  enc_sjis        /* SJIS */
+} enc_t;
+static inline enc_t
+locale_enc (void)
+{
+  const char *encoding = locale_charset ();
+  if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
+    return enc_utf8;
+  if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
+    return enc_eucjp;
+  if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
+      || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
+      || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
+    return enc_94;
+  if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
+    return enc_euctw;
+  if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
+    return enc_gb18030;
+  if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
+    return enc_sjis;
+  return enc_other;
+}
+
+#if GNULIB_WCHAR_SINGLE
+/* When we know that the locale does not change, provide a speedup by
+   caching the value of locale_enc.  */
+static int cached_locale_enc = -1;
+static inline enc_t
+locale_enc_cached (void)
+{
+  if (cached_locale_enc < 0)
+    cached_locale_enc = locale_enc ();
+  return cached_locale_enc;
+}
+#else
+/* By default, don't make assumptions, hence no caching.  */
+# define locale_enc_cached locale_enc
 #endif
 
 verify (sizeof (mbstate_t) >= 4);
@@ -137,20 +185,9 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t 
*ps)
       if (m >= 4 || m >= MB_CUR_MAX)
         goto invalid;
       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
-      {
-        static int utf8_charset = -1;
-        static const char *encoding;
-
-# if GNULIB_WCHAR_SINGLE
-        if (utf8_charset == -1)
-# endif
-          {
-            encoding = locale_charset ();
-            utf8_charset = STREQ_OPT (encoding,
-                                      "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 
0 ,0);
-          }
-
-        if (utf8_charset)
+      switch (locale_enc_cached ())
+        {
+        case enc_utf8: /* UTF-8 */
           {
             /* Cf. unistr/u8-mblen.c.  */
             unsigned char c = (unsigned char) p[0];
@@ -207,8 +244,7 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t 
*ps)
         /* As a reference for this code, you can use the GNU libiconv
            implementation.  Look for uses of the RET_TOOFEW macro.  */
 
-        if (STREQ_OPT (encoding,
-                       "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
+        case enc_eucjp: /* EUC-JP */
           {
             if (m == 1)
               {
@@ -231,12 +267,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t 
*ps)
               }
             goto invalid;
           }
-        if (STREQ_OPT (encoding,
-                       "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
-            || STREQ_OPT (encoding,
-                          "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
-            || STREQ_OPT (encoding,
-                          "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
+
+        case enc_94: /* EUC-KR, GB2312, BIG5 */
           {
             if (m == 1)
               {
@@ -247,8 +279,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t 
*ps)
               }
             goto invalid;
           }
-        if (STREQ_OPT (encoding,
-                       "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
+
+        case enc_euctw: /* EUC-TW */
           {
             if (m == 1)
               {
@@ -266,8 +298,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t 
*ps)
               }
             goto invalid;
           }
-        if (STREQ_OPT (encoding,
-                       "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
+
+        case enc_gb18030: /* GB18030 */
           {
             if (m == 1)
               {
@@ -300,7 +332,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t 
*ps)
               }
             goto invalid;
           }
-        if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
+
+        case enc_sjis: /* SJIS */
           {
             if (m == 1)
               {
@@ -313,9 +346,10 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t 
*ps)
             goto invalid;
           }
 
-        /* An unknown multibyte encoding.  */
-        goto incomplete;
-      }
+        default:
+          /* An unknown multibyte encoding.  */
+          goto incomplete;
+        }
 
      incomplete:
       {
diff --git a/lib/wcwidth.c b/lib/wcwidth.c
index d03a50f..d33b6a9 100644
--- a/lib/wcwidth.c
+++ b/lib/wcwidth.c
@@ -26,28 +26,40 @@
 #include "streq.h"
 #include "uniwidth.h"
 
-int
-wcwidth (wchar_t wc)
-#undef wcwidth
+/* Returns 1 if the current locale is an UTF-8 locale, 0 otherwise.  */
+static inline int
+is_locale_utf8 (void)
 {
-  static int utf8_charset = -1;
-  static const char *encoding;
+  const char *encoding = locale_charset ();
+  return STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0);
+}
 
 #if GNULIB_WCHAR_SINGLE
-  if (utf8_charset == -1)
+/* When we know that the locale does not change, provide a speedup by
+   caching the value of is_locale_utf8.  */
+static int cached_is_locale_utf8 = -1;
+static inline int
+is_locale_utf8_cached (void)
+{
+  if (cached_is_locale_utf8 < 0)
+    cached_is_locale_utf8 = is_locale_utf8 ();
+  return cached_is_locale_utf8;
+}
+#else
+/* By default, don't make assumptions, hence no caching.  */
+# define is_locale_utf8_cached is_locale_utf8
 #endif
-    {
-      encoding = locale_charset ();
-      utf8_charset = STREQ_OPT (encoding,
-                                "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0);
-    }
 
+int
+wcwidth (wchar_t wc)
+#undef wcwidth
+{
   /* In UTF-8 locales, use a Unicode aware width function.  */
-  if (utf8_charset)
+  if (is_locale_utf8_cached ())
     {
       /* We assume that in a UTF-8 locale, a wide character is the same as a
          Unicode character.  */
-      return uc_width (wc, encoding);
+      return uc_width (wc, "UTF-8");
     }
   else
     {
diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4
index f789875..c706d04 100644
--- a/m4/mbrtowc.m4
+++ b/m4/mbrtowc.m4
@@ -1,4 +1,4 @@
-# mbrtowc.m4 serial 30  -*- coding: utf-8 -*-
+# mbrtowc.m4 serial 31  -*- coding: utf-8 -*-
 dnl Copyright (C) 2001-2002, 2004-2005, 2008-2018 Free Software Foundation,
 dnl Inc.
 dnl This file is free software; the Free Software Foundation
@@ -634,6 +634,7 @@ AC_DEFUN([gl_MBRTOWC_C_LOCALE],
 
 # Prerequisites of lib/mbrtowc.c.
 AC_DEFUN([gl_PREREQ_MBRTOWC], [
+  AC_REQUIRE([AC_C_INLINE])
   :
 ])
 
diff --git a/m4/wcwidth.m4 b/m4/wcwidth.m4
index 0605ce8..1cd489b 100644
--- a/m4/wcwidth.m4
+++ b/m4/wcwidth.m4
@@ -1,4 +1,4 @@
-# wcwidth.m4 serial 26
+# wcwidth.m4 serial 27
 dnl Copyright (C) 2006-2018 Free Software Foundation, Inc.
 dnl This file is free software; the Free Software Foundation
 dnl gives unlimited permission to copy and/or distribute it,
@@ -115,3 +115,9 @@ changequote([,])dnl
   dnl We don't substitute HAVE_WCWIDTH. We assume that if the system does not
   dnl have the wcwidth function, then it does not declare it.
 ])
+
+# Prerequisites of lib/wcwidth.c.
+AC_DEFUN([gl_PREREQ_WCWIDTH], [
+  AC_REQUIRE([AC_C_INLINE])
+  :
+])
diff --git a/modules/wcwidth b/modules/wcwidth
index 5a27713..372c210 100644
--- a/modules/wcwidth
+++ b/modules/wcwidth
@@ -19,6 +19,7 @@ configure.ac:
 gl_FUNC_WCWIDTH
 if test $HAVE_WCWIDTH = 0 || test $REPLACE_WCWIDTH = 1; then
   AC_LIBOBJ([wcwidth])
+  gl_PREREQ_WCWIDTH
 fi
 gl_WCHAR_MODULE_INDICATOR([wcwidth])

Re: speeding up `wc -m`

Reply via email to