On 12/15/19 4:43 AM, arn...@skeeve.com wrote:
> On the assumption that setlocale is the only blocker, I would rather
> see an additional `char *locale_name' parameter added to dfa_syntax.

Thanks, this is a good suggestion. Running with it, we can improve it further by
putting this new flag into struct localeinfo (so no need for a new dfasyntax
parameter), and also I think I can initialize it without using setlocale (so no
need to worry about setlocale's lack of thread-safety). I installed the attached
patches and they work with 'grep'. This way, we shouldn't need to pull in any
more Gnulib code into Gawk.

I plan to take a look at the more serious crashes soon.
>From 74324f05db859cb125fe7ec2f33b80a6cbd40697 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 16 Dec 2019 00:27:15 -0800
Subject: [PATCH 1/2] localeinfo: record whether locale is simple
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* lib/localeinfo.c (using_simple_locale): New function,
copied here from lib/dfa.c but with a change: it uses
strcoll for its heuristic, instead of using setlocale.
This lets it be thread-safe.
* lib/localeinfo.h (struct localeinfo): New member ‘simple’.
---
 ChangeLog        |  9 +++++++++
 lib/localeinfo.c | 44 +++++++++++++++++++++++++++++++++++++++++---
 lib/localeinfo.h |  6 ++++++
 3 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4f46f01a4..e59323a3e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2019-12-16  Paul Eggert  <egg...@cs.ucla.edu>
+
+	localeinfo: record whether locale is simple
+	* lib/localeinfo.c (using_simple_locale): New function,
+	copied here from lib/dfa.c but with a change: it uses
+	strcoll for its heuristic, instead of using setlocale.
+	This lets it be thread-safe.
+	* lib/localeinfo.h (struct localeinfo): New member ‘simple’.
+
 2019-12-15  Bruno Haible  <br...@clisp.org>
 
 	duplocale: Fix multithread-safety bug on AIX.
diff --git a/lib/localeinfo.c b/lib/localeinfo.c
index 65b6c5e6d..372530e01 100644
--- a/lib/localeinfo.c
+++ b/lib/localeinfo.c
@@ -44,17 +44,55 @@ is_using_utf8 (void)
   return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
 }
 
+/* Return true if the locale is compatible enough with the C locale so
+   that the locale is single-byte, bytes are in collating-sequence
+   order, and there are no multi-character collating elements.  */
+
+static bool
+using_simple_locale (bool multibyte)
+{
+  /* The native character set is known to be compatible with
+     the C locale.  The following test isn't perfect, but it's good
+     enough in practice, as only ASCII and EBCDIC are in common use
+     and this test correctly accepts ASCII and rejects EBCDIC.  */
+  enum { native_c_charset =
+    ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+     && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+     && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+     && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+     && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+     && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+     && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+     && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+     && '}' == 125 && '~' == 126)
+  };
+
+  if (!native_c_charset || multibyte)
+    return false;
+
+  /* As a heuristic, use strcoll to compare native character order.
+     If this agrees with byte order the locale should be simple.
+     This heuristic should work for all known practical locales,
+     although it would be invalid for artificially-constructed locales
+     where the native order is the collating-sequence order but there
+     are multi-character collating elements.  */
+  for (int i = 0; i < UCHAR_MAX; i++)
+    if (strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})) <= 0)
+      return false;
+
+  return true;
+}
+
 /* Initialize *LOCALEINFO from the current locale.  */
 
 void
 init_localeinfo (struct localeinfo *localeinfo)
 {
-  int i;
-
   localeinfo->multibyte = MB_CUR_MAX > 1;
+  localeinfo->simple = using_simple_locale (localeinfo->multibyte);
   localeinfo->using_utf8 = is_using_utf8 ();
 
-  for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+  for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
     {
       char c = i;
       unsigned char uc = i;
diff --git a/lib/localeinfo.h b/lib/localeinfo.h
index a5140164f..c827a2bfd 100644
--- a/lib/localeinfo.h
+++ b/lib/localeinfo.h
@@ -28,6 +28,12 @@ struct localeinfo
   /* MB_CUR_MAX > 1.  */
   bool multibyte;
 
+  /* The locale is simple, like the C locale.  These locales can be
+     processed more efficiently, as they are single-byte, their native
+     character set is in collating-sequence order, and they do not
+     have multi-character collating elements.  */
+  bool simple;
+
   /* The locale uses UTF-8.  */
   bool using_utf8;
 
-- 
2.17.1

>From ecb700c1c897e01f1c97f2f23c12ecb0d3658eba Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 16 Dec 2019 00:32:01 -0800
Subject: [PATCH 2/2] dfa: make dfasyntax thread-safe

Problem reported by Bruno Haible in:
https://lists.gnu.org/r/bug-gnulib/2019-12/msg00099.html
* lib/dfa.c: Do not include locale.h.
(struct dfa): Remove simple_locale member.
All uses replaced by localeinfo.simple.
(using_simple_locale): Remove; now present (with some
changes) in localeinfo.c.
(dfasyntax): No need to initialize removed member.
---
 ChangeLog | 10 ++++++++++
 lib/dfa.c | 46 +++-------------------------------------------
 2 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e59323a3e..8766995f6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
 2019-12-16  Paul Eggert  <egg...@cs.ucla.edu>
 
+	dfa: make dfasyntax thread-safe
+	Problem reported by Bruno Haible in:
+	https://lists.gnu.org/r/bug-gnulib/2019-12/msg00099.html
+	* lib/dfa.c: Do not include locale.h.
+	(struct dfa): Remove simple_locale member.
+	All uses replaced by localeinfo.simple.
+	(using_simple_locale): Remove; now present (with some
+	changes) in localeinfo.c.
+	(dfasyntax): No need to initialize removed member.
+
 	localeinfo: record whether locale is simple
 	* lib/localeinfo.c (using_simple_locale): New function,
 	copied here from lib/dfa.c but with a change: it uses
diff --git a/lib/dfa.c b/lib/dfa.c
index 8c88c9d36..6b89613a9 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -31,7 +31,6 @@
 #include <stdlib.h>
 #include <limits.h>
 #include <string.h>
-#include <locale.h>
 
 /* Another name for ptrdiff_t, for sizes of objects and nonnegative
    indexes into objects.  It is signed to help catch integer overflow.
@@ -573,12 +572,6 @@ struct dfa
   char *(*dfaexec) (struct dfa *, char const *, char *,
                     bool, ptrdiff_t *, bool *);
 
-  /* The locale is simple, like the C locale.  These locales can be
-     processed more efficiently, as they are single-byte, their native
-     character set is in collating-sequence order, and they do not
-     have multi-character collating elements.  */
-  bool simple_locale;
-
   /* Other cached information derived from the locale.  */
   struct localeinfo localeinfo;
 };
@@ -917,38 +910,6 @@ setbit_case_fold_c (int b, charclass *c)
       setbit (i, c);
 }
 
-/* Return true if the locale compatible with the C locale.  */
-
-static bool
-using_simple_locale (bool multibyte)
-{
-  /* The native character set is known to be compatible with
-     the C locale.  The following test isn't perfect, but it's good
-     enough in practice, as only ASCII and EBCDIC are in common use
-     and this test correctly accepts ASCII and rejects EBCDIC.  */
-  enum { native_c_charset =
-    ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
-     && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
-     && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
-     && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
-     && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
-     && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
-     && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
-     && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
-     && '}' == 125 && '~' == 126)
-  };
-
-  if (!native_c_charset || multibyte)
-    return false;
-  else
-    {
-      /* Treat C and POSIX locales as being compatible.  Also, treat
-         errors as compatible, as these are invariably from stubs.  */
-      char const *loc = setlocale (LC_ALL, NULL);
-      return !loc || streq (loc, "C") || streq (loc, "POSIX");
-    }
-}
-
 /* Fetch the next lexical input character from the pattern.  There
    must at least one byte of pattern input.  Set DFA->lex.wctok to the
    value of the character or to WEOF depending on whether the input is
@@ -1039,7 +1000,7 @@ parse_bracket_exp (struct dfa *dfa)
   if (invert)
     {
       c = bracket_fetch_wc (dfa);
-      known_bracket_exp = dfa->simple_locale;
+      known_bracket_exp = dfa->localeinfo.simple;
     }
   wint_t wc = dfa->lex.wctok;
   int c1;
@@ -1169,7 +1130,7 @@ parse_bracket_exp (struct dfa *dfa)
               /* Treat [x-y] as a range if x != y.  */
               if (wc != wc2 || wc == WEOF)
                 {
-                  if (dfa->simple_locale
+                  if (dfa->localeinfo.simple
                       || (isasciidigit (c) & isasciidigit (c2)))
                     {
                       for (int ci = c; ci <= c2; ci++)
@@ -3348,7 +3309,7 @@ skip_remains_mb (struct dfa *d, unsigned char const *p,
     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
     - back-reference: (.)\1
     - word-delimiter in multibyte locale: \<, \>, \b, \B
-   See using_simple_locale for the definition of "simple locale".  */
+   See struct localeinfo.simple for the definition of "simple locale".  */
 
 static inline char *
 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
@@ -4311,7 +4272,6 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
 {
   memset (dfa, 0, offsetof (struct dfa, dfaexec));
   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
-  dfa->simple_locale = using_simple_locale (linfo->multibyte);
   dfa->localeinfo = *linfo;
 
   dfa->fast = !dfa->localeinfo.multibyte;
-- 
2.17.1

Reply via email to