In the regex code, use the char32_t functions instead of the
wchar_t functions, so that regex stays in sync with dfa.
This should fix a bug in Gnu grep reported by Dennis Clarke for
OpenBSD <https://debbugs.gnu.org/cgi/bugreport.cgi?bug=80774#47>.
A similar bug occurs in some macOS releases.
* modules/regex (Depends-on): Replace btowc, iswctype, mbrtowc,
wchar-h, wcrtomb, wctype-h, and wctype with btoc32,
c32_apply_type_test, c32_get_type_test, c32isalnum, c32rtomb,
c32tolower, c32toupper, mbrtoc32-regular, uchar-h.
* lib/regex_internal.h [!_LIBC]: Do not include <wchar.h>, <wctype.h>.
Instead, include <uchar.h> and #define wchar_t, wctype_t,
__wctype, __iswalnum, __iswctype, __towlower, __towupper, __btowc,
__mbrtowc, and __wcrtomb to their char32_t counterparts.
---
 ChangeLog            | 15 +++++++++++++++
 lib/regex_internal.h | 32 ++++++++++++++++++++------------
 modules/regex        | 20 +++++++++++++-------
 3 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 13cdd0d1fb..c39c3cc04d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2026-04-24  Paul Eggert  <[email protected]>
 
+       regex: be consistent with dfa
+       In the regex code, use the char32_t functions instead of the
+       wchar_t functions, so that regex stays in sync with dfa.
+       This should fix a bug in Gnu grep reported by Dennis Clarke for
+       OpenBSD <https://debbugs.gnu.org/cgi/bugreport.cgi?bug=80774#47>.
+       A similar bug occurs in some macOS releases.
+       * modules/regex (Depends-on): Replace btowc, iswctype, mbrtowc,
+       wchar-h, wcrtomb, wctype-h, and wctype with btoc32,
+       c32_apply_type_test, c32_get_type_test, c32isalnum, c32rtomb,
+       c32tolower, c32toupper, mbrtoc32-regular, uchar-h.
+       * lib/regex_internal.h [!_LIBC]: Do not include <wchar.h>, <wctype.h>.
+       Instead, include <uchar.h> and #define wchar_t, wctype_t,
+       __wctype, __iswalnum, __iswctype, __towlower, __towupper, __btowc,
+       __mbrtowc, and __wcrtomb to their char32_t counterparts.
+
        dfa: always use char32_t not wchar_t
        This should help merges changes from Gawk, which always uses the
        char32_t API though that’s sometimes implemented with the wchar_t
diff --git a/lib/regex_internal.h b/lib/regex_internal.h
index fb6c1f373d..87ed00b87a 100644
--- a/lib/regex_internal.h
+++ b/lib/regex_internal.h
@@ -27,8 +27,6 @@
 
 #include <langinfo.h>
 #include <locale.h>
-#include <wchar.h>
-#include <wctype.h>
 #include <stdckdint.h>
 #include <stdcountof.h>
 #include <stdint.h>
@@ -121,21 +119,31 @@
 #define NEWLINE_CHAR '\n'
 #define WIDE_NEWLINE_CHAR L'\n'
 
-/* Rename to standard API for using out of glibc.  */
-#ifndef _LIBC
+/* Use Gnulib <uchar.h> when outside of glibc.  */
+#ifdef _LIBC
+# include <wchar.h>
+# include <wctype.h>
+#else
+# include <uchar.h>
 # undef __wctype
 # undef __iswalnum
 # undef __iswctype
 # undef __towlower
 # undef __towupper
-# define __wctype wctype
-# define __iswalnum iswalnum
-# define __iswctype iswctype
-# define __towlower towlower
-# define __towupper towupper
-# define __btowc btowc
-# define __mbrtowc mbrtowc
-# define __wcrtomb wcrtomb
+# undef __btowc
+# undef __mbrtowc
+# undef __wcrtomb
+# undef __regfree
+# define wchar_t char32_t
+# define wctype_t c32_type_test_t
+# define __wctype c32_get_type_test
+# define __iswalnum c32isalnum
+# define __iswctype c32_apply_type_test
+# define __towlower c32tolower
+# define __towupper c32toupper
+# define __btowc btoc32
+# define __mbrtowc mbrtoc32
+# define __wcrtomb c32rtomb
 # define __regfree regfree
 #endif /* not _LIBC */
 
diff --git a/modules/regex b/modules/regex
index a1176b8e90..9cb277618b 100644
--- a/modules/regex
+++ b/modules/regex
@@ -20,21 +20,26 @@ extensions
 ssize_t
 vararrays
 attribute               [test $ac_use_included_regex = yes]
-btowc                   [test $ac_use_included_regex = yes]
+btoc32                  [test $ac_use_included_regex = yes]
 builtin-expect          [test $ac_use_included_regex = yes]
+c32_apply_type_test     [test $ac_use_included_regex = yes]
+c32_get_type_test       [test $ac_use_included_regex = yes]
+c32isalnum              [test $ac_use_included_regex = yes]
+c32rtomb                [test $ac_use_included_regex = yes]
+c32tolower              [test $ac_use_included_regex = yes]
+c32toupper              [test $ac_use_included_regex = yes]
 gettext-h               [test $ac_use_included_regex = yes]
 glibc-internal/dynarray [test $ac_use_included_regex = yes]
 gnulib-i18n             [test $ac_use_included_regex = yes]
 intprops                [test $ac_use_included_regex = yes]
-iswctype                [test $ac_use_included_regex = yes]
 langinfo-h              [test $ac_use_included_regex = yes]
 libc-config             [test $ac_use_included_regex = yes]
 limits-h                [test $ac_use_included_regex = yes]
 lock                    [test $ac_use_included_regex = yes]
 malloc-gnu              [test $ac_use_included_regex = yes]
+mbrtoc32-regular        [test $ac_use_included_regex = yes]
 memcmp                  [test $ac_use_included_regex = yes]
 memmove                 [test $ac_use_included_regex = yes]
-mbrtowc                 [test $ac_use_included_regex = yes]
 mbsinit                 [test $ac_use_included_regex = yes]
 nl_langinfo             [test $ac_use_included_regex = yes]
 bool                    [test $ac_use_included_regex = yes]
@@ -43,10 +48,11 @@ stdcountof-h            [test $ac_use_included_regex = yes]
 stdint-h                [test $ac_use_included_regex = yes]
 strncpy                 [test $ac_use_included_regex = yes]
 verify                  [test $ac_use_included_regex = yes]
-wchar-h                 [test $ac_use_included_regex = yes]
-wcrtomb                 [test $ac_use_included_regex = yes]
-wctype-h                [test $ac_use_included_regex = yes]
-wctype                  [test $ac_use_included_regex = yes]
+uchar-h                 [test $ac_use_included_regex = yes]
+# The lonesome_lower array requires ISO C 23 semantics for char32_t.
+# But uchar-h-c23 has a global effect, therefore leave it to each package
+# to enable it.
+#uchar-h-c23            [test $ac_use_included_regex = yes]
 
 configure.ac:
 gl_REGEX
-- 
2.53.0


Reply via email to