In the regex code, use the char32_t functions instead of the wchar_t functions, so that regex stays in sync with dfa. This should fix a bug in Gnu grep reported by Dennis Clarke for OpenBSD <https://debbugs.gnu.org/cgi/bugreport.cgi?bug=80774#47>. A similar bug occurs in some macOS releases. * modules/regex (Depends-on): Replace btowc, iswctype, mbrtowc, wchar-h, wcrtomb, wctype-h, and wctype with btoc32, c32_apply_type_test, c32_get_type_test, c32isalnum, c32rtomb, c32tolower, c32toupper, mbrtoc32-regular, uchar-h. * lib/regex_internal.h [!_LIBC]: Do not include <wchar.h>, <wctype.h>. Instead, include <uchar.h> and #define wchar_t, wctype_t, __wctype, __iswalnum, __iswctype, __towlower, __towupper, __btowc, __mbrtowc, and __wcrtomb to their char32_t counterparts. --- ChangeLog | 15 +++++++++++++++ lib/regex_internal.h | 32 ++++++++++++++++++++------------ modules/regex | 20 +++++++++++++------- 3 files changed, 48 insertions(+), 19 deletions(-)
diff --git a/ChangeLog b/ChangeLog index 13cdd0d1fb..c39c3cc04d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,20 @@ 2026-04-24 Paul Eggert <[email protected]> + regex: be consistent with dfa + In the regex code, use the char32_t functions instead of the + wchar_t functions, so that regex stays in sync with dfa. + This should fix a bug in Gnu grep reported by Dennis Clarke for + OpenBSD <https://debbugs.gnu.org/cgi/bugreport.cgi?bug=80774#47>. + A similar bug occurs in some macOS releases. + * modules/regex (Depends-on): Replace btowc, iswctype, mbrtowc, + wchar-h, wcrtomb, wctype-h, and wctype with btoc32, + c32_apply_type_test, c32_get_type_test, c32isalnum, c32rtomb, + c32tolower, c32toupper, mbrtoc32-regular, uchar-h. + * lib/regex_internal.h [!_LIBC]: Do not include <wchar.h>, <wctype.h>. + Instead, include <uchar.h> and #define wchar_t, wctype_t, + __wctype, __iswalnum, __iswctype, __towlower, __towupper, __btowc, + __mbrtowc, and __wcrtomb to their char32_t counterparts. + dfa: always use char32_t not wchar_t This should help merges changes from Gawk, which always uses the char32_t API though that’s sometimes implemented with the wchar_t diff --git a/lib/regex_internal.h b/lib/regex_internal.h index fb6c1f373d..87ed00b87a 100644 --- a/lib/regex_internal.h +++ b/lib/regex_internal.h @@ -27,8 +27,6 @@ #include <langinfo.h> #include <locale.h> -#include <wchar.h> -#include <wctype.h> #include <stdckdint.h> #include <stdcountof.h> #include <stdint.h> @@ -121,21 +119,31 @@ #define NEWLINE_CHAR '\n' #define WIDE_NEWLINE_CHAR L'\n' -/* Rename to standard API for using out of glibc. */ -#ifndef _LIBC +/* Use Gnulib <uchar.h> when outside of glibc. */ +#ifdef _LIBC +# include <wchar.h> +# include <wctype.h> +#else +# include <uchar.h> # undef __wctype # undef __iswalnum # undef __iswctype # undef __towlower # undef __towupper -# define __wctype wctype -# define __iswalnum iswalnum -# define __iswctype iswctype -# define __towlower towlower -# define __towupper towupper -# define __btowc btowc -# define __mbrtowc mbrtowc -# define __wcrtomb wcrtomb +# undef __btowc +# undef __mbrtowc +# undef __wcrtomb +# undef __regfree +# define wchar_t char32_t +# define wctype_t c32_type_test_t +# define __wctype c32_get_type_test +# define __iswalnum c32isalnum +# define __iswctype c32_apply_type_test +# define __towlower c32tolower +# define __towupper c32toupper +# define __btowc btoc32 +# define __mbrtowc mbrtoc32 +# define __wcrtomb c32rtomb # define __regfree regfree #endif /* not _LIBC */ diff --git a/modules/regex b/modules/regex index a1176b8e90..9cb277618b 100644 --- a/modules/regex +++ b/modules/regex @@ -20,21 +20,26 @@ extensions ssize_t vararrays attribute [test $ac_use_included_regex = yes] -btowc [test $ac_use_included_regex = yes] +btoc32 [test $ac_use_included_regex = yes] builtin-expect [test $ac_use_included_regex = yes] +c32_apply_type_test [test $ac_use_included_regex = yes] +c32_get_type_test [test $ac_use_included_regex = yes] +c32isalnum [test $ac_use_included_regex = yes] +c32rtomb [test $ac_use_included_regex = yes] +c32tolower [test $ac_use_included_regex = yes] +c32toupper [test $ac_use_included_regex = yes] gettext-h [test $ac_use_included_regex = yes] glibc-internal/dynarray [test $ac_use_included_regex = yes] gnulib-i18n [test $ac_use_included_regex = yes] intprops [test $ac_use_included_regex = yes] -iswctype [test $ac_use_included_regex = yes] langinfo-h [test $ac_use_included_regex = yes] libc-config [test $ac_use_included_regex = yes] limits-h [test $ac_use_included_regex = yes] lock [test $ac_use_included_regex = yes] malloc-gnu [test $ac_use_included_regex = yes] +mbrtoc32-regular [test $ac_use_included_regex = yes] memcmp [test $ac_use_included_regex = yes] memmove [test $ac_use_included_regex = yes] -mbrtowc [test $ac_use_included_regex = yes] mbsinit [test $ac_use_included_regex = yes] nl_langinfo [test $ac_use_included_regex = yes] bool [test $ac_use_included_regex = yes] @@ -43,10 +48,11 @@ stdcountof-h [test $ac_use_included_regex = yes] stdint-h [test $ac_use_included_regex = yes] strncpy [test $ac_use_included_regex = yes] verify [test $ac_use_included_regex = yes] -wchar-h [test $ac_use_included_regex = yes] -wcrtomb [test $ac_use_included_regex = yes] -wctype-h [test $ac_use_included_regex = yes] -wctype [test $ac_use_included_regex = yes] +uchar-h [test $ac_use_included_regex = yes] +# The lonesome_lower array requires ISO C 23 semantics for char32_t. +# But uchar-h-c23 has a global effect, therefore leave it to each package +# to enable it. +#uchar-h-c23 [test $ac_use_included_regex = yes] configure.ac: gl_REGEX -- 2.53.0
