The below seems like a cheap fix for UTF-8 locales. Since Bash falls back to using the single-byte glob matching functions when presented with invalid multibyte strings, this patch makes the glob code avoid calling the ctype functions or strcoll when handling individual bytes >0x7F (in a UTF-8 locale).
This makes the following no longer evaluate to true on macos: [[ $'\xC0' == [[:upper:]] ]] [[ $'\xC0' == [[=A=]] ]] [[ $'\xC0' == $'\xE0' ]] # with nocasematch And on Linux with glibc (tested on Ubuntu 22.04) in en_US.UTF-8, strcoll returns 0 for any two invalid bytes, so the following is also no longer true: x=$'\x80'; [[ $'\xC0' == [[=$x=]] ]] The locale_setblanks change is for the macos issue with 0xA0 being treated as a blank (as U+00A0). There's no other code that changes CSHBRK in sh_syntaxtab so I think the simplifications are OK. --- diff --git a/lib/glob/smatch.c b/lib/glob/smatch.c index 12eb9d27..1c6b0229 100644 --- a/lib/glob/smatch.c +++ b/lib/glob/smatch.c @@ -141,6 +141,9 @@ rangecmp (int c1, int c2, int forcecoll) static int collseqcmp (int c, int equiv) { + if (locale_utf8locale && (UTF8_SINGLEBYTE (c) == 0 || UTF8_SINGLEBYTE (equiv) == 0)) + return (c == equiv); + if (charcmp (c, equiv, 1) == 0) return 1; @@ -281,6 +284,9 @@ is_cclass (int c, const char *name) enum char_class char_class; int result; + if (locale_utf8locale && UTF8_SINGLEBYTE(c) == 0) + return -1; + char_class = is_valid_cclass (name); if (char_class == CC_NO_CLASS) return -1; @@ -291,7 +297,8 @@ is_cclass (int c, const char *name) /* Now include `sm_loop.c' for single-byte characters. */ /* The result of FOLD is an `unsigned char' */ -# define FOLD(c) ((flags & FNM_CASEFOLD) \ +# define FOLD(c) (((flags & FNM_CASEFOLD) && \ + (locale_utf8locale == 0 || UTF8_SINGLEBYTE (c))) \ ? TOLOWER ((unsigned char)c) \ : ((unsigned char)c)) diff --git a/locale.c b/locale.c index eb24a517..b918db37 100644 --- a/locale.c +++ b/locale.c @@ -584,15 +584,10 @@ locale_setblanks (void) for (x = 0; x < sh_syntabsiz; x++) { - if (isblank ((unsigned char)x)) - sh_syntaxtab[x] |= CSHBRK|CBLANK; - else if (member (x, shell_break_chars)) - { - sh_syntaxtab[x] |= CSHBRK; - sh_syntaxtab[x] &= ~CBLANK; - } + if ((locale_utf8locale == 0 || (x & 0x80) == 0) && isblank ((unsigned char)x)) + sh_syntaxtab[x] |= CBLANK; else - sh_syntaxtab[x] &= ~(CSHBRK|CBLANK); + sh_syntaxtab[x] &= ~CBLANK; } }