* lib/regex_internal.c (build_wcs_upper_buffer): Do not assume that converting single-byte character to upper yields a single-byte character. This is not true for Turkish, where towupper (L'i') yields L'İ', which is not single-byte. * tests/test-regex.c (main): Test for this bug. --- ChangeLog | 7 +++++++ lib/regex_internal.c | 19 ++++++++++--------- tests/test-regex.c | 41 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 53 insertions(+), 14 deletions(-)
diff --git a/ChangeLog b/ChangeLog index d15f158ab..5c4d8f849 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,12 @@ 2020-09-23 Paul Eggert <egg...@cs.ucla.edu> + regex: fix ignore-case Turkish bug + * lib/regex_internal.c (build_wcs_upper_buffer): + Do not assume that converting single-byte character to upper + yields a single-byte character. This is not true for Turkish, + where towupper (L'i') yields L'İ', which is not single-byte. + * tests/test-regex.c (main): Test for this bug. + regex: port to weird isascii platforms * lib/regex_internal.h (isascii) [!_LIBC]: Supply glibc version. diff --git a/lib/regex_internal.c b/lib/regex_internal.c index e1b6b4d5a..ed0a13461 100644 --- a/lib/regex_internal.c +++ b/lib/regex_internal.c @@ -300,18 +300,20 @@ build_wcs_upper_buffer (re_string_t *pstr) while (byte_idx < end_idx) { wchar_t wc; + unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; - if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) - && mbsinit (&pstr->cur_state)) + if (isascii (ch) && mbsinit (&pstr->cur_state)) { - /* In case of a singlebyte character. */ - pstr->mbs[byte_idx] - = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); /* The next step uses the assumption that wchar_t is encoded ASCII-safe: all ASCII values can be converted like this. */ - pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; - ++byte_idx; - continue; + wchar_t wcu = __towupper (ch); + if (isascii (wcu)) + { + pstr->mbs[byte_idx] = wcu; + pstr->wcs[byte_idx] = wcu; + byte_idx++; + continue; + } } remain_len = end_idx - byte_idx; @@ -348,7 +350,6 @@ build_wcs_upper_buffer (re_string_t *pstr) { /* It is an invalid character, an incomplete character at the end of the string, or '\0'. Just use the byte. */ - int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; pstr->mbs[byte_idx] = ch; /* And also cast it to wide char. */ pstr->wcs[byte_idx++] = (wchar_t) ch; diff --git a/tests/test-regex.c b/tests/test-regex.c index d3f429aeb..b4e23c8c8 100644 --- a/tests/test-regex.c +++ b/tests/test-regex.c @@ -29,6 +29,15 @@ #include "localcharset.h" +/* Check whether it's really a UTF-8 locale. + On mingw, setlocale (LC_ALL, "en_US.UTF-8") succeeds but returns + "English_United States.1252", with locale_charset () returning "CP1252". */ +static int +really_utf8 (void) +{ + return strcmp (locale_charset (), "UTF-8") == 0; +} + int main (void) { @@ -75,11 +84,7 @@ main (void) } } - /* Check whether it's really a UTF-8 locale. - On mingw, the setlocale call succeeds but returns - "English_United States.1252", with locale_charset() returning - "CP1252". */ - if (strcmp (locale_charset (), "UTF-8") == 0) + if (really_utf8 ()) { /* This test is from glibc bug 15078. The test case is from Andreas Schwab in @@ -119,6 +124,32 @@ main (void) return 1; } + if (setlocale (LC_ALL, "tr_TR.UTF-8") && really_utf8 ()) + { + re_set_syntax (RE_SYNTAX_GREP | RE_ICASE); + if (re_compile_pattern ("i", 1, ®ex)) + result |= 1; + else + { + /* UTF-8 encoding of U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE. + In Turkish, this is the upper-case equivalent of ASCII "i". + Older versions of Gnulib failed to match "i" to U+0130 when + ignoring case in Turkish <https://bugs.gnu.org/43577>. */ + static char const data[] = "\xc4\xb0"; + + memset (®s, 0, sizeof regs); + if (re_search (®ex, data, sizeof data - 1, 0, sizeof data - 1, + ®s)) + result |= 1; + regfree (®ex); + free (regs.start); + free (regs.end); + + if (! setlocale (LC_ALL, "C")) + return 1; + } + } + /* This test is from glibc bug 3957, reported by Andrew Mackey. */ re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE); memset (®ex, 0, sizeof regex); -- 2.25.4