I applied the attached three patches to sync with glibc and fix two relatively serious bug.
Paolo
>From d9491838d50536edcf30e219a3ab96791aeb1d5d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <bonz...@gnu.org> Date: Fri, 9 Jan 2009 09:00:58 +0100 Subject: [PATCH] merge regex from glibc: replace mbrtowc with __mbrtowc. 2009-01-09 Paolo Bonzini <bonz...@gnu.org> * lib/regcomp.c (re_compile_fastmap_iter): Use __mbrtowc. * lib/regex_internal.c (build_wcs_buffer, build_wcs_upper_buffer, re_string_skip_chars, re_string_reconstruct): Likewise. * lib/regex_internal.h [!_LIBC] (__mbrtowc): New #define. --- ChangeLog | 8 ++++++++ lib/regcomp.c | 7 ++++--- lib/regex_internal.c | 22 +++++++++++----------- lib/regex_internal.h | 4 +++- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/ChangeLog b/ChangeLog index 74954d5..2059b03 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2009-01-09 Paolo Bonzini <bonz...@gnu.org> + + regex: merge glibc changes + * lib/regcomp.c (re_compile_fastmap_iter): Use __mbrtowc. + * lib/regex_internal.c (build_wcs_buffer, build_wcs_upper_buffer, + re_string_skip_chars, re_string_reconstruct): Likewise. + * lib/regex_internal.h [!_LIBC] (__mbrtowc): New #define. + 2009-01-07 Jim Meyering <meyer...@redhat.com> poll: filter through cppi diff --git a/lib/regcomp.c b/lib/regcomp.c index a3a745d..fc3cf98 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -1,5 +1,6 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Free Software Foundation, Inc. + Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009 + Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa <is...@yamato.ibm.com>. @@ -333,8 +334,8 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, && dfa->nodes[node].mb_partial) *p++ = dfa->nodes[node].opr.c; memset (&state, '\0', sizeof (state)); - if (mbrtowc (&wc, (const char *) buf, p - buf, - &state) == p - buf + if (__mbrtowc (&wc, (const char *) buf, p - buf, + &state) == p - buf && (__wcrtomb ((char *) buf, towlower (wc), &state) != (size_t) -1)) re_set_fastmap (fastmap, false, buf[0]); diff --git a/lib/regex_internal.c b/lib/regex_internal.c index 977b15a..904b88e 100644 --- a/lib/regex_internal.c +++ b/lib/regex_internal.c @@ -1,6 +1,6 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software - Foundation, Inc. + Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa <is...@yamato.ibm.com>. @@ -236,7 +236,7 @@ build_wcs_buffer (re_string_t *pstr) } else p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; - mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); + mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); if (BE (mbclen == (size_t) -2, 0)) { /* The buffer doesn't have enough space, finish to build. */ @@ -306,9 +306,9 @@ build_wcs_upper_buffer (re_string_t *pstr) remain_len = end_idx - byte_idx; prev_st = pstr->cur_state; - mbclen = mbrtowc (&wc, - ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx - + byte_idx), remain_len, &pstr->cur_state); + mbclen = __mbrtowc (&wc, + ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx + + byte_idx), remain_len, &pstr->cur_state); if (BE (mbclen < (size_t) -2, 1)) { wchar_t wcu = wc; @@ -376,7 +376,7 @@ build_wcs_upper_buffer (re_string_t *pstr) } else p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; - mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); + mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); if (BE (mbclen < (size_t) -2, 1)) { wchar_t wcu = wc; @@ -499,8 +499,8 @@ re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) Idx remain_len; remain_len = pstr->len - rawbuf_idx; prev_st = pstr->cur_state; - mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, - remain_len, &pstr->cur_state); + mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, + remain_len, &pstr->cur_state); if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) { /* We treat these cases as a single byte character. */ @@ -745,8 +745,8 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) /* XXX Don't use mbrtowc, we know which conversion to use (UTF-8 -> UCS4). */ memset (&cur_state, 0, sizeof (cur_state)); - mbclen = mbrtowc (&wc2, (const char *) p, mlen, - &cur_state); + mbclen = __mbrtowc (&wc2, (const char *) p, mlen, + &cur_state); if (raw + offset - p <= mbclen && mbclen < (size_t) -2) { diff --git a/lib/regex_internal.h b/lib/regex_internal.h index 47b9e13..d3d58e8 100644 --- a/lib/regex_internal.h +++ b/lib/regex_internal.h @@ -1,5 +1,6 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. + Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa <is...@yamato.ibm.com>. @@ -115,6 +116,7 @@ # define __iswctype iswctype # define __btowc btowc # define __wcrtomb wcrtomb +# define __mbrtowc mbrtowc # define __regfree regfree # define attribute_hidden #endif /* not _LIBC */ -- 1.5.5
>From f9fb3bb5a348aa2381edbe4cbc7eecc3894a1f42 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <bonz...@gnu.org> Date: Fri, 9 Jan 2009 09:04:55 +0100 Subject: [PATCH] merge regex from glibc: fix glibc bug 697 2009-01-09 Paolo Bonzini <bonz...@gnu.org> * lib/regexec.c (prune_impossible_nodes): Handle sifted_states[0] being NULL also if there are no backreferences. --- ChangeLog | 6 ++++++ lib/regexec.c | 9 +++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2059b03..d76e1e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,10 @@ 2009-01-09 Paolo Bonzini <bonz...@gnu.org> + + regex: fix glibc bug 697 + * lib/regexec.c (prune_impossible_nodes): Handle sifted_states[0] + being NULL also if there are no backreferences. + +2009-01-09 Paolo Bonzini <bonz...@gnu.org> regex: merge glibc changes * lib/regcomp.c (re_compile_fastmap_iter): Use __mbrtowc. diff --git a/lib/regexec.c b/lib/regexec.c index 2afa5b3..21a8166 100644 --- a/lib/regexec.c +++ b/lib/regexec.c @@ -1,6 +1,6 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, - Inc. + Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa <is...@yamato.ibm.com>. @@ -1045,6 +1045,11 @@ prune_impossible_nodes (re_match_context_t *mctx) re_node_set_free (&sctx.limits); if (BE (ret != REG_NOERROR, 0)) goto free_return; + if (sifted_states[0] == NULL) + { + ret = REG_NOMATCH; + goto free_return; + } } re_free (mctx->state_log); mctx->state_log = sifted_states; -- 1.5.5
>From fa02d58b87e0a4eb2cbf45adb8bf9fe576ae5339 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini <bonz...@gnu.org> Date: Fri, 9 Jan 2009 09:10:36 +0100 Subject: [PATCH] merge regex from glibc: fix glibc bug 9697 2009-01-09 Paolo Bonzini <bonz...@gnu.org> * lib/regcomp.c (re_compile_fastmap_iter): Rewrite COMPLEX_BRACKET handling. --- ChangeLog | 6 ++++ lib/regcomp.c | 74 ++++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/ChangeLog b/ChangeLog index d76e1e2..f7483f4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,10 @@ 2009-01-09 Paolo Bonzini <bonz...@gnu.org> + + regex: fix glibc bug 9697 + * lib/regcomp.c (re_compile_fastmap_iter): Rewrite COMPLEX_BRACKET + handling. + +2009-01-09 Paolo Bonzini <bonz...@gnu.org> regex: fix glibc bug 697 * lib/regexec.c (prune_impossible_nodes): Handle sifted_states[0] diff --git a/lib/regcomp.c b/lib/regcomp.c index fc3cf98..6472ff6 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -357,45 +357,65 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, #ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) { - Idx i; re_charset_t *cset = dfa->nodes[node].opr.mbcset; - if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes - || cset->nranges || cset->nchar_classes) - { + Idx i; + # ifdef _LIBC - if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) + /* See if we have to try all bytes which start multiple collation + elements. + e.g. In da_DK, we want to catch 'a' since "aa" is a valid + collation element, and don't catch 'b' since 'b' is + the only collation element which starts from 'b' (and + it is caught by SIMPLE_BRACKET). */ + if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 + && (cset->ncoll_syms || cset->nranges)) { - /* In this case we want to catch the bytes which are - the first byte of any collation elements. - e.g. In da_DK, we want to catch 'a' since "aa" - is a valid collation element, and don't catch - 'b' since 'b' is the only collation element - which starts from 'b'. */ const int32_t *table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); for (i = 0; i < SBC_MAX; ++i) if (table[i] < 0) re_set_fastmap (fastmap, icase, i); } -# else - if (dfa->mb_cur_max > 1) - for (i = 0; i < SBC_MAX; ++i) - if (__btowc (i) == WEOF) - re_set_fastmap (fastmap, icase, i); -# endif /* not _LIBC */ +# endif /* _LIBC */ + + /* See if we have to start the match at all multibyte characters, + i.e. where we would not find an invalid sequence. This only + applies to multibyte character sets; for single byte character + sets, the SIMPLE_BRACKET again suffices. */ + if (dfa->mb_cur_max > 1 + && (cset->nchar_classes || cset->non_match +# ifdef _LIBC + || cset->nequiv_classes +# endif /* _LIBC */ + )) + { + unsigned char c = 0; + do + { + mbstate_t mbs; + memset (&mbs, 0, sizeof (mbs)); + if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) + re_set_fastmap (fastmap, false, (int) c); + } + while (++c != 0); } - for (i = 0; i < cset->nmbchars; ++i) + + else { - char buf[256]; - mbstate_t state; - memset (&state, '\0', sizeof (state)); - if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) - re_set_fastmap (fastmap, icase, *(unsigned char *) buf); - if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + /* ... Else catch all bytes which can start the mbchars. */ + for (i = 0; i < cset->nmbchars; ++i) { - if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) - != (size_t) -1) - re_set_fastmap (fastmap, false, *(unsigned char *) buf); + char buf[256]; + mbstate_t state; + memset (&state, '\0', sizeof (state)); + if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) + re_set_fastmap (fastmap, icase, *(unsigned char *) buf); + if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + { + if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + != (size_t) -1) + re_set_fastmap (fastmap, false, *(unsigned char *) buf); + } } } } -- 1.5.5