Hi Paul, Trying to sync gnulib with glibc code, this patch trigger some regression on glibc testcases:
FAIL: posix/tst-boost FAIL: posix/tst-pcre FAIL: posix/tst-rxspencer FAIL: posix/tst-rxspencer-no-utf8 $ grep -n "FAIL rm" posix/tst-boost.out 445:FAIL rm[1] 3..-1 != expected 2..3 448:FAIL rm[1] 3..-1 != expected 2..3 451:FAIL rm[1] 3..-1 != expected 2..3 454:FAIL rm[1] 3..-1 != expected 2..3 $ cat posix/tst-pcre.out 1168: /([a]*)*/ on a unexpectedly failed to match register 1 1..-1 1171: /([a]*)*/ on aaaaa unexpectedly failed to match register 1 5..-1 1176: /([ab]*)*/ on a unexpectedly failed to match register 1 1..-1 1179: /([ab]*)*/ on b unexpectedly failed to match register 1 1..-1 1182: /([ab]*)*/ on ababab unexpectedly failed to match register 1 6..-1 1185: /([ab]*)*/ on aaaabcde unexpectedly failed to match register 1 5..-1 1188: /([ab]*)*/ on bbbb unexpectedly failed to match register 1 4..-1 1193: /([^a]*)*/ on b unexpectedly failed to match register 1 1..-1 1196: /([^a]*)*/ on bbbb unexpectedly failed to match register 1 4..-1 1203: /([^ab]*)*/ on cccc unexpectedly failed to match register 1 4..-1 $ cat posix/tst-rxspencer.out | grep FAIL FAIL rm[1] unexpectedly did not match FAIL rm[1] unexpectedly did not match $ cat posix/tst-rxspencer-no-utf8.out | grep FAIL FAIL rm[1] unexpectedly did not match FAIL rm[1] unexpectedly did not match On 05/02/2021 22:25, Paul Eggert wrote: > This fixes a longstanding glibc bug concerning backreferences > <https://sourceware.org/11053> (2009-12-04). > * lib/regexec.c (proceed_next_node, push_fail_stack) > (pop_fail_stack): Push and pop the previous registers > as well as the current ones. All callers changed. > (set_regs): Also pop if CUR_NODE has already been checked, > so that it does not get added as a duplicate set entry. > (update_regs): Fix comment location. > * tests/test-regex.c (tests): New constant. > (bug_regex11): New test function. > (main): Bump alarm value. Call new test function. > --- > ChangeLog | 13 ++++++ > lib/regexec.c | 26 +++++++---- > tests/test-regex.c | 113 ++++++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 141 insertions(+), 11 deletions(-) > > diff --git a/ChangeLog b/ChangeLog > index 74304474b..bd7d1fa16 100644 > --- a/ChangeLog > +++ b/ChangeLog > @@ -1,5 +1,18 @@ > 2021-02-05 Paul Eggert <egg...@cs.ucla.edu> > > + regex: fix longstanding backref match bug > + This fixes a longstanding glibc bug concerning backreferences > + <https://sourceware.org/11053> (2009-12-04). > + * lib/regexec.c (proceed_next_node, push_fail_stack) > + (pop_fail_stack): Push and pop the previous registers > + as well as the current ones. All callers changed. > + (set_regs): Also pop if CUR_NODE has already been checked, > + so that it does not get added as a duplicate set entry. > + (update_regs): Fix comment location. > + * tests/test-regex.c (tests): New constant. > + (bug_regex11): New test function. > + (main): Bump alarm value. Call new test function. > + > regex: avoid duplicate in espilon closure > * lib/regcomp.c (calc_eclosure_iter): Insert NODE into epsilon > closure first rather than last. Otherwise, the epsilon closure > diff --git a/lib/regexec.c b/lib/regexec.c > index fdd2e373e..424bc8d15 100644 > --- a/lib/regexec.c > +++ b/lib/regexec.c > @@ -59,7 +59,7 @@ static void update_regs (const re_dfa_t *dfa, regmatch_t > *pmatch, > Idx cur_idx, Idx nmatch); > static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, > Idx str_idx, Idx dest_node, Idx nregs, > - regmatch_t *regs, > + regmatch_t *regs, regmatch_t *prevregs, > re_node_set *eps_via_nodes); > static reg_errcode_t set_regs (const regex_t *preg, > const re_match_context_t *mctx, > @@ -1211,6 +1211,7 @@ check_halt_state_context (const re_match_context_t > *mctx, > > static Idx > proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t > *regs, > + regmatch_t *prevregs, > Idx *pidx, Idx node, re_node_set *eps_via_nodes, > struct re_fail_stack_t *fs) > { > @@ -1243,7 +1244,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx > nregs, regmatch_t *regs, > /* Otherwise, push the second epsilon-transition on the fail > stack. */ > else if (fs != NULL > && push_fail_stack (fs, *pidx, candidate, nregs, regs, > - eps_via_nodes)) > + prevregs, eps_via_nodes)) > return -2; > > /* We know we are going to exit. */ > @@ -1316,7 +1317,8 @@ proceed_next_node (const re_match_context_t *mctx, Idx > nregs, regmatch_t *regs, > static reg_errcode_t > __attribute_warn_unused_result__ > push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node, > - Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes) > + Idx nregs, regmatch_t *regs, regmatch_t *prevregs, > + re_node_set *eps_via_nodes) > { > reg_errcode_t err; > Idx num = fs->num++; > @@ -1332,23 +1334,26 @@ push_fail_stack (struct re_fail_stack_t *fs, Idx > str_idx, Idx dest_node, > } > fs->stack[num].idx = str_idx; > fs->stack[num].node = dest_node; > - fs->stack[num].regs = re_malloc (regmatch_t, nregs); > + fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs); > if (fs->stack[num].regs == NULL) > return REG_ESPACE; > memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); > + memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * > nregs); > err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); > return err; > } > > static Idx > pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs, > - regmatch_t *regs, re_node_set *eps_via_nodes) > + regmatch_t *regs, regmatch_t *prevregs, > + re_node_set *eps_via_nodes) > { > if (fs == NULL || fs->num == 0) > return -1; > Idx num = --fs->num; > *pidx = fs->stack[num].idx; > memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); > + memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * > nregs); > re_node_set_free (eps_via_nodes); > re_free (fs->stack[num].regs); > *eps_via_nodes = fs->stack[num].eps_via_nodes; > @@ -1408,7 +1413,8 @@ set_regs (const regex_t *preg, const re_match_context_t > *mctx, size_t nmatch, > { > update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); > > - if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) > + if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node) > + || re_node_set_contains (&eps_via_nodes, cur_node)) > { > Idx reg_idx; > cur_node = -1; > @@ -1418,7 +1424,7 @@ set_regs (const regex_t *preg, const re_match_context_t > *mctx, size_t nmatch, > if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) > { > cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, > - &eps_via_nodes); > + prev_idx_match, &eps_via_nodes); > break; > } > } > @@ -1431,7 +1437,8 @@ set_regs (const regex_t *preg, const re_match_context_t > *mctx, size_t nmatch, > } > > /* Proceed to next node. */ > - cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, > + cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match, > + &idx, cur_node, > &eps_via_nodes, fs); > > if (__glibc_unlikely (cur_node < 0)) > @@ -1443,7 +1450,8 @@ set_regs (const regex_t *preg, const re_match_context_t > *mctx, size_t nmatch, > free_fail_stack_return (fs); > return REG_ESPACE; > } > - cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, &eps_via_nodes); > + cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, > + prev_idx_match, &eps_via_nodes); > if (cur_node < 0) > { > re_node_set_free (&eps_via_nodes); > diff --git a/tests/test-regex.c b/tests/test-regex.c > index 58f8200f2..a14619805 100644 > --- a/tests/test-regex.c > +++ b/tests/test-regex.c > @@ -55,6 +55,111 @@ really_utf8 (void) > return strcmp (locale_charset (), "UTF-8") == 0; > } > > +/* Tests supposed to match; copied from glibc posix/bug-regex11.c. */ > +static struct > +{ > + const char *pattern; > + const char *string; > + int flags, nmatch; > + regmatch_t rm[5]; > +} const tests[] = { > + /* Test for newline handling in regex. */ > + { "[^~]*~", "\nx~y", 0, 2, { { 0, 3 }, { -1, -1 } } }, > + /* Other tests. */ > + { "a(.*)b", "a b", REG_EXTENDED, 2, { { 0, 3 }, { 1, 2 } } }, > + { ".*|\\([KIO]\\)\\([^|]*\\).*|?[KIO]", "10~.~|P|K0|I10|O16|?KSb", 0, 3, > + { { 0, 21 }, { 15, 16 }, { 16, 18 } } }, > + { ".*|\\([KIO]\\)\\([^|]*\\).*|?\\1", "10~.~|P|K0|I10|O16|?KSb", 0, 3, > + { { 0, 21 }, { 8, 9 }, { 9, 10 } } }, > + { "^\\(a*\\)\\1\\{9\\}\\(a\\{0,9\\}\\)\\([0-9]*;.*[^a]\\2\\([0-9]\\)\\)", > + "a1;;0a1aa2aaa3aaaa4aaaaa5aaaaaa6aaaaaaa7aaaaaaaa8aaaaaaaaa9aa2aa1a0", 0, > + 5, { { 0, 67 }, { 0, 0 }, { 0, 1 }, { 1, 67 }, { 66, 67 } } }, > + /* Test for BRE expression anchoring. POSIX says just that this may match; > + in glibc regex it always matched, so avoid changing it. */ > + { "\\(^\\|foo\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } }, > + { "\\(foo\\|^\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } }, > + /* In ERE this must be treated as an anchor. */ > + { "(^|foo)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } }, > + { "(foo|^)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } }, > + /* Here ^ cannot be treated as an anchor according to POSIX. */ > + { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } }, > + { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } }, > + /* More tests on backreferences. */ > + { "()\\1", "x", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } }, > + { "()x\\1", "x", REG_EXTENDED, 2, { { 0, 1 }, { 0, 0 } } }, > + { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } }, > + { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, > 4 } } }, > + { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, > 4 } } }, > + { "(b)()c\\1", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 1 }, { 1, 1 } } }, > + { "()(b)c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } }, > + { "a(b)()c\\1", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 2 }, { 2, 2 } } > }, > + { "a()(b)c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } > }, > + { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } > }, > + { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } > }, > + { "a()(b)\\1c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } > } }, > + { "a()d(b)\\1c\\2", "adbcb", REG_EXTENDED, 3, { { 0, 5 }, { 1, 1 }, { 2, 3 > } } }, > + { "a(b())\\2\\1", "abbbb", REG_EXTENDED, 3, { { 0, 3 }, { 1, 2 }, { 2, 2 } > } }, > + { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } > } }, > + { "^([^,]*),\\1,\\1$", "a,a,a", REG_EXTENDED, 2, { { 0, 5 }, { 0, 1 } } }, > + { "^([^,]*),\\1,\\1$", "ab,ab,ab", REG_EXTENDED, 2, { { 0, 8 }, { 0, 2 } } > }, > + { "^([^,]*),\\1,\\1,\\1$", "abc,abc,abc,abc", REG_EXTENDED, 2, > + { { 0, 15 }, { 0, 3 } } }, > + { "^(.?)(.?)(.?)(.?)(.?).?\\5\\4\\3\\2\\1$", > + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } }, > + { > "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$", > + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } }, > + { > "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$", > + "abcdedcba", REG_EXTENDED, 1, { { 0, 9 } } }, > + /* XXX Not used since they fail so far. */ > + { > "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$", > + "ababababa", REG_EXTENDED, 1, { { 0, 9 } } }, > + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$", > + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } }, > + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$", > + "ababababa", REG_EXTENDED, 1, { { 0, 9 } } }, > +}; > + > +static void > +bug_regex11 (void) > +{ > + regex_t re; > + regmatch_t rm[5]; > + size_t i; > + int n; > + > + for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i) > + { > + n = regcomp (&re, tests[i].pattern, tests[i].flags); > + if (n != 0) > + { > + char buf[500]; > + regerror (n, &re, buf, sizeof (buf)); > + report_error ("%s: regcomp %zd failed: %s", tests[i].pattern, i, buf); > + continue; > + } > + > + if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0)) > + { > + report_error ("%s: regexec %zd failed", tests[i].pattern, i); > + regfree (&re); > + continue; > + } > + > + for (n = 0; n < tests[i].nmatch; ++n) > + if (rm[n].rm_so != tests[i].rm[n].rm_so > + || rm[n].rm_eo != tests[i].rm[n].rm_eo) > + { > + if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1) > + break; > + report_error ("%s: regexec %zd match failure rm[%d] %d..%d", > + tests[i].pattern, i, n, rm[n].rm_so, rm[n].rm_eo); > + break; > + } > + > + regfree (&re); > + } > +} > + > int > main (void) > { > @@ -65,11 +170,15 @@ main (void) > struct re_registers regs; > > #if HAVE_DECL_ALARM > - /* Some builds of glibc go into an infinite loop on this test. */ > - int alarm_value = 2; > + /* In case a bug causes glibc to go into an infinite loop. > + The tests should take less than 10 s on a reasonably modern CPU. */ > + int alarm_value = 1000; > signal (SIGALRM, SIG_DFL); > alarm (alarm_value); > #endif > + > + bug_regex11 (); > + > if (setlocale (LC_ALL, "en_US.UTF-8")) > { > { >