This fixes a longstanding glibc bug concerning backreferences <https://sourceware.org/11053> (2009-12-04). * lib/regexec.c (proceed_next_node, push_fail_stack) (pop_fail_stack): Push and pop the previous registers as well as the current ones. All callers changed. (set_regs): Also pop if CUR_NODE has already been checked, so that it does not get added as a duplicate set entry. (update_regs): Fix comment location. * tests/test-regex.c (tests): New constant. (bug_regex11): New test function. (main): Bump alarm value. Call new test function. --- ChangeLog | 13 ++++++ lib/regexec.c | 26 +++++++---- tests/test-regex.c | 113 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 141 insertions(+), 11 deletions(-)
diff --git a/ChangeLog b/ChangeLog index 74304474b..bd7d1fa16 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,18 @@ 2021-02-05 Paul Eggert <egg...@cs.ucla.edu> + regex: fix longstanding backref match bug + This fixes a longstanding glibc bug concerning backreferences + <https://sourceware.org/11053> (2009-12-04). + * lib/regexec.c (proceed_next_node, push_fail_stack) + (pop_fail_stack): Push and pop the previous registers + as well as the current ones. All callers changed. + (set_regs): Also pop if CUR_NODE has already been checked, + so that it does not get added as a duplicate set entry. + (update_regs): Fix comment location. + * tests/test-regex.c (tests): New constant. + (bug_regex11): New test function. + (main): Bump alarm value. Call new test function. + regex: avoid duplicate in espilon closure * lib/regcomp.c (calc_eclosure_iter): Insert NODE into epsilon closure first rather than last. Otherwise, the epsilon closure diff --git a/lib/regexec.c b/lib/regexec.c index fdd2e373e..424bc8d15 100644 --- a/lib/regexec.c +++ b/lib/regexec.c @@ -59,7 +59,7 @@ static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, Idx cur_idx, Idx nmatch); static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node, Idx nregs, - regmatch_t *regs, + regmatch_t *regs, regmatch_t *prevregs, re_node_set *eps_via_nodes); static reg_errcode_t set_regs (const regex_t *preg, const re_match_context_t *mctx, @@ -1211,6 +1211,7 @@ check_halt_state_context (const re_match_context_t *mctx, static Idx proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs, + regmatch_t *prevregs, Idx *pidx, Idx node, re_node_set *eps_via_nodes, struct re_fail_stack_t *fs) { @@ -1243,7 +1244,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs, /* Otherwise, push the second epsilon-transition on the fail stack. */ else if (fs != NULL && push_fail_stack (fs, *pidx, candidate, nregs, regs, - eps_via_nodes)) + prevregs, eps_via_nodes)) return -2; /* We know we are going to exit. */ @@ -1316,7 +1317,8 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs, static reg_errcode_t __attribute_warn_unused_result__ push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node, - Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes) + Idx nregs, regmatch_t *regs, regmatch_t *prevregs, + re_node_set *eps_via_nodes) { reg_errcode_t err; Idx num = fs->num++; @@ -1332,23 +1334,26 @@ push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node, } fs->stack[num].idx = str_idx; fs->stack[num].node = dest_node; - fs->stack[num].regs = re_malloc (regmatch_t, nregs); + fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs); if (fs->stack[num].regs == NULL) return REG_ESPACE; memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); + memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs); err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); return err; } static Idx pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs, - regmatch_t *regs, re_node_set *eps_via_nodes) + regmatch_t *regs, regmatch_t *prevregs, + re_node_set *eps_via_nodes) { if (fs == NULL || fs->num == 0) return -1; Idx num = --fs->num; *pidx = fs->stack[num].idx; memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); + memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs); re_node_set_free (eps_via_nodes); re_free (fs->stack[num].regs); *eps_via_nodes = fs->stack[num].eps_via_nodes; @@ -1408,7 +1413,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, { update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); - if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) + if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node) + || re_node_set_contains (&eps_via_nodes, cur_node)) { Idx reg_idx; cur_node = -1; @@ -1418,7 +1424,7 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) { cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, - &eps_via_nodes); + prev_idx_match, &eps_via_nodes); break; } } @@ -1431,7 +1437,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, } /* Proceed to next node. */ - cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, + cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match, + &idx, cur_node, &eps_via_nodes, fs); if (__glibc_unlikely (cur_node < 0)) @@ -1443,7 +1450,8 @@ set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, free_fail_stack_return (fs); return REG_ESPACE; } - cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, &eps_via_nodes); + cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, + prev_idx_match, &eps_via_nodes); if (cur_node < 0) { re_node_set_free (&eps_via_nodes); diff --git a/tests/test-regex.c b/tests/test-regex.c index 58f8200f2..a14619805 100644 --- a/tests/test-regex.c +++ b/tests/test-regex.c @@ -55,6 +55,111 @@ really_utf8 (void) return strcmp (locale_charset (), "UTF-8") == 0; } +/* Tests supposed to match; copied from glibc posix/bug-regex11.c. */ +static struct +{ + const char *pattern; + const char *string; + int flags, nmatch; + regmatch_t rm[5]; +} const tests[] = { + /* Test for newline handling in regex. */ + { "[^~]*~", "\nx~y", 0, 2, { { 0, 3 }, { -1, -1 } } }, + /* Other tests. */ + { "a(.*)b", "a b", REG_EXTENDED, 2, { { 0, 3 }, { 1, 2 } } }, + { ".*|\\([KIO]\\)\\([^|]*\\).*|?[KIO]", "10~.~|P|K0|I10|O16|?KSb", 0, 3, + { { 0, 21 }, { 15, 16 }, { 16, 18 } } }, + { ".*|\\([KIO]\\)\\([^|]*\\).*|?\\1", "10~.~|P|K0|I10|O16|?KSb", 0, 3, + { { 0, 21 }, { 8, 9 }, { 9, 10 } } }, + { "^\\(a*\\)\\1\\{9\\}\\(a\\{0,9\\}\\)\\([0-9]*;.*[^a]\\2\\([0-9]\\)\\)", + "a1;;0a1aa2aaa3aaaa4aaaaa5aaaaaa6aaaaaaa7aaaaaaaa8aaaaaaaaa9aa2aa1a0", 0, + 5, { { 0, 67 }, { 0, 0 }, { 0, 1 }, { 1, 67 }, { 66, 67 } } }, + /* Test for BRE expression anchoring. POSIX says just that this may match; + in glibc regex it always matched, so avoid changing it. */ + { "\\(^\\|foo\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } }, + { "\\(foo\\|^\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } }, + /* In ERE this must be treated as an anchor. */ + { "(^|foo)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } }, + { "(foo|^)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } }, + /* Here ^ cannot be treated as an anchor according to POSIX. */ + { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } }, + { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } }, + /* More tests on backreferences. */ + { "()\\1", "x", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } }, + { "()x\\1", "x", REG_EXTENDED, 2, { { 0, 1 }, { 0, 0 } } }, + { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } }, + { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } }, + { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } }, + { "(b)()c\\1", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 1 }, { 1, 1 } } }, + { "()(b)c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } }, + { "a(b)()c\\1", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 2 }, { 2, 2 } } }, + { "a()(b)c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } }, + { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } }, + { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } }, + { "a()(b)\\1c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } }, + { "a()d(b)\\1c\\2", "adbcb", REG_EXTENDED, 3, { { 0, 5 }, { 1, 1 }, { 2, 3 } } }, + { "a(b())\\2\\1", "abbbb", REG_EXTENDED, 3, { { 0, 3 }, { 1, 2 }, { 2, 2 } } }, + { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } } }, + { "^([^,]*),\\1,\\1$", "a,a,a", REG_EXTENDED, 2, { { 0, 5 }, { 0, 1 } } }, + { "^([^,]*),\\1,\\1$", "ab,ab,ab", REG_EXTENDED, 2, { { 0, 8 }, { 0, 2 } } }, + { "^([^,]*),\\1,\\1,\\1$", "abc,abc,abc,abc", REG_EXTENDED, 2, + { { 0, 15 }, { 0, 3 } } }, + { "^(.?)(.?)(.?)(.?)(.?).?\\5\\4\\3\\2\\1$", + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } }, + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$", + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } }, + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$", + "abcdedcba", REG_EXTENDED, 1, { { 0, 9 } } }, + /* XXX Not used since they fail so far. */ + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$", + "ababababa", REG_EXTENDED, 1, { { 0, 9 } } }, + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$", + "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } }, + { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$", + "ababababa", REG_EXTENDED, 1, { { 0, 9 } } }, +}; + +static void +bug_regex11 (void) +{ + regex_t re; + regmatch_t rm[5]; + size_t i; + int n; + + for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i) + { + n = regcomp (&re, tests[i].pattern, tests[i].flags); + if (n != 0) + { + char buf[500]; + regerror (n, &re, buf, sizeof (buf)); + report_error ("%s: regcomp %zd failed: %s", tests[i].pattern, i, buf); + continue; + } + + if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0)) + { + report_error ("%s: regexec %zd failed", tests[i].pattern, i); + regfree (&re); + continue; + } + + for (n = 0; n < tests[i].nmatch; ++n) + if (rm[n].rm_so != tests[i].rm[n].rm_so + || rm[n].rm_eo != tests[i].rm[n].rm_eo) + { + if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1) + break; + report_error ("%s: regexec %zd match failure rm[%d] %d..%d", + tests[i].pattern, i, n, rm[n].rm_so, rm[n].rm_eo); + break; + } + + regfree (&re); + } +} + int main (void) { @@ -65,11 +170,15 @@ main (void) struct re_registers regs; #if HAVE_DECL_ALARM - /* Some builds of glibc go into an infinite loop on this test. */ - int alarm_value = 2; + /* In case a bug causes glibc to go into an infinite loop. + The tests should take less than 10 s on a reasonably modern CPU. */ + int alarm_value = 1000; signal (SIGALRM, SIG_DFL); alarm (alarm_value); #endif + + bug_regex11 (); + if (setlocale (LC_ALL, "en_US.UTF-8")) { { -- 2.27.0