Patch updated. Paul, thanks for the previous comments. As you suggested, the attached patch doesn't copy the buffer and splits the input when it finds an invalid character.
For the moment, I don't see a cleaner way to avoid the pcre internals. Regards, Santiago
From d58b53f86bb3f4b97137f708c159b4a3bc40c543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Santiago=20Ruano=20Rinc=C3=B3n?= <santi...@debian.org> Date: Tue, 9 Sep 2014 00:02:52 +0200 Subject: [PATCH] Workaround to don't abort for invalid UTF8 input * src/pcresearch.c (Pexecute): If pcre_exec returns an invalid UTF8 character error, evaluates the valid characters only, iteratively dividing line_buf in two sections, before and after each invalid character it founds. * tests/pcre-infloop: Exit status is 1 again. * tests/pcre-invalid-utf8-input: Check again if grep doesn't abort. Also cheks for match after a second invalid character in the same line. * tests/fload1: Add simple --color tests with -P matcher Closes http://debbugs.gnu.org/18266 --- src/pcresearch.c | 31 +++++++++++++++++++++++++++++++ tests/foad1 | 2 ++ tests/pcre-infloop | 2 +- tests/pcre-invalid-utf8-input | 16 +++++++++++++--- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/pcresearch.c b/src/pcresearch.c index 820dd00..e542d48 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -166,6 +166,37 @@ Pexecute (char const *buf, size_t size, size_t *match_size, sub, sizeof sub / sizeof *sub); } + if (e == PCRE_ERROR_BADUTF8) + { + ptrdiff_t clean_offset = start_ptr ? start_ptr - line_buf : 0; + const char * clean_sect_beg = line_buf + clean_offset; + + while (e == PCRE_ERROR_BADUTF8) + { + if (line_buf < clean_sect_beg + sub[0]) + { + /* Evaluate the buffer section previous to the invalid character */ + e = pcre_exec (cre, extra, clean_sect_beg, sub[0], + 0, 0, sub, sizeof sub / sizeof *sub); + } + if (e == 1) + continue; /* Match */ + else if (clean_sect_beg + sub[0] + 1 < line_end) + { + clean_sect_beg += sub[0] + 1; + + /* Evaluate the remaining buffer section, after the invalid + character */ + e = pcre_exec (cre, extra, clean_sect_beg, line_end - clean_sect_beg, + 0, 0, sub, sizeof sub / sizeof *sub); + } + } + + /* Fix offsets */ + sub[0] += clean_sect_beg - line_buf; + sub[1] += clean_sect_beg - line_buf; + } + if (e <= 0) { switch (e) diff --git a/tests/foad1 b/tests/foad1 index eeab51a..fe9d0f5 100755 --- a/tests/foad1 +++ b/tests/foad1 @@ -134,8 +134,10 @@ grep_test "$x1" "$y1" -E -w --color=always -e ccc -e bb grep_test "$x1" "$y1" -F -w --color=always -e ccc -e bb grep_test "$x2" "$y2" -E -w --color=always bc grep_test "$x2" "$y2" -F -w --color=always bc +grep_test "$x2" "$y2" -P -w --color=always bc grep_test "$x3" "$y3" -E -w --color=always bc grep_test "$x3" "$y3" -F -w --color=always bc +grep_test "$x3" "$y3" -P -w --color=always bc # Skip the rest of the tests - known to fail. TAA. Exit $failures diff --git a/tests/pcre-infloop b/tests/pcre-infloop index 1b33e72..b92f8e1 100755 --- a/tests/pcre-infloop +++ b/tests/pcre-infloop @@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_ fail=0 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in -test $? = 2 || fail_ "libpcre's match function appears to infloop" +test $? = 1 || fail_ "libpcre's match function appears to infloop" Exit $fail diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input index 913e8ee..a5ae7bc 100755 --- a/tests/pcre-invalid-utf8-input +++ b/tests/pcre-invalid-utf8-input @@ -13,9 +13,19 @@ require_en_utf8_locale_ fail=0 -printf 'j\202\nj\n' > in || framework_failure_ +printf 'j\202j\202x\njx\n' > in || framework_failure_ -LC_ALL=en_US.UTF-8 grep -P j in -test $? -eq 2 || fail=1 +LC_ALL=en_US.UTF-8 grep -P j in > out 2>&1 || fail=1 +compare in out || fail=1 +compare /dev/null err || fail=1 +# Match after a second invalid UTF-8 character +#LC_ALL=en_US.UTF-8 grep -P x in > out 2>&1 || fail=1 +#compare in out || fail=1 +#compare /dev/null err || fail=1 + +printf '\202xj\n' > in || framework_failure_ +LC_ALL=en_US.UTF-8 grep -P x in > out 2>&1 || fail=1 +compare in out || fail=1 +compare /dev/null err || fail=1 Exit $fail -- 1.7.10.4
signature.asc
Description: Digital signature