El 16/08/14 a las 11:36, Paul Eggert escribió: > Santiago wrote: > >Another solution would be to don't check if binary files are valid > >(passing PCRE_NO_UTF8_CHECK to pcre_exec), but I don't know if that'd > >avoid security holes > > It wouldn't. (We already tried it.) >
Another try. This patch is by far more efficient. With the previous patch #1: % time grep -P faz /usr/bin/* ... grep: /usr/bin/X11: Es un directorio grep -P faz /usr/bin/* 519,78s user 0,32s system 99% cpu 8:41,19 total With this one: % time src/grep -P faz /usr/bin/* src/grep -P faz /usr/bin/* 7,36s user 0,33s system 99% cpu 7,695 total Cheers, Santiago
From 1f8aa0f711f1954b688a790c54b0cadbde165e5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Santiago=20Ruano=20Rinc=C3=B3n?= <santi...@debian.org> Date: Thu, 28 Aug 2014 22:39:51 -0700 Subject: [PATCH] Workaround to don't abort for invalid UTF8 input * src/pcresearch.c (Pexecute): When pcre_exec returns an invalid UTF8 character error, copies line_buf to an auxiliar buffer, removes invalid characters and evaluates against it. * tests/pcre-infloop: Exit status is 1 again. * tests/pcre-invalid-utf8-input: Check again if grep doesn't abort. Also cheks for match after a second invalid character in the same line. Closes http://debbugs.gnu.org/18266 --- src/pcresearch.c | 21 +++++++++++++++++++++ tests/pcre-infloop | 2 +- tests/pcre-invalid-utf8-input | 12 +++++++++--- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/pcresearch.c b/src/pcresearch.c index 820dd00..31661f9 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -164,6 +164,27 @@ Pexecute (char const *buf, size_t size, size_t *match_size, e = pcre_exec (cre, extra, line_buf, line_end - line_buf, start_ofs < 0 ? 0 : start_ofs, 0, sub, sizeof sub / sizeof *sub); + + if (e == PCRE_ERROR_BADUTF8){ + char *line_utf8_clean = xmemdup (line_buf, line_end - line_buf); + long invalid_pos = 0; + + /* Change invalid UTF-8 characters (according to pcre_exec) to '\0' */ + while (e == PCRE_ERROR_BADUTF8){ + line_utf8_clean[sub[0]+invalid_pos] = '\0'; + invalid_pos += sub[0]; + + /* Evaluate the remaining line_utf8_clean section */ + e = pcre_exec (cre, extra, line_utf8_clean + invalid_pos, line_end - line_buf - invalid_pos, + start_ofs < 0 ? 0 : start_ofs, 0, + sub, sizeof sub / sizeof *sub); + } + + /* Evaluate the cleaned line_utf8_clean */ + e = pcre_exec (cre, extra, line_utf8_clean, line_end - line_buf, + start_ofs < 0 ? 0 : start_ofs, 0, + sub, sizeof sub / sizeof *sub); + } } if (e <= 0) diff --git a/tests/pcre-infloop b/tests/pcre-infloop index 1b33e72..b92f8e1 100755 --- a/tests/pcre-infloop +++ b/tests/pcre-infloop @@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_ fail=0 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in -test $? = 2 || fail_ "libpcre's match function appears to infloop" +test $? = 1 || fail_ "libpcre's match function appears to infloop" Exit $fail diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input index 913e8ee..2c6aadb 100755 --- a/tests/pcre-invalid-utf8-input +++ b/tests/pcre-invalid-utf8-input @@ -13,9 +13,15 @@ require_en_utf8_locale_ fail=0 -printf 'j\202\nj\n' > in || framework_failure_ +printf 'j\202j\202\x\njx\n' > in || framework_failure_ -LC_ALL=en_US.UTF-8 grep -P j in -test $? -eq 2 || fail=1 +LC_ALL=en_US.UTF-8 grep -P j in > out 2>&1 || fail=1 +compare in out || fail=1 +compare /dev/null err || fail=1 + +# Match after a second invalid UTF-8 character +LC_ALL=en_US.UTF-8 grep -P x in > out 2>&1 || fail=1 +compare in out || fail=1 +compare /dev/null err || fail=1 Exit $fail -- 1.7.10.4
signature.asc
Description: Digital signature