El 16/08/14 a las 11:36, Paul Eggert escribió:
> Santiago wrote:
> >Another solution would be to don't check if binary files are valid
> >(passing PCRE_NO_UTF8_CHECK to pcre_exec), but I don't know if that'd
> >avoid security holes
> 
> It wouldn't.  (We already tried it.)
> 

Another try. This patch is by far more efficient.
With the previous patch #1:

 % time grep -P faz /usr/bin/*                                            
 ...
 grep: /usr/bin/X11: Es un directorio
 grep -P faz /usr/bin/*  519,78s user 0,32s system 99% cpu 8:41,19 total

 With this one:

  % time src/grep -P faz /usr/bin/*
  src/grep -P faz /usr/bin/*  7,36s user 0,33s system 99% cpu 7,695 total

Cheers,

Santiago
From 1f8aa0f711f1954b688a790c54b0cadbde165e5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Santiago=20Ruano=20Rinc=C3=B3n?= <santi...@debian.org>
Date: Thu, 28 Aug 2014 22:39:51 -0700
Subject: [PATCH] Workaround to don't abort for invalid UTF8 input

* src/pcresearch.c (Pexecute): When pcre_exec returns an invalid
UTF8 character error, copies line_buf to an auxiliar buffer,
removes invalid characters and evaluates against it.
* tests/pcre-infloop: Exit status is 1 again.
* tests/pcre-invalid-utf8-input: Check again if grep doesn't
abort. Also cheks for match after a second invalid character
in the same line.

Closes http://debbugs.gnu.org/18266
---
 src/pcresearch.c              |   21 +++++++++++++++++++++
 tests/pcre-infloop            |    2 +-
 tests/pcre-invalid-utf8-input |   12 +++++++++---
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/src/pcresearch.c b/src/pcresearch.c
index 820dd00..31661f9 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -164,6 +164,27 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
       e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
                      start_ofs < 0 ? 0 : start_ofs, 0,
                      sub, sizeof sub / sizeof *sub);
+
+      if (e == PCRE_ERROR_BADUTF8){
+          char *line_utf8_clean = xmemdup (line_buf, line_end - line_buf);
+          long invalid_pos = 0;
+
+          /* Change invalid UTF-8 characters (according to pcre_exec) to '\0' */
+          while (e == PCRE_ERROR_BADUTF8){
+            line_utf8_clean[sub[0]+invalid_pos] = '\0';
+            invalid_pos += sub[0];
+
+            /* Evaluate the remaining line_utf8_clean section */
+            e = pcre_exec (cre, extra, line_utf8_clean + invalid_pos, line_end - line_buf - invalid_pos,
+                         start_ofs < 0 ? 0 : start_ofs, 0,
+                         sub, sizeof sub / sizeof *sub);
+          }
+
+          /* Evaluate the cleaned line_utf8_clean */
+          e = pcre_exec (cre, extra, line_utf8_clean, line_end - line_buf,
+                         start_ofs < 0 ? 0 : start_ofs, 0,
+                         sub, sizeof sub / sizeof *sub);
+      }
     }
 
   if (e <= 0)
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
index 1b33e72..b92f8e1 100755
--- a/tests/pcre-infloop
+++ b/tests/pcre-infloop
@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_
 fail=0
 
 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
-test $? = 2 || fail_ "libpcre's match function appears to infloop"
+test $? = 1 || fail_ "libpcre's match function appears to infloop"
 
 Exit $fail
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
index 913e8ee..2c6aadb 100755
--- a/tests/pcre-invalid-utf8-input
+++ b/tests/pcre-invalid-utf8-input
@@ -13,9 +13,15 @@ require_en_utf8_locale_
 
 fail=0
 
-printf 'j\202\nj\n' > in || framework_failure_
+printf 'j\202j\202\x\njx\n' > in || framework_failure_
 
-LC_ALL=en_US.UTF-8 grep -P j in
-test $? -eq 2 || fail=1
+LC_ALL=en_US.UTF-8 grep -P j in > out 2>&1 || fail=1
+compare in out || fail=1
+compare /dev/null err || fail=1
+
+# Match after a second invalid UTF-8 character
+LC_ALL=en_US.UTF-8 grep -P x in > out 2>&1 || fail=1
+compare in out || fail=1
+compare /dev/null err || fail=1
 
 Exit $fail
-- 
1.7.10.4

Attachment: signature.asc
Description: Digital signature

Reply via email to