Patch updated.  Paul, thanks for the previous comments. As you
suggested, the attached patch doesn't copy the buffer and splits the
input when it finds an invalid character.

For the moment, I don't see a cleaner way to avoid the pcre internals.

Regards,

Santiago
From d58b53f86bb3f4b97137f708c159b4a3bc40c543 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Santiago=20Ruano=20Rinc=C3=B3n?= <santi...@debian.org>
Date: Tue, 9 Sep 2014 00:02:52 +0200
Subject: [PATCH] Workaround to don't abort for invalid UTF8 input

* src/pcresearch.c (Pexecute): If pcre_exec returns an invalid
UTF8 character error, evaluates the valid characters only, iteratively
dividing line_buf in two sections, before and after each invalid
character it founds.
* tests/pcre-infloop: Exit status is 1 again.
* tests/pcre-invalid-utf8-input: Check again if grep doesn't
abort. Also cheks for match after a second invalid character
in the same line.
* tests/fload1: Add simple --color tests with -P matcher

Closes http://debbugs.gnu.org/18266
---
 src/pcresearch.c              |   31 +++++++++++++++++++++++++++++++
 tests/foad1                   |    2 ++
 tests/pcre-infloop            |    2 +-
 tests/pcre-invalid-utf8-input |   16 +++++++++++++---
 4 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/src/pcresearch.c b/src/pcresearch.c
index 820dd00..e542d48 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -166,6 +166,37 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
                      sub, sizeof sub / sizeof *sub);
     }
 
+  if (e == PCRE_ERROR_BADUTF8)
+    {
+      ptrdiff_t clean_offset = start_ptr ? start_ptr - line_buf : 0;
+      const char * clean_sect_beg = line_buf + clean_offset;
+
+      while (e == PCRE_ERROR_BADUTF8)
+        {
+          if (line_buf < clean_sect_beg + sub[0])
+            {
+              /* Evaluate the buffer section previous to the invalid character */
+              e = pcre_exec (cre, extra, clean_sect_beg, sub[0],
+                           0, 0, sub, sizeof sub / sizeof *sub);
+            }
+          if (e == 1)
+            continue; /* Match */
+          else if (clean_sect_beg + sub[0] + 1 < line_end)
+            {
+              clean_sect_beg += sub[0] + 1;
+
+              /* Evaluate the remaining buffer section, after the invalid
+                 character */
+              e = pcre_exec (cre, extra, clean_sect_beg, line_end - clean_sect_beg,
+                           0, 0, sub, sizeof sub / sizeof *sub);
+            }
+        }
+
+      /* Fix offsets */
+      sub[0] += clean_sect_beg - line_buf;
+      sub[1] += clean_sect_beg - line_buf;
+    }
+
   if (e <= 0)
     {
       switch (e)
diff --git a/tests/foad1 b/tests/foad1
index eeab51a..fe9d0f5 100755
--- a/tests/foad1
+++ b/tests/foad1
@@ -134,8 +134,10 @@ grep_test "$x1" "$y1" -E -w --color=always -e ccc -e bb
 grep_test "$x1" "$y1" -F -w --color=always -e ccc -e bb
 grep_test "$x2" "$y2" -E -w --color=always bc
 grep_test "$x2" "$y2" -F -w --color=always bc
+grep_test "$x2" "$y2" -P -w --color=always bc
 grep_test "$x3" "$y3" -E -w --color=always bc
 grep_test "$x3" "$y3" -F -w --color=always bc
+grep_test "$x3" "$y3" -P -w --color=always bc
 
 # Skip the rest of the tests - known to fail. TAA.
 Exit $failures
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
index 1b33e72..b92f8e1 100755
--- a/tests/pcre-infloop
+++ b/tests/pcre-infloop
@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_
 fail=0
 
 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
-test $? = 2 || fail_ "libpcre's match function appears to infloop"
+test $? = 1 || fail_ "libpcre's match function appears to infloop"
 
 Exit $fail
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
index 913e8ee..a5ae7bc 100755
--- a/tests/pcre-invalid-utf8-input
+++ b/tests/pcre-invalid-utf8-input
@@ -13,9 +13,19 @@ require_en_utf8_locale_
 
 fail=0
 
-printf 'j\202\nj\n' > in || framework_failure_
+printf 'j\202j\202x\njx\n' > in || framework_failure_
 
-LC_ALL=en_US.UTF-8 grep -P j in
-test $? -eq 2 || fail=1
+LC_ALL=en_US.UTF-8 grep -P j in > out 2>&1 || fail=1
+compare in out || fail=1
+compare /dev/null err || fail=1
 
+# Match after a second invalid UTF-8 character
+#LC_ALL=en_US.UTF-8 grep -P x in > out 2>&1 || fail=1
+#compare in out || fail=1
+#compare /dev/null err || fail=1
+
+printf '\202xj\n' > in || framework_failure_
+LC_ALL=en_US.UTF-8 grep -P x in > out 2>&1 || fail=1
+compare in out || fail=1
+compare /dev/null err || fail=1
 Exit $fail
-- 
1.7.10.4

Attachment: signature.asc
Description: Digital signature

Reply via email to