Paul Eggert wrote:
Attached are some proposed patches which should improve the performance of grep -P when applied to binary files, among other things. I have some other ideas for boosting performance further but thought I'd publish these first.
I pushed those patches, along with the attached further patches to fix up some porting glitches and bugs I encountered in subsequent testing. I plan to follow up soon on Bug#18454 with more performance-related patches in this area.
From 53c5d9fd50b6895b886c1d19d0851562fc03e00c Mon Sep 17 00:00:00 2001 From: Paul Eggert <egg...@cs.ucla.edu> Date: Tue, 16 Sep 2014 17:29:40 -0700 Subject: [PATCH 07/10] grep: avoid false alarms for mb_clen and to_uchar * cfg.mk (_gl_TS_unmarked_extern_functions): New var, to bypass the tight_scope false alarms on mb_clen and to_uchar. --- cfg.mk | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cfg.mk b/cfg.mk index 947d184..3316b5d 100644 --- a/cfg.mk +++ b/cfg.mk @@ -28,6 +28,10 @@ local-checks-to-skip = \ # Tools used to bootstrap this package, used for "announcement". bootstrap-tools = autoconf,automake,gnulib +# The tight_scope test gets confused about inline functions. +# like 'to_uchar'. +_gl_TS_unmarked_extern_functions = main usage mb_clen to_uchar + # Now that we have better tests, make this the default. export VERBOSE = yes -- 1.9.3
From 493ddec2e61d48953600575896a5d3ce1d1a582b Mon Sep 17 00:00:00 2001 From: Paul Eggert <egg...@cs.ucla.edu> Date: Mon, 15 Sep 2014 22:25:21 -0700 Subject: [PATCH 08/10] grep: use mbclen cache in one more place * src/grep.c (fgrep_to_grep_pattern): Use mb_clen here, too. --- src/grep.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/grep.c b/src/grep.c index 72a811e..e4379bc 100644 --- a/src/grep.c +++ b/src/grep.c @@ -1912,8 +1912,7 @@ fgrep_to_grep_pattern (size_t len, char const *keys, for (; len; keys += n, len -= n) { - wchar_t wc; - n = mbrtowc (&wc, keys, len, &mb_state); + n = mb_clen (keys, len, &mb_state); switch (n) { case (size_t) -2: -- 1.9.3
From 219f10596c17e38b2716673a140c2b3827549862 Mon Sep 17 00:00:00 2001 From: Paul Eggert <egg...@cs.ucla.edu> Date: Mon, 15 Sep 2014 17:27:58 -0700 Subject: [PATCH 09/10] grep: port -P speedup to hosts lacking PCRE_STUDY_JIT_COMPILE * src/pcresearch.c (Pcompile): Do not assume that PCRE_STUDY_JIT_COMPILE is defined. (empty_match): Define on all platforms. --- src/pcresearch.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pcresearch.c b/src/pcresearch.c index 95877e3..ce65758 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -33,10 +33,6 @@ static pcre *cre; /* Additional information about the pattern. */ static pcre_extra *extra; -/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty - string matches when that flag is used. */ -static int empty_match[2]; - # ifdef PCRE_STUDY_JIT_COMPILE static pcre_jit_stack *jit_stack; # else @@ -44,6 +40,10 @@ static pcre_jit_stack *jit_stack; # endif #endif +/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty + string matches when that flag is used. */ +static int empty_match[2]; + void Pcompile (char const *pattern, size_t size) { @@ -129,11 +129,11 @@ Pcompile (char const *pattern, size_t size) pcre_assign_jit_stack (extra, NULL, jit_stack); } - empty_match[false] = pcre_exec (cre, extra, "", 0, 0, PCRE_NOTBOL, NULL, 0); - empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, NULL, 0); - # endif free (re); + + empty_match[false] = pcre_exec (cre, extra, "", 0, 0, PCRE_NOTBOL, NULL, 0); + empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, NULL, 0); #endif /* HAVE_LIBPCRE */ } -- 1.9.3
From 530fd765922b16643c78652ef036024fc4dd72eb Mon Sep 17 00:00:00 2001 From: Paul Eggert <egg...@cs.ucla.edu> Date: Mon, 15 Sep 2014 18:33:19 -0700 Subject: [PATCH 10/10] grep: fix -P speedup bug with empty match * src/pcresearch.c (NSUB): New top-level constant, replacing 'nsub' within Pexecute. (Pcompile, Pexecute): Use it. (Pexecute): Don't assume sub[1] is zero after a PCRE_ERROR_BADUTF8 match failure. * tests/pcre-invalid-utf8-input: Test for this bug. --- src/pcresearch.c | 32 +++++++++++++++++++------------- tests/pcre-invalid-utf8-input | 5 +++++ 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/pcresearch.c b/src/pcresearch.c index ce65758..c41f7ef 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -44,6 +44,10 @@ static pcre_jit_stack *jit_stack; string matches when that flag is used. */ static int empty_match[2]; +/* This must be at least 2; everything after that is for performance + in pcre_exec. */ +enum { NSUB = 300 }; + void Pcompile (char const *pattern, size_t size) { @@ -132,8 +136,10 @@ Pcompile (char const *pattern, size_t size) # endif free (re); - empty_match[false] = pcre_exec (cre, extra, "", 0, 0, PCRE_NOTBOL, NULL, 0); - empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, NULL, 0); + int sub[NSUB]; + empty_match[false] = pcre_exec (cre, extra, "", 0, 0, + PCRE_NOTBOL, sub, NSUB); + empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, sub, NSUB); #endif /* HAVE_LIBPCRE */ } @@ -146,11 +152,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size, error (EXIT_TROUBLE, 0, _("internal error")); return -1; #else - /* This array must have at least two elements; everything after that - is just for performance improvement in pcre_exec. */ - enum { nsub = 300 }; - int sub[nsub]; - + int sub[NSUB]; char const *p = start_ptr ? start_ptr : buf; bool bol = p[-1] == eolbyte; char const *line_start = buf; @@ -174,15 +176,19 @@ Pexecute (char const *buf, size_t size, size_t *match_size, { int options = bol ? 0 : PCRE_NOTBOL; int valid_bytes; - e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub); + e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, NSUB); if (e != PCRE_ERROR_BADUTF8) break; valid_bytes = sub[0]; - e = (valid_bytes == 0 - ? empty_match[bol] - : pcre_exec (cre, extra, p, valid_bytes, 0, - options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, - sub, nsub)); + if (valid_bytes == 0) + { + sub[1] = 0; + e = empty_match[bol]; + } + else + e = pcre_exec (cre, extra, p, valid_bytes, 0, + options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, + sub, NSUB); if (e != PCRE_ERROR_NOMATCH) break; p += valid_bytes + 1; diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input index 9da4b18..78bd1cf 100755 --- a/tests/pcre-invalid-utf8-input +++ b/tests/pcre-invalid-utf8-input @@ -21,4 +21,9 @@ test $? -eq 0 || fail=1 LC_ALL=en_US.UTF-8 grep -P 'k$' in test $? -eq 1 || fail=1 +echo k >exp + +LC_ALL=en_US.UTF-8 grep -aoP 'k*' in >out || fail=1 +compare exp out || fail=1 + Exit $fail -- 1.9.3