-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 According to Eric Blake on 1/10/2008 8:33 PM: | should I go ahead and prepare a patch for strcasestr, c-strcasestr, and | memcasecmp to utilize str-two-way.h?
How's this for strcasestr? Glibc's strcasestr is quadratic (no surprise there, given our track record); cygwin 1.5.x does not have strcasestr but CVS cygwin does, and it is quadratic (as of today, although I've posted a similar patch there). We could add a strcasestr-simple, to parallel memmem-simple, if desired. - -- Don't work too hard, make some time for fun as well! Eric Blake [EMAIL PROTECTED] -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.5 (Cygwin) Comment: Public key at home.comcast.net/~ericblake/eblake.gpg Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org iD8DBQFHhv3Z84KuGfSFAYARArsNAJ95wB3UAQCayfXnRM2qew5tk5If2gCgoxt8 PiaqTo9MdzmclcKbWjkxuIc= =0ic5 -----END PGP SIGNATURE-----
>From 7051f3d9bab2bed584317204ce41c5dedc1f1b60 Mon Sep 17 00:00:00 2001 From: Eric Blake <[EMAIL PROTECTED]> Date: Thu, 10 Jan 2008 22:22:51 -0700 Subject: [PATCH] Convert strcasestr module to use Two-Way algorithm. * modules/strcasestr (Depends-on): Remove malloca, strnlen. (Files): Use str-two-way.h, not str-kmp.h. * lib/string.in.h (rpl_strcasestr): Declare. * m4/strcasestr.m4 (gl_FUNC_STRCASESTR): Check for linear performance. * lib/strcasestr.c (strcasestr): Simplify, and avoid malloc. * modules/string (Makefile.am): Support strcasestr. * m4/string_h.m4 (gl_HEADER_STRING_H_DEFAULTS): Likewise. * modules/strcasestr-tests (Depends-on): Check for alarm. * tests/test-strcasestr.c: Augment test. * lib/str-two-way.h: Clean up stray macro. Signed-off-by: Eric Blake <[EMAIL PROTECTED]> --- ChangeLog | 15 +++++ lib/str-two-way.h | 1 + lib/strcasestr.c | 140 ++++++++++++++------------------------------- lib/string.in.h | 10 ++- m4/strcasestr.m4 | 40 ++++++++++++- m4/string_h.m4 | 1 + modules/strcasestr | 4 +- modules/strcasestr-tests | 1 + modules/string | 1 + tests/test-strcasestr.c | 4 +- 10 files changed, 109 insertions(+), 108 deletions(-) diff --git a/ChangeLog b/ChangeLog index 1f1cb43..b36c3e8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2008-01-11 Eric Blake <[EMAIL PROTECTED]> + + Convert strcasestr module to use Two-Way algorithm. + * modules/strcasestr (Depends-on): Remove malloca, strnlen. + (Files): Use str-two-way.h, not str-kmp.h. + * lib/string.in.h (rpl_strcasestr): Declare. + * m4/strcasestr.m4 (gl_FUNC_STRCASESTR): Check for linear + performance. + * lib/strcasestr.c (strcasestr): Simplify, and avoid malloc. + * modules/string (Makefile.am): Support strcasestr. + * m4/string_h.m4 (gl_HEADER_STRING_H_DEFAULTS): Likewise. + * modules/strcasestr-tests (Depends-on): Check for alarm. + * tests/test-strcasestr.c: Augment test. + * lib/str-two-way.h: Clean up stray macro. + 2008-01-10 Eric Blake <[EMAIL PROTECTED]> * m4/strstr.m4: Delete cruft from copy-n-paste. diff --git a/lib/str-two-way.h b/lib/str-two-way.h index 3aa3a1b..d144ac9 100644 --- a/lib/str-two-way.h +++ b/lib/str-two-way.h @@ -422,5 +422,6 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, #undef AVAILABLE #undef CANON_ELEMENT +#undef CMP_FUNC #undef MAX #undef RETURN_TYPE diff --git a/lib/strcasestr.c b/lib/strcasestr.c index 34f36a7..9a2db9b 100644 --- a/lib/strcasestr.c +++ b/lib/strcasestr.c @@ -1,5 +1,5 @@ /* Case-insensitive searching in a string. - Copyright (C) 2005-2007 Free Software Foundation, Inc. + Copyright (C) 2005-2008 Free Software Foundation, Inc. Written by Bruno Haible <[EMAIL PROTECTED]>, 2005. This program is free software; you can redistribute it and/or modify @@ -23,109 +23,57 @@ #include <ctype.h> #include <stdbool.h> -#include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */ - -#include "malloca.h" #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) -/* Knuth-Morris-Pratt algorithm. */ +/* Two-Way algorithm. */ +#define RETURN_TYPE char * +#define AVAILABLE(h, h_l, j, n_l) \ + (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ + && ((h_l) = (j) + (n_l))) #define CANON_ELEMENT(c) TOLOWER (c) -#include "str-kmp.h" +#define CMP_FUNC strncasecmp +#include "str-two-way.h" -/* Find the first occurrence of NEEDLE in HAYSTACK, using case-insensitive - comparison. - Note: This function may, in multibyte locales, return success even if - strlen (haystack) < strlen (needle) ! */ +/* Find the first occurrence of NEEDLE in HAYSTACK, using + case-insensitive comparison. This function gives unspecified + results in multibyte locales. */ char * -strcasestr (const char *haystack, const char *needle) +strcasestr (const char *haystack_start, const char *needle_start) { - if (*needle != '\0') - { - /* Minimizing the worst-case complexity: - Let n = strlen(haystack), m = strlen(needle). - The naïve algorithm is O(n*m) worst-case. - The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a - memory allocation. - To achieve linear complexity and yet amortize the cost of the memory - allocation, we activate the Knuth-Morris-Pratt algorithm only once - the naïve algorithm has already run for some time; more precisely, - when - - the outer loop count is >= 10, - - the average number of comparisons per outer loop is >= 5, - - the total number of comparisons is >= m. - But we try it only once. If the memory allocation attempt failed, - we don't retry it. */ - bool try_kmp = true; - size_t outer_loop_count = 0; - size_t comparison_count = 0; - size_t last_ccount = 0; /* last comparison count */ - const char *needle_last_ccount = needle; /* = needle + last_ccount */ - - /* Speed up the following searches of needle by caching its first - character. */ - unsigned char b = TOLOWER ((unsigned char) *needle); + const char *haystack = haystack_start; + const char *needle = needle_start; + size_t needle_len; /* Length of NEEDLE. */ + size_t haystack_len; /* Known minimum length of HAYSTACK. */ + bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */ + /* Determine length of NEEDLE, and in the process, make sure + HAYSTACK is at least as long (no point processing all of a long + NEEDLE if HAYSTACK is too short). */ + while (*haystack && *needle) + { + ok &= TOLOWER (*haystack) == TOLOWER (*needle); + haystack++; needle++; - for (;; haystack++) - { - if (*haystack == '\0') - /* No match. */ - return NULL; - - /* See whether it's advisable to use an asymptotically faster - algorithm. */ - if (try_kmp - && outer_loop_count >= 10 - && comparison_count >= 5 * outer_loop_count) - { - /* See if needle + comparison_count now reaches the end of - needle. */ - if (needle_last_ccount != NULL) - { - needle_last_ccount += - strnlen (needle_last_ccount, comparison_count - last_ccount); - if (*needle_last_ccount == '\0') - needle_last_ccount = NULL; - last_ccount = comparison_count; - } - if (needle_last_ccount == NULL) - { - /* Try the Knuth-Morris-Pratt algorithm. */ - const char *result; - bool success = - knuth_morris_pratt_unibyte (haystack, needle - 1, &result); - if (success) - return (char *) result; - try_kmp = false; - } - } - - outer_loop_count++; - comparison_count++; - if (TOLOWER ((unsigned char) *haystack) == b) - /* The first character matches. */ - { - const char *rhaystack = haystack + 1; - const char *rneedle = needle; - - for (;; rhaystack++, rneedle++) - { - if (*rneedle == '\0') - /* Found a match. */ - return (char *) haystack; - if (*rhaystack == '\0') - /* No match. */ - return NULL; - comparison_count++; - if (TOLOWER ((unsigned char) *rhaystack) - != TOLOWER ((unsigned char) *rneedle)) - /* Nothing in this round. */ - break; - } - } - } } - else - return (char *) haystack; + if (*needle) + return NULL; + if (ok) + return (char *) haystack_start; + needle_len = needle - needle_start; + needle -= needle_len; + haystack = haystack_start + 1; + haystack_len = needle_len - 1; + + /* Perform the search. Abstract memory is considered to be an array + of 'unsigned char' values, not an array of 'char' values. See + ISO C 99 section 6.2.6.1. */ + if (needle_len < LONG_NEEDLE_THRESHOLD) + return two_way_short_needle ((const unsigned char *) haystack, + haystack_len, + (const unsigned char *) needle, needle_len); + return two_way_long_needle ((const unsigned char *) haystack, haystack_len, + (const unsigned char *) needle, needle_len); } + +#undef LONG_NEEDLE_THRESHOLD diff --git a/lib/string.in.h b/lib/string.in.h index 4d68cd9..e34645d 100644 --- a/lib/string.in.h +++ b/lib/string.in.h @@ -313,11 +313,15 @@ char *strstr (const char *haystack, const char *needle) /* Find the first occurrence of NEEDLE in HAYSTACK, using case-insensitive comparison. */ -#if ! @HAVE_STRCASESTR@ +#if @GNULIB_STRCASESTR@ +# if @REPLACE_STRCASESTR@ +# define strcasestr rpl_strcasestr +# endif +# if ! @HAVE_STRCASESTR@ || @REPLACE_STRCASESTR@ extern char *strcasestr (const char *haystack, const char *needle) __attribute__ ((__pure__)); -#endif -#if defined GNULIB_POSIXCHECK +# endif +#elif defined GNULIB_POSIXCHECK /* strcasestr() does not work with multibyte strings: It is a glibc extension, and glibc implements it only for unibyte locales. */ diff --git a/m4/strcasestr.m4 b/m4/strcasestr.m4 index 52f3a58..9583b14 100644 --- a/m4/strcasestr.m4 +++ b/m4/strcasestr.m4 @@ -1,5 +1,5 @@ -# strcasestr.m4 serial 6 -dnl Copyright (C) 2005, 2007 Free Software Foundation, Inc. +# strcasestr.m4 serial 7 +dnl Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, dnl with or without modifications, as long as this notice is preserved. @@ -7,10 +7,44 @@ dnl with or without modifications, as long as this notice is preserved. AC_DEFUN([gl_FUNC_STRCASESTR], [ AC_REQUIRE([gl_HEADER_STRING_H_DEFAULTS]) - AC_REPLACE_FUNCS(strcasestr) + AC_REPLACE_FUNCS([strcasestr]) if test $ac_cv_func_strcasestr = no; then HAVE_STRCASESTR=0 gl_PREREQ_STRCASESTR + else + AC_CACHE_CHECK([whether strcasestr works in linear time], + [gl_cv_func_strcasestr_linear], + [AC_RUN_IFELSE([AC_LANG_PROGRAM([ +#include <string.h> /* for memmem */ +#include <stdlib.h> /* for malloc */ +#include <unistd.h> /* for alarm */ +], [[size_t m = 1000000; + char *haystack = (char *) malloc (2 * m + 2); + char *needle = (char *) malloc (m + 2); + void *result = 0; + /* Failure to compile this test due to missing alarm is okay, + since all such platforms (mingw) also lack strcasestr. */ + alarm (5); + /* Check for quadratic performance. */ + if (haystack && needle) + { + memset (haystack, 'A', 2 * m); + haystack[2 * m] = 'B'; + haystack[2 * m + 1] = 0; + memset (needle, 'A', m); + needle[m] = 'B'; + needle[m + 1] = 0; + result = strcasestr (haystack, needle); + } + return !result;]])], + [gl_cv_func_strcasestr_linear=yes], [gl_cv_func_strcasestr_linear=no], + [dnl pessimistically assume the worst, since even glibc 2.6.1 + dnl has quadratic complexity in its strcasestr + gl_cv_func_strcasestr_linear="guessing no"])]) + if test "$gl_cv_func_strcasestr_linear" != yes; then + REPLACE_STRCASESTR=1 + AC_LIBOBJ([strcasestr]) + fi fi ]) diff --git a/m4/string_h.m4 b/m4/string_h.m4 index 1598c39..a50b3e3 100644 --- a/m4/string_h.m4 +++ b/m4/string_h.m4 @@ -79,5 +79,6 @@ AC_DEFUN([gl_HEADER_STRING_H_DEFAULTS], HAVE_DECL_STRERROR=1; AC_SUBST([HAVE_DECL_STRERROR]) REPLACE_STRERROR=0; AC_SUBST([REPLACE_STRERROR]) REPLACE_MEMMEM=0; AC_SUBST([REPLACE_MEMMEM]) + REPLACE_STRCASESTR=0; AC_SUBST([REPLACE_STRCASESTR]) REPLACE_STRSTR=0; AC_SUBST([REPLACE_STRSTR]) ]) diff --git a/modules/strcasestr b/modules/strcasestr index 884edfd..5be4ad9 100644 --- a/modules/strcasestr +++ b/modules/strcasestr @@ -3,14 +3,12 @@ strcasestr() function: case-insensitive search for a substring in a string. Files: lib/strcasestr.c -lib/str-kmp.h +lib/str-two-way.h m4/strcasestr.m4 Depends-on: string stdbool -malloca -strnlen configure.ac: gl_FUNC_STRCASESTR diff --git a/modules/strcasestr-tests b/modules/strcasestr-tests index e472d5b..e5262cc 100644 --- a/modules/strcasestr-tests +++ b/modules/strcasestr-tests @@ -4,6 +4,7 @@ tests/test-strcasestr.c Depends-on: configure.ac: +AC_CHECK_DECLS_ONCE([alarm]) Makefile.am: TESTS += test-strcasestr diff --git a/modules/string b/modules/string index 431a322..3e82431 100644 --- a/modules/string +++ b/modules/string @@ -68,6 +68,7 @@ string.h: string.in.h -e 's|@''HAVE_DECL_STRTOK_R''@|$(HAVE_DECL_STRTOK_R)|g' \ -e 's|@''HAVE_DECL_STRERROR''@|$(HAVE_DECL_STRERROR)|g' \ -e 's|@''REPLACE_MEMMEM''@|$(REPLACE_MEMMEM)|g' \ + -e 's|@''REPLACE_STRCASESTR''@|$(REPLACE_STRCASESTR)|g' \ -e 's|@''REPLACE_STRSTR''@|$(REPLACE_STRSTR)|g' \ -e 's|@''REPLACE_STRERROR''@|$(REPLACE_STRERROR)|g' \ -e '/definition of GL_LINK_WARNING/r $(LINK_WARNING_H)' \ diff --git a/tests/test-strcasestr.c b/tests/test-strcasestr.c index ceaab79..64a09f2 100644 --- a/tests/test-strcasestr.c +++ b/tests/test-strcasestr.c @@ -1,5 +1,5 @@ /* Test of case-insensitive searching in a string. - Copyright (C) 2007 Free Software Foundation, Inc. + Copyright (C) 2007, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -110,7 +110,6 @@ main () } /* Check that the asymptotic worst-case complexity is not quadratic. */ -#if !HAVE_STRCASESTR /* The system's strcasestr() function fails this test. */ { size_t m = 1000000; char *haystack = (char *) malloc (2 * m + 2); @@ -135,7 +134,6 @@ main () if (haystack != NULL) free (haystack); } -#endif return 0; } -- 1.5.3.5