Eric Blake wrote: > I'm committing this series to add a strstr module similar to memmem > ... > I'm also thinking that we do not need the c_strstr module - aside from its > comments on when it is safe to use a bytewise search even in a multibyte > locale, it behaves no differently than the POSIX specification of strstr.
We need the c-strstr module (see the other mail). But you are right regarding its implementation. Now that strstr is worst-case linear, c-strstr can use it. I'm applying the attached patch. > However, I can see having all three of strcasestr, c-strcasestr, and > mbscasestr, since it is conceivable to want single-byte locale-dependent case > insensitivity in strcasestr which differs from the locale-independent case- > insensitivity of c_strstr or the multibyte-safe mbscasestr. I dislike strcasestr because it _appears_ to be locale dependent if you test it with some old locales - but it does not work with the majority of locales in use today. The only reason to support it in gnulib is that BSD and glibc systems have it. 2008-01-10 Bruno Haible <[EMAIL PROTECTED]> Make c-strstr rely on strstr. * lib/c-strstr.c: Don't include str-kmp.h. (c_strstr): Define in terms of strstr. * modules/c-strstr (Files): Remove lib/str-kmp.h. (Depends-on): Remove stdbool, malloca, strnlen. Add strstr. *** lib/c-strstr.c.orig 2008-01-11 03:53:57.000000000 +0100 --- lib/c-strstr.c 2008-01-11 03:47:36.000000000 +0100 *************** *** 1,5 **** /* c-strstr.c -- substring search in C locale ! Copyright (C) 2005-2007 Free Software Foundation, Inc. Written by Bruno Haible <[EMAIL PROTECTED]>, 2005, 2007. This program is free software: you can redistribute it and/or modify --- 1,5 ---- /* c-strstr.c -- substring search in C locale ! Copyright (C) 2005-2008 Free Software Foundation, Inc. Written by Bruno Haible <[EMAIL PROTECTED]>, 2005, 2007. This program is free software: you can redistribute it and/or modify *************** *** 20,129 **** /* Specification. */ #include "c-strstr.h" - #include <stdbool.h> - #include <stdlib.h> #include <string.h> - #include "malloca.h" - - /* Knuth-Morris-Pratt algorithm. */ - #define CANON_ELEMENT(c) c - #include "str-kmp.h" - /* Find the first occurrence of NEEDLE in HAYSTACK. */ char * c_strstr (const char *haystack, const char *needle) { ! /* Be careful not to look at the entire extent of haystack or needle ! until needed. This is useful because of these two cases: ! - haystack may be very long, and a match of needle found early, ! - needle may be very long, and not even a short initial segment of ! needle may be found in haystack. */ ! if (*needle != '\0') ! { ! /* Minimizing the worst-case complexity: ! Let n = strlen(haystack), m = strlen(needle). ! The naïve algorithm is O(n*m) worst-case. ! The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a ! memory allocation. ! To achieve linear complexity and yet amortize the cost of the memory ! allocation, we activate the Knuth-Morris-Pratt algorithm only once ! the naïve algorithm has already run for some time; more precisely, ! when ! - the outer loop count is >= 10, ! - the average number of comparisons per outer loop is >= 5, ! - the total number of comparisons is >= m. ! But we try it only once. If the memory allocation attempt failed, ! we don't retry it. */ ! bool try_kmp = true; ! size_t outer_loop_count = 0; ! size_t comparison_count = 0; ! size_t last_ccount = 0; /* last comparison count */ ! const char *needle_last_ccount = needle; /* = needle + last_ccount */ ! ! /* Speed up the following searches of needle by caching its first ! character. */ ! unsigned char b = (unsigned char) *needle; ! ! needle++; ! for (;; haystack++) ! { ! if (*haystack == '\0') ! /* No match. */ ! return NULL; ! ! /* See whether it's advisable to use an asymptotically faster ! algorithm. */ ! if (try_kmp ! && outer_loop_count >= 10 ! && comparison_count >= 5 * outer_loop_count) ! { ! /* See if needle + comparison_count now reaches the end of ! needle. */ ! if (needle_last_ccount != NULL) ! { ! needle_last_ccount += ! strnlen (needle_last_ccount, comparison_count - last_ccount); ! if (*needle_last_ccount == '\0') ! needle_last_ccount = NULL; ! last_ccount = comparison_count; ! } ! if (needle_last_ccount == NULL) ! { ! /* Try the Knuth-Morris-Pratt algorithm. */ ! const char *result; ! bool success = ! knuth_morris_pratt_unibyte (haystack, needle - 1, &result); ! if (success) ! return (char *) result; ! try_kmp = false; ! } ! } ! ! outer_loop_count++; ! comparison_count++; ! if ((unsigned char) *haystack == b) ! /* The first character matches. */ ! { ! const char *rhaystack = haystack + 1; ! const char *rneedle = needle; ! ! for (;; rhaystack++, rneedle++) ! { ! if (*rneedle == '\0') ! /* Found a match. */ ! return (char *) haystack; ! if (*rhaystack == '\0') ! /* No match. */ ! return NULL; ! comparison_count++; ! if ((unsigned char) *rhaystack != (unsigned char) *rneedle) ! /* Nothing in this round. */ ! break; ! } ! } ! } ! } ! else ! return (char *) haystack; } --- 20,32 ---- /* Specification. */ #include "c-strstr.h" #include <string.h> /* Find the first occurrence of NEEDLE in HAYSTACK. */ char * c_strstr (const char *haystack, const char *needle) { ! /* POSIX says that strstr() interprets the strings as byte sequences, not ! as character sequences in the current locale. */ ! return strstr (haystack, needle); } *** modules/c-strstr.orig 2008-01-11 03:53:57.000000000 +0100 --- modules/c-strstr 2008-01-11 03:45:45.000000000 +0100 *************** *** 4,15 **** Files: lib/c-strstr.h lib/c-strstr.c - lib/str-kmp.h Depends-on: ! stdbool ! malloca ! strnlen configure.ac: --- 4,12 ---- Files: lib/c-strstr.h lib/c-strstr.c Depends-on: ! strstr configure.ac: