> > Thus, wouldn't it be wise to fix strcasecmp itself, and include it > > unconditionally in GnuLib ? > > Yes. Here's a patch to that effect. (Completely untested so far. Also > quite inefficient. One can do better with mbrtowc().)
Well, that code was broken. And a similar code I wrote based on mbrtowc() was broken as well: It did not fulfill the transitivity rule cmp (s1, s2) < 0 && cmp (s2, s3) < 0 ==> cmp (s1, s3) < 0 when some of s1, s2, s3 were strings containing invalid multibyte characters. The lesson to be learned from this is: you cannot write good multibyte support if you don't have a "multibyte character" abstraction. If you use the naked C99 functions, the code will be so complicated that it will never behave well and consistently. So I dug out an earlier 'mbchar' abstraction, and rewrite strcasecmp() as follows. With this patch, a simple test program shows that it works fine in a Turkish locale: ======================================================== #include <locale.h> #include <stdio.h> int main (int argc, char * argv[]) { setlocale (LC_ALL, ""); printf ("cmp %s %s -> %d\n", argv[1], argv[2], rpl_strcasecmp (argv[1], argv[2])); return 0; } ========================================================= $ ./a.out abi ABI cmp abi ABI -> 0 $ LC_ALL=tr_TR-UTF-8 ./a.out abi ABI cmp abi ABI -> -148 Bruno Index: modules/strcase =================================================================== RCS file: /cvsroot/gnulib/gnulib/modules/strcase,v retrieving revision 1.5 diff -c -3 -r1.5 strcase *** modules/strcase 22 Sep 2004 15:11:04 -0000 1.5 --- modules/strcase 16 Aug 2005 12:13:35 -0000 *************** *** 6,13 **** --- 6,16 ---- lib/strcasecmp.c lib/strncasecmp.c m4/strcase.m4 + m4/mbrtowc.m4 Depends-on: + strnlen1 + mbchar configure.ac: gl_STRCASE Index: lib/strcase.h =================================================================== RCS file: /cvsroot/gnulib/gnulib/lib/strcase.h,v retrieving revision 1.4 diff -c -3 -r1.4 strcase.h *** lib/strcase.h 14 May 2005 06:03:58 -0000 1.4 --- lib/strcase.h 16 Aug 2005 12:13:35 -0000 *************** *** 1,5 **** /* Case-insensitive string comparison functions. ! Copyright (C) 1995-1996, 2001, 2003 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by --- 1,5 ---- /* Case-insensitive string comparison functions. ! Copyright (C) 1995-1996, 2001, 2003, 2005 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by *************** *** 29,35 **** /* Compare strings S1 and S2, ignoring case, returning less than, equal to or greater than zero if S1 is lexicographically less than, equal to or greater than S2. ! Note: This function does not work correctly in multibyte locales. */ extern int strcasecmp (const char *s1, const char *s2); /* Compare no more than N characters of strings S1 and S2, ignoring case, --- 29,36 ---- /* Compare strings S1 and S2, ignoring case, returning less than, equal to or greater than zero if S1 is lexicographically less than, equal to or greater than S2. ! Note: This function may, in multibyte locales, return 0 for strings of ! different lengths! */ extern int strcasecmp (const char *s1, const char *s2); /* Compare no more than N characters of strings S1 and S2, ignoring case, Index: lib/strncasecmp.c =================================================================== RCS file: /cvsroot/gnulib/gnulib/lib/strncasecmp.c,v retrieving revision 1.4 diff -c -3 -r1.4 strncasecmp.c *** lib/strncasecmp.c 2 Jan 1999 15:55:44 -0000 1.4 --- lib/strncasecmp.c 16 Aug 2005 12:13:35 -0000 *************** *** 1,2 **** ! #define LENGTH_LIMIT ! #include "strcasecmp.c" --- 1,58 ---- ! /* strncasecmp.c -- case insensitive string comparator ! Copyright (C) 1998, 1999 Free Software Foundation, Inc. ! ! This program is free software; you can redistribute it and/or modify ! it under the terms of the GNU General Public License as published by ! the Free Software Foundation; either version 2, or (at your option) ! any later version. ! ! This program is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License ! along with this program; if not, write to the Free Software Foundation, ! Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ ! ! #if HAVE_CONFIG_H ! # include <config.h> ! #endif ! ! /* Specification. */ ! #include "strcase.h" ! ! #include <ctype.h> ! ! #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) ! ! /* Compare no more than N bytes of strings S1 and S2, ! ignoring case, returning less than, equal to or ! greater than zero if S1 is lexicographically less ! than, equal to or greater than S2. */ ! ! int ! strncasecmp (const char *s1, const char *s2, size_t n) ! { ! register const unsigned char *p1 = (const unsigned char *) s1; ! register const unsigned char *p2 = (const unsigned char *) s2; ! unsigned char c1, c2; ! ! if (p1 == p2 || n == 0) ! return 0; ! ! do ! { ! c1 = TOLOWER (*p1); ! c2 = TOLOWER (*p2); ! ! if (--n == 0 || c1 == '\0') ! break; ! ! ++p1; ! ++p2; ! } ! while (c1 == c2); ! ! return c1 - c2; ! } Index: lib/strcasecmp.c =================================================================== RCS file: /cvsroot/gnulib/gnulib/lib/strcasecmp.c,v retrieving revision 1.6 diff -c -3 -r1.6 strcasecmp.c *** lib/strcasecmp.c 14 May 2005 06:03:58 -0000 1.6 --- lib/strcasecmp.c 16 Aug 2005 12:13:35 -0000 *************** *** 1,5 **** ! /* strcasecmp.c -- case insensitive string comparator ! Copyright (C) 1998, 1999 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by --- 1,7 ---- ! /* Case-insensitive string comparison function. ! Copyright (C) 1998, 1999, 2005 Free Software Foundation, Inc. ! Written by Bruno Haible <[EMAIL PROTECTED]>, 2005, ! based on earlier glibc code. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by *************** *** 15,66 **** along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ ! #if HAVE_CONFIG_H # include <config.h> #endif ! #ifdef LENGTH_LIMIT ! # define STRXCASECMP_FUNCTION strncasecmp ! # define STRXCASECMP_DECLARE_N , size_t n ! # define LENGTH_LIMIT_EXPR(Expr) Expr ! #else ! # define STRXCASECMP_FUNCTION strcasecmp ! # define STRXCASECMP_DECLARE_N /* empty */ ! # define LENGTH_LIMIT_EXPR(Expr) 0 ! #endif - #include <stddef.h> #include <ctype.h> ! #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) ! /* Compare {{no more than N characters of }}strings S1 and S2, ! ignoring case, returning less than, equal to or ! greater than zero if S1 is lexicographically less ! than, equal to or greater than S2. */ ! int ! STRXCASECMP_FUNCTION (const char *s1, const char *s2 STRXCASECMP_DECLARE_N) { ! register const unsigned char *p1 = (const unsigned char *) s1; ! register const unsigned char *p2 = (const unsigned char *) s2; ! unsigned char c1, c2; ! if (p1 == p2 || LENGTH_LIMIT_EXPR (n == 0)) return 0; ! do { ! c1 = TOLOWER (*p1); ! c2 = TOLOWER (*p2); ! if (LENGTH_LIMIT_EXPR (--n == 0) || c1 == '\0') ! break; ! ++p1; ! ++p2; } ! while (c1 == c2); ! return c1 - c2; } --- 17,241 ---- along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ ! #ifdef HAVE_CONFIG_H # include <config.h> #endif ! /* Specification. */ ! #include "strcase.h" #include <ctype.h> ! #if HAVE_MBRTOWC ! #include "strnlen1.h" ! /* Like mbiter.h, except it doesn't look at the entire string. */ ! ! #include "mbchar.h" ! ! #include <assert.h> ! #include <stdbool.h> ! #include <stdlib.h> ! #include <wchar.h> ! #include <wctype.h> ! ! struct mbiter_multi ! { ! bool at_end; /* true if the end of the string has been reached */ ! bool in_shift; /* true if next byte may not be interpreted as ASCII */ ! mbstate_t state; /* if in_shift: current shift state */ ! bool next_done; /* true if mbi_avail has already filled the following */ ! struct mbchar cur; /* the current character: ! const char *cur.ptr pointer to current character ! The following are only valid after mbi_avail. ! size_t cur.bytes number of bytes of current character ! bool cur.wc_valid true if wc is a valid wide character ! wchar_t cur.wc if wc_valid: the current character ! */ ! }; ! ! static inline void ! mbiter_multi_next (struct mbiter_multi *iter) ! { ! if (iter->next_done) ! return; ! if (iter->in_shift) ! goto with_shift; ! /* Handle most ASCII characters quickly, without calling mbrtowc(). */ ! if (is_basic (*iter->cur.ptr)) ! { ! /* These characters are part of the basic character set. ISO C 99 ! guarantees that their wide character code is identical to their ! char code. */ ! iter->cur.bytes = 1; ! iter->cur.wc = *iter->cur.ptr; ! iter->cur.wc_valid = true; ! } ! else ! { ! assert (mbsinit (&iter->state)); ! iter->in_shift = true; ! with_shift: ! iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr, ! strnlen1 (iter->cur.ptr, MB_CUR_MAX), ! &iter->state); ! if (iter->cur.bytes == (size_t) -1) ! { ! /* An invalid multibyte sequence was encountered. */ ! iter->cur.bytes = 1; ! iter->cur.wc_valid = false; ! /* Whether to set iter->in_shift = false and reset iter->state ! or not is not very important; the string is bogus anyway. */ ! } ! else if (iter->cur.bytes == (size_t) -2) ! { ! /* An incomplete multibyte character at the end. */ ! iter->cur.bytes = strlen (iter->cur.ptr) + 1; ! iter->cur.wc_valid = false; ! /* Whether to set iter->in_shift = false and reset iter->state ! or not is not important; the string end is reached anyway. */ ! } ! else ! { ! if (iter->cur.bytes == 0) ! { ! /* A null wide character was encountered. */ ! iter->cur.bytes = 1; ! assert (*iter->cur.ptr == '\0'); ! assert (iter->cur.wc == 0); ! } ! iter->cur.wc_valid = true; ! ! /* When in the initial state, we can go back treating ASCII ! characters more quickly. */ ! if (mbsinit (&iter->state)) ! iter->in_shift = false; ! } ! } ! iter->next_done = true; ! } ! ! static inline void ! mbiter_multi_reloc (struct mbiter_multi *iter, ptrdiff_t ptrdiff) { ! iter->cur.ptr += ptrdiff; ! } ! ! /* Iteration macros. */ ! typedef struct mbiter_multi mbi_iterator_t; ! #define mbi_init(iter, startptr) \ ! ((iter).cur.ptr = (startptr), (iter).at_end = false, \ ! (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \ ! (iter).next_done = false) ! #define mbi_avail(iter) \ ! (!(iter).at_end && (mbiter_multi_next (&(iter)), true)) ! #define mbi_advance(iter) \ ! ((mb_isnul ((iter).cur) ? ((iter).at_end = true) : 0), \ ! (iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false) ! ! /* Access to the current character. */ ! #define mbi_cur(iter) (iter).cur ! #define mbi_cur_ptr(iter) (iter).cur.ptr ! ! #endif ! ! #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) ! /* Compare strings S1 and S2, ignoring case, returning less than, equal to or ! greater than zero if S1 is lexicographically less than, equal to or greater ! than S2. ! Note: This function may, in multibyte locales, return 0 for strings of ! different lengths! */ ! int ! strcasecmp (const char *s1, const char *s2) ! { ! if (s1 == s2) return 0; ! /* Be careful not to look at the entire extent of s1 or s2 until needed. ! This is useful because when two strings differ, the difference is ! most often already in the very few first characters. */ ! #if HAVE_MBRTOWC ! if (MB_CUR_MAX > 1) { ! mbi_iterator_t iter1; ! mbi_iterator_t iter2; ! mbi_init (iter1, s1); ! mbi_init (iter2, s2); ! while (mbi_avail (iter1) && mbi_avail (iter2)) ! { ! /* Sort invalid characters after all valid ones. */ ! if (!mbi_cur (iter1).wc_valid) ! { ! if (!mbi_cur (iter2).wc_valid) ! { ! /* Compare two invalid characters. */ ! int cmp; ! ! if (mbi_cur (iter1).bytes > mbi_cur (iter2).bytes) ! return 1; ! if (mbi_cur (iter1).bytes < mbi_cur (iter2).bytes) ! return -1; ! cmp = memcmp (mbi_cur_ptr (iter1), mbi_cur_ptr (iter2), ! mbi_cur (iter1).bytes); ! if (cmp != 0) ! return cmp; ! } ! else ! /* mbi_cur (iter1) invalid, mbi_cur (iter2) valid. */ ! return 1; ! } ! else ! { ! if (!mbi_cur (iter2).wc_valid) ! /* mbi_cur (iter1) valid, mbi_cur (iter2) invalid. */ ! return -1; ! else ! { ! /* Compare two valid characters. */ ! wchar_t c1 = towlower (mbi_cur (iter1).wc); ! wchar_t c2 = towlower (mbi_cur (iter2).wc); ! ! if (c1 > c2) ! return 1; ! if (c1 < c2) ! return -1; ! } ! } ! mbi_advance (iter1); ! mbi_advance (iter2); ! } ! if (mbi_avail (iter1)) ! /* s2 terminated before s1. */ ! return 1; ! if (mbi_avail (iter2)) ! /* s1 terminated before s2. */ ! return -1; ! return 0; } ! else ! #endif ! { ! const unsigned char *p1 = (const unsigned char *) s1; ! const unsigned char *p2 = (const unsigned char *) s2; ! unsigned char c1, c2; ! ! do ! { ! c1 = TOLOWER (*p1); ! c2 = TOLOWER (*p2); ! ! if (c1 == '\0') ! break; ! ! ++p1; ! ++p2; ! } ! while (c1 == c2); ! return c1 - c2; ! } } Index: m4/strcase.m4 =================================================================== RCS file: /cvsroot/gnulib/gnulib/m4/strcase.m4,v retrieving revision 1.2 diff -c -3 -r1.2 strcase.m4 *** m4/strcase.m4 18 Jan 2005 13:07:56 -0000 1.2 --- m4/strcase.m4 16 Aug 2005 12:13:35 -0000 *************** *** 1,5 **** ! # strcase.m4 serial 1 ! dnl Copyright (C) 2002 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, dnl with or without modifications, as long as this notice is preserved. --- 1,5 ---- ! # strcase.m4 serial 2 ! dnl Copyright (C) 2002, 2005 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, dnl with or without modifications, as long as this notice is preserved. *************** *** 12,21 **** AC_DEFUN([gl_FUNC_STRCASECMP], [ ! AC_REPLACE_FUNCS(strcasecmp) ! if test $ac_cv_func_strcasecmp = no; then ! gl_PREREQ_STRCASECMP ! fi ]) AC_DEFUN([gl_FUNC_STRNCASECMP], --- 12,22 ---- AC_DEFUN([gl_FUNC_STRCASECMP], [ ! dnl No known system has a strcasecmp() function that works correctly in ! dnl multibyte locales. Therefore we use our version always. ! AC_LIBOBJ(strcasecmp) ! AC_DEFINE(strcasecmp, rpl_strcasecmp, [Define to rpl_strcasecmp always.]) ! gl_PREREQ_STRCASECMP ]) AC_DEFUN([gl_FUNC_STRNCASECMP], *************** *** 28,34 **** # Prerequisites of lib/strcasecmp.c. AC_DEFUN([gl_PREREQ_STRCASECMP], [ ! : ]) # Prerequisites of lib/strncasecmp.c. --- 29,35 ---- # Prerequisites of lib/strcasecmp.c. AC_DEFUN([gl_PREREQ_STRCASECMP], [ ! gl_FUNC_MBRTOWC ]) # Prerequisites of lib/strncasecmp.c. _______________________________________________ bug-gnulib mailing list bug-gnulib@gnu.org http://lists.gnu.org/mailman/listinfo/bug-gnulib