Hi, Mike Gran wrote: > As far as I can tell, the ISO C towupper will probably work correctly on > UCS-4 characters created by Gnulib functions like u32_conv_from_enc. But, > it seems that it isn't guaranteed to do so, since wint_t is not required to > be UCS-4. (I don't have a counterexample: I'm just going from what I've > read.)
This is correct: wchar_t is not required to be Unicode, and is not Unicode, for example on Solaris and FreeBSD. So, you cannot use towupper() on Unicode characters in portable code. Even on systems where wchar_t is UCS-4 (such as glibc systems), the towupper() function supports only those characters that map to multibyte characters in the current locale, and not all Unicode characters. (Example: It will not handle the Polish ł character when operating in a German ISO-8859-15 locale.) > If that is true, then it would be neat to have u32_toupper and u32_tolower > functions. You are right. I'm adding functions uc_toupper, uc_tolower, uc_totitle functions. These are the right functions to use for Unicode characters. Note, however, that it is better to use case mapping functions that operate on an entire string; this is the only way to handle German or Lithuanian specialities correctly. These functions are already declared in gnulib's "unicase.h", but not yet implemented as of today. 2009-02-08 Bruno Haible <br...@clisp.org> New module 'unicase/totitle'. * modules/unicase/totitle: New file. * lib/unicase/totitle.c: New file. New module 'unicase/tolower'. * modules/unicase/tolower: New file. * lib/unicase/tolower.c: New file. New module 'unicase/toupper'. * modules/unicase/toupper: New file. * lib/unicase/toupper.c: New file. * lib/unicase/simple-mapping.h: New file. * lib/gen-uni-tables.c (output_simple_mapping_test): New function. (mapping_table): New structure. (output_simple_mapping): New function. (main): Invoke output_simple_mapping_test and output_simple_mapping. * modules/gen-uni-tables (Description): Update. * lib/unicase/toupper.h: New file, automatically generated by gen-uni-tables. * lib/unicase/tolower.h: New file, automatically generated by gen-uni-tables. * lib/unicase/totitle.h: New file, automatically generated by gen-uni-tables. * tests/unicase/test-uc_toupper.c: New file, automatically generated by gen-uni-tables. * tests/unicase/test-uc_tolower.c: New file, automatically generated by gen-uni-tables. * tests/unicase/test-uc_totitle.c: New file, automatically generated by gen-uni-tables. New module 'unicase/base'. * modules/unicase/base: New file. * lib/unicase.h: New file. ================================ lib/unicase.h ================================ /* Unicode character case mappings. Copyright (C) 2002, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef _UNICASE_H #define _UNICASE_H #include "unitypes.h" /* Get size_t. */ #include <stddef.h> #ifdef __cplusplus extern "C" { #endif /* ========================================================================= */ /* Character case mappings. These mappings are locale and context independent. WARNING! These functions are not sufficient for languages such as German. Better use the functions below that treat an entire string at once and are language aware. */ /* Return the uppercase mapping of a Unicode character. */ extern ucs4_t uc_toupper (ucs4_t uc); /* Return the lowercase mapping of a Unicode character. */ extern ucs4_t uc_tolower (ucs4_t uc); /* Return the titlecase mapping of a Unicode character. */ extern ucs4_t uc_totitle (ucs4_t uc); /* ========================================================================= */ /* String case mappings. */ /* These functions are locale dependent. The iso639_language argument identifies the language (e.g. "tr" for Turkish). NULL means to use locale independent case mappings. */ /* Return the ISO 639 language code of the current locale. Return "" if it is unknown, or in the "C" locale. */ extern const char * uc_locale_language (void); /* Return the uppercase mapping of a string. */ extern uint8_t * u8_toupper (const uint8_t *s, size_t n, const char *iso639_language, uint8_t *resultbuf, size_t *lengthp); extern uint16_t * u16_toupper (const uint16_t *s, size_t n, const char *iso639_language, uint16_t *resultbuf, size_t *lengthp); extern uint32_t * u32_toupper (const uint32_t *s, size_t n, const char *iso639_language, uint32_t *resultbuf, size_t *lengthp); /* Return the lowercase mapping of a string. */ extern uint8_t * u8_tolower (const uint8_t *s, size_t n, const char *iso639_language, uint8_t *resultbuf, size_t *lengthp); extern uint16_t * u16_tolower (const uint16_t *s, size_t n, const char *iso639_language, uint16_t *resultbuf, size_t *lengthp); extern uint32_t * u32_tolower (const uint32_t *s, size_t n, const char *iso639_language, uint32_t *resultbuf, size_t *lengthp); /* Return the titlecase mapping of a string. */ extern uint8_t * u8_totitle (const uint8_t *s, size_t n, const char *iso639_language, uint8_t *resultbuf, size_t *lengthp); extern uint16_t * u16_totitle (const uint16_t *s, size_t n, const char *iso639_language, uint16_t *resultbuf, size_t *lengthp); extern uint32_t * u32_totitle (const uint32_t *s, size_t n, const char *iso639_language, uint32_t *resultbuf, size_t *lengthp); /* Return the case folded string. */ extern uint8_t * u8_casefold (const uint8_t *s, size_t n, uint8_t *resultbuf, size_t *lengthp); extern uint16_t * u16_casefold (const uint16_t *s, size_t n, uint16_t *resultbuf, size_t *lengthp); extern uint32_t * u32_casefold (const uint32_t *s, size_t n, uint32_t *resultbuf, size_t *lengthp); /* Compare S1 and S2, ignoring case. Return -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2. */ extern int u8_casecmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2); extern int u16_casecmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2); extern int u32_casecmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2); /* Compare S1 and S2 using the collation rules of the current locale, ignoring case. Return -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2. Upon failure, set errno and return any value. */ extern int u8_casecoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2); extern int u16_casecoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2); extern int u32_casecoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2); /* ========================================================================= */ #ifdef __cplusplus } #endif #endif /* _UNICASE_H */