The function mbrtoc32 transforms a multibyte character to a char32_t, thus solving the "wchar_t mess", explained in https://www.gnu.org/software/libunistring/manual/html_node/The-wchar_005ft-mess.html .
But while ISO C 11..23 has upper/lowercase mapping functions for wchar_t (towlower, towupper), is does not have such functions for char32_t values. With these patches, I'm adding towlower / towupper workalike functions for char32_t. 2023-04-09 Bruno Haible <br...@clisp.org> c32toupper: Add tests. * tests/test-c32toupper.sh: New file. * tests/test-c32toupper.c: New file, based on tests/test-c32islower.c. * modules/c32toupper-tests: New file. c32toupper: New module. * lib/uchar.in.h (c32toupper): New declaration. * lib/c32toupper.c: New file. * modules/c32toupper: New file. * m4/uchar_h.m4 (gl_UCHAR_H_REQUIRE_DEFAULTS): Initialize GNULIB_C32TOUPPER. * modules/uchar (Makefile.am): Substitute GNULIB_C32TOUPPER. 2023-04-09 Bruno Haible <br...@clisp.org> c32tolower: Add tests. * tests/test-c32tolower.sh: New file. * tests/test-c32tolower.c: New file, based on tests/test-c32isupper.c. * modules/c32tolower-tests: New file. c32tolower: New module. * lib/uchar.in.h (c32tolower): New declaration. * lib/c32tolower.c: New file. * lib/c32to-impl.h: New file, based on lib/c32is-impl.h. * modules/c32tolower: New file. * m4/uchar_h.m4 (gl_UCHAR_H_REQUIRE_DEFAULTS): Initialize GNULIB_C32TOLOWER. * modules/uchar (Makefile.am): Substitute GNULIB_C32TOLOWER.
>From 444ebd4da72a6d959563c9ac29927c2ccc11e003 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Tue, 4 Apr 2023 23:21:04 +0200 Subject: [PATCH 1/4] c32tolower: New module. * lib/uchar.in.h (c32tolower): New declaration. * lib/c32tolower.c: New file. * lib/c32to-impl.h: New file, based on lib/c32is-impl.h. * modules/c32tolower: New file. * m4/uchar_h.m4 (gl_UCHAR_H_REQUIRE_DEFAULTS): Initialize GNULIB_C32TOLOWER. * modules/uchar (Makefile.am): Substitute GNULIB_C32TOLOWER. --- ChangeLog | 11 ++++++ lib/c32to-impl.h | 92 ++++++++++++++++++++++++++++++++++++++++++++++ lib/c32tolower.c | 34 +++++++++++++++++ lib/uchar.in.h | 18 +++++++++ m4/uchar_h.m4 | 3 +- modules/c32tolower | 44 ++++++++++++++++++++++ modules/uchar | 1 + 7 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 lib/c32to-impl.h create mode 100644 lib/c32tolower.c create mode 100644 modules/c32tolower diff --git a/ChangeLog b/ChangeLog index ea2857a38c..79a570e891 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2023-04-09 Bruno Haible <br...@clisp.org> + + c32tolower: New module. + * lib/uchar.in.h (c32tolower): New declaration. + * lib/c32tolower.c: New file. + * lib/c32to-impl.h: New file, based on lib/c32is-impl.h. + * modules/c32tolower: New file. + * m4/uchar_h.m4 (gl_UCHAR_H_REQUIRE_DEFAULTS): Initialize + GNULIB_C32TOLOWER. + * modules/uchar (Makefile.am): Substitute GNULIB_C32TOLOWER. + 2023-04-09 Bruno Haible <br...@clisp.org> c32is*: Ensure GNULIB_defined_mbstate_t is defined on AIX. diff --git a/lib/c32to-impl.h b/lib/c32to-impl.h new file mode 100644 index 0000000000..724691a116 --- /dev/null +++ b/lib/c32to-impl.h @@ -0,0 +1,92 @@ +/* Case mapping of a 32-bit wide character. + Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Bruno Haible <br...@clisp.org>, 2023. */ + +#include <wchar.h> +#include <wctype.h> + +#if GNULIB_defined_mbstate_t +# include "localcharset.h" +# include "streq.h" +#endif + +#include "unicase.h" + +#if _GL_WCHAR_T_IS_UCS4 && !GNULIB_defined_mbstate_t +_GL_EXTERN_INLINE +#endif +wint_t +FUNC (wint_t wc) +{ + /* The char32_t encoding of a multibyte character is defined by the way + mbrtoc32() is defined. */ + +#if GNULIB_defined_mbstate_t /* AIX, IRIX */ + /* mbrtoc32() is defined on top of mbtowc() for the non-UTF-8 locales + and directly for the UTF-8 locales. */ + if (wc != WEOF) + { + const char *encoding = locale_charset (); + if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) + return UCS_FUNC (wc); + else + return WCHAR_FUNC (wc); + } + else + return wc; + +#elif HAVE_WORKING_MBRTOC32 /* glibc */ + /* mbrtoc32() is essentially defined by the system libc. */ + +# if _GL_WCHAR_T_IS_UCS4 + /* The char32_t encoding of a multibyte character is known to be the same as + the wchar_t encoding. */ + return WCHAR_FUNC (wc); +# else + /* The char32_t encoding of a multibyte character is known to be UCS-4, + different from the wchar_t encoding. */ + if (wc != WEOF) + return UCS_FUNC (wc); + else + return wc; +# endif + +#elif _GL_SMALL_WCHAR_T /* Cygwin, mingw, MSVC */ + /* The wchar_t encoding is UTF-16. + The char32_t encoding is UCS-4. */ + + if (wc == WEOF || wc == (wchar_t) wc) + /* wc is in the range for the tow* functions. */ + return WCHAR_FUNC (wc); + else + return UCS_FUNC (wc); + +#else /* macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Minix, Android */ + /* char32_t and wchar_t are equivalent. */ + static_assert (sizeof (char32_t) == sizeof (wchar_t)); + + return WCHAR_FUNC (wc); +#endif +} diff --git a/lib/c32tolower.c b/lib/c32tolower.c new file mode 100644 index 0000000000..447016214e --- /dev/null +++ b/lib/c32tolower.c @@ -0,0 +1,34 @@ +/* Map a 32-bit wide character to lowercase. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +#define IN_C32TOLOWER +/* Specification. */ +#include <uchar.h> + +#define FUNC c32tolower +#define WCHAR_FUNC towlower +#define UCS_FUNC uc_tolower +#include "c32to-impl.h" diff --git a/lib/uchar.in.h b/lib/uchar.in.h index 4461a35901..c6795f7e91 100644 --- a/lib/uchar.in.h +++ b/lib/uchar.in.h @@ -343,6 +343,24 @@ _GL_CXXALIASWARN (c32isxdigit); #endif +/* Case mapping of a 32-bit wide character. */ +#if @GNULIB_C32TOLOWER@ +# if (_GL_WCHAR_T_IS_UCS4 && !GNULIB_defined_mbstate_t) && !defined IN_C32TOLOWER +_GL_BEGIN_C_LINKAGE +_GL_INLINE wint_t +c32tolower (wint_t wc) +{ + return towlower (wc); +} +_GL_END_C_LINKAGE +# else +_GL_FUNCDECL_SYS (c32tolower, wint_t, (wint_t wc)); +# endif +_GL_CXXALIAS_SYS (c32tolower, wint_t, (wint_t wc)); +_GL_CXXALIASWARN (c32tolower); +#endif + + /* Converts a 32-bit wide character to a multibyte character. */ #if @GNULIB_C32RTOMB@ # if @REPLACE_C32RTOMB@ diff --git a/m4/uchar_h.m4 b/m4/uchar_h.m4 index 3af24cf3e4..6bb83adb6a 100644 --- a/m4/uchar_h.m4 +++ b/m4/uchar_h.m4 @@ -1,4 +1,4 @@ -# uchar_h.m4 serial 22 +# uchar_h.m4 serial 23 dnl Copyright (C) 2019-2023 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -214,6 +214,7 @@ AC_DEFUN([gl_UCHAR_H_REQUIRE_DEFAULTS] gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32ISSPACE]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32ISUPPER]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32ISXDIGIT]) + gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32TOLOWER]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32RTOMB]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32SNRTOMBS]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32SRTOMBS]) diff --git a/modules/c32tolower b/modules/c32tolower new file mode 100644 index 0000000000..b6ed633aec --- /dev/null +++ b/modules/c32tolower @@ -0,0 +1,44 @@ +Description: +c32tolower() function: map 32-bit wide character to lowercase. + +Files: +lib/c32tolower.c +lib/c32to-impl.h +m4/mbrtoc32.m4 +m4/mbrtowc.m4 +m4/locale-fr.m4 +m4/locale-ja.m4 +m4/locale-zh.m4 +m4/codeset.m4 + +Depends-on: +uchar +wchar +wctype-h +localcharset [test $REPLACE_MBSTATE_T = 1] +streq [test $REPLACE_MBSTATE_T = 1] +unicase/tolower +assert-h + +configure.ac: +AC_REQUIRE([gl_UCHAR_H]) +dnl Determine REPLACE_MBSTATE_T, from which GNULIB_defined_mbstate_t is +dnl determined. It describes how mbrtoc32 is implemented. +AC_REQUIRE([gl_MBSTATE_T_BROKEN]) +AC_REQUIRE([gl_MBRTOC32_SANITYCHECK]) +gl_UCHAR_MODULE_INDICATOR([c32tolower]) + +Makefile.am: +lib_SOURCES += c32tolower.c + +Include: +<uchar.h> + +Link: +$(LTLIBUNISTRING) when linking with libtool, $(LIBUNISTRING) otherwise + +License: +LGPLv3+ or GPLv2+ + +Maintainer: +Bruno Haible diff --git a/modules/uchar b/modules/uchar index 985b58206f..f49db7d42c 100644 --- a/modules/uchar +++ b/modules/uchar @@ -51,6 +51,7 @@ uchar.h: uchar.in.h $(top_builddir)/config.status $(CXXDEFS_H) -e 's/@''GNULIB_C32ISSPACE''@/$(GNULIB_C32ISSPACE)/g' \ -e 's/@''GNULIB_C32ISUPPER''@/$(GNULIB_C32ISUPPER)/g' \ -e 's/@''GNULIB_C32ISXDIGIT''@/$(GNULIB_C32ISXDIGIT)/g' \ + -e 's/@''GNULIB_C32TOLOWER''@/$(GNULIB_C32TOLOWER)/g' \ -e 's/@''GNULIB_C32RTOMB''@/$(GNULIB_C32RTOMB)/g' \ -e 's/@''GNULIB_C32SNRTOMBS''@/$(GNULIB_C32SNRTOMBS)/g' \ -e 's/@''GNULIB_C32SRTOMBS''@/$(GNULIB_C32SRTOMBS)/g' \ -- 2.34.1
>From cba07f77a7579c8d4f2d80783cb3b5acdab24bc5 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Mon, 10 Apr 2023 01:46:37 +0200 Subject: [PATCH 2/4] c32tolower: Add tests. * tests/test-c32tolower.sh: New file. * tests/test-c32tolower.c: New file, based on tests/test-c32isupper.c. * modules/c32tolower-tests: New file. --- ChangeLog | 5 + modules/c32tolower-tests | 30 +++ tests/test-c32tolower.c | 436 +++++++++++++++++++++++++++++++++++++++ tests/test-c32tolower.sh | 42 ++++ 4 files changed, 513 insertions(+) create mode 100644 modules/c32tolower-tests create mode 100644 tests/test-c32tolower.c create mode 100755 tests/test-c32tolower.sh diff --git a/ChangeLog b/ChangeLog index 79a570e891..defa123445 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2023-04-09 Bruno Haible <br...@clisp.org> + c32tolower: Add tests. + * tests/test-c32tolower.sh: New file. + * tests/test-c32tolower.c: New file, based on tests/test-c32isupper.c. + * modules/c32tolower-tests: New file. + c32tolower: New module. * lib/uchar.in.h (c32tolower): New declaration. * lib/c32tolower.c: New file. diff --git a/modules/c32tolower-tests b/modules/c32tolower-tests new file mode 100644 index 0000000000..1153eabadc --- /dev/null +++ b/modules/c32tolower-tests @@ -0,0 +1,30 @@ +Files: +tests/test-c32tolower.sh +tests/test-c32tolower.c +tests/signature.h +tests/macros.h +m4/locale-fr.m4 +m4/locale-ja.m4 +m4/locale-zh.m4 +m4/codeset.m4 + +Depends-on: +mbrtoc32 +c32rtomb +setlocale + +configure.ac: +gt_LOCALE_FR +gt_LOCALE_FR_UTF8 +gt_LOCALE_JA +gt_LOCALE_ZH_CN + +Makefile.am: +TESTS += test-c32tolower.sh +TESTS_ENVIRONMENT += \ + LOCALE_FR='@LOCALE_FR@' \ + LOCALE_FR_UTF8='@LOCALE_FR_UTF8@' \ + LOCALE_JA='@LOCALE_JA@' \ + LOCALE_ZH_CN='@LOCALE_ZH_CN@' +check_PROGRAMS += test-c32tolower +test_c32tolower_LDADD = $(LDADD) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBUNISTRING) diff --git a/tests/test-c32tolower.c b/tests/test-c32tolower.c new file mode 100644 index 0000000000..30fe9b5848 --- /dev/null +++ b/tests/test-c32tolower.c @@ -0,0 +1,436 @@ +/* Test of c32tolower() function. + Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <uchar.h> + +#include "signature.h" +SIGNATURE_CHECK (c32tolower, wint_t, (wint_t)); + +#include <locale.h> +#include <stdlib.h> +#include <string.h> +#include <wchar.h> + +#include "macros.h" + +/* Representation of a multibyte character. */ +#define MBCHAR_BUF_SIZE 6 +struct multibyte +{ + size_t nbytes; /* number of bytes of current character, > 0 */ + char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */ +}; + +/* Returns the value of c32tolower for the multibyte character s[0..n-1], + as a multibyte character. */ +static struct multibyte +for_character (const char *s, size_t n) +{ + mbstate_t state; + char32_t wc; + size_t ret; + struct multibyte result; + + memset (&state, '\0', sizeof (mbstate_t)); + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, s, n, &state); + ASSERT (ret == n); + + wc = c32tolower (wc); + ASSERT (wc != WEOF); + + memset (&state, '\0', sizeof (mbstate_t)); + ret = c32rtomb (result.buf, wc, &state); + ASSERT (ret != 0); + if (ret == (size_t)(-1)) + /* wc cannot be converted back to multibyte. */ + result.nbytes = 0; + else + { + ASSERT (ret <= MBCHAR_BUF_SIZE); + result.nbytes = ret; + } + return result; +} + +int +main (int argc, char *argv[]) +{ + wint_t wc; + struct multibyte mb; + char buf[4]; + + /* configure should already have checked that the locale is supported. */ + if (setlocale (LC_ALL, "") == NULL) + return 1; + + /* Test WEOF. */ + wc = c32tolower (WEOF); + ASSERT (wc == WEOF); + + /* Test single-byte characters. + POSIX specifies in + <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html> + that + - in all locales, the uppercase characters include the A ... Z + characters, and the corresponding characters a ... z (if not in a + Turkish locale) are lowercase, + - in the "POSIX" locale (which is usually the same as the "C" locale), + the uppercase characters include only the ASCII A ... Z characters, + and the corresponding characters a ... z are lowercase. + */ +#if defined __NetBSD__ + /* towlower is broken in the zh_CN.GB18030 locale on NetBSD 9.0. + See <https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=57339>. */ + if (!(argc > 1 && argv[1][0] == '4')) +#endif + { + int c; + + for (c = 0; c < 0x100; c++) + switch (c) + { + case '\t': case '\v': case '\f': + case ' ': case '!': case '"': case '#': case '%': + case '&': case '\'': case '(': case ')': case '*': + case '+': case ',': case '-': case '.': case '/': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case ':': case ';': case '<': case '=': case '>': + case '?': + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case '[': case '\\': case ']': case '^': case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': case '{': case '|': case '}': case '~': + /* c is in the ISO C "basic character set". */ + buf[0] = (unsigned char) c; + mb = for_character (buf, 1); + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + ASSERT (mb.nbytes == 1); + ASSERT ((unsigned char) mb.buf[0] == (unsigned char) c - 'A' + 'a'); + break; + default: + ASSERT (mb.nbytes == 1); + ASSERT ((unsigned char) mb.buf[0] == c); + break; + } + break; + } + } + + if (argc > 1) + switch (argv[1][0]) + { + case '0': + /* C locale; tested above. */ + return 0; + + case '1': + /* Locale encoding is ISO-8859-1 or ISO-8859-15. */ + { + /* U+00B2 SUPERSCRIPT TWO */ + mb = for_character ("\262", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\262", 1) == 0); + /* U+00B5 MICRO SIGN */ + mb = for_character ("\265", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\265", 1) == 0); + /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ + mb = for_character ("\311", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\351", 1) == 0); + /* U+00DF LATIN SMALL LETTER SHARP S */ + mb = for_character ("\337", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\337", 1) == 0); + /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ + mb = for_character ("\351", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\351", 1) == 0); + /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ + mb = for_character ("\377", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\377", 1) == 0); + } + return 0; + + case '2': + /* Locale encoding is EUC-JP. */ + { + #if !((defined __APPLE__ && defined __MACH__) || defined __DragonFly__) + /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ + mb = for_character ("\217\252\261", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\253\261", 3) == 0); + #endif + /* U+00DF LATIN SMALL LETTER SHARP S */ + mb = for_character ("\217\251\316", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\251\316", 3) == 0); + /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ + mb = for_character ("\217\253\261", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\253\261", 3) == 0); + /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ + mb = for_character ("\217\253\363", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\253\363", 3) == 0); + #if !((defined __APPLE__ && defined __MACH__) || defined __DragonFly__) + /* U+0141 LATIN CAPITAL LETTER L WITH STROKE */ + mb = for_character ("\217\251\250", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\251\310", 3) == 0); + #endif + /* U+0142 LATIN SMALL LETTER L WITH STROKE */ + mb = for_character ("\217\251\310", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\251\310", 3) == 0); + #if !defined __DragonFly__ + /* U+0429 CYRILLIC CAPITAL LETTER SHCHA */ + mb = for_character ("\247\273", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\247\353", 2) == 0); + #endif + /* U+0449 CYRILLIC SMALL LETTER SHCHA */ + mb = for_character ("\247\353", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\247\353", 2) == 0); + /* U+3073 HIRAGANA LETTER BI */ + mb = for_character ("\244\323", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\244\323", 2) == 0); + #if !defined __DragonFly__ + /* U+FF27 FULLWIDTH LATIN CAPITAL LETTER G */ + mb = for_character ("\243\307", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\243\347", 2) == 0); + #endif + /* U+FF47 FULLWIDTH LATIN SMALL LETTER G */ + mb = for_character ("\243\347", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\243\347", 2) == 0); + } + return 0; + + case '3': + /* Locale encoding is UTF-8. */ + { + /* U+00B2 SUPERSCRIPT TWO */ + mb = for_character ("\302\262", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\302\262", 2) == 0); + /* U+00B5 MICRO SIGN */ + mb = for_character ("\302\265", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\302\265", 2) == 0); + #if !(defined _WIN32 && !defined __CYGWIN__) + /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ + mb = for_character ("\303\211", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\303\251", 2) == 0); + #endif + /* U+00DF LATIN SMALL LETTER SHARP S */ + mb = for_character ("\303\237", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\303\237", 2) == 0); + /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ + mb = for_character ("\303\251", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\303\251", 2) == 0); + /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ + mb = for_character ("\303\277", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\303\277", 2) == 0); + /* U+0141 LATIN CAPITAL LETTER L WITH STROKE */ + mb = for_character ("\305\201", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\305\202", 2) == 0); + /* U+0142 LATIN SMALL LETTER L WITH STROKE */ + mb = for_character ("\305\202", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\305\202", 2) == 0); + /* U+0429 CYRILLIC CAPITAL LETTER SHCHA */ + mb = for_character ("\320\251", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\321\211", 2) == 0); + /* U+0449 CYRILLIC SMALL LETTER SHCHA */ + mb = for_character ("\321\211", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\321\211", 2) == 0); + /* U+05D5 HEBREW LETTER VAV */ + mb = for_character ("\327\225", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\327\225", 2) == 0); + /* U+3073 HIRAGANA LETTER BI */ + mb = for_character ("\343\201\263", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\343\201\263", 3) == 0); + /* U+3162 HANGUL LETTER YI */ + mb = for_character ("\343\205\242", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\343\205\242", 3) == 0); + /* U+FF27 FULLWIDTH LATIN CAPITAL LETTER G */ + mb = for_character ("\357\274\247", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\357\275\207", 3) == 0); + /* U+FF47 FULLWIDTH LATIN SMALL LETTER G */ + mb = for_character ("\357\275\207", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\357\275\207", 3) == 0); + /* U+FFDB HALFWIDTH HANGUL LETTER YI */ + mb = for_character ("\357\277\233", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\357\277\233", 3) == 0); + #if !(defined __DragonFly__ || defined __sun) + /* U+10419 DESERET CAPITAL LETTER EF */ + mb = for_character ("\360\220\220\231", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\360\220\221\201", 4) == 0); + #endif + /* U+10441 DESERET SMALL LETTER EF */ + mb = for_character ("\360\220\221\201", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\360\220\221\201", 4) == 0); + /* U+E0041 TAG LATIN CAPITAL LETTER A */ + mb = for_character ("\363\240\201\201", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\363\240\201\201", 4) == 0); + /* U+E0061 TAG LATIN SMALL LETTER A */ + mb = for_character ("\363\240\201\241", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\363\240\201\241", 4) == 0); + } + return 0; + + case '4': + /* Locale encoding is GB18030. */ + { + /* U+00B2 SUPERSCRIPT TWO */ + mb = for_character ("\201\060\205\065", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\205\065", 4) == 0); + /* U+00B5 MICRO SIGN */ + mb = for_character ("\201\060\205\070", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\205\070", 4) == 0); + #if !(defined __FreeBSD__ || defined __DragonFly__ || defined __sun) + /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ + mb = for_character ("\201\060\207\067", 4); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\250\246", 2) == 0); + #endif + /* U+00DF LATIN SMALL LETTER SHARP S */ + mb = for_character ("\201\060\211\070", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\211\070", 4) == 0); + /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ + mb = for_character ("\250\246", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\250\246", 2) == 0); + /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ + mb = for_character ("\201\060\213\067", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\213\067", 4) == 0); + #if !(defined __FreeBSD__ || defined __DragonFly__ || defined __sun) + /* U+0141 LATIN CAPITAL LETTER L WITH STROKE */ + mb = for_character ("\201\060\221\071", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\222\060", 4) == 0); + #endif + /* U+0142 LATIN SMALL LETTER L WITH STROKE */ + mb = for_character ("\201\060\222\060", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\222\060", 4) == 0); + #if !(defined __FreeBSD__ || defined __DragonFly__) + /* U+0429 CYRILLIC CAPITAL LETTER SHCHA */ + mb = for_character ("\247\273", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\247\353", 2) == 0); + #endif + /* U+0449 CYRILLIC SMALL LETTER SHCHA */ + mb = for_character ("\247\353", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\247\353", 2) == 0); + /* U+05D5 HEBREW LETTER VAV */ + mb = for_character ("\201\060\371\067", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\371\067", 4) == 0); + /* U+3073 HIRAGANA LETTER BI */ + mb = for_character ("\244\323", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\244\323", 2) == 0); + /* U+3162 HANGUL LETTER YI */ + mb = for_character ("\201\071\256\062", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\071\256\062", 4) == 0); + #if !defined __DragonFly__ + /* U+FF27 FULLWIDTH LATIN CAPITAL LETTER G */ + mb = for_character ("\243\307", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\243\347", 2) == 0); + #endif + /* U+FF47 FULLWIDTH LATIN SMALL LETTER G */ + mb = for_character ("\243\347", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\243\347", 2) == 0); + /* U+FFDB HALFWIDTH HANGUL LETTER YI */ + mb = for_character ("\204\061\241\071", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\204\061\241\071", 4) == 0); + #if !((defined __APPLE__ && defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined __NetBSD__ || defined __sun) + /* U+10419 DESERET CAPITAL LETTER EF */ + mb = for_character ("\220\060\351\071", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\220\060\355\071", 4) == 0); + #endif + /* U+10441 DESERET SMALL LETTER EF */ + mb = for_character ("\220\060\355\071", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\220\060\355\071", 4) == 0); + /* U+E0041 TAG LATIN CAPITAL LETTER A */ + mb = for_character ("\323\066\234\063", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\323\066\234\063", 4) == 0); + /* U+E0061 TAG LATIN SMALL LETTER A */ + mb = for_character ("\323\066\237\065", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\323\066\237\065", 4) == 0); + } + return 0; + + } + + return 1; +} diff --git a/tests/test-c32tolower.sh b/tests/test-c32tolower.sh new file mode 100755 index 0000000000..a0859a900e --- /dev/null +++ b/tests/test-c32tolower.sh @@ -0,0 +1,42 @@ +#!/bin/sh + +# Allow distinguishing the various invocations in the .log file. +set -x + +# Test in the POSIX locale. +LC_ALL=C ${CHECKER} ./test-c32tolower${EXEEXT} 0 || exit 1 +LC_ALL=POSIX ${CHECKER} ./test-c32tolower${EXEEXT} 0 || exit 1 + +# Test in an ISO-8859-1 or ISO-8859-15 locale. +: "${LOCALE_FR=fr_FR}" +if test $LOCALE_FR != none; then + LC_ALL=$LOCALE_FR \ + ${CHECKER} ./test-c32tolower${EXEEXT} 1 \ + || exit 1 +fi + +# Test whether a specific EUC-JP locale is installed. +: "${LOCALE_JA=ja_JP}" +if test $LOCALE_JA != none; then + LC_ALL=$LOCALE_JA \ + ${CHECKER} ./test-c32tolower${EXEEXT} 2 \ + || exit 1 +fi + +# Test whether a specific UTF-8 locale is installed. +: "${LOCALE_FR_UTF8=fr_FR.UTF-8}" +if test $LOCALE_FR_UTF8 != none; then + LC_ALL=$LOCALE_FR_UTF8 \ + ${CHECKER} ./test-c32tolower${EXEEXT} 3 \ + || exit 1 +fi + +# Test whether a specific GB18030 locale is installed. +: "${LOCALE_ZH_CN=zh_CN.GB18030}" +if test $LOCALE_ZH_CN != none; then + LC_ALL=$LOCALE_ZH_CN \ + ${CHECKER} ./test-c32tolower${EXEEXT} 4 \ + || exit 1 +fi + +exit 0 -- 2.34.1
>From d47ae89803ca1cde69b8a312101421322f4259f6 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Mon, 10 Apr 2023 01:50:39 +0200 Subject: [PATCH 3/4] c32toupper: New module. * lib/uchar.in.h (c32toupper): New declaration. * lib/c32toupper.c: New file. * modules/c32toupper: New file. * m4/uchar_h.m4 (gl_UCHAR_H_REQUIRE_DEFAULTS): Initialize GNULIB_C32TOUPPER. * modules/uchar (Makefile.am): Substitute GNULIB_C32TOUPPER. --- ChangeLog | 10 ++++++++++ lib/c32toupper.c | 34 ++++++++++++++++++++++++++++++++++ lib/uchar.in.h | 15 +++++++++++++++ m4/uchar_h.m4 | 3 ++- modules/c32toupper | 44 ++++++++++++++++++++++++++++++++++++++++++++ modules/uchar | 1 + 6 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 lib/c32toupper.c create mode 100644 modules/c32toupper diff --git a/ChangeLog b/ChangeLog index defa123445..9b7487dd01 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2023-04-09 Bruno Haible <br...@clisp.org> + + c32toupper: New module. + * lib/uchar.in.h (c32toupper): New declaration. + * lib/c32toupper.c: New file. + * modules/c32toupper: New file. + * m4/uchar_h.m4 (gl_UCHAR_H_REQUIRE_DEFAULTS): Initialize + GNULIB_C32TOUPPER. + * modules/uchar (Makefile.am): Substitute GNULIB_C32TOUPPER. + 2023-04-09 Bruno Haible <br...@clisp.org> c32tolower: Add tests. diff --git a/lib/c32toupper.c b/lib/c32toupper.c new file mode 100644 index 0000000000..a3ca5ecf19 --- /dev/null +++ b/lib/c32toupper.c @@ -0,0 +1,34 @@ +/* Map a 32-bit wide character to uppercase. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +#define IN_C32TOUPPER +/* Specification. */ +#include <uchar.h> + +#define FUNC c32toupper +#define WCHAR_FUNC towupper +#define UCS_FUNC uc_toupper +#include "c32to-impl.h" diff --git a/lib/uchar.in.h b/lib/uchar.in.h index c6795f7e91..27b91c5b58 100644 --- a/lib/uchar.in.h +++ b/lib/uchar.in.h @@ -359,6 +359,21 @@ _GL_FUNCDECL_SYS (c32tolower, wint_t, (wint_t wc)); _GL_CXXALIAS_SYS (c32tolower, wint_t, (wint_t wc)); _GL_CXXALIASWARN (c32tolower); #endif +#if @GNULIB_C32TOUPPER@ +# if (_GL_WCHAR_T_IS_UCS4 && !GNULIB_defined_mbstate_t) && !defined IN_C32TOUPPER +_GL_BEGIN_C_LINKAGE +_GL_INLINE wint_t +c32toupper (wint_t wc) +{ + return towupper (wc); +} +_GL_END_C_LINKAGE +# else +_GL_FUNCDECL_SYS (c32toupper, wint_t, (wint_t wc)); +# endif +_GL_CXXALIAS_SYS (c32toupper, wint_t, (wint_t wc)); +_GL_CXXALIASWARN (c32toupper); +#endif /* Converts a 32-bit wide character to a multibyte character. */ diff --git a/m4/uchar_h.m4 b/m4/uchar_h.m4 index 6bb83adb6a..2679371716 100644 --- a/m4/uchar_h.m4 +++ b/m4/uchar_h.m4 @@ -1,4 +1,4 @@ -# uchar_h.m4 serial 23 +# uchar_h.m4 serial 24 dnl Copyright (C) 2019-2023 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -215,6 +215,7 @@ AC_DEFUN([gl_UCHAR_H_REQUIRE_DEFAULTS] gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32ISUPPER]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32ISXDIGIT]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32TOLOWER]) + gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32TOUPPER]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32RTOMB]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32SNRTOMBS]) gl_MODULE_INDICATOR_INIT_VARIABLE([GNULIB_C32SRTOMBS]) diff --git a/modules/c32toupper b/modules/c32toupper new file mode 100644 index 0000000000..364e011818 --- /dev/null +++ b/modules/c32toupper @@ -0,0 +1,44 @@ +Description: +c32toupper() function: map 32-bit wide character to uppercase. + +Files: +lib/c32toupper.c +lib/c32to-impl.h +m4/mbrtoc32.m4 +m4/mbrtowc.m4 +m4/locale-fr.m4 +m4/locale-ja.m4 +m4/locale-zh.m4 +m4/codeset.m4 + +Depends-on: +uchar +wchar +wctype-h +localcharset [test $REPLACE_MBSTATE_T = 1] +streq [test $REPLACE_MBSTATE_T = 1] +unicase/toupper +assert-h + +configure.ac: +AC_REQUIRE([gl_UCHAR_H]) +dnl Determine REPLACE_MBSTATE_T, from which GNULIB_defined_mbstate_t is +dnl determined. It describes how mbrtoc32 is implemented. +AC_REQUIRE([gl_MBSTATE_T_BROKEN]) +AC_REQUIRE([gl_MBRTOC32_SANITYCHECK]) +gl_UCHAR_MODULE_INDICATOR([c32toupper]) + +Makefile.am: +lib_SOURCES += c32toupper.c + +Include: +<uchar.h> + +Link: +$(LTLIBUNISTRING) when linking with libtool, $(LIBUNISTRING) otherwise + +License: +LGPLv3+ or GPLv2+ + +Maintainer: +Bruno Haible diff --git a/modules/uchar b/modules/uchar index f49db7d42c..6363d543d9 100644 --- a/modules/uchar +++ b/modules/uchar @@ -52,6 +52,7 @@ uchar.h: uchar.in.h $(top_builddir)/config.status $(CXXDEFS_H) -e 's/@''GNULIB_C32ISUPPER''@/$(GNULIB_C32ISUPPER)/g' \ -e 's/@''GNULIB_C32ISXDIGIT''@/$(GNULIB_C32ISXDIGIT)/g' \ -e 's/@''GNULIB_C32TOLOWER''@/$(GNULIB_C32TOLOWER)/g' \ + -e 's/@''GNULIB_C32TOUPPER''@/$(GNULIB_C32TOUPPER)/g' \ -e 's/@''GNULIB_C32RTOMB''@/$(GNULIB_C32RTOMB)/g' \ -e 's/@''GNULIB_C32SNRTOMBS''@/$(GNULIB_C32SNRTOMBS)/g' \ -e 's/@''GNULIB_C32SRTOMBS''@/$(GNULIB_C32SRTOMBS)/g' \ -- 2.34.1
>From 99fc85c723f3e393be47b1cbcf2711a03ceaf6ef Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Mon, 10 Apr 2023 01:51:54 +0200 Subject: [PATCH 4/4] c32toupper: Add tests. * tests/test-c32toupper.sh: New file. * tests/test-c32toupper.c: New file, based on tests/test-c32islower.c. * modules/c32toupper-tests: New file. --- ChangeLog | 5 + modules/c32toupper-tests | 30 +++ tests/test-c32toupper.c | 448 +++++++++++++++++++++++++++++++++++++++ tests/test-c32toupper.sh | 42 ++++ 4 files changed, 525 insertions(+) create mode 100644 modules/c32toupper-tests create mode 100644 tests/test-c32toupper.c create mode 100755 tests/test-c32toupper.sh diff --git a/ChangeLog b/ChangeLog index 9b7487dd01..7a17b548ce 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2023-04-09 Bruno Haible <br...@clisp.org> + c32toupper: Add tests. + * tests/test-c32toupper.sh: New file. + * tests/test-c32toupper.c: New file, based on tests/test-c32islower.c. + * modules/c32toupper-tests: New file. + c32toupper: New module. * lib/uchar.in.h (c32toupper): New declaration. * lib/c32toupper.c: New file. diff --git a/modules/c32toupper-tests b/modules/c32toupper-tests new file mode 100644 index 0000000000..eeedc9df1e --- /dev/null +++ b/modules/c32toupper-tests @@ -0,0 +1,30 @@ +Files: +tests/test-c32toupper.sh +tests/test-c32toupper.c +tests/signature.h +tests/macros.h +m4/locale-fr.m4 +m4/locale-ja.m4 +m4/locale-zh.m4 +m4/codeset.m4 + +Depends-on: +mbrtoc32 +c32rtomb +setlocale + +configure.ac: +gt_LOCALE_FR +gt_LOCALE_FR_UTF8 +gt_LOCALE_JA +gt_LOCALE_ZH_CN + +Makefile.am: +TESTS += test-c32toupper.sh +TESTS_ENVIRONMENT += \ + LOCALE_FR='@LOCALE_FR@' \ + LOCALE_FR_UTF8='@LOCALE_FR_UTF8@' \ + LOCALE_JA='@LOCALE_JA@' \ + LOCALE_ZH_CN='@LOCALE_ZH_CN@' +check_PROGRAMS += test-c32toupper +test_c32toupper_LDADD = $(LDADD) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBUNISTRING) diff --git a/tests/test-c32toupper.c b/tests/test-c32toupper.c new file mode 100644 index 0000000000..37f0134de8 --- /dev/null +++ b/tests/test-c32toupper.c @@ -0,0 +1,448 @@ +/* Test of c32toupper() function. + Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <uchar.h> + +#include "signature.h" +SIGNATURE_CHECK (c32toupper, wint_t, (wint_t)); + +#include <locale.h> +#include <stdlib.h> +#include <string.h> +#include <wchar.h> + +#include "macros.h" + +/* Representation of a multibyte character. */ +#define MBCHAR_BUF_SIZE 6 +struct multibyte +{ + size_t nbytes; /* number of bytes of current character, > 0 */ + char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */ +}; + +/* Returns the value of c32toupper for the multibyte character s[0..n-1], + as a multibyte character. */ +static struct multibyte +for_character (const char *s, size_t n) +{ + mbstate_t state; + char32_t wc; + size_t ret; + struct multibyte result; + + memset (&state, '\0', sizeof (mbstate_t)); + wc = (char32_t) 0xBADFACE; + ret = mbrtoc32 (&wc, s, n, &state); + ASSERT (ret == n); + + wc = c32toupper (wc); + ASSERT (wc != WEOF); + + memset (&state, '\0', sizeof (mbstate_t)); + ret = c32rtomb (result.buf, wc, &state); + ASSERT (ret != 0); + if (ret == (size_t)(-1)) + /* wc cannot be converted back to multibyte. */ + result.nbytes = 0; + else + { + ASSERT (ret <= MBCHAR_BUF_SIZE); + result.nbytes = ret; + } + return result; +} + +int +main (int argc, char *argv[]) +{ + wint_t wc; + struct multibyte mb; + char buf[4]; + + /* configure should already have checked that the locale is supported. */ + if (setlocale (LC_ALL, "") == NULL) + return 1; + + /* Test WEOF. */ + wc = c32toupper (WEOF); + ASSERT (wc == WEOF); + + /* Test single-byte characters. + POSIX specifies in + <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html> + that + - in all locales, the lowercase characters include the a ... z + characters, and the corresponding characters A ... Z (if not in a + Turkish locale) are uppercase, + - in the "POSIX" locale (which is usually the same as the "C" locale), + the lowercase characters include only the ASCII a ... z characters, + and the corresponding characters A ... Z are uppercase. + */ +#if defined __NetBSD__ + /* towupper is broken in the zh_CN.GB18030 locale on NetBSD 9.0. + See <https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=57339>. */ + if (!(argc > 1 && argv[1][0] == '4')) +#endif + { + int c; + + for (c = 0; c < 0x100; c++) + switch (c) + { + case '\t': case '\v': case '\f': + case ' ': case '!': case '"': case '#': case '%': + case '&': case '\'': case '(': case ')': case '*': + case '+': case ',': case '-': case '.': case '/': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case ':': case ';': case '<': case '=': case '>': + case '?': + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case '[': case '\\': case ']': case '^': case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': case '{': case '|': case '}': case '~': + /* c is in the ISO C "basic character set". */ + buf[0] = (unsigned char) c; + mb = for_character (buf, 1); + switch (c) + { + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + ASSERT (mb.nbytes == 1); + ASSERT ((unsigned char) mb.buf[0] == (unsigned char) c - 'a' + 'A'); + break; + default: + ASSERT (mb.nbytes == 1); + ASSERT ((unsigned char) mb.buf[0] == c); + break; + } + break; + } + } + + if (argc > 1) + switch (argv[1][0]) + { + case '0': + /* C locale; tested above. */ + return 0; + + case '1': + /* Locale encoding is ISO-8859-1 or ISO-8859-15. */ + { + /* U+00B2 SUPERSCRIPT TWO */ + mb = for_character ("\262", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\262", 1) == 0); + #if !(defined __GLIBC__ || defined __sun || defined __CYGWIN__) + /* U+00B5 MICRO SIGN */ + mb = for_character ("\265", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\265", 1) == 0); + #endif + /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ + mb = for_character ("\311", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\311", 1) == 0); + /* U+00DF LATIN SMALL LETTER SHARP S */ + mb = for_character ("\337", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\337", 1) == 0); + /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ + mb = for_character ("\351", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\311", 1) == 0); + #if !(defined __GLIBC__ || defined __DragonFly__ || defined __sun || defined __CYGWIN__ || (defined _WIN32 && !defined __CYGWIN__)) + /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ + mb = for_character ("\377", 1); + ASSERT (mb.nbytes == 1); + ASSERT (memcmp (mb.buf, "\377", 1) == 0); + #endif + } + return 0; + + case '2': + /* Locale encoding is EUC-JP. */ + { + /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ + mb = for_character ("\217\252\261", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\252\261", 3) == 0); + #if !defined __NetBSD__ + /* U+00DF LATIN SMALL LETTER SHARP S */ + mb = for_character ("\217\251\316", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\251\316", 3) == 0); + #endif + #if !((defined __APPLE__ && defined __MACH__) || defined __DragonFly__) + /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ + mb = for_character ("\217\253\261", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\252\261", 3) == 0); + #endif + #if !((defined __APPLE__ && defined __MACH__) || defined __DragonFly__ || defined __NetBSD__) + /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ + mb = for_character ("\217\253\363", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\252\363", 3) == 0); + #endif + /* U+0141 LATIN CAPITAL LETTER L WITH STROKE */ + mb = for_character ("\217\251\250", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\251\250", 3) == 0); + #if !((defined __APPLE__ && defined __MACH__) || defined __DragonFly__) + /* U+0142 LATIN SMALL LETTER L WITH STROKE */ + mb = for_character ("\217\251\310", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\217\251\250", 3) == 0); + #endif + /* U+0429 CYRILLIC CAPITAL LETTER SHCHA */ + mb = for_character ("\247\273", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\247\273", 2) == 0); + #if !defined __DragonFly__ + /* U+0449 CYRILLIC SMALL LETTER SHCHA */ + mb = for_character ("\247\353", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\247\273", 2) == 0); + #endif + /* U+3073 HIRAGANA LETTER BI */ + mb = for_character ("\244\323", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\244\323", 2) == 0); + /* U+FF27 FULLWIDTH LATIN CAPITAL LETTER G */ + mb = for_character ("\243\307", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\243\307", 2) == 0); + #if !defined __DragonFly__ + /* U+FF47 FULLWIDTH LATIN SMALL LETTER G */ + mb = for_character ("\243\347", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\243\307", 2) == 0); + #endif + } + return 0; + + case '3': + /* Locale encoding is UTF-8. */ + { + /* U+00B2 SUPERSCRIPT TWO */ + mb = for_character ("\302\262", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\302\262", 2) == 0); + #if !(defined __GLIBC__ || (defined __APPLE__ && defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined __NetBSD__ || defined _AIX || defined __sun || defined __CYGWIN__) + /* U+00B5 MICRO SIGN */ + mb = for_character ("\302\265", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\302\265", 2) == 0); + #endif + /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ + mb = for_character ("\303\211", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\303\211", 2) == 0); + /* U+00DF LATIN SMALL LETTER SHARP S */ + mb = for_character ("\303\237", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\303\237", 2) == 0); + #if !(defined _WIN32 && !defined __CYGWIN__) + /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ + mb = for_character ("\303\251", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\303\211", 2) == 0); + /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ + mb = for_character ("\303\277", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\305\270", 2) == 0); + #endif + /* U+0141 LATIN CAPITAL LETTER L WITH STROKE */ + mb = for_character ("\305\201", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\305\201", 2) == 0); + /* U+0142 LATIN SMALL LETTER L WITH STROKE */ + mb = for_character ("\305\202", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\305\201", 2) == 0); + /* U+0429 CYRILLIC CAPITAL LETTER SHCHA */ + mb = for_character ("\320\251", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\320\251", 2) == 0); + /* U+0449 CYRILLIC SMALL LETTER SHCHA */ + mb = for_character ("\321\211", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\320\251", 2) == 0); + /* U+05D5 HEBREW LETTER VAV */ + mb = for_character ("\327\225", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\327\225", 2) == 0); + /* U+3073 HIRAGANA LETTER BI */ + mb = for_character ("\343\201\263", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\343\201\263", 3) == 0); + /* U+3162 HANGUL LETTER YI */ + mb = for_character ("\343\205\242", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\343\205\242", 3) == 0); + /* U+FF27 FULLWIDTH LATIN CAPITAL LETTER G */ + mb = for_character ("\357\274\247", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\357\274\247", 3) == 0); + /* U+FF47 FULLWIDTH LATIN SMALL LETTER G */ + mb = for_character ("\357\275\207", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\357\274\247", 3) == 0); + /* U+FFDB HALFWIDTH HANGUL LETTER YI */ + mb = for_character ("\357\277\233", 3); + ASSERT (mb.nbytes == 3); + ASSERT (memcmp (mb.buf, "\357\277\233", 3) == 0); + /* U+10419 DESERET CAPITAL LETTER EF */ + mb = for_character ("\360\220\220\231", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\360\220\220\231", 4) == 0); + #if !(defined __DragonFly__ || defined __sun) + /* U+10441 DESERET SMALL LETTER EF */ + mb = for_character ("\360\220\221\201", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\360\220\220\231", 4) == 0); + #endif + /* U+E0041 TAG LATIN CAPITAL LETTER A */ + mb = for_character ("\363\240\201\201", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\363\240\201\201", 4) == 0); + /* U+E0061 TAG LATIN SMALL LETTER A */ + mb = for_character ("\363\240\201\241", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\363\240\201\241", 4) == 0); + } + return 0; + + case '4': + /* Locale encoding is GB18030. */ + { + /* U+00B2 SUPERSCRIPT TWO */ + mb = for_character ("\201\060\205\065", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\205\065", 4) == 0); + #if !(defined __GLIBC__ || (defined __APPLE__ && defined __MACH__) || defined __NetBSD__) + /* U+00B5 MICRO SIGN */ + mb = for_character ("\201\060\205\070", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\205\070", 4) == 0); + #endif + /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */ + mb = for_character ("\201\060\207\067", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\207\067", 4) == 0); + /* U+00DF LATIN SMALL LETTER SHARP S */ + mb = for_character ("\201\060\211\070", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\211\070", 4) == 0); + #if !(defined __FreeBSD__ || defined __DragonFly__ || defined __sun) + /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */ + mb = for_character ("\250\246", 2); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\207\067", 4) == 0); + /* U+00FF LATIN SMALL LETTER Y WITH DIAERESIS */ + mb = for_character ("\201\060\213\067", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\227\060", 4) == 0); + #endif + /* U+0141 LATIN CAPITAL LETTER L WITH STROKE */ + mb = for_character ("\201\060\221\071", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\221\071", 4) == 0); + #if !(defined __FreeBSD__ || defined __DragonFly__ || defined __sun) + /* U+0142 LATIN SMALL LETTER L WITH STROKE */ + mb = for_character ("\201\060\222\060", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\221\071", 4) == 0); + #endif + /* U+0429 CYRILLIC CAPITAL LETTER SHCHA */ + mb = for_character ("\247\273", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\247\273", 2) == 0); + #if !(defined __FreeBSD__ || defined __DragonFly__) + /* U+0449 CYRILLIC SMALL LETTER SHCHA */ + mb = for_character ("\247\353", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\247\273", 2) == 0); + #endif + /* U+05D5 HEBREW LETTER VAV */ + mb = for_character ("\201\060\371\067", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\060\371\067", 4) == 0); + /* U+3073 HIRAGANA LETTER BI */ + mb = for_character ("\244\323", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\244\323", 2) == 0); + /* U+3162 HANGUL LETTER YI */ + mb = for_character ("\201\071\256\062", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\201\071\256\062", 4) == 0); + /* U+FF27 FULLWIDTH LATIN CAPITAL LETTER G */ + mb = for_character ("\243\307", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\243\307", 2) == 0); + #if !defined __DragonFly__ + /* U+FF47 FULLWIDTH LATIN SMALL LETTER G */ + mb = for_character ("\243\347", 2); + ASSERT (mb.nbytes == 2); + ASSERT (memcmp (mb.buf, "\243\307", 2) == 0); + #endif + /* U+FFDB HALFWIDTH HANGUL LETTER YI */ + mb = for_character ("\204\061\241\071", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\204\061\241\071", 4) == 0); + /* U+10419 DESERET CAPITAL LETTER EF */ + mb = for_character ("\220\060\351\071", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\220\060\351\071", 4) == 0); + #if !((defined __APPLE__ && defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined __NetBSD__ || defined __sun) + /* U+10441 DESERET SMALL LETTER EF */ + mb = for_character ("\220\060\355\071", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\220\060\351\071", 4) == 0); + #endif + /* U+E0041 TAG LATIN CAPITAL LETTER A */ + mb = for_character ("\323\066\234\063", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\323\066\234\063", 4) == 0); + /* U+E0061 TAG LATIN SMALL LETTER A */ + mb = for_character ("\323\066\237\065", 4); + ASSERT (mb.nbytes == 4); + ASSERT (memcmp (mb.buf, "\323\066\237\065", 4) == 0); + } + return 0; + + } + + return 1; +} diff --git a/tests/test-c32toupper.sh b/tests/test-c32toupper.sh new file mode 100755 index 0000000000..1c253e5460 --- /dev/null +++ b/tests/test-c32toupper.sh @@ -0,0 +1,42 @@ +#!/bin/sh + +# Allow distinguishing the various invocations in the .log file. +set -x + +# Test in the POSIX locale. +LC_ALL=C ${CHECKER} ./test-c32toupper${EXEEXT} 0 || exit 1 +LC_ALL=POSIX ${CHECKER} ./test-c32toupper${EXEEXT} 0 || exit 1 + +# Test in an ISO-8859-1 or ISO-8859-15 locale. +: "${LOCALE_FR=fr_FR}" +if test $LOCALE_FR != none; then + LC_ALL=$LOCALE_FR \ + ${CHECKER} ./test-c32toupper${EXEEXT} 1 \ + || exit 1 +fi + +# Test whether a specific EUC-JP locale is installed. +: "${LOCALE_JA=ja_JP}" +if test $LOCALE_JA != none; then + LC_ALL=$LOCALE_JA \ + ${CHECKER} ./test-c32toupper${EXEEXT} 2 \ + || exit 1 +fi + +# Test whether a specific UTF-8 locale is installed. +: "${LOCALE_FR_UTF8=fr_FR.UTF-8}" +if test $LOCALE_FR_UTF8 != none; then + LC_ALL=$LOCALE_FR_UTF8 \ + ${CHECKER} ./test-c32toupper${EXEEXT} 3 \ + || exit 1 +fi + +# Test whether a specific GB18030 locale is installed. +: "${LOCALE_ZH_CN=zh_CN.GB18030}" +if test $LOCALE_ZH_CN != none; then + LC_ALL=$LOCALE_ZH_CN \ + ${CHECKER} ./test-c32toupper${EXEEXT} 4 \ + || exit 1 +fi + +exit 0 -- 2.34.1