In Unicode 16.0.0, there are composed characters whose constituents are Unicode characters with value >= 0x12000. A hardcoded bound in lib/uninorm/composition.c no longer works.
2024-09-12 Bruno Haible <br...@clisp.org> uninorm/composition: Make more maintainable. * lib/gen-uni-tables.c (output_composition_tables): Add a filename2 parameter. Emit definitions of UNINORM_COMPOSE_MAX_ARG1 and UNINORM_COMPOSE_MAX_ARG2 to this file. (main): Invoke it with additional file name uninorm/composition-table-bounds.h. * uninorm/composition-table-bounds.h: New generated file. * lib/uninorm/composition.c: Include it. (uc_composition): Use UNINORM_COMPOSE_MAX_ARG1 and UNINORM_COMPOSE_MAX_ARG2 instead of hardcoded bounds. diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index b4f16da560..c42827ea25 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -10765,11 +10765,17 @@ debug_output_composition_tables (const char *filename) } static void -output_composition_tables (const char *filename, const char *version) +output_composition_tables (const char *filename, const char *filename2, + const char *version) { + unsigned int max_code1; + unsigned int max_code2; FILE *stream; unsigned int ch; + max_code1 = 0; + max_code2 = 0; + stream = fopen (filename, "w"); if (stream == NULL) { @@ -10844,6 +10850,11 @@ output_composition_tables (const char *filename, const char *version) Verify this. */ assert (strcmp (unicode_attributes[combined].combining, "0") == 0); + if (max_code1 < code1) + max_code1 = code1; + if (max_code2 < code2) + max_code2 = code2; + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff, (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff, @@ -10857,6 +10868,37 @@ output_composition_tables (const char *filename, const char *version) fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } + + stream = fopen (filename2, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename2); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Canonical composition of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + fprintf (stream, "/* Copyright (C) 2009-2024 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + output_library_license (stream, true); + fprintf (stream, "\n"); + + fprintf (stream, "/* Maximum value of the first argument for which gl_uninorm_compose_lookup\n" + " can return a non-NULL value. */\n"); + fprintf (stream, "#define UNINORM_COMPOSE_MAX_ARG1 0x%x\n", max_code1); + fprintf (stream, "/* Maximum value of the second argument for which gl_uninorm_compose_lookup\n" + " can return a non-NULL value. */\n"); + fprintf (stream, "#define UNINORM_COMPOSE_MAX_ARG2 0x%x\n", max_code2); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename2); + exit (1); + } } /* ========================================================================= */ @@ -12031,7 +12073,7 @@ main (int argc, char * argv[]) output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version); debug_output_composition_tables ("uninorm/composition.txt"); - output_composition_tables ("uninorm/composition-table.gperf", version); + output_composition_tables ("uninorm/composition-table.gperf", "uninorm/composition-table-bounds.h", version); output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version); output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version); diff --git a/lib/uninorm/composition-table-bounds.h b/lib/uninorm/composition-table-bounds.h new file mode 100644 index 0000000000..5eafc478c0 --- /dev/null +++ b/lib/uninorm/composition-table-bounds.h @@ -0,0 +1,25 @@ +/* DO NOT EDIT! GENERATED AUTOMATICALLY! */ +/* Canonical composition of Unicode characters. */ +/* Generated automatically by gen-uni-tables.c for Unicode 15.1.0. */ + +/* Copyright (C) 2009-2024 Free Software Foundation, Inc. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Maximum value of the first argument for which gl_uninorm_compose_lookup + can return a non-NULL value. */ +#define UNINORM_COMPOSE_MAX_ARG1 0x11935 +/* Maximum value of the second argument for which gl_uninorm_compose_lookup + can return a non-NULL value. */ +#define UNINORM_COMPOSE_MAX_ARG2 0x11930 diff --git a/lib/uninorm/composition.c b/lib/uninorm/composition.c index df662895df..ecc4ed21e0 100644 --- a/lib/uninorm/composition.c +++ b/lib/uninorm/composition.c @@ -25,11 +25,12 @@ struct composition_rule { char codes[6]; unsigned int combined; }; #include "composition-table.h" +#include "composition-table-bounds.h" ucs4_t uc_composition (ucs4_t uc1, ucs4_t uc2) { - if (uc1 < 0x12000 && uc2 < 0x12000) + if (uc1 <= UNINORM_COMPOSE_MAX_ARG1 && uc2 <= UNINORM_COMPOSE_MAX_ARG2) { if (uc2 >= 0x1161 && uc2 < 0x1161 + 21 && uc1 >= 0x1100 && uc1 < 0x1100 + 19)