This patch adds most test cases from Unicode.org's LineBreakTest.txt to the gnulib tests. I wish I had done this earlier: I would have noticed the several bugs that I fixed yesterday and today much earlier.
2024-09-15 Bruno Haible <br...@clisp.org> unilbrk: Strengthen tests. * lib/gen-uni-tables.c: Add shell commands for creating tests/unilbrk/LineBreakTest.txt. * tests/unilbrk/LineBreakTest.txt: New file, from unicode.org. * tests/unilbrk/test-uc-possible-linebreaks.c: New file, based on tests/uniwbrk/test-uc-wordbreaks.c. * tests/unilbrk/test-uc-possible-linebreaks.sh: New file, based on tests/uniwbrk/test-uc-wordbreaks.sh. * modules/unilbrk/u32-possible-linebreaks-tests (Files): Add them. (Makefile.am): Arrange to compile test-uc-possible-linebreaks.c and test test-uc-possible-linebreaks.sh. 2024-09-15 Bruno Haible <br...@clisp.org> uniwbrk tests: Modernize code. * tests/uniwbrk/test-uc-wordbreaks.c (main): Reduce the scope of local variables. Remove memset() calls. Align expected and actial output lines. Add comments. 2024-09-15 Bruno Haible <br...@clisp.org> unigbrk tests: Modernize code. * tests/unigbrk/test-uc-grapheme-breaks.c (main): Reduce the scope of local variables. Add comments. * tests/unigbrk/test-uc-is-grapheme-break.c (main): Likewise.
>From 60748cab5bc917a544b126e4fc1e04c074d86904 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Sun, 15 Sep 2024 15:16:52 +0200 Subject: [PATCH 1/3] unigbrk tests: Modernize code. * tests/unigbrk/test-uc-grapheme-breaks.c (main): Reduce the scope of local variables. Add comments. * tests/unigbrk/test-uc-is-grapheme-break.c (main): Likewise. --- ChangeLog | 7 +++++++ tests/unigbrk/test-uc-grapheme-breaks.c | 19 +++++++++-------- tests/unigbrk/test-uc-is-grapheme-break.c | 25 ++++++++++++----------- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/ChangeLog b/ChangeLog index d8396afb00..56ff30e8ef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2024-09-15 Bruno Haible <br...@clisp.org> + + unigbrk tests: Modernize code. + * tests/unigbrk/test-uc-grapheme-breaks.c (main): Reduce the scope of + local variables. Add comments. + * tests/unigbrk/test-uc-is-grapheme-break.c (main): Likewise. + 2024-09-15 Bruno Haible <br...@clisp.org> unilbrk: Fix conflicts between rules. diff --git a/tests/unigbrk/test-uc-grapheme-breaks.c b/tests/unigbrk/test-uc-grapheme-breaks.c index e3f0005899..49eed04c30 100644 --- a/tests/unigbrk/test-uc-grapheme-breaks.c +++ b/tests/unigbrk/test-uc-grapheme-breaks.c @@ -97,9 +97,9 @@ int main (int argc, char *argv[]) { const char *filename; - char line[1024]; FILE *stream; int lineno; + char line[1024]; if (argc != 2) { @@ -118,22 +118,23 @@ main (int argc, char *argv[]) } lineno = 0; - while (fgets (line, sizeof line, stream)) + while (fgets (line, sizeof (line), stream)) { - char *comment; - const char *p; - ucs4_t s[16]; - char breaks[16]; - size_t i = 0; - lineno++; - comment = strchr (line, '#'); + /* Cut off the trailing comment, if any. */ + char *comment = strchr (line, '#'); if (comment != NULL) *comment = '\0'; + /* Is the remaining line blank? */ if (line[strspn (line, " \t\r\n")] == '\0') continue; + const char *p; + ucs4_t s[16]; + char breaks[16]; + size_t i = 0; + s[0] = 0; p = line; do diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c b/tests/unigbrk/test-uc-is-grapheme-break.c index 92674a07fb..939b09b3a9 100644 --- a/tests/unigbrk/test-uc-is-grapheme-break.c +++ b/tests/unigbrk/test-uc-is-grapheme-break.c @@ -59,10 +59,10 @@ int main (int argc, char *argv[]) { const char *filename; - char line[1024]; - int exit_code; FILE *stream; + int exit_code; int lineno; + char line[1024]; if (argc != 2) { @@ -82,9 +82,18 @@ main (int argc, char *argv[]) exit_code = 0; lineno = 0; - while (fgets (line, sizeof line, stream)) + while (fgets (line, sizeof (line), stream)) { - char *comment; + lineno++; + + /* Cut off the trailing comment, if any. */ + char *comment = strchr (line, '#'); + if (comment != NULL) + *comment = '\0'; + /* Is the remaining line blank? */ + if (line[strspn (line, " \t\r\n")] == '\0') + continue; + const char *p; ucs4_t prev; int last_char_prop; @@ -95,14 +104,6 @@ main (int argc, char *argv[]) bool emoji_modifier_sequence_before_last_char; size_t ri_count; - lineno++; - - comment = strchr (line, '#'); - if (comment != NULL) - *comment = '\0'; - if (line[strspn (line, " \t\r\n")] == '\0') - continue; - last_char_prop = -1; incb_consonant_extended = false; incb_consonant_extended_linker = false; -- 2.34.1
>From 54d66d912df487a31aa10c6c35df3ba556c9c43a Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Sun, 15 Sep 2024 15:20:08 +0200 Subject: [PATCH 2/3] uniwbrk tests: Modernize code. * tests/uniwbrk/test-uc-wordbreaks.c (main): Reduce the scope of local variables. Remove memset() calls. Align expected and actial output lines. Add comments. --- ChangeLog | 7 +++++++ tests/uniwbrk/test-uc-wordbreaks.c | 28 +++++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/ChangeLog b/ChangeLog index 56ff30e8ef..a31bdc0861 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2024-09-15 Bruno Haible <br...@clisp.org> + + uniwbrk tests: Modernize code. + * tests/uniwbrk/test-uc-wordbreaks.c (main): Reduce the scope of local + variables. Remove memset() calls. Align expected and actial output + lines. Add comments. + 2024-09-15 Bruno Haible <br...@clisp.org> unigbrk tests: Modernize code. diff --git a/tests/uniwbrk/test-uc-wordbreaks.c b/tests/uniwbrk/test-uc-wordbreaks.c index 8213446f77..1c7bb1017e 100644 --- a/tests/uniwbrk/test-uc-wordbreaks.c +++ b/tests/uniwbrk/test-uc-wordbreaks.c @@ -65,10 +65,10 @@ int main (int argc, char *argv[]) { const char *filename; - char line[4096]; - int exit_code; FILE *stream; + int exit_code; int lineno; + char line[4096]; if (argc != 2) { @@ -88,26 +88,24 @@ main (int argc, char *argv[]) exit_code = 0; lineno = 0; - while (fgets (line, sizeof line, stream)) + while (fgets (line, sizeof (line), stream)) { - char *comment; - const char *p; - uint32_t input[100]; - char breaks[101]; - char breaks_expected[101]; - int i; - lineno++; - memset (breaks, 0, sizeof (breaks)); - memset (breaks_expected, 0, sizeof (breaks_expected)); - - comment = strchr (line, '#'); + /* Cut off the trailing comment, if any. */ + char *comment = strchr (line, '#'); if (comment != NULL) *comment = '\0'; + /* Is the remaining line blank? */ if (line[strspn (line, " \t\r\n")] == '\0') continue; + const char *p; + uint32_t input[100]; + char breaks[100]; + char breaks_expected[101]; + int i; + i = 0; p = line; do @@ -170,7 +168,7 @@ main (int argc, char *argv[]) input[j], wordbreakproperty_to_string (input_wbp)); } fprintf (stderr, "\n"); - fprintf (stderr, "%s:%d: actual: ", filename, lineno); + fprintf (stderr, "%s:%d: actual: ", filename, lineno); for (j = 0; j < i - 1; j++) { int input_wbp = uc_wordbreak_property (input[j]); -- 2.34.1
From 1aee1fe7d5a1d8a1f2067c4eede0a34bd1e3a655 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Sun, 15 Sep 2024 15:27:34 +0200 Subject: [PATCH 3/3] unilbrk: Strengthen tests. * lib/gen-uni-tables.c: Add shell commands for creating tests/unilbrk/LineBreakTest.txt. * tests/unilbrk/LineBreakTest.txt: New file, from unicode.org. * tests/unilbrk/test-uc-possible-linebreaks.c: New file, based on tests/uniwbrk/test-uc-wordbreaks.c. * tests/unilbrk/test-uc-possible-linebreaks.sh: New file, based on tests/uniwbrk/test-uc-wordbreaks.sh. * modules/unilbrk/u32-possible-linebreaks-tests (Files): Add them. (Makefile.am): Arrange to compile test-uc-possible-linebreaks.c and test test-uc-possible-linebreaks.sh. --- ChangeLog | 14 + lib/gen-uni-tables.c | 4 + modules/unilbrk/u32-possible-linebreaks-tests | 9 +- tests/unilbrk/LineBreakTest.txt | 16740 ++++++++++++++++ tests/unilbrk/test-uc-possible-linebreaks.c | 189 + tests/unilbrk/test-uc-possible-linebreaks.sh | 3 + 6 files changed, 16957 insertions(+), 2 deletions(-) create mode 100644 tests/unilbrk/LineBreakTest.txt create mode 100644 tests/unilbrk/test-uc-possible-linebreaks.c create mode 100755 tests/unilbrk/test-uc-possible-linebreaks.sh diff --git a/ChangeLog b/ChangeLog index a31bdc0861..54ad8d3eec 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +2024-09-15 Bruno Haible <br...@clisp.org> + + unilbrk: Strengthen tests. + * lib/gen-uni-tables.c: Add shell commands for creating + tests/unilbrk/LineBreakTest.txt. + * tests/unilbrk/LineBreakTest.txt: New file, from unicode.org. + * tests/unilbrk/test-uc-possible-linebreaks.c: New file, based on + tests/uniwbrk/test-uc-wordbreaks.c. + * tests/unilbrk/test-uc-possible-linebreaks.sh: New file, based on + tests/uniwbrk/test-uc-wordbreaks.sh. + * modules/unilbrk/u32-possible-linebreaks-tests (Files): Add them. + (Makefile.am): Arrange to compile test-uc-possible-linebreaks.c and test + test-uc-possible-linebreaks.sh. + 2024-09-15 Bruno Haible <br...@clisp.org> uniwbrk tests: Modernize code. diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index f95a78dbf1..2ee7a686fc 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -12232,6 +12232,10 @@ main (int argc, char * argv[]) * > ../tests/unigbrk/GraphemeBreakTest.txt \\ * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\ * echo; \\ + * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/LineBreakTest.txt; } \\ + * > ../tests/unilbrk/LineBreakTest.txt \\ + * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\ + * echo; \\ * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/WordBreakTest.txt; } \\ * > ../tests/uniwbrk/WordBreakTest.txt" * End: diff --git a/modules/unilbrk/u32-possible-linebreaks-tests b/modules/unilbrk/u32-possible-linebreaks-tests index 7e5b50563a..319c86dd5c 100644 --- a/modules/unilbrk/u32-possible-linebreaks-tests +++ b/modules/unilbrk/u32-possible-linebreaks-tests @@ -1,5 +1,8 @@ Files: tests/unilbrk/test-u32-possible-linebreaks.c +tests/unilbrk/test-uc-possible-linebreaks.c +tests/unilbrk/test-uc-possible-linebreaks.sh +tests/unilbrk/LineBreakTest.txt tests/macros.h Depends-on: @@ -7,7 +10,9 @@ Depends-on: configure.ac: Makefile.am: -TESTS += test-u32-possible-linebreaks -check_PROGRAMS += test-u32-possible-linebreaks +TESTS += test-u32-possible-linebreaks unilbrk/test-uc-possible-linebreaks.sh +check_PROGRAMS += test-u32-possible-linebreaks test-uc-possible-linebreaks test_u32_possible_linebreaks_SOURCES = unilbrk/test-u32-possible-linebreaks.c test_u32_possible_linebreaks_LDADD = $(LDADD) $(LIBUNISTRING) +test_uc_possible_linebreaks_SOURCES = unilbrk/test-uc-possible-linebreaks.c +test_uc_possible_linebreaks_LDADD = $(LDADD) $(LIBUNISTRING) diff --git a/tests/unilbrk/test-uc-possible-linebreaks.c b/tests/unilbrk/test-uc-possible-linebreaks.c new file mode 100644 index 0000000000..cebead6723 --- /dev/null +++ b/tests/unilbrk/test-uc-possible-linebreaks.c @@ -0,0 +1,189 @@ +/* Line break function test, using test data from UCD. + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Bruno Haible <br...@clisp.org>, 2024. */ + +#include <config.h> + +/* Specification. */ +#include <unilbrk.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +int +main (int argc, char *argv[]) +{ + const char *filename; + FILE *stream; + int exit_code; + int lineno; + char line[16384]; + + if (argc != 2) + { + fprintf (stderr, "usage: %s FILENAME\n" + "where FILENAME is the location of the LineBreakTest.txt test file.\n", + argv[0]); + exit (1); + } + + filename = argv[1]; + stream = fopen (filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", filename); + exit (1); + } + + exit_code = 0; + lineno = 0; + while (fgets (line, sizeof (line), stream)) + { + lineno++; + + /* Cut off the trailing comment, if any. */ + char *comment = strchr (line, '#'); + if (comment != NULL) + *comment = '\0'; + /* Is the remaining line blank? */ + if (line[strspn (line, " \t\r\n")] == '\0') + continue; + + const char *p; + uint32_t input[1024]; + char breaks[1024]; + char breaks_expected[1025]; + int i; + + i = 0; + p = line; + do + { + p += strspn (p, " \t\r\n"); + if (!strncmp (p, "\303\267" /* ÷ */, 2)) + { + breaks_expected[i] = 1; + p += 2; + } + else if (!strncmp (p, "\303\227" /* × */, 2)) + { + breaks_expected[i] = 0; + p += 2; + } + else + { + fprintf (stderr, "%s:%d.%d: syntax error expecting '÷' or '×'\n", + filename, lineno, (int) (p - line + 1)); + exit (1); + } + + p += strspn (p, " \t\r\n"); + if (*p != '\0') + { + unsigned int next_int; + int n; + + if (sscanf (p, "%x%n", &next_int, &n) != 1) + { + fprintf (stderr, "%s:%d.%d: syntax error at '%s' " + "expecting hexadecimal Unicode code point number\n", + filename, lineno, (int) (p - line + 1), p); + exit (1); + } + p += n; + + input[i] = next_int; + } + + p += strspn (p, " \t\r\n"); + i++; + } + while (*p != '\0'); + + u32_possible_linebreaks (input, i - 1, "UTF-8", breaks); + + int matches = 1; + { + int j; + for (j = 0; j < i - 1; j++) + { + /* The character U+FFFC has line break property CB, which according + to rule (LB1) is resolved "into other line breaking classes + depending on criteria outside the scope of this algorithm". + Thus it makes no sense to check the breaks[] entry before or + after such a character. */ + if (!(input[j] == 0xFFFC + || (j > 0 && input[j - 1] == 0xFFFC) + /* Also consider intervening characters with property LBP_CM + or LBP_ZWJ, per (LB9). */ + || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) + && input[j - 2] == 0xFFFC))) + /* A regional indicator with a combining character is nonsense, + because regional indicators are supposed to come in pairs. */ + if (!(j >= 2 && (input[0] >= 0x1F1E6 && input[0] <= 0x1F1FF) + && input[1] == 0x0308)) + /* There is a disagreement regarding whether to allow a line break + after a U+0020 SPACE character at the start of the text. + We consider that the start of the text is equivalent to the + state after a newline was seen; hence the loop starts with + property LBP_BK. By the rules (LB4,LB5,LB6) an extra line + break after a mandatory line break is undesired, even with + intervening spaces (because these rules come before (LB18)). + Whereas the LineBreakTest.txt file allows a line break after + the space. + Similarly when the first two characters at the start of the + text have property LBP_CM and LBP_ZWJ, respectively. (LB9). */ + if (!(((j == 1 || (j > 1 && (input[j - 2] >= 0x000A && input[j - 2] <= 0x000D || input[j - 2] == 0x0085))) + && input[j - 1] == 0x0020) + || ((j == 2 || (j > 2 && (input[j - 3] >= 0x000A && input[j - 3] <= 0x000D || input[j - 3] == 0x0085))) + && ((input[j - 2] == 0x0020 && input[j - 1] == 0x0020) + || (input[j - 2] == 0x0308 && input[j - 1] == 0x200D) + || (input[j - 2] == 0x200D && input[j - 1] == 0x0308))))) + matches &= (!(breaks[j] == UC_BREAK_PROHIBITED + || breaks[j] == UC_BREAK_MANDATORY + || breaks[j] == UC_BREAK_CR_BEFORE_LF) + || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY)) + == breaks_expected[j]; + } + } + if (!matches) + { + int j; + + fprintf (stderr, "%s:%d: expected: ", filename, lineno); + for (j = 0; j < i - 1; j++) + fprintf (stderr, "%s U+%04X ", + breaks_expected[j] == 1 ? "\303\267" : "\303\227", + input[j]); + fprintf (stderr, "\n"); + fprintf (stderr, "%s:%d: actual: ", filename, lineno); + for (j = 0; j < i - 1; j++) + fprintf (stderr, "%s U+%04X ", + (!(breaks[j] == UC_BREAK_PROHIBITED + || breaks[j] == UC_BREAK_MANDATORY + || breaks[j] == UC_BREAK_CR_BEFORE_LF) + || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY)) + ? "\303\267" : "\303\227", + input[j]); + fprintf (stderr, "\n"); + exit_code = 1; + } + } + + return exit_code; +} diff --git a/tests/unilbrk/test-uc-possible-linebreaks.sh b/tests/unilbrk/test-uc-possible-linebreaks.sh new file mode 100755 index 0000000000..0df7bb915f --- /dev/null +++ b/tests/unilbrk/test-uc-possible-linebreaks.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +${CHECKER} ./test-uc-possible-linebreaks${EXEEXT} "${srcdir}/unilbrk/LineBreakTest.txt" diff --git a/tests/unilbrk/LineBreakTest.txt b/tests/unilbrk/LineBreakTest.txt new file mode 100644 index 0000000000..75044be059 --- /dev/null +++ b/tests/unilbrk/LineBreakTest.txt (omitted)