unilbrk: Strengthen tests

Bruno Haible Sun, 15 Sep 2024 06:36:57 -0700

This patch adds most test cases from Unicode.org's LineBreakTest.txt
to the gnulib tests. I wish I had done this earlier: I would have noticed
the several bugs that I fixed yesterday and today much earlier.



2024-09-15  Bruno Haible  <br...@clisp.org>

        unilbrk: Strengthen tests.
        * lib/gen-uni-tables.c: Add shell commands for creating
        tests/unilbrk/LineBreakTest.txt.
        * tests/unilbrk/LineBreakTest.txt: New file, from unicode.org.
        * tests/unilbrk/test-uc-possible-linebreaks.c: New file, based on
        tests/uniwbrk/test-uc-wordbreaks.c.
        * tests/unilbrk/test-uc-possible-linebreaks.sh: New file, based on
        tests/uniwbrk/test-uc-wordbreaks.sh.
        * modules/unilbrk/u32-possible-linebreaks-tests (Files): Add them.
        (Makefile.am): Arrange to compile test-uc-possible-linebreaks.c and test
        test-uc-possible-linebreaks.sh.

2024-09-15  Bruno Haible  <br...@clisp.org>

        uniwbrk tests: Modernize code.
        * tests/uniwbrk/test-uc-wordbreaks.c (main): Reduce the scope of local
        variables. Remove memset() calls. Align expected and actial output
        lines. Add comments.

2024-09-15  Bruno Haible  <br...@clisp.org>

        unigbrk tests: Modernize code.
        * tests/unigbrk/test-uc-grapheme-breaks.c (main): Reduce the scope of
        local variables. Add comments.
        * tests/unigbrk/test-uc-is-grapheme-break.c (main): Likewise.

>From 60748cab5bc917a544b126e4fc1e04c074d86904 Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Sun, 15 Sep 2024 15:16:52 +0200
Subject: [PATCH 1/3] unigbrk tests: Modernize code.

* tests/unigbrk/test-uc-grapheme-breaks.c (main): Reduce the scope of
local variables. Add comments.
* tests/unigbrk/test-uc-is-grapheme-break.c (main): Likewise.
---
 ChangeLog                                 |  7 +++++++
 tests/unigbrk/test-uc-grapheme-breaks.c   | 19 +++++++++--------
 tests/unigbrk/test-uc-is-grapheme-break.c | 25 ++++++++++++-----------
 3 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index d8396afb00..56ff30e8ef 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2024-09-15  Bruno Haible  <br...@clisp.org>
+
+	unigbrk tests: Modernize code.
+	* tests/unigbrk/test-uc-grapheme-breaks.c (main): Reduce the scope of
+	local variables. Add comments.
+	* tests/unigbrk/test-uc-is-grapheme-break.c (main): Likewise.
+
 2024-09-15  Bruno Haible  <br...@clisp.org>
 
 	unilbrk: Fix conflicts between rules.
diff --git a/tests/unigbrk/test-uc-grapheme-breaks.c b/tests/unigbrk/test-uc-grapheme-breaks.c
index e3f0005899..49eed04c30 100644
--- a/tests/unigbrk/test-uc-grapheme-breaks.c
+++ b/tests/unigbrk/test-uc-grapheme-breaks.c
@@ -97,9 +97,9 @@ int
 main (int argc, char *argv[])
 {
   const char *filename;
-  char line[1024];
   FILE *stream;
   int lineno;
+  char line[1024];
 
   if (argc != 2)
     {
@@ -118,22 +118,23 @@ main (int argc, char *argv[])
     }
 
   lineno = 0;
-  while (fgets (line, sizeof line, stream))
+  while (fgets (line, sizeof (line), stream))
     {
-      char *comment;
-      const char *p;
-      ucs4_t s[16];
-      char breaks[16];
-      size_t i = 0;
-
       lineno++;
 
-      comment = strchr (line, '#');
+      /* Cut off the trailing comment, if any.  */
+      char *comment = strchr (line, '#');
       if (comment != NULL)
         *comment = '\0';
+      /* Is the remaining line blank?  */
       if (line[strspn (line, " \t\r\n")] == '\0')
         continue;
 
+      const char *p;
+      ucs4_t s[16];
+      char breaks[16];
+      size_t i = 0;
+
       s[0] = 0;
       p = line;
       do
diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c b/tests/unigbrk/test-uc-is-grapheme-break.c
index 92674a07fb..939b09b3a9 100644
--- a/tests/unigbrk/test-uc-is-grapheme-break.c
+++ b/tests/unigbrk/test-uc-is-grapheme-break.c
@@ -59,10 +59,10 @@ int
 main (int argc, char *argv[])
 {
   const char *filename;
-  char line[1024];
-  int exit_code;
   FILE *stream;
+  int exit_code;
   int lineno;
+  char line[1024];
 
   if (argc != 2)
     {
@@ -82,9 +82,18 @@ main (int argc, char *argv[])
 
   exit_code = 0;
   lineno = 0;
-  while (fgets (line, sizeof line, stream))
+  while (fgets (line, sizeof (line), stream))
     {
-      char *comment;
+      lineno++;
+
+      /* Cut off the trailing comment, if any.  */
+      char *comment = strchr (line, '#');
+      if (comment != NULL)
+        *comment = '\0';
+      /* Is the remaining line blank?  */
+      if (line[strspn (line, " \t\r\n")] == '\0')
+        continue;
+
       const char *p;
       ucs4_t prev;
       int last_char_prop;
@@ -95,14 +104,6 @@ main (int argc, char *argv[])
       bool emoji_modifier_sequence_before_last_char;
       size_t ri_count;
 
-      lineno++;
-
-      comment = strchr (line, '#');
-      if (comment != NULL)
-        *comment = '\0';
-      if (line[strspn (line, " \t\r\n")] == '\0')
-        continue;
-
       last_char_prop = -1;
       incb_consonant_extended = false;
       incb_consonant_extended_linker = false;
-- 
2.34.1

>From 54d66d912df487a31aa10c6c35df3ba556c9c43a Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Sun, 15 Sep 2024 15:20:08 +0200
Subject: [PATCH 2/3] uniwbrk tests: Modernize code.

* tests/uniwbrk/test-uc-wordbreaks.c (main): Reduce the scope of local
variables. Remove memset() calls. Align expected and actial output
lines. Add comments.
---
 ChangeLog                          |  7 +++++++
 tests/uniwbrk/test-uc-wordbreaks.c | 28 +++++++++++++---------------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 56ff30e8ef..a31bdc0861 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2024-09-15  Bruno Haible  <br...@clisp.org>
+
+	uniwbrk tests: Modernize code.
+	* tests/uniwbrk/test-uc-wordbreaks.c (main): Reduce the scope of local
+	variables. Remove memset() calls. Align expected and actial output
+	lines. Add comments.
+
 2024-09-15  Bruno Haible  <br...@clisp.org>
 
 	unigbrk tests: Modernize code.
diff --git a/tests/uniwbrk/test-uc-wordbreaks.c b/tests/uniwbrk/test-uc-wordbreaks.c
index 8213446f77..1c7bb1017e 100644
--- a/tests/uniwbrk/test-uc-wordbreaks.c
+++ b/tests/uniwbrk/test-uc-wordbreaks.c
@@ -65,10 +65,10 @@ int
 main (int argc, char *argv[])
 {
   const char *filename;
-  char line[4096];
-  int exit_code;
   FILE *stream;
+  int exit_code;
   int lineno;
+  char line[4096];
 
   if (argc != 2)
     {
@@ -88,26 +88,24 @@ main (int argc, char *argv[])
 
   exit_code = 0;
   lineno = 0;
-  while (fgets (line, sizeof line, stream))
+  while (fgets (line, sizeof (line), stream))
     {
-      char *comment;
-      const char *p;
-      uint32_t input[100];
-      char breaks[101];
-      char breaks_expected[101];
-      int i;
-
       lineno++;
 
-      memset (breaks, 0, sizeof (breaks));
-      memset (breaks_expected, 0, sizeof (breaks_expected));
-
-      comment = strchr (line, '#');
+      /* Cut off the trailing comment, if any.  */
+      char *comment = strchr (line, '#');
       if (comment != NULL)
         *comment = '\0';
+      /* Is the remaining line blank?  */
       if (line[strspn (line, " \t\r\n")] == '\0')
         continue;
 
+      const char *p;
+      uint32_t input[100];
+      char breaks[100];
+      char breaks_expected[101];
+      int i;
+
       i = 0;
       p = line;
       do
@@ -170,7 +168,7 @@ main (int argc, char *argv[])
                        input[j], wordbreakproperty_to_string (input_wbp));
             }
           fprintf (stderr, "\n");
-          fprintf (stderr, "%s:%d: actual: ", filename, lineno);
+          fprintf (stderr, "%s:%d: actual:   ", filename, lineno);
           for (j = 0; j < i - 1; j++)
             {
               int input_wbp = uc_wordbreak_property (input[j]);
-- 
2.34.1

From 1aee1fe7d5a1d8a1f2067c4eede0a34bd1e3a655 Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Sun, 15 Sep 2024 15:27:34 +0200
Subject: [PATCH 3/3] unilbrk: Strengthen tests.

* lib/gen-uni-tables.c: Add shell commands for creating
tests/unilbrk/LineBreakTest.txt.
* tests/unilbrk/LineBreakTest.txt: New file, from unicode.org.
* tests/unilbrk/test-uc-possible-linebreaks.c: New file, based on
tests/uniwbrk/test-uc-wordbreaks.c.
* tests/unilbrk/test-uc-possible-linebreaks.sh: New file, based on
tests/uniwbrk/test-uc-wordbreaks.sh.
* modules/unilbrk/u32-possible-linebreaks-tests (Files): Add them.
(Makefile.am): Arrange to compile test-uc-possible-linebreaks.c and test
test-uc-possible-linebreaks.sh.
---
 ChangeLog                                     |    14 +
 lib/gen-uni-tables.c                          |     4 +
 modules/unilbrk/u32-possible-linebreaks-tests |     9 +-
 tests/unilbrk/LineBreakTest.txt               | 16740 ++++++++++++++++
 tests/unilbrk/test-uc-possible-linebreaks.c   |   189 +
 tests/unilbrk/test-uc-possible-linebreaks.sh  |     3 +
 6 files changed, 16957 insertions(+), 2 deletions(-)
 create mode 100644 tests/unilbrk/LineBreakTest.txt
 create mode 100644 tests/unilbrk/test-uc-possible-linebreaks.c
 create mode 100755 tests/unilbrk/test-uc-possible-linebreaks.sh

diff --git a/ChangeLog b/ChangeLog
index a31bdc0861..54ad8d3eec 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2024-09-15  Bruno Haible  <br...@clisp.org>
+
+	unilbrk: Strengthen tests.
+	* lib/gen-uni-tables.c: Add shell commands for creating
+	tests/unilbrk/LineBreakTest.txt.
+	* tests/unilbrk/LineBreakTest.txt: New file, from unicode.org.
+	* tests/unilbrk/test-uc-possible-linebreaks.c: New file, based on
+	tests/uniwbrk/test-uc-wordbreaks.c.
+	* tests/unilbrk/test-uc-possible-linebreaks.sh: New file, based on
+	tests/uniwbrk/test-uc-wordbreaks.sh.
+	* modules/unilbrk/u32-possible-linebreaks-tests (Files): Add them.
+	(Makefile.am): Arrange to compile test-uc-possible-linebreaks.c and test
+	test-uc-possible-linebreaks.sh.
+
 2024-09-15  Bruno Haible  <br...@clisp.org>
 
 	uniwbrk tests: Modernize code.
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index f95a78dbf1..2ee7a686fc 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -12232,6 +12232,10 @@ main (int argc, char * argv[])
  *      > ../tests/unigbrk/GraphemeBreakTest.txt \\
  *   && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
  *        echo; \\
+ *        cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/LineBreakTest.txt; } \\
+ *      > ../tests/unilbrk/LineBreakTest.txt \\
+ *   && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
+ *        echo; \\
  *        cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/WordBreakTest.txt; } \\
  *      > ../tests/uniwbrk/WordBreakTest.txt"
  * End:
diff --git a/modules/unilbrk/u32-possible-linebreaks-tests b/modules/unilbrk/u32-possible-linebreaks-tests
index 7e5b50563a..319c86dd5c 100644
--- a/modules/unilbrk/u32-possible-linebreaks-tests
+++ b/modules/unilbrk/u32-possible-linebreaks-tests
@@ -1,5 +1,8 @@
 Files:
 tests/unilbrk/test-u32-possible-linebreaks.c
+tests/unilbrk/test-uc-possible-linebreaks.c
+tests/unilbrk/test-uc-possible-linebreaks.sh
+tests/unilbrk/LineBreakTest.txt
 tests/macros.h
 
 Depends-on:
@@ -7,7 +10,9 @@ Depends-on:
 configure.ac:
 
 Makefile.am:
-TESTS += test-u32-possible-linebreaks
-check_PROGRAMS += test-u32-possible-linebreaks
+TESTS += test-u32-possible-linebreaks unilbrk/test-uc-possible-linebreaks.sh
+check_PROGRAMS += test-u32-possible-linebreaks test-uc-possible-linebreaks
 test_u32_possible_linebreaks_SOURCES = unilbrk/test-u32-possible-linebreaks.c
 test_u32_possible_linebreaks_LDADD = $(LDADD) $(LIBUNISTRING)
+test_uc_possible_linebreaks_SOURCES = unilbrk/test-uc-possible-linebreaks.c
+test_uc_possible_linebreaks_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/tests/unilbrk/test-uc-possible-linebreaks.c b/tests/unilbrk/test-uc-possible-linebreaks.c
new file mode 100644
index 0000000000..cebead6723
--- /dev/null
+++ b/tests/unilbrk/test-uc-possible-linebreaks.c
@@ -0,0 +1,189 @@
+/* Line break function test, using test data from UCD.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation, either version 3 of the License,
+   or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Bruno Haible <br...@clisp.org>, 2024.  */
+
+#include <config.h>
+
+/* Specification. */
+#include <unilbrk.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int
+main (int argc, char *argv[])
+{
+  const char *filename;
+  FILE *stream;
+  int exit_code;
+  int lineno;
+  char line[16384];
+
+  if (argc != 2)
+    {
+      fprintf (stderr, "usage: %s FILENAME\n"
+               "where FILENAME is the location of the LineBreakTest.txt test file.\n",
+               argv[0]);
+      exit (1);
+    }
+
+  filename = argv[1];
+  stream = fopen (filename, "r");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "error during fopen of '%s'\n", filename);
+      exit (1);
+    }
+
+  exit_code = 0;
+  lineno = 0;
+  while (fgets (line, sizeof (line), stream))
+    {
+      lineno++;
+
+      /* Cut off the trailing comment, if any.  */
+      char *comment = strchr (line, '#');
+      if (comment != NULL)
+        *comment = '\0';
+      /* Is the remaining line blank?  */
+      if (line[strspn (line, " \t\r\n")] == '\0')
+        continue;
+
+      const char *p;
+      uint32_t input[1024];
+      char breaks[1024];
+      char breaks_expected[1025];
+      int i;
+
+      i = 0;
+      p = line;
+      do
+        {
+          p += strspn (p, " \t\r\n");
+          if (!strncmp (p, "\303\267" /* ÷ */, 2))
+            {
+              breaks_expected[i] = 1;
+              p += 2;
+            }
+          else if (!strncmp (p, "\303\227" /* × */, 2))
+            {
+              breaks_expected[i] = 0;
+              p += 2;
+            }
+          else
+            {
+              fprintf (stderr, "%s:%d.%d: syntax error expecting '÷' or '×'\n",
+                       filename, lineno, (int) (p - line + 1));
+              exit (1);
+            }
+
+          p += strspn (p, " \t\r\n");
+          if (*p != '\0')
+            {
+              unsigned int next_int;
+              int n;
+
+              if (sscanf (p, "%x%n", &next_int, &n) != 1)
+                {
+                  fprintf (stderr, "%s:%d.%d: syntax error at '%s' "
+                           "expecting hexadecimal Unicode code point number\n",
+                           filename, lineno, (int) (p - line + 1), p);
+                  exit (1);
+                }
+              p += n;
+
+              input[i] = next_int;
+            }
+
+          p += strspn (p, " \t\r\n");
+          i++;
+        }
+      while (*p != '\0');
+
+      u32_possible_linebreaks (input, i - 1, "UTF-8", breaks);
+
+      int matches = 1;
+      {
+        int j;
+        for (j = 0; j < i - 1; j++)
+          {
+            /* The character U+FFFC has line break property CB, which according
+               to rule (LB1) is resolved "into other line breaking classes
+               depending on criteria outside the scope of this algorithm".
+               Thus it makes no sense to check the breaks[] entry before or
+               after such a character.  */
+            if (!(input[j] == 0xFFFC
+                  || (j > 0 && input[j - 1] == 0xFFFC)
+                  /* Also consider intervening characters with property LBP_CM
+                     or LBP_ZWJ, per (LB9).  */
+                  || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D)
+                      && input[j - 2] == 0xFFFC)))
+              /* A regional indicator with a combining character is nonsense,
+                 because regional indicators are supposed to come in pairs.  */
+              if (!(j >= 2 && (input[0] >= 0x1F1E6 && input[0] <= 0x1F1FF)
+                    && input[1] == 0x0308))
+                /* There is a disagreement regarding whether to allow a line break
+                   after a U+0020 SPACE character at the start of the text.
+                   We consider that the start of the text is equivalent to the
+                   state after a newline was seen; hence the loop starts with
+                   property LBP_BK.  By the rules (LB4,LB5,LB6) an extra line
+                   break after a mandatory line break is undesired, even with
+                   intervening spaces (because these rules come before (LB18)).
+                   Whereas the LineBreakTest.txt file allows a line break after
+                   the space.
+                   Similarly when the first two characters at the start of the
+                   text have property LBP_CM and LBP_ZWJ, respectively. (LB9).  */
+                if (!(((j == 1 || (j > 1 && (input[j - 2] >= 0x000A && input[j - 2] <= 0x000D || input[j - 2] == 0x0085)))
+                       && input[j - 1] == 0x0020)
+                      || ((j == 2 || (j > 2 && (input[j - 3] >= 0x000A && input[j - 3] <= 0x000D || input[j - 3] == 0x0085)))
+                          && ((input[j - 2] == 0x0020 && input[j - 1] == 0x0020)
+                              || (input[j - 2] == 0x0308 && input[j - 1] == 0x200D)
+                              || (input[j - 2] == 0x200D && input[j - 1] == 0x0308)))))
+                  matches &= (!(breaks[j] == UC_BREAK_PROHIBITED
+                                || breaks[j] == UC_BREAK_MANDATORY
+                                || breaks[j] == UC_BREAK_CR_BEFORE_LF)
+                              || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY))
+                             == breaks_expected[j];
+          }
+      }
+      if (!matches)
+        {
+          int j;
+
+          fprintf (stderr, "%s:%d: expected: ", filename, lineno);
+          for (j = 0; j < i - 1; j++)
+            fprintf (stderr, "%s U+%04X ",
+                     breaks_expected[j] == 1 ? "\303\267" : "\303\227",
+                     input[j]);
+          fprintf (stderr, "\n");
+          fprintf (stderr, "%s:%d: actual:   ", filename, lineno);
+          for (j = 0; j < i - 1; j++)
+            fprintf (stderr, "%s U+%04X ",
+                     (!(breaks[j] == UC_BREAK_PROHIBITED
+                        || breaks[j] == UC_BREAK_MANDATORY
+                        || breaks[j] == UC_BREAK_CR_BEFORE_LF)
+                      || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY))
+                     ? "\303\267" : "\303\227",
+                     input[j]);
+          fprintf (stderr, "\n");
+          exit_code = 1;
+        }
+    }
+
+  return exit_code;
+}
diff --git a/tests/unilbrk/test-uc-possible-linebreaks.sh b/tests/unilbrk/test-uc-possible-linebreaks.sh
new file mode 100755
index 0000000000..0df7bb915f
--- /dev/null
+++ b/tests/unilbrk/test-uc-possible-linebreaks.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+${CHECKER} ./test-uc-possible-linebreaks${EXEEXT} "${srcdir}/unilbrk/LineBreakTest.txt"
diff --git a/tests/unilbrk/LineBreakTest.txt b/tests/unilbrk/LineBreakTest.txt
new file mode 100644
index 0000000000..75044be059
--- /dev/null
+++ b/tests/unilbrk/LineBreakTest.txt
(omitted)

unilbrk: Strengthen tests

Reply via email to