unilbrk: Fully implement Unicode rule (LB9)

Bruno Haible Sun, 15 Sep 2024 05:55:54 -0700

Part of the (LB9) rule was not implemented right in the unilbrk modules,
namely the requirement "Treat X (CM | ZWJ)* as if it were X."
This patch does it, taking care to resolve to not introduce a conflict
with rule (LB8a) "Do not break after a zero width joiner."


This patch too fixes a couple of test failures from LineBreak.txt.


2024-09-15  Bruno Haible  <br...@clisp.org>

        unilbrk: Fully implement Unicode rule (LB9).
        * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop):
        New variable preceding_prop. Don't copy prop to prev_prop and ea to
        prev_ea if the conditions of rule (LB9) apply.
        * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop):
        Likewise.
        * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop):
        Likewise.

diff --git a/lib/unilbrk/u16-possible-linebreaks.c 
b/lib/unilbrk/u16-possible-linebreaks.c
index d2d6343234..dde1aa6064 100644
--- a/lib/unilbrk/u16-possible-linebreaks.c
+++ b/lib/unilbrk/u16-possible-linebreaks.c
@@ -78,17 +78,20 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, 
const char *encoding,
           lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
         }
 
-      int prev_prop = LBP_BK; /* line break property of last character */
-      int prev_ea = 0;        /* EastAsian property of last character */
-      int prev2_ea = 0;       /* EastAsian property of character before the
-                                 last character */
-      bool prev_initial_hyphen = false; /* the last character was a 
word-initial
-                                           hyphen or U+2010 */
-      bool prev_nus = false; /* before the last character, there was a 
character
+      int preceding_prop = LBP_BK; /* line break property of preceding 
character */
+      int prev_prop = LBP_BK; /* line break property of previous character
+                                 (= last character, ignoring intervening 
characters of class CM or ZWJ) */
+      int prev_ea = 0;        /* EastAsian property of previous character
+                                 (= last character, ignoring intervening 
characters of class CM or ZWJ) */
+      int prev2_ea = 0;       /* EastAsian property of character before the 
previous character */
+      bool prev_initial_hyphen = false; /* the previous character was a
+                                           word-initial hyphen or U+2010 */
+      bool prev_nus = false; /* before the previous character, there was a 
character
                                 with line break property LBP_NU and since then
                                 only characters with line break property LBP_SY
                                 or LBP_IS */
-      int last_prop = LBP_BK; /* line break property of last non-space 
character */
+      int last_prop = LBP_BK; /* line break property of last non-space 
character
+                                 (= last character, ignoring intervening 
characters of class SP or CM or ZWJ) */
       char *seen_space = NULL; /* Was a space seen after the last non-space 
character? */
 
       /* Number of consecutive regional indicator (RI) characters seen
@@ -221,7 +224,7 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, 
const char *encoding,
                       /* (LB8) Break after zero-width space.  */
                       *p = UC_BREAK_POSSIBLE;
                     }
-                  else if (prev_prop == LBP_ZWJ)
+                  else if (preceding_prop == LBP_ZWJ)
                     {
                       /* (LB8a) Don't break right after a zero-width joiner.  
*/
                       *p = UC_BREAK_PROHIBITED;
@@ -368,21 +371,29 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t 
n, const char *encoding,
                   seen_space = NULL;
                 }
 
-              prev_initial_hyphen =
-                (prop == LBP_HY || uc == 0x2010)
-                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == 
LBP_LF
-                    || prev_prop == LBP_SP || prev_prop == LBP_ZW
-                    || prev_prop == LBP_CB || prev_prop == LBP_GL);
-              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
-                                              || prev_prop == LBP_AL2
-                                              || prev_prop == LBP_AS)
-                           ? LBP_AKLS_VI :
-                           prev_prop == LBP_HL && (prop == LBP_HY
-                                                   || (prop == LBP_BA && !ea))
-                           ? LBP_HL_BA :
-                           prop);
-              prev2_ea = prev_ea;
-              prev_ea = ea;
+              /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
+                 break class except BK, CR, LF, NL, SP, or ZW.  */
+              if (!((prop == LBP_CM || prop == LBP_ZWJ)
+                    && !(prev_prop == LBP_BK || prev_prop == LBP_LF || 
prev_prop == LBP_CR
+                         || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
+                {
+                  prev_initial_hyphen =
+                    (prop == LBP_HY || uc == 0x2010)
+                    && (prev_prop == LBP_BK || prev_prop == LBP_CR || 
prev_prop == LBP_LF
+                        || prev_prop == LBP_SP || prev_prop == LBP_ZW
+                        || prev_prop == LBP_CB || prev_prop == LBP_GL);
+                  prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
+                                                  || prev_prop == LBP_AL2
+                                                  || prev_prop == LBP_AS)
+                               ? LBP_AKLS_VI :
+                               prev_prop == LBP_HL && (prop == LBP_HY
+                                                       || (prop == LBP_BA && 
!ea))
+                               ? LBP_HL_BA :
+                               prop);
+                  prev2_ea = prev_ea;
+                  prev_ea = ea;
+                }
+              preceding_prop = prop;
             }
 
           prev_nus = nus;
diff --git a/lib/unilbrk/u32-possible-linebreaks.c 
b/lib/unilbrk/u32-possible-linebreaks.c
index 10f39a8d05..9cdda2868d 100644
--- a/lib/unilbrk/u32-possible-linebreaks.c
+++ b/lib/unilbrk/u32-possible-linebreaks.c
@@ -72,17 +72,20 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, 
const char *encoding,
           lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
         }
 
-      int prev_prop = LBP_BK; /* line break property of last character */
-      int prev_ea = 0;        /* EastAsian property of last character */
-      int prev2_ea = 0;       /* EastAsian property of character before the
-                                 last character */
-      bool prev_initial_hyphen = false; /* the last character was a 
word-initial
-                                           hyphen or U+2010 */
-      bool prev_nus = false; /* before the last character, there was a 
character
+      int preceding_prop = LBP_BK; /* line break property of preceding 
character */
+      int prev_prop = LBP_BK; /* line break property of previous character
+                                 (= last character, ignoring intervening 
characters of class CM or ZWJ) */
+      int prev_ea = 0;        /* EastAsian property of previous character
+                                 (= last character, ignoring intervening 
characters of class CM or ZWJ) */
+      int prev2_ea = 0;       /* EastAsian property of character before the 
previous character */
+      bool prev_initial_hyphen = false; /* the previous character was a
+                                           word-initial hyphen or U+2010 */
+      bool prev_nus = false; /* before the previous character, there was a 
character
                                 with line break property LBP_NU and since then
                                 only characters with line break property LBP_SY
                                 or LBP_IS */
-      int last_prop = LBP_BK; /* line break property of last non-space 
character */
+      int last_prop = LBP_BK; /* line break property of last non-space 
character
+                                 (= last character, ignoring intervening 
characters of class SP or CM or ZWJ) */
       char *seen_space = NULL; /* Was a space seen after the last non-space 
character? */
 
       /* Number of consecutive regional indicator (RI) characters seen
@@ -214,7 +217,7 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, 
const char *encoding,
                       /* (LB8) Break after zero-width space.  */
                       *p = UC_BREAK_POSSIBLE;
                     }
-                  else if (prev_prop == LBP_ZWJ)
+                  else if (preceding_prop == LBP_ZWJ)
                     {
                       /* (LB8a) Don't break right after a zero-width joiner.  
*/
                       *p = UC_BREAK_PROHIBITED;
@@ -361,21 +364,29 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t 
n, const char *encoding,
                   seen_space = NULL;
                 }
 
-              prev_initial_hyphen =
-                (prop == LBP_HY || uc == 0x2010)
-                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == 
LBP_LF
-                    || prev_prop == LBP_SP || prev_prop == LBP_ZW
-                    || prev_prop == LBP_CB || prev_prop == LBP_GL);
-              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
-                                              || prev_prop == LBP_AL2
-                                              || prev_prop == LBP_AS)
-                           ? LBP_AKLS_VI :
-                           prev_prop == LBP_HL && (prop == LBP_HY
-                                                   || (prop == LBP_BA && !ea))
-                           ? LBP_HL_BA :
-                           prop);
-              prev2_ea = prev_ea;
-              prev_ea = ea;
+              /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
+                 break class except BK, CR, LF, NL, SP, or ZW.  */
+              if (!((prop == LBP_CM || prop == LBP_ZWJ)
+                    && !(prev_prop == LBP_BK || prev_prop == LBP_LF || 
prev_prop == LBP_CR
+                         || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
+                {
+                  prev_initial_hyphen =
+                    (prop == LBP_HY || uc == 0x2010)
+                    && (prev_prop == LBP_BK || prev_prop == LBP_CR || 
prev_prop == LBP_LF
+                        || prev_prop == LBP_SP || prev_prop == LBP_ZW
+                        || prev_prop == LBP_CB || prev_prop == LBP_GL);
+                  prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
+                                                  || prev_prop == LBP_AL2
+                                                  || prev_prop == LBP_AS)
+                               ? LBP_AKLS_VI :
+                               prev_prop == LBP_HL && (prop == LBP_HY
+                                                       || (prop == LBP_BA && 
!ea))
+                               ? LBP_HL_BA :
+                               prop);
+                  prev2_ea = prev_ea;
+                  prev_ea = ea;
+                }
+              preceding_prop = prop;
             }
 
           prev_nus = nus;
diff --git a/lib/unilbrk/u8-possible-linebreaks.c 
b/lib/unilbrk/u8-possible-linebreaks.c
index 65248e5c40..6c26a8c26f 100644
--- a/lib/unilbrk/u8-possible-linebreaks.c
+++ b/lib/unilbrk/u8-possible-linebreaks.c
@@ -78,17 +78,20 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
           lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
         }
 
-      int prev_prop = LBP_BK; /* line break property of last character */
-      int prev_ea = 0;        /* EastAsian property of last character */
-      int prev2_ea = 0;       /* EastAsian property of character before the
-                                 last character */
-      bool prev_initial_hyphen = false; /* the last character was a 
word-initial
-                                           hyphen or U+2010 */
-      bool prev_nus = false; /* before the last character, there was a 
character
+      int preceding_prop = LBP_BK; /* line break property of preceding 
character */
+      int prev_prop = LBP_BK; /* line break property of previous character
+                                 (= last character, ignoring intervening 
characters of class CM or ZWJ) */
+      int prev_ea = 0;        /* EastAsian property of previous character
+                                 (= last character, ignoring intervening 
characters of class CM or ZWJ) */
+      int prev2_ea = 0;       /* EastAsian property of character before the 
previous character */
+      bool prev_initial_hyphen = false; /* the previous character was a
+                                           word-initial hyphen or U+2010 */
+      bool prev_nus = false; /* before the previous character, there was a 
character
                                 with line break property LBP_NU and since then
                                 only characters with line break property LBP_SY
                                 or LBP_IS */
-      int last_prop = LBP_BK; /* line break property of last non-space 
character */
+      int last_prop = LBP_BK; /* line break property of last non-space 
character
+                                 (= last character, ignoring intervening 
characters of class SP or CM or ZWJ) */
       char *seen_space = NULL; /* Was a space seen after the last non-space 
character? */
 
       /* Number of consecutive regional indicator (RI) characters seen
@@ -221,7 +224,7 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
                       /* (LB8) Break after zero-width space.  */
                       *p = UC_BREAK_POSSIBLE;
                     }
-                  else if (prev_prop == LBP_ZWJ)
+                  else if (preceding_prop == LBP_ZWJ)
                     {
                       /* (LB8a) Don't break right after a zero-width joiner.  
*/
                       *p = UC_BREAK_PROHIBITED;
@@ -368,21 +371,29 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
                   seen_space = NULL;
                 }
 
-              prev_initial_hyphen =
-                (prop == LBP_HY || uc == 0x2010)
-                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == 
LBP_LF
-                    || prev_prop == LBP_SP || prev_prop == LBP_ZW
-                    || prev_prop == LBP_CB || prev_prop == LBP_GL);
-              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
-                                              || prev_prop == LBP_AL2
-                                              || prev_prop == LBP_AS)
-                           ? LBP_AKLS_VI :
-                           prev_prop == LBP_HL && (prop == LBP_HY
-                                                   || (prop == LBP_BA && !ea))
-                           ? LBP_HL_BA :
-                           prop);
-              prev2_ea = prev_ea;
-              prev_ea = ea;
+              /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
+                 break class except BK, CR, LF, NL, SP, or ZW.  */
+              if (!((prop == LBP_CM || prop == LBP_ZWJ)
+                    && !(prev_prop == LBP_BK || prev_prop == LBP_LF || 
prev_prop == LBP_CR
+                         || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
+                {
+                  prev_initial_hyphen =
+                    (prop == LBP_HY || uc == 0x2010)
+                    && (prev_prop == LBP_BK || prev_prop == LBP_CR || 
prev_prop == LBP_LF
+                        || prev_prop == LBP_SP || prev_prop == LBP_ZW
+                        || prev_prop == LBP_CB || prev_prop == LBP_GL);
+                  prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
+                                                  || prev_prop == LBP_AL2
+                                                  || prev_prop == LBP_AS)
+                               ? LBP_AKLS_VI :
+                               prev_prop == LBP_HL && (prop == LBP_HY
+                                                       || (prop == LBP_BA && 
!ea))
+                               ? LBP_HL_BA :
+                               prop);
+                  prev2_ea = prev_ea;
+                  prev_ea = ea;
+                }
+              preceding_prop = prop;
             }
 
           prev_nus = nus;

unilbrk: Fully implement Unicode rule (LB9)

Reply via email to