And this patch makes sure that we apply the same logic also when a
line break was seen in the input string.


2024-09-15  Bruno Haible  <br...@clisp.org>

        unilbrk: Some more tweaks.
        * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop):
        Assign to preceding_prop at each loop round. Assign to prev2_ea,
        prev_ea, prev_nus also when a line break was seen.
        * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop):
        Likewise.
        * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop):
        Likewise.

diff --git a/lib/unilbrk/u16-possible-linebreaks.c 
b/lib/unilbrk/u16-possible-linebreaks.c
index 9678041606..ac655f8c82 100644
--- a/lib/unilbrk/u16-possible-linebreaks.c
+++ b/lib/unilbrk/u16-possible-linebreaks.c
@@ -122,7 +122,7 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, 
const char *encoding,
               lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
             }
 
-          bool nus = /* ending at the last character, there was a character
+          bool nus = /* ending at the previous character, there was a character
                         with line break property LBP_NU and since then only
                         characters with line break property LBP_SY or LBP_IS */
             (prev_prop == LBP_NU
@@ -136,10 +136,8 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, 
const char *encoding,
                  a CR-LF sequence.  */
               if (prev_prop == cr && prop == LBP_LF)
                 p[-1] = UC_BREAK_CR_BEFORE_LF;
-              prev_prop = prop;
               last_prop = LBP_BK;
               seen_space = NULL;
-              prev_initial_hyphen = false;
             }
           else
             {
@@ -385,33 +383,33 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t 
n, const char *encoding,
                   last_prop = prop;
                   seen_space = NULL;
                 }
+            }
 
-              /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
-                 break class except BK, CR, LF, NL, SP, or ZW.  */
-              if (!((prop == LBP_CM || prop == LBP_ZWJ)
-                    && !(prev_prop == LBP_BK || prev_prop == LBP_LF || 
prev_prop == LBP_CR
-                         || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
-                {
-                  prev_initial_hyphen =
-                    (prop == LBP_HY || uc == 0x2010)
-                    && (prev_prop == LBP_BK || prev_prop == LBP_CR || 
prev_prop == LBP_LF
-                        || prev_prop == LBP_SP || prev_prop == LBP_ZW
-                        || prev_prop == LBP_CB || prev_prop == LBP_GL);
-                  prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
-                                                  || prev_prop == LBP_AL2
-                                                  || prev_prop == LBP_AS)
-                               ? LBP_AKLS_VI :
-                               prev_prop == LBP_HL && (prop == LBP_HY
-                                                       || (prop == LBP_BA && 
!ea))
-                               ? LBP_HL_BA :
-                               prop);
-                  prev2_ea = prev_ea;
-                  prev_ea = ea;
-                }
-              preceding_prop = prop;
+          /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
+             break class except BK, CR, LF, NL, SP, or ZW.  */
+          if (!((prop == LBP_CM || prop == LBP_ZWJ)
+                && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop 
== LBP_CR
+                     || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
+            {
+              prev_initial_hyphen =
+                (prop == LBP_HY || uc == 0x2010)
+                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == 
LBP_LF
+                    || prev_prop == LBP_SP || prev_prop == LBP_ZW
+                    || prev_prop == LBP_CB || prev_prop == LBP_GL);
+              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
+                                              || prev_prop == LBP_AL2
+                                              || prev_prop == LBP_AS)
+                           ? LBP_AKLS_VI :
+                           prev_prop == LBP_HL && (prop == LBP_HY
+                                                   || (prop == LBP_BA && !ea))
+                           ? LBP_HL_BA :
+                           prop);
+              prev2_ea = prev_ea;
+              prev_ea = ea;
+              prev_nus = nus;
             }
 
-          prev_nus = nus;
+          preceding_prop = prop;
 
           if (prop == LBP_RI)
             ri_count++;
diff --git a/lib/unilbrk/u32-possible-linebreaks.c 
b/lib/unilbrk/u32-possible-linebreaks.c
index c12ca3abbb..f242c9a8ee 100644
--- a/lib/unilbrk/u32-possible-linebreaks.c
+++ b/lib/unilbrk/u32-possible-linebreaks.c
@@ -115,7 +115,7 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, 
const char *encoding,
               lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
             }
 
-          bool nus = /* ending at the last character, there was a character
+          bool nus = /* ending at the previous character, there was a character
                         with line break property LBP_NU and since then only
                         characters with line break property LBP_SY or LBP_IS */
             (prev_prop == LBP_NU
@@ -129,10 +129,8 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, 
const char *encoding,
                  a CR-LF sequence.  */
               if (prev_prop == cr && prop == LBP_LF)
                 p[-1] = UC_BREAK_CR_BEFORE_LF;
-              prev_prop = prop;
               last_prop = LBP_BK;
               seen_space = NULL;
-              prev_initial_hyphen = false;
             }
           else
             {
@@ -378,33 +376,33 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t 
n, const char *encoding,
                   last_prop = prop;
                   seen_space = NULL;
                 }
+            }
 
-              /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
-                 break class except BK, CR, LF, NL, SP, or ZW.  */
-              if (!((prop == LBP_CM || prop == LBP_ZWJ)
-                    && !(prev_prop == LBP_BK || prev_prop == LBP_LF || 
prev_prop == LBP_CR
-                         || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
-                {
-                  prev_initial_hyphen =
-                    (prop == LBP_HY || uc == 0x2010)
-                    && (prev_prop == LBP_BK || prev_prop == LBP_CR || 
prev_prop == LBP_LF
-                        || prev_prop == LBP_SP || prev_prop == LBP_ZW
-                        || prev_prop == LBP_CB || prev_prop == LBP_GL);
-                  prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
-                                                  || prev_prop == LBP_AL2
-                                                  || prev_prop == LBP_AS)
-                               ? LBP_AKLS_VI :
-                               prev_prop == LBP_HL && (prop == LBP_HY
-                                                       || (prop == LBP_BA && 
!ea))
-                               ? LBP_HL_BA :
-                               prop);
-                  prev2_ea = prev_ea;
-                  prev_ea = ea;
-                }
-              preceding_prop = prop;
+          /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
+             break class except BK, CR, LF, NL, SP, or ZW.  */
+          if (!((prop == LBP_CM || prop == LBP_ZWJ)
+                && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop 
== LBP_CR
+                     || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
+            {
+              prev_initial_hyphen =
+                (prop == LBP_HY || uc == 0x2010)
+                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == 
LBP_LF
+                    || prev_prop == LBP_SP || prev_prop == LBP_ZW
+                    || prev_prop == LBP_CB || prev_prop == LBP_GL);
+              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
+                                              || prev_prop == LBP_AL2
+                                              || prev_prop == LBP_AS)
+                           ? LBP_AKLS_VI :
+                           prev_prop == LBP_HL && (prop == LBP_HY
+                                                   || (prop == LBP_BA && !ea))
+                           ? LBP_HL_BA :
+                           prop);
+              prev2_ea = prev_ea;
+              prev_ea = ea;
+              prev_nus = nus;
             }
 
-          prev_nus = nus;
+          preceding_prop = prop;
 
           if (prop == LBP_RI)
             ri_count++;
diff --git a/lib/unilbrk/u8-possible-linebreaks.c 
b/lib/unilbrk/u8-possible-linebreaks.c
index 3f32ee6289..90d64c51b8 100644
--- a/lib/unilbrk/u8-possible-linebreaks.c
+++ b/lib/unilbrk/u8-possible-linebreaks.c
@@ -122,7 +122,7 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
               lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
             }
 
-          bool nus = /* ending at the last character, there was a character
+          bool nus = /* ending at the previous character, there was a character
                         with line break property LBP_NU and since then only
                         characters with line break property LBP_SY or LBP_IS */
             (prev_prop == LBP_NU
@@ -136,10 +136,8 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
                  a CR-LF sequence.  */
               if (prev_prop == cr && prop == LBP_LF)
                 p[-1] = UC_BREAK_CR_BEFORE_LF;
-              prev_prop = prop;
               last_prop = LBP_BK;
               seen_space = NULL;
-              prev_initial_hyphen = false;
             }
           else
             {
@@ -385,33 +383,33 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
                   last_prop = prop;
                   seen_space = NULL;
                 }
+            }
 
-              /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
-                 break class except BK, CR, LF, NL, SP, or ZW.  */
-              if (!((prop == LBP_CM || prop == LBP_ZWJ)
-                    && !(prev_prop == LBP_BK || prev_prop == LBP_LF || 
prev_prop == LBP_CR
-                         || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
-                {
-                  prev_initial_hyphen =
-                    (prop == LBP_HY || uc == 0x2010)
-                    && (prev_prop == LBP_BK || prev_prop == LBP_CR || 
prev_prop == LBP_LF
-                        || prev_prop == LBP_SP || prev_prop == LBP_ZW
-                        || prev_prop == LBP_CB || prev_prop == LBP_GL);
-                  prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
-                                                  || prev_prop == LBP_AL2
-                                                  || prev_prop == LBP_AS)
-                               ? LBP_AKLS_VI :
-                               prev_prop == LBP_HL && (prop == LBP_HY
-                                                       || (prop == LBP_BA && 
!ea))
-                               ? LBP_HL_BA :
-                               prop);
-                  prev2_ea = prev_ea;
-                  prev_ea = ea;
-                }
-              preceding_prop = prop;
+          /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
+             break class except BK, CR, LF, NL, SP, or ZW.  */
+          if (!((prop == LBP_CM || prop == LBP_ZWJ)
+                && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop 
== LBP_CR
+                     || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
+            {
+              prev_initial_hyphen =
+                (prop == LBP_HY || uc == 0x2010)
+                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == 
LBP_LF
+                    || prev_prop == LBP_SP || prev_prop == LBP_ZW
+                    || prev_prop == LBP_CB || prev_prop == LBP_GL);
+              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
+                                              || prev_prop == LBP_AL2
+                                              || prev_prop == LBP_AS)
+                           ? LBP_AKLS_VI :
+                           prev_prop == LBP_HL && (prop == LBP_HY
+                                                   || (prop == LBP_BA && !ea))
+                           ? LBP_HL_BA :
+                           prop);
+              prev2_ea = prev_ea;
+              prev_ea = ea;
+              prev_nus = nus;
             }
 
-          prev_nus = nus;
+          preceding_prop = prop;
 
           if (prop == LBP_RI)
             ri_count++;




Reply via email to