And this patch makes sure that we apply the same logic also when a line break was seen in the input string.
2024-09-15 Bruno Haible <br...@clisp.org> unilbrk: Some more tweaks. * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop): Assign to preceding_prop at each loop round. Assign to prev2_ea, prev_ea, prev_nus also when a line break was seen. * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop): Likewise. * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop): Likewise. diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c index 9678041606..ac655f8c82 100644 --- a/lib/unilbrk/u16-possible-linebreaks.c +++ b/lib/unilbrk/u16-possible-linebreaks.c @@ -122,7 +122,7 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, lookahead2_prop_ea = PROP_EA (LBP_BK, 0); } - bool nus = /* ending at the last character, there was a character + bool nus = /* ending at the previous character, there was a character with line break property LBP_NU and since then only characters with line break property LBP_SY or LBP_IS */ (prev_prop == LBP_NU @@ -136,10 +136,8 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, a CR-LF sequence. */ if (prev_prop == cr && prop == LBP_LF) p[-1] = UC_BREAK_CR_BEFORE_LF; - prev_prop = prop; last_prop = LBP_BK; seen_space = NULL; - prev_initial_hyphen = false; } else { @@ -385,33 +383,33 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, last_prop = prop; seen_space = NULL; } + } - /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line - break class except BK, CR, LF, NL, SP, or ZW. */ - if (!((prop == LBP_CM || prop == LBP_ZWJ) - && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR - || prev_prop == LBP_SP || prev_prop == LBP_ZW))) - { - prev_initial_hyphen = - (prop == LBP_HY || uc == 0x2010) - && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF - || prev_prop == LBP_SP || prev_prop == LBP_ZW - || prev_prop == LBP_CB || prev_prop == LBP_GL); - prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK - || prev_prop == LBP_AL2 - || prev_prop == LBP_AS) - ? LBP_AKLS_VI : - prev_prop == LBP_HL && (prop == LBP_HY - || (prop == LBP_BA && !ea)) - ? LBP_HL_BA : - prop); - prev2_ea = prev_ea; - prev_ea = ea; - } - preceding_prop = prop; + /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line + break class except BK, CR, LF, NL, SP, or ZW. */ + if (!((prop == LBP_CM || prop == LBP_ZWJ) + && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR + || prev_prop == LBP_SP || prev_prop == LBP_ZW))) + { + prev_initial_hyphen = + (prop == LBP_HY || uc == 0x2010) + && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF + || prev_prop == LBP_SP || prev_prop == LBP_ZW + || prev_prop == LBP_CB || prev_prop == LBP_GL); + prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK + || prev_prop == LBP_AL2 + || prev_prop == LBP_AS) + ? LBP_AKLS_VI : + prev_prop == LBP_HL && (prop == LBP_HY + || (prop == LBP_BA && !ea)) + ? LBP_HL_BA : + prop); + prev2_ea = prev_ea; + prev_ea = ea; + prev_nus = nus; } - prev_nus = nus; + preceding_prop = prop; if (prop == LBP_RI) ri_count++; diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c index c12ca3abbb..f242c9a8ee 100644 --- a/lib/unilbrk/u32-possible-linebreaks.c +++ b/lib/unilbrk/u32-possible-linebreaks.c @@ -115,7 +115,7 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, lookahead2_prop_ea = PROP_EA (LBP_BK, 0); } - bool nus = /* ending at the last character, there was a character + bool nus = /* ending at the previous character, there was a character with line break property LBP_NU and since then only characters with line break property LBP_SY or LBP_IS */ (prev_prop == LBP_NU @@ -129,10 +129,8 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, a CR-LF sequence. */ if (prev_prop == cr && prop == LBP_LF) p[-1] = UC_BREAK_CR_BEFORE_LF; - prev_prop = prop; last_prop = LBP_BK; seen_space = NULL; - prev_initial_hyphen = false; } else { @@ -378,33 +376,33 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, last_prop = prop; seen_space = NULL; } + } - /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line - break class except BK, CR, LF, NL, SP, or ZW. */ - if (!((prop == LBP_CM || prop == LBP_ZWJ) - && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR - || prev_prop == LBP_SP || prev_prop == LBP_ZW))) - { - prev_initial_hyphen = - (prop == LBP_HY || uc == 0x2010) - && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF - || prev_prop == LBP_SP || prev_prop == LBP_ZW - || prev_prop == LBP_CB || prev_prop == LBP_GL); - prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK - || prev_prop == LBP_AL2 - || prev_prop == LBP_AS) - ? LBP_AKLS_VI : - prev_prop == LBP_HL && (prop == LBP_HY - || (prop == LBP_BA && !ea)) - ? LBP_HL_BA : - prop); - prev2_ea = prev_ea; - prev_ea = ea; - } - preceding_prop = prop; + /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line + break class except BK, CR, LF, NL, SP, or ZW. */ + if (!((prop == LBP_CM || prop == LBP_ZWJ) + && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR + || prev_prop == LBP_SP || prev_prop == LBP_ZW))) + { + prev_initial_hyphen = + (prop == LBP_HY || uc == 0x2010) + && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF + || prev_prop == LBP_SP || prev_prop == LBP_ZW + || prev_prop == LBP_CB || prev_prop == LBP_GL); + prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK + || prev_prop == LBP_AL2 + || prev_prop == LBP_AS) + ? LBP_AKLS_VI : + prev_prop == LBP_HL && (prop == LBP_HY + || (prop == LBP_BA && !ea)) + ? LBP_HL_BA : + prop); + prev2_ea = prev_ea; + prev_ea = ea; + prev_nus = nus; } - prev_nus = nus; + preceding_prop = prop; if (prop == LBP_RI) ri_count++; diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c index 3f32ee6289..90d64c51b8 100644 --- a/lib/unilbrk/u8-possible-linebreaks.c +++ b/lib/unilbrk/u8-possible-linebreaks.c @@ -122,7 +122,7 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, lookahead2_prop_ea = PROP_EA (LBP_BK, 0); } - bool nus = /* ending at the last character, there was a character + bool nus = /* ending at the previous character, there was a character with line break property LBP_NU and since then only characters with line break property LBP_SY or LBP_IS */ (prev_prop == LBP_NU @@ -136,10 +136,8 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, a CR-LF sequence. */ if (prev_prop == cr && prop == LBP_LF) p[-1] = UC_BREAK_CR_BEFORE_LF; - prev_prop = prop; last_prop = LBP_BK; seen_space = NULL; - prev_initial_hyphen = false; } else { @@ -385,33 +383,33 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, last_prop = prop; seen_space = NULL; } + } - /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line - break class except BK, CR, LF, NL, SP, or ZW. */ - if (!((prop == LBP_CM || prop == LBP_ZWJ) - && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR - || prev_prop == LBP_SP || prev_prop == LBP_ZW))) - { - prev_initial_hyphen = - (prop == LBP_HY || uc == 0x2010) - && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF - || prev_prop == LBP_SP || prev_prop == LBP_ZW - || prev_prop == LBP_CB || prev_prop == LBP_GL); - prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK - || prev_prop == LBP_AL2 - || prev_prop == LBP_AS) - ? LBP_AKLS_VI : - prev_prop == LBP_HL && (prop == LBP_HY - || (prop == LBP_BA && !ea)) - ? LBP_HL_BA : - prop); - prev2_ea = prev_ea; - prev_ea = ea; - } - preceding_prop = prop; + /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line + break class except BK, CR, LF, NL, SP, or ZW. */ + if (!((prop == LBP_CM || prop == LBP_ZWJ) + && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR + || prev_prop == LBP_SP || prev_prop == LBP_ZW))) + { + prev_initial_hyphen = + (prop == LBP_HY || uc == 0x2010) + && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF + || prev_prop == LBP_SP || prev_prop == LBP_ZW + || prev_prop == LBP_CB || prev_prop == LBP_GL); + prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK + || prev_prop == LBP_AL2 + || prev_prop == LBP_AS) + ? LBP_AKLS_VI : + prev_prop == LBP_HL && (prop == LBP_HY + || (prop == LBP_BA && !ea)) + ? LBP_HL_BA : + prop); + prev2_ea = prev_ea; + prev_ea = ea; + prev_nus = nus; } - prev_nus = nus; + preceding_prop = prop; if (prop == LBP_RI) ri_count++;