The implementation of the Unicode rules (LB15a), (LB15b) on 2024-01-30 was not correct: It causes several test failures in Unicode.org's LineBreak.txt. This patch fixes it.
2024-09-15 Bruno Haible <br...@clisp.org> unilbrk: Fix bugs in implementation of Unicode rules (LB15a), (LB15b). * lib/gen-uni-tables.c (output_lbrk_rules_as_tables): Fix typo in comment. * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop): Do the LBP_QU2 to LBP_QU1 mapping for (LB15a) after the table lookup but before the assignment to last_prop. Do the LBP_QU3 to LBP_QU1 mapping for (LB15b) in a way that does not influence prev_prop or last_prop. * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop): Likewise. * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop): Likewise. diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 0ea9de9f28..6ac81d0821 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -9424,7 +9424,7 @@ output_lbrk_rules_as_tables (const char *filename, const char *version) set_table_cell (prohibited_with_sp, true); } - /* (LB15a) Do not break before an ambiguous quotation that is an initial + /* (LB15a) Do not break after an ambiguous quotation that is an initial punctuation, even after spaces. */ for (after = 0; after < NLBP; after++) { diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c index e327575a34..d2d6343234 100644 --- a/lib/unilbrk/u16-possible-linebreaks.c +++ b/lib/unilbrk/u16-possible-linebreaks.c @@ -158,51 +158,6 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, /* This is arbitrary. */ prop = LBP_AL1; break; - case LBP_QU2: - /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous - character's line break property was not one of - BK, CR, LF, OP, QU, GL, SP, ZW. */ - switch (prev_prop) - { - case LBP_BK: - case LBP_CR: - case LBP_LF: - case LBP_OP1: case LBP_OP2: - case LBP_QU1: case LBP_QU2: case LBP_QU3: - case LBP_GL: - case LBP_SP: - case LBP_ZW: - break; - default: - prop = LBP_QU1; - break; - } - break; - case LBP_QU3: - /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next - character's line break property is not one of - BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ - switch (PROP (lookahead1_prop_ea)) - { - case LBP_BK: - case LBP_CR: - case LBP_LF: - case LBP_SP: - case LBP_GL: - case LBP_WJ: - case LBP_CL: - case LBP_QU1: case LBP_QU2: case LBP_QU3: - case LBP_CP1: case LBP_CP2: - case LBP_EX: - case LBP_IS: - case LBP_SY: - case LBP_ZW: - break; - default: - prop = LBP_QU1; - break; - } - break; } /* Deal with spaces and combining characters. */ @@ -343,7 +298,35 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, } else { - switch (unilbrk_table [last_prop] [prop]) + int this_prop = prop; + if (prop == LBP_QU3) + { + /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the + next character's line break property is not one of + BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ + switch (PROP (lookahead1_prop_ea)) + { + case LBP_BK: + case LBP_CR: + case LBP_LF: + case LBP_SP: + case LBP_GL: + case LBP_WJ: + case LBP_CL: + case LBP_QU1: case LBP_QU2: case LBP_QU3: + case LBP_CP1: case LBP_CP2: + case LBP_EX: + case LBP_IS: + case LBP_SY: + case LBP_ZW: + break; + default: + this_prop = LBP_QU1; + break; + } + } + + switch (unilbrk_table [last_prop] [this_prop]) { case D: *p = UC_BREAK_POSSIBLE; @@ -358,6 +341,29 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, abort (); } } + + if (prop == LBP_QU2) + { + /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the + previous character's line break property was not one of + BK, CR, LF, OP, QU, GL, SP, ZW. */ + switch (prev_prop) + { + case LBP_BK: + case LBP_CR: + case LBP_LF: + case LBP_OP1: case LBP_OP2: + case LBP_QU1: case LBP_QU2: case LBP_QU3: + case LBP_GL: + case LBP_SP: + case LBP_ZW: + break; + default: + prop = LBP_QU1; + break; + } + } + last_prop = prop; seen_space = NULL; } diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c index b351ece0d4..10f39a8d05 100644 --- a/lib/unilbrk/u32-possible-linebreaks.c +++ b/lib/unilbrk/u32-possible-linebreaks.c @@ -151,51 +151,6 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, /* This is arbitrary. */ prop = LBP_AL1; break; - case LBP_QU2: - /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous - character's line break property was not one of - BK, CR, LF, OP, QU, GL, SP, ZW. */ - switch (prev_prop) - { - case LBP_BK: - case LBP_CR: - case LBP_LF: - case LBP_OP1: case LBP_OP2: - case LBP_QU1: case LBP_QU2: case LBP_QU3: - case LBP_GL: - case LBP_SP: - case LBP_ZW: - break; - default: - prop = LBP_QU1; - break; - } - break; - case LBP_QU3: - /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next - character's line break property is not one of - BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ - switch (PROP (lookahead1_prop_ea)) - { - case LBP_BK: - case LBP_CR: - case LBP_LF: - case LBP_SP: - case LBP_GL: - case LBP_WJ: - case LBP_CL: - case LBP_QU1: case LBP_QU2: case LBP_QU3: - case LBP_CP1: case LBP_CP2: - case LBP_EX: - case LBP_IS: - case LBP_SY: - case LBP_ZW: - break; - default: - prop = LBP_QU1; - break; - } - break; } /* Deal with spaces and combining characters. */ @@ -336,7 +291,35 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, } else { - switch (unilbrk_table [last_prop] [prop]) + int this_prop = prop; + if (prop == LBP_QU3) + { + /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the + next character's line break property is not one of + BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ + switch (PROP (lookahead1_prop_ea)) + { + case LBP_BK: + case LBP_CR: + case LBP_LF: + case LBP_SP: + case LBP_GL: + case LBP_WJ: + case LBP_CL: + case LBP_QU1: case LBP_QU2: case LBP_QU3: + case LBP_CP1: case LBP_CP2: + case LBP_EX: + case LBP_IS: + case LBP_SY: + case LBP_ZW: + break; + default: + this_prop = LBP_QU1; + break; + } + } + + switch (unilbrk_table [last_prop] [this_prop]) { case D: *p = UC_BREAK_POSSIBLE; @@ -351,6 +334,29 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, abort (); } } + + if (prop == LBP_QU2) + { + /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the + previous character's line break property was not one of + BK, CR, LF, OP, QU, GL, SP, ZW. */ + switch (prev_prop) + { + case LBP_BK: + case LBP_CR: + case LBP_LF: + case LBP_OP1: case LBP_OP2: + case LBP_QU1: case LBP_QU2: case LBP_QU3: + case LBP_GL: + case LBP_SP: + case LBP_ZW: + break; + default: + prop = LBP_QU1; + break; + } + } + last_prop = prop; seen_space = NULL; } diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c index dd7d7c6fbd..65248e5c40 100644 --- a/lib/unilbrk/u8-possible-linebreaks.c +++ b/lib/unilbrk/u8-possible-linebreaks.c @@ -158,51 +158,6 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, /* This is arbitrary. */ prop = LBP_AL1; break; - case LBP_QU2: - /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous - character's line break property was not one of - BK, CR, LF, OP, QU, GL, SP, ZW. */ - switch (prev_prop) - { - case LBP_BK: - case LBP_CR: - case LBP_LF: - case LBP_OP1: case LBP_OP2: - case LBP_QU1: case LBP_QU2: case LBP_QU3: - case LBP_GL: - case LBP_SP: - case LBP_ZW: - break; - default: - prop = LBP_QU1; - break; - } - break; - case LBP_QU3: - /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next - character's line break property is not one of - BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ - switch (PROP (lookahead1_prop_ea)) - { - case LBP_BK: - case LBP_CR: - case LBP_LF: - case LBP_SP: - case LBP_GL: - case LBP_WJ: - case LBP_CL: - case LBP_QU1: case LBP_QU2: case LBP_QU3: - case LBP_CP1: case LBP_CP2: - case LBP_EX: - case LBP_IS: - case LBP_SY: - case LBP_ZW: - break; - default: - prop = LBP_QU1; - break; - } - break; } /* Deal with spaces and combining characters. */ @@ -343,7 +298,35 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, } else { - switch (unilbrk_table [last_prop] [prop]) + int this_prop = prop; + if (prop == LBP_QU3) + { + /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the + next character's line break property is not one of + BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ + switch (PROP (lookahead1_prop_ea)) + { + case LBP_BK: + case LBP_CR: + case LBP_LF: + case LBP_SP: + case LBP_GL: + case LBP_WJ: + case LBP_CL: + case LBP_QU1: case LBP_QU2: case LBP_QU3: + case LBP_CP1: case LBP_CP2: + case LBP_EX: + case LBP_IS: + case LBP_SY: + case LBP_ZW: + break; + default: + this_prop = LBP_QU1; + break; + } + } + + switch (unilbrk_table [last_prop] [this_prop]) { case D: *p = UC_BREAK_POSSIBLE; @@ -358,6 +341,29 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, abort (); } } + + if (prop == LBP_QU2) + { + /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the + previous character's line break property was not one of + BK, CR, LF, OP, QU, GL, SP, ZW. */ + switch (prev_prop) + { + case LBP_BK: + case LBP_CR: + case LBP_LF: + case LBP_OP1: case LBP_OP2: + case LBP_QU1: case LBP_QU2: case LBP_QU3: + case LBP_GL: + case LBP_SP: + case LBP_ZW: + break; + default: + prop = LBP_QU1; + break; + } + } + last_prop = prop; seen_space = NULL; }