This patch enables Unicode rule (LB29). I had previously disabled it because I did not like a particular effect of this rule in a special case. With this patch, only that special case gets exempted.
2024-09-15 Bruno Haible <br...@clisp.org> unilbrk: Enable most cases of Unicode rule (LB29). * lib/gen-uni-tables.c (output_lbrk_rules_as_tables): Enable rule LB29. * lib/unilbrk/lbrktables.c: Regenerated. * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop): Allow a break between a character with property IS and '<'. * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop): Likewise. * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop): Likewise. diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 6ac81d0821..f95a78dbf1 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -9273,12 +9273,8 @@ output_lbrk_rules_as_tables (const char *filename, const char *version) /* (LB29) Do not break between numeric punctuation and alphabetics ("e.g."). */ - /* We don't implement this rule, because we find it desirable to break before - the HTML tag "</P>" in strings like "<P>Some sentence.</P>". */ -#if 0 before = LBP_IS; after = LBP_AL; set_table_cell (prohibited_no_sp, true); before = LBP_IS; after = LBP_HL; set_table_cell (prohibited_no_sp, true); -#endif /* (LB28a) Do not break inside the orthographic syllables of Brahmic scripts. */ diff --git a/lib/unilbrk/lbrktables.c b/lib/unilbrk/lbrktables.c index 217933a526..c09f5855a7 100644 --- a/lib/unilbrk/lbrktables.c +++ b/lib/unilbrk/lbrktables.c @@ -54,7 +54,7 @@ const unsigned char unilbrk_table[41][41] = /* QU1 */ { P, I, I, I, I, I, P, P, P, P, I, I, I, I, I, I, P, P, I, I, I, P, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, }, /* QU2 */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, }, /* QU3 */ { P, I, D, I, D, I, P, P, P, P, I, I, D, D, I, D, P, P, D, D, D, P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, }, -/* IS */ { P, I, D, I, D, I, P, P, P, P, I, I, D, D, I, D, P, P, I, D, D, P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, }, +/* IS */ { P, I, D, I, D, I, P, P, P, P, I, I, D, D, I, D, P, P, I, D, D, P, I, I, D, D, D, D, D, D, D, I, D, D, D, D, D, D, I, D, D, }, /* NU */ { P, I, D, I, D, I, P, P, P, P, I, I, I, D, I, D, P, P, D, D, D, P, I, I, D, D, D, D, D, D, D, I, D, D, D, D, D, D, I, D, D, }, /* PO */ { P, I, D, I, D, I, P, P, P, P, I, I, D, D, I, D, P, P, I, D, D, P, I, I, D, D, D, D, D, D, D, I, D, D, D, D, D, D, I, D, D, }, /* PR */ { P, I, D, I, D, I, P, P, P, P, I, I, D, D, I, D, P, P, I, D, D, P, I, I, I, I, I, I, I, I, I, I, D, D, D, D, D, D, I, I, I, }, diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c index dde1aa6064..bf0f31057f 100644 --- a/lib/unilbrk/u16-possible-linebreaks.c +++ b/lib/unilbrk/u16-possible-linebreaks.c @@ -290,6 +290,14 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, Brahmic scripts, line 4. */ *p = UC_BREAK_PROHIBITED; } + else if (last_prop == LBP_IS && uc == 0x003C) + { + /* Partially disable (LB29) Do not break between numeric + punctuation and alphabetics ("e.g."). We find it + desirable to break before the HTML tag "</P>" in + strings like "<P>Some sentence.</P>". */ + *p = UC_BREAK_POSSIBLE; + } else if (last_prop == LBP_RI && prop == LBP_RI) { /* (LB30a) Break between two regional indicator symbols diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c index 9cdda2868d..a7630467a6 100644 --- a/lib/unilbrk/u32-possible-linebreaks.c +++ b/lib/unilbrk/u32-possible-linebreaks.c @@ -283,6 +283,14 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding, Brahmic scripts, line 4. */ *p = UC_BREAK_PROHIBITED; } + else if (last_prop == LBP_IS && uc == 0x003C) + { + /* Partially disable (LB29) Do not break between numeric + punctuation and alphabetics ("e.g."). We find it + desirable to break before the HTML tag "</P>" in + strings like "<P>Some sentence.</P>". */ + *p = UC_BREAK_POSSIBLE; + } else if (last_prop == LBP_RI && prop == LBP_RI) { /* (LB30a) Break between two regional indicator symbols diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c index 6c26a8c26f..7abf20beed 100644 --- a/lib/unilbrk/u8-possible-linebreaks.c +++ b/lib/unilbrk/u8-possible-linebreaks.c @@ -290,6 +290,14 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding, Brahmic scripts, line 4. */ *p = UC_BREAK_PROHIBITED; } + else if (last_prop == LBP_IS && uc == 0x003C) + { + /* Partially disable (LB29) Do not break between numeric + punctuation and alphabetics ("e.g."). We find it + desirable to break before the HTML tag "</P>" in + strings like "<P>Some sentence.</P>". */ + *p = UC_BREAK_POSSIBLE; + } else if (last_prop == LBP_RI && prop == LBP_RI) { /* (LB30a) Break between two regional indicator symbols