unilbrk: Enable most cases of Unicode rule (LB29)

Bruno Haible Sun, 15 Sep 2024 06:01:50 -0700

This patch enables Unicode rule (LB29). I had previously disabled it
because I did not like a particular effect of this rule in a special case.
With this patch, only that special case gets exempted.



2024-09-15  Bruno Haible  <br...@clisp.org>

        unilbrk: Enable most cases of Unicode rule (LB29).
        * lib/gen-uni-tables.c (output_lbrk_rules_as_tables): Enable rule LB29.
        * lib/unilbrk/lbrktables.c: Regenerated.
        * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks_loop):
        Allow a break between a character with property IS and '<'.
        * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks_loop):
        Likewise.
        * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks_loop):
        Likewise.

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index 6ac81d0821..f95a78dbf1 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -9273,12 +9273,8 @@ output_lbrk_rules_as_tables (const char *filename, const 
char *version)
 
   /* (LB29) Do not break between numeric punctuation and alphabetics
      ("e.g.").  */
-  /* We don't implement this rule, because we find it desirable to break before
-     the HTML tag "</P>" in strings like "<P>Some sentence.</P>".  */
-#if 0
   before = LBP_IS; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
   before = LBP_IS; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
-#endif
 
   /* (LB28a) Do not break inside the orthographic syllables of Brahmic
      scripts.  */
diff --git a/lib/unilbrk/lbrktables.c b/lib/unilbrk/lbrktables.c
index 217933a526..c09f5855a7 100644
--- a/lib/unilbrk/lbrktables.c
+++ b/lib/unilbrk/lbrktables.c
@@ -54,7 +54,7 @@ const unsigned char unilbrk_table[41][41] =
 /* QU1 */ {  P,  I,  I,  I,  I,  I,  P,  P,  P,  P,  I,  I,  I,  I,  I,  I,  
P,  P,  I,  I,  I,  P,  I,  I,  I,  I,  I,  I,  I,  I,  I,  I,  I,  I,  I,  I,  
I,  I,  I,  I,  I, },
 /* QU2 */ {  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  
P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  P,  
P,  P,  P,  P,  P, },
 /* QU3 */ {  P,  I,  D,  I,  D,  I,  P,  P,  P,  P,  I,  I,  D,  D,  I,  D,  
P,  P,  D,  D,  D,  P,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  
D,  D,  D,  D,  D, },
-/*  IS */ {  P,  I,  D,  I,  D,  I,  P,  P,  P,  P,  I,  I,  D,  D,  I,  D,  
P,  P,  I,  D,  D,  P,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  D,  
D,  D,  D,  D,  D, },
+/*  IS */ {  P,  I,  D,  I,  D,  I,  P,  P,  P,  P,  I,  I,  D,  D,  I,  D,  
P,  P,  I,  D,  D,  P,  I,  I,  D,  D,  D,  D,  D,  D,  D,  I,  D,  D,  D,  D,  
D,  D,  I,  D,  D, },
 /*  NU */ {  P,  I,  D,  I,  D,  I,  P,  P,  P,  P,  I,  I,  I,  D,  I,  D,  
P,  P,  D,  D,  D,  P,  I,  I,  D,  D,  D,  D,  D,  D,  D,  I,  D,  D,  D,  D,  
D,  D,  I,  D,  D, },
 /*  PO */ {  P,  I,  D,  I,  D,  I,  P,  P,  P,  P,  I,  I,  D,  D,  I,  D,  
P,  P,  I,  D,  D,  P,  I,  I,  D,  D,  D,  D,  D,  D,  D,  I,  D,  D,  D,  D,  
D,  D,  I,  D,  D, },
 /*  PR */ {  P,  I,  D,  I,  D,  I,  P,  P,  P,  P,  I,  I,  D,  D,  I,  D,  
P,  P,  I,  D,  D,  P,  I,  I,  I,  I,  I,  I,  I,  I,  I,  I,  D,  D,  D,  D,  
D,  D,  I,  I,  I, },
diff --git a/lib/unilbrk/u16-possible-linebreaks.c 
b/lib/unilbrk/u16-possible-linebreaks.c
index dde1aa6064..bf0f31057f 100644
--- a/lib/unilbrk/u16-possible-linebreaks.c
+++ b/lib/unilbrk/u16-possible-linebreaks.c
@@ -290,6 +290,14 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, 
const char *encoding,
                          Brahmic scripts, line 4.  */
                       *p = UC_BREAK_PROHIBITED;
                     }
+                  else if (last_prop == LBP_IS && uc == 0x003C)
+                    {
+                      /* Partially disable (LB29) Do not break between numeric
+                         punctuation and alphabetics ("e.g.").  We find it
+                         desirable to break before the HTML tag "</P>" in
+                         strings like "<P>Some sentence.</P>".  */
+                      *p = UC_BREAK_POSSIBLE;
+                    }
                   else if (last_prop == LBP_RI && prop == LBP_RI)
                     {
                       /* (LB30a) Break between two regional indicator symbols
diff --git a/lib/unilbrk/u32-possible-linebreaks.c 
b/lib/unilbrk/u32-possible-linebreaks.c
index 9cdda2868d..a7630467a6 100644
--- a/lib/unilbrk/u32-possible-linebreaks.c
+++ b/lib/unilbrk/u32-possible-linebreaks.c
@@ -283,6 +283,14 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, 
const char *encoding,
                          Brahmic scripts, line 4.  */
                       *p = UC_BREAK_PROHIBITED;
                     }
+                  else if (last_prop == LBP_IS && uc == 0x003C)
+                    {
+                      /* Partially disable (LB29) Do not break between numeric
+                         punctuation and alphabetics ("e.g.").  We find it
+                         desirable to break before the HTML tag "</P>" in
+                         strings like "<P>Some sentence.</P>".  */
+                      *p = UC_BREAK_POSSIBLE;
+                    }
                   else if (last_prop == LBP_RI && prop == LBP_RI)
                     {
                       /* (LB30a) Break between two regional indicator symbols
diff --git a/lib/unilbrk/u8-possible-linebreaks.c 
b/lib/unilbrk/u8-possible-linebreaks.c
index 6c26a8c26f..7abf20beed 100644
--- a/lib/unilbrk/u8-possible-linebreaks.c
+++ b/lib/unilbrk/u8-possible-linebreaks.c
@@ -290,6 +290,14 @@ u8_possible_linebreaks_loop (const uint8_t *s, size_t n, 
const char *encoding,
                          Brahmic scripts, line 4.  */
                       *p = UC_BREAK_PROHIBITED;
                     }
+                  else if (last_prop == LBP_IS && uc == 0x003C)
+                    {
+                      /* Partially disable (LB29) Do not break between numeric
+                         punctuation and alphabetics ("e.g.").  We find it
+                         desirable to break before the HTML tag "</P>" in
+                         strings like "<P>Some sentence.</P>".  */
+                      *p = UC_BREAK_POSSIBLE;
+                    }
                   else if (last_prop == LBP_RI && prop == LBP_RI)
                     {
                       /* (LB30a) Break between two regional indicator symbols

unilbrk: Enable most cases of Unicode rule (LB29)

Reply via email to