i18npool/qa/cppunit/test_breakiterator.cxx       |  105 ++++++++++++++++++-----
 i18npool/source/breakiterator/data/dict_word.txt |    5 -
 i18npool/source/breakiterator/data/edit_word.txt |    5 -
 3 files changed, 94 insertions(+), 21 deletions(-)

New commits:
commit 174aa6e980f973cea9b1c402d03bd6dba951f5ae
Author:     Jonathan Clark <[email protected]>
AuthorDate: Mon Jul 15 15:30:16 2024 -0600
Commit:     Jonathan Clark <[email protected]>
CommitDate: Tue Jul 16 02:17:54 2024 +0200

    tdf#46950 Allow intra-word right double quotation mark
    
    Hebrew text may use the character RIGHT DOUBLE QUOTATION MARK as a
    substitute for HEBREW PUNCTUATION GERSHAYIM. This change customizes the
    ICU word BreakIterator rules to that end.
    
    Change-Id: I03a48729de103505a2f68f9a1635c0f0cd7d126a
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170536
    Reviewed-by: Jonathan Clark <[email protected]>
    Tested-by: Jenkins

diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index 7e9f47ad22f1..baf1d47603c7 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -48,7 +48,7 @@ public:
     void testLegacyDictWordPrepostDash_nds_DE();
     void testLegacyDictWordPrepostDash_nl_NL();
     void testLegacyDictWordPrepostDash_sv_SE();
-    void testLegacyHebrewQuoteInsideWord();
+    void testHebrewGereshGershaim();
     void testLegacySurrogatePairs();
     void testWordCount();
 
@@ -71,7 +71,7 @@ public:
     CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE);
     CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL);
     CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE);
-    CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord);
+    CPPUNIT_TEST(testHebrewGereshGershaim);
     CPPUNIT_TEST(testLegacySurrogatePairs);
     CPPUNIT_TEST(testWordCount);
     CPPUNIT_TEST_SUITE_END();
@@ -1708,41 +1708,108 @@ void 
TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE()
     }
 }
 
-void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
+void TestBreakIterator::testHebrewGereshGershaim()
 {
+    // In Hebrew documents, there are multiple valid ways to represent the 
geresh and gershaim
+    // intra-word punctuation marks. This test exhaustively exercises them.
+    //
+    // See the following bugs:
+    // i#51661: Add quotation mark as middle letter for Hebrew
+    // tdf#46950: Spell-checking breaks Hebrew words at intra-word single and 
double quotes
+
     lang::Locale aLocale;
 
     aLocale.Language = "he";
     aLocale.Country = "IL";
 
-    // i#51661: Add quotation mark as middle letter for Hebrew
+    // Unicode U+05F3 HEBREW PUNCTUATION GERESH
     {
-        auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
+        auto aTest = u"ג׳ירפה"_ustr;
 
-        i18n::Boundary aBounds
+        auto aBounds
             = m_xBreak->getWordBoundary(aTest, 3, aLocale, 
i18n::WordType::DICTIONARY_WORD, false);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
 
-        aBounds
-            = m_xBreak->getWordBoundary(aTest, 13, aLocale, 
i18n::WordType::DICTIONARY_WORD, false);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
+        aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
     }
 
-    // i#51661: Add quotation mark as middle letter for Hebrew
+    // Apostrophe as geresh
     {
-        auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
+        auto aTest = u"ג'ירפה"_ustr;
 
-        i18n::Boundary aBounds = m_xBreak->getWordBoundary(
-            aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, 
false);
+        auto aBounds
+            = m_xBreak->getWordBoundary(aTest, 3, aLocale, 
i18n::WordType::DICTIONARY_WORD, false);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
 
-        aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale,
+        aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
                                             
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+    }
+
+    // Right single quote as geresh
+    {
+        auto aTest = u"ג’ירפה"_ustr;
+
+        auto aBounds
+            = m_xBreak->getWordBoundary(aTest, 3, aLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+    }
+
+    // Unicode U+05F4 HEBREW PUNCTUATION GERSHAYIM
+    {
+        auto aTest = u"דו״ח"_ustr;
+
+        auto aBounds
+            = m_xBreak->getWordBoundary(aTest, 2, aLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+    }
+
+    // Double quote as gershayim
+    {
+        auto aTest = u"דו\"ח"_ustr;
+
+        auto aBounds
+            = m_xBreak->getWordBoundary(aTest, 2, aLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+    }
+
+    // Right double quote as gershayim
+    {
+        auto aTest = u"דו”ח"_ustr;
+
+        auto aBounds
+            = m_xBreak->getWordBoundary(aTest, 2, aLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
     }
 }
 
diff --git a/i18npool/source/breakiterator/data/dict_word.txt 
b/i18npool/source/breakiterator/data/dict_word.txt
index deeec7dd659e..4a09af5cf1b2 100644
--- a/i18npool/source/breakiterator/data/dict_word.txt
+++ b/i18npool/source/breakiterator/data/dict_word.txt
@@ -50,7 +50,6 @@ $Katakana           = [\p{Word_Break = Katakana}];
 $Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
 $ALetter            = [\p{Word_Break = ALetter}];
 $Single_Quote       = [\p{Word_Break = Single_Quote}];
-$Double_Quote       = [\p{Word_Break = Double_Quote}];
 $MidNumLet          = [\p{Word_Break = MidNumLet}];
 $MidNum             = [\p{Word_Break = MidNum}];
 $Numeric            = [\p{Word_Break = Numeric}];
@@ -74,6 +73,10 @@ $ExcludedML         = [[:name = COLON:]
 # $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
 
+### tdf#46950: Right double-quotes are also used as substitutes for Hebrew 
gershaim
+# $Double_Quote       = [\p{Word_Break = Double_Quote}];
+$Double_Quote       = [[\p{Word_Break = Double_Quote}][:name= RIGHT DOUBLE 
QUOTATION MARK:]];
+
 ### END CUSTOMIZATION
 
 $Hiragana           = [:Hiragana:];
diff --git a/i18npool/source/breakiterator/data/edit_word.txt 
b/i18npool/source/breakiterator/data/edit_word.txt
index 1e3bcd15b20d..8db21d9b281a 100644
--- a/i18npool/source/breakiterator/data/edit_word.txt
+++ b/i18npool/source/breakiterator/data/edit_word.txt
@@ -50,7 +50,6 @@ $Katakana           = [\p{Word_Break = Katakana}];
 $Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
 $ALetter            = [\p{Word_Break = ALetter}];
 $Single_Quote       = [\p{Word_Break = Single_Quote}];
-$Double_Quote       = [\p{Word_Break = Double_Quote}];
 $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidNum             = [\p{Word_Break = MidNum}];
 $Numeric            = [\p{Word_Break = Numeric}];
@@ -67,6 +66,10 @@ $MidNumLet          = [\p{Word_Break = MidNumLet}-[:name= 
FULL STOP:]];
 # $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]-[:name 
= NARROW NO-BREAK SPACE:]];
 
+### tdf#46950: Right double-quotes are also used as substitutes for Hebrew 
gershaim
+# $Double_Quote       = [\p{Word_Break = Double_Quote}];
+$Double_Quote       = [[\p{Word_Break = Double_Quote}][:name= RIGHT DOUBLE 
QUOTATION MARK:]];
+
 ### END CUSTOMIZATION
 
 $Hiragana           = [:Hiragana:];

Reply via email to