i18npool/qa/cppunit/test_breakiterator.cxx | 105 ++++++++++++++++++----- i18npool/source/breakiterator/data/dict_word.txt | 5 - i18npool/source/breakiterator/data/edit_word.txt | 5 - 3 files changed, 94 insertions(+), 21 deletions(-)
New commits: commit 174aa6e980f973cea9b1c402d03bd6dba951f5ae Author: Jonathan Clark <[email protected]> AuthorDate: Mon Jul 15 15:30:16 2024 -0600 Commit: Jonathan Clark <[email protected]> CommitDate: Tue Jul 16 02:17:54 2024 +0200 tdf#46950 Allow intra-word right double quotation mark Hebrew text may use the character RIGHT DOUBLE QUOTATION MARK as a substitute for HEBREW PUNCTUATION GERSHAYIM. This change customizes the ICU word BreakIterator rules to that end. Change-Id: I03a48729de103505a2f68f9a1635c0f0cd7d126a Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170536 Reviewed-by: Jonathan Clark <[email protected]> Tested-by: Jenkins diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 7e9f47ad22f1..baf1d47603c7 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -48,7 +48,7 @@ public: void testLegacyDictWordPrepostDash_nds_DE(); void testLegacyDictWordPrepostDash_nl_NL(); void testLegacyDictWordPrepostDash_sv_SE(); - void testLegacyHebrewQuoteInsideWord(); + void testHebrewGereshGershaim(); void testLegacySurrogatePairs(); void testWordCount(); @@ -71,7 +71,7 @@ public: CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE); CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL); CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE); - CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord); + CPPUNIT_TEST(testHebrewGereshGershaim); CPPUNIT_TEST(testLegacySurrogatePairs); CPPUNIT_TEST(testWordCount); CPPUNIT_TEST_SUITE_END(); @@ -1708,41 +1708,108 @@ void TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE() } } -void TestBreakIterator::testLegacyHebrewQuoteInsideWord() +void TestBreakIterator::testHebrewGereshGershaim() { + // In Hebrew documents, there are multiple valid ways to represent the geresh and gershaim + // intra-word punctuation marks. This test exhaustively exercises them. + // + // See the following bugs: + // i#51661: Add quotation mark as middle letter for Hebrew + // tdf#46950: Spell-checking breaks Hebrew words at intra-word single and double quotes + lang::Locale aLocale; aLocale.Language = "he"; aLocale.Country = "IL"; - // i#51661: Add quotation mark as middle letter for Hebrew + // Unicode U+05F3 HEBREW PUNCTUATION GERESH { - auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; + auto aTest = u"ג׳ירפה"_ustr; - i18n::Boundary aBounds + auto aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - aBounds - = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } - // i#51661: Add quotation mark as middle letter for Hebrew + // Apostrophe as geresh { - auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; + auto aTest = u"ג'ירפה"_ustr; - i18n::Boundary aBounds = m_xBreak->getWordBoundary( - aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + auto aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, + aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } + + // Right single quote as geresh + { + auto aTest = u"ג’ירפה"_ustr; + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } + + // Unicode U+05F4 HEBREW PUNCTUATION GERSHAYIM + { + auto aTest = u"דו״ח"_ustr; + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + } + + // Double quote as gershayim + { + auto aTest = u"דו\"ח"_ustr; + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + } + + // Right double quote as gershayim + { + auto aTest = u"דו”ח"_ustr; + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); } } diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt index deeec7dd659e..4a09af5cf1b2 100644 --- a/i18npool/source/breakiterator/data/dict_word.txt +++ b/i18npool/source/breakiterator/data/dict_word.txt @@ -50,7 +50,6 @@ $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; $ALetter = [\p{Word_Break = ALetter}]; $Single_Quote = [\p{Word_Break = Single_Quote}]; -$Double_Quote = [\p{Word_Break = Double_Quote}]; $MidNumLet = [\p{Word_Break = MidNumLet}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; @@ -74,6 +73,10 @@ $ExcludedML = [[:name = COLON:] # $MidLetter = [\p{Word_Break = MidLetter}]; $MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; +### tdf#46950: Right double-quotes are also used as substitutes for Hebrew gershaim +# $Double_Quote = [\p{Word_Break = Double_Quote}]; +$Double_Quote = [[\p{Word_Break = Double_Quote}][:name= RIGHT DOUBLE QUOTATION MARK:]]; + ### END CUSTOMIZATION $Hiragana = [:Hiragana:]; diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt index 1e3bcd15b20d..8db21d9b281a 100644 --- a/i18npool/source/breakiterator/data/edit_word.txt +++ b/i18npool/source/breakiterator/data/edit_word.txt @@ -50,7 +50,6 @@ $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; $ALetter = [\p{Word_Break = ALetter}]; $Single_Quote = [\p{Word_Break = Single_Quote}]; -$Double_Quote = [\p{Word_Break = Double_Quote}]; $MidLetter = [\p{Word_Break = MidLetter}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; @@ -67,6 +66,10 @@ $MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; # $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]-[:name = NARROW NO-BREAK SPACE:]]; +### tdf#46950: Right double-quotes are also used as substitutes for Hebrew gershaim +# $Double_Quote = [\p{Word_Break = Double_Quote}]; +$Double_Quote = [[\p{Word_Break = Double_Quote}][:name= RIGHT DOUBLE QUOTATION MARK:]]; + ### END CUSTOMIZATION $Hiragana = [:Hiragana:];
