editeng/source/editeng/impedit4.cxx                     |    3 
 i18npool/qa/cppunit/test_breakiterator.cxx              |   98 +++++++++-------
 i18npool/source/breakiterator/breakiterator_unicode.cxx |    2 
 sw/qa/core/uwriter.cxx                                  |   42 ++++++
 sw/source/core/txtnode/txtedt.cxx                       |   67 +++++++---
 sw/uiconfig/swriter/ui/wordcount-mobile.ui              |    4 
 sw/uiconfig/swriter/ui/wordcount.ui                     |    2 
 7 files changed, 153 insertions(+), 65 deletions(-)

New commits:
commit aa938fe03cfb968f7f4ed9760dcbe579a74bdc02
Author:     Jonathan Clark <[email protected]>
AuthorDate: Tue Jul 16 16:50:10 2024 -0600
Commit:     Jonathan Clark <[email protected]>
CommitDate: Wed Jul 17 15:45:41 2024 +0200

    tdf#150621 Changed Korean word counting to use words
    
    Previously, Writer counted characters for all CJK languages, rather than
    words. This is the correct behavior for Chinese and Japanese, which make
    extensive use of ideographs. However, it is not correct for Korean.
    
    This change adjusts the Writer word count algorithm to count Korean
    words, rather than Korean characters.
    
    Change-Id: I6e77136867baca1a7b51248886ee5fd7073ad364
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170621
    Tested-by: Jenkins
    Reviewed-by: Jonathan Clark <[email protected]>

diff --git a/editeng/source/editeng/impedit4.cxx 
b/editeng/source/editeng/impedit4.cxx
index 35070a4dd740..e1f655c1cc89 100644
--- a/editeng/source/editeng/impedit4.cxx
+++ b/editeng/source/editeng/impedit4.cxx
@@ -2904,7 +2904,8 @@ EditSelection ImpEditEngine::TransliterateText( const 
EditSelection& rSelection,
             }
 
             i18n::Boundary aCurWordBndry( aSttBndry );
-            while (aCurWordBndry.endPos && aCurWordBndry.startPos <= 
aEndBndry.startPos)
+            while (aCurWordBndry.startPos != aCurWordBndry.endPos
+                   && aCurWordBndry.startPos <= aEndBndry.startPos)
             {
                 nCurrentStart = aCurWordBndry.startPos;
                 nCurrentEnd   = aCurWordBndry.endPos;
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index 3b4ae3efea3d..84ae6d5fe72b 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -1867,33 +1867,32 @@ void TestBreakIterator::testLegacySurrogatePairs()
 
 void TestBreakIterator::testWordCount()
 {
-    auto count_words_fn = [&](const OUString& str, const lang::Locale& 
aLocale) -> int
+    auto fnCountWords = [&](const OUString& aStr, const lang::Locale& aLocale) 
-> int
     {
-        int num_words = 0;
-        sal_Int32 next_pos = 0;
-        int iter_guard = 0;
+        int nWords = 0;
+        sal_Int32 nNextPos = 0;
+        int nIterGuard = 0;
 
-        if (m_xBreak->isBeginWord(str, next_pos, aLocale, 
i18n::WordType::WORD_COUNT))
+        if (m_xBreak->isBeginWord(aStr, nNextPos, aLocale, 
i18n::WordType::WORD_COUNT))
         {
-            ++num_words;
+            ++nWords;
         }
 
         while (true)
         {
-            CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard 
< 100);
+            CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++nIterGuard 
< 100);
 
-            auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, 
i18n::WordType::WORD_COUNT);
-
-            if (aBounds.endPos < next_pos || aBounds.startPos == 
aBounds.endPos)
+            auto aBounds = m_xBreak->nextWord(aStr, nNextPos, aLocale, 
i18n::WordType::WORD_COUNT);
+            if (aBounds.endPos == aBounds.startPos)
             {
                 break;
             }
 
-            next_pos = aBounds.endPos;
-            ++num_words;
+            nNextPos = aBounds.endPos;
+            ++nWords;
         }
 
-        return num_words;
+        return nWords;
     };
 
     // i#80815: "Word count differs from MS Word"
@@ -1903,29 +1902,29 @@ void TestBreakIterator::testWordCount()
         aLocale.Language = "en";
         aLocale.Country = "US";
 
-        const OUString str = u""
-                             "test data for word count issue #80815
"
-                             "fo\\'sforos
"
-                             "archipi\\'elago
"
-                             "do\^me
"
-                             "f**k
"
-                             "
"
-                             "battery-driven
"
-                             "and/or
"
-                             "apple(s)
"
-                             "money+opportunity
"
-                             "Micro$oft
"
-                             "
"
-                             "300$
"
-                             "I(not you)
"
-                             "a****n
"
-                             "1+3=4
"
-                             "
"
-                             "aaaaaaa.aaaaaaa
"
-                             "aaaaaaa,aaaaaaa
"
-                             "aaaaaaa;aaaaaaa
"_ustr;
-
-        CPPUNIT_ASSERT_EQUAL(24, count_words_fn(str, aLocale));
+        const OUString aStr = u""
+                              "test data for word count issue #80815
"
+                              "fo\\'sforos
"
+                              "archipi\\'elago
"
+                              "do\^me
"
+                              "f**k
"
+                              "
"
+                              "battery-driven
"
+                              "and/or
"
+                              "apple(s)
"
+                              "money+opportunity
"
+                              "Micro$oft
"
+                              "
"
+                              "300$
"
+                              "I(not you)
"
+                              "a****n
"
+                              "1+3=4
"
+                              "
"
+                              "aaaaaaa.aaaaaaa
"
+                              "aaaaaaa,aaaaaaa
"
+                              "aaaaaaa;aaaaaaa
"_ustr;
+
+        CPPUNIT_ASSERT_EQUAL(24, fnCountWords(aStr, aLocale));
     }
 
     // Test that the switch to upstream ICU for CJ word boundary analysis 
doesn't change word count.
@@ -1934,9 +1933,32 @@ void TestBreakIterator::testWordCount()
         aLocale.Language = "ja";
         aLocale.Country = "JP";
 
-        const OUString str = u"Wordの様にワード数をするのにTest
植松町"_ustr;
+        const OUString aStr = u"Wordの様にワード数をするのにTest
植松町"_ustr;
+
+        CPPUNIT_ASSERT_EQUAL(7, fnCountWords(aStr, aLocale));
+    }
+
+    // tdf#150621 Korean words should be counted individually, rather than by 
syllable.
+    //
+    // Per i#80815, the intention for the word count feature is to emulate the 
behavior of MS Word.
+    {
+        lang::Locale aLocale;
+        aLocale.Language = "ko";
+        aLocale.Country = "KR";
 
-        CPPUNIT_ASSERT_EQUAL(7, count_words_fn(str, aLocale));
+        // Basic case: Korean words are counted as space-delimited. In 
particular, grammatical
+        // particles are treated as part of the previous word.
+        CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"저는 영화를 봤어요"_ustr, aLocale));
+
+        // Mixed script: Korean is mostly written in hangul, but hanja are 
still used in certain
+        // situations (e.g. abbreviations in newspaper articles). For Chinese 
and Japanese, such
+        // ideographs would be counted individually as words. In Korean, 
however, they are treated
+        // no differently than hangul characters.
+        CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"불렀다...與"_ustr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"불렀다 ...與"_ustr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"불렀다 ... 與"_ustr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"尹탄핵"_ustr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"尹 탄핵"_ustr, aLocale));
     }
 }
 
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx 
b/i18npool/source/breakiterator/breakiterator_unicode.cxx
index 9b47c433f296..6406158f558f 100644
--- a/i18npool/source/breakiterator/breakiterator_unicode.cxx
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -364,7 +364,7 @@ Boundary SAL_CALL BreakIterator_Unicode::nextWord( const 
OUString& Text, sal_Int
     Boundary rv;
     rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
     if( rv.startPos >= Text.getLength() || rv.startPos == 
icu::BreakIterator::DONE )
-        rv.endPos = result.startPos;
+        rv.endPos = rv.startPos;
     else {
         if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
              && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
diff --git a/sw/qa/core/uwriter.cxx b/sw/qa/core/uwriter.cxx
index 436e4c1c934a..d1549a8ae966 100644
--- a/sw/qa/core/uwriter.cxx
+++ b/sw/qa/core/uwriter.cxx
@@ -654,7 +654,7 @@ void SwDocTest::testSwScanner()
         pTextNode = aPaM.GetPointNode().GetTextNode();
         pTextNode->CountWords(aDocStat, 0, test.getLength());
         CPPUNIT_ASSERT_EQUAL_MESSAGE("words", static_cast<sal_uLong>(58), 
aDocStat.nWord);
-        CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean syllables", 
static_cast<sal_uLong>(43), aDocStat.nAsianWord);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean words", 
static_cast<sal_uLong>(43), aDocStat.nAsianWord);
         CPPUNIT_ASSERT_EQUAL_MESSAGE("non-whitespace chars", 
static_cast<sal_uLong>(105), aDocStat.nCharExcludingSpaces);
         CPPUNIT_ASSERT_EQUAL_MESSAGE("characters", 
static_cast<sal_uLong>(128), aDocStat.nChar);
     }
@@ -929,6 +929,46 @@ void SwDocTest::testSwScanner()
         CPPUNIT_ASSERT_EQUAL(sal_uLong(17), aDocStat.nChar);
         aDocStat.Reset();
     }
+
+    // tdf#150621 Korean words should be counted individually, rather than by 
syllable.
+    //
+    // Per i#80815, the intention for the word count feature is to emulate the 
behavior of MS Word.
+    {
+        auto fnAssertWords = [&](const OUString& aStr, sal_uLong nWords, 
sal_uLong nAsianWords)
+        {
+            
m_pDoc->getIDocumentContentOperations().AppendTextNode(*aPaM.GetPoint());
+
+            SvxLanguageItem aCJKLangItem(LANGUAGE_KOREAN, 
RES_CHRATR_CJK_LANGUAGE);
+            SvxLanguageItem aWestLangItem(LANGUAGE_ENGLISH_US, 
RES_CHRATR_LANGUAGE);
+            m_pDoc->getIDocumentContentOperations().InsertPoolItem(aPaM, 
aCJKLangItem);
+            m_pDoc->getIDocumentContentOperations().InsertPoolItem(aPaM, 
aWestLangItem);
+
+            m_pDoc->getIDocumentContentOperations().InsertString(aPaM, aStr);
+
+            SwDocStat aDocStat;
+            pTextNode = aPaM.GetPointNode().GetTextNode();
+            pTextNode->CountWords(aDocStat, 0, aStr.getLength());
+            CPPUNIT_ASSERT_EQUAL_MESSAGE("words", nWords, aDocStat.nWord);
+            CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean words", 
nAsianWords,
+                                         aDocStat.nAsianWord);
+        };
+
+        // Basic case: Korean words are counted as space-delimited. In 
particular, grammatical
+        // particles are treated as part of the previous word.
+        fnAssertWords(u"저는 영화를 봤어요"_ustr, 3, 3);
+
+        // Mixed script: Korean is mostly written in hangul, but hanja are 
still used in certain
+        // situations (e.g. abbreviations in newspaper articles). For Chinese 
and Japanese, such
+        // ideographs would be counted individually as words. In Korean, 
however, they are treated
+        // no differently than hangul characters.
+        fnAssertWords(u"尹탄핵"_ustr, 1, 1);
+        fnAssertWords(u"尹 탄핵"_ustr, 2, 2);
+
+        // These mixed-script results are anomalous, but reflect the behavior 
of MSW.
+        fnAssertWords(u"불렀다...與"_ustr, 1, 1);
+        fnAssertWords(u"불렀다 ...與"_ustr, 2, 1);
+        fnAssertWords(u"불렀다 ... 與"_ustr, 3, 2);
+    }
 }
 
 void SwDocTest::testMergePortionsDeleteNotSorted()
diff --git a/sw/source/core/txtnode/txtedt.cxx 
b/sw/source/core/txtnode/txtedt.cxx
index a1bfb0c0f3c1..24de64a8f9e0 100644
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@@ -793,40 +793,62 @@ 
SwScanner::SwScanner(std::function<LanguageType(sal_Int32, sal_Int32, bool)> aGe
 
 namespace
 {
-    //fdo#45271 for Asian words count characters instead of words
-    sal_Int32 forceEachAsianCodePointToWord(const OUString &rText, sal_Int32 
nBegin, sal_Int32 nLen)
+// tdf#45271 For Chinese and Japanese, count characters instead of words
+sal_Int32
+forceEachCJCodePointToWord(const OUString& rText, sal_Int32 nBegin, sal_Int32 
nLen,
+                           const ModelToViewHelper* pModelToView,
+                           std::function<LanguageType(sal_Int32, sal_Int32, 
bool)>& fnGetLangOfChar)
+{
+    if (nLen > 1)
     {
-        if (nLen > 1)
-        {
-            const uno::Reference< XBreakIterator > &rxBreak = 
g_pBreakIt->GetBreakIter();
+        const uno::Reference<XBreakIterator>& rxBreak = 
g_pBreakIt->GetBreakIter();
 
-            sal_uInt16 nCurrScript = rxBreak->getScriptType( rText, nBegin );
+        sal_uInt16 nCurrScript = rxBreak->getScriptType(rText, nBegin);
 
-            sal_Int32 indexUtf16 = nBegin;
-            rText.iterateCodePoints(&indexUtf16);
+        sal_Int32 indexUtf16 = nBegin;
+        rText.iterateCodePoints(&indexUtf16);
 
-            //First character is Asian, consider it a word :-(
-            if (nCurrScript == i18n::ScriptType::ASIAN)
+        // First character is Asian
+        if (nCurrScript == i18n::ScriptType::ASIAN)
+        {
+            auto aModelBeginPos = pModelToView->ConvertToModelPosition(nBegin);
+            auto aCurrentLang = fnGetLangOfChar(aModelBeginPos.mnPos, 
nCurrScript, false);
+
+            // tdf#150621 Korean words must be counted as-is
+            if (primary(aCurrentLang) == primary(LANGUAGE_KOREAN))
             {
-                nLen = indexUtf16 - nBegin;
                 return nLen;
             }
 
-            //First character was not Asian, consider appearance of any Asian 
character
-            //to be the end of the word
-            while (indexUtf16 < nBegin + nLen)
+            // Word is Chinese or Japanese, and must be truncated to a single 
character
+            return indexUtf16 - nBegin;
+        }
+
+        // First character was not Asian, consider appearance of any Asian 
character
+        // to be the end of the word
+        while (indexUtf16 < nBegin + nLen)
+        {
+            nCurrScript = rxBreak->getScriptType(rText, indexUtf16);
+            if (nCurrScript == i18n::ScriptType::ASIAN)
             {
-                nCurrScript = rxBreak->getScriptType( rText, indexUtf16 );
-                if (nCurrScript == i18n::ScriptType::ASIAN)
+                auto aModelBeginPos = 
pModelToView->ConvertToModelPosition(indexUtf16);
+                auto aCurrentLang = fnGetLangOfChar(aModelBeginPos.mnPos, 
nCurrScript, false);
+
+                // tdf#150621 Korean words must be counted as-is.
+                // Note that script changes intentionally do not delimit words 
for counting.
+                if (primary(aCurrentLang) == primary(LANGUAGE_KOREAN))
                 {
-                    nLen = indexUtf16 - nBegin;
                     return nLen;
                 }
-                rText.iterateCodePoints(&indexUtf16);
+
+                // Word tail contains Chinese or Japanese, and must be 
truncated
+                return indexUtf16 - nBegin;
             }
+            rText.iterateCodePoints(&indexUtf16);
         }
-        return nLen;
     }
+    return nLen;
+}
 }
 
 bool SwScanner::NextWord()
@@ -959,8 +981,11 @@ bool SwScanner::NextWord()
     if( ! m_nLength )
         return false;
 
-    if ( m_nWordType == i18n::WordType::WORD_COUNT )
-        m_nLength = forceEachAsianCodePointToWord(m_aText, m_nBegin, 
m_nLength);
+    if (m_nWordType == i18n::WordType::WORD_COUNT)
+    {
+        m_nLength = forceEachCJCodePointToWord(m_aText, m_nBegin, m_nLength, 
&m_ModelToView,
+                                               m_pGetLangOfChar);
+    }
 
     m_aPrevWord = m_aWord;
     m_aWord = m_aPreDashReplacementText.copy( m_nBegin, m_nLength );
diff --git a/sw/uiconfig/swriter/ui/wordcount-mobile.ui 
b/sw/uiconfig/swriter/ui/wordcount-mobile.ui
index a1bb1dfdf70b..de6ef6f5e9cd 100644
--- a/sw/uiconfig/swriter/ui/wordcount-mobile.ui
+++ b/sw/uiconfig/swriter/ui/wordcount-mobile.ui
@@ -202,7 +202,7 @@
                   <object class="GtkLabel" id="cjkcharsft">
                     <property name="can-focus">False</property>
                     <property name="no-show-all">True</property>
-                    <property name="label" translatable="yes" 
context="wordcount-mobile|cjkcharsft">Asian characters and Korean 
syllables</property>
+                    <property name="label" translatable="yes" 
context="wordcount-mobile|cjkcharsft">Asian characters and Korean 
words</property>
                     <property name="xalign">1</property>
                   </object>
                   <packing>
@@ -274,7 +274,7 @@
                   <object class="GtkLabel" id="cjkcharsft2">
                     <property name="can-focus">False</property>
                     <property name="no-show-all">True</property>
-                    <property name="label" translatable="yes" 
context="wordcount-mobile|cjkcharsft">Asian characters and Korean 
syllables</property>
+                    <property name="label" translatable="yes" 
context="wordcount-mobile|cjkcharsft">Asian characters and Korean 
words</property>
                     <property name="xalign">1</property>
                   </object>
                   <packing>
diff --git a/sw/uiconfig/swriter/ui/wordcount.ui 
b/sw/uiconfig/swriter/ui/wordcount.ui
index c42d8fbb30ad..5f11d31e0b7b 100644
--- a/sw/uiconfig/swriter/ui/wordcount.ui
+++ b/sw/uiconfig/swriter/ui/wordcount.ui
@@ -229,7 +229,7 @@
               <object class="GtkLabel" id="cjkcharsft">
                 <property name="can-focus">False</property>
                 <property name="no-show-all">True</property>
-                <property name="label" translatable="yes" 
context="wordcount|cjkcharsft">Asian characters and Korean syllables</property>
+                <property name="label" translatable="yes" 
context="wordcount|cjkcharsft">Asian characters and Korean words</property>
                 <property name="xalign">1</property>
               </object>
               <packing>

Reply via email to