i18npool/qa/cppunit/test_breakiterator.cxx          |   16 ++++++++++++++++
 i18npool/source/breakiterator/breakiterator_cjk.cxx |   19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

New commits:
commit 441fded7f7fc8a2564075406933226a6eea73dd1
Author: Mark Hung <[email protected]>
Date:   Sun Oct 1 14:53:51 2017 +0800

    tdf#96197 do not break Korean words in the middle.
    
    Korean words are composed of Hangul and are separated
    by space or newline. This patch improves line breaking
    function in CJK break iterator so that it does not
    break Korean words in the middle. It now breaks at the
    first character of the last Korean word.
    
    Change-Id: I91b20733c0c5ec4755bf68eb0d7c14c42c1f3556
    Reviewed-on: https://gerrit.libreoffice.org/42987
    Tested-by: Jenkins <[email protected]>
    Reviewed-by: Eike Rathke <[email protected]>

diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index 98a0bca96a77..552274864035 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -158,6 +158,22 @@ void TestBreakIterator::testLineBreaking()
             (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, 
aUserOptions);
         }
     }
+
+    //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
+    {
+        const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 
0xac00,
+                                       0xc0ac, 0xb294};
+        OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL));
+
+        aLocale.Language = "ko";
+        aLocale.Country = "KR";
+
+        {
+            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, 
aTest.getLength()-2, aLocale, 0,
+                aHyphOptions, aUserOptions);
+            CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the 
Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
+        }
+    }
 }
 
 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx 
b/i18npool/source/breakiterator/breakiterator_cjk.cxx
index 98115e6c1d32..8a4244631759 100644
--- a/i18npool/source/breakiterator/breakiterator_cjk.cxx
+++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx
@@ -86,6 +86,8 @@ BreakIterator_CJK::getWordBoundary( const OUString& text, 
sal_Int32 anyPos,
     return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, 
wordType, bDirection);
 }
 
+#define isHangul(cCh) ((cCh>=0xAC00&&cCh<=0xD7AF)||(cCh>=0x1100&&cCh<=0x11FF))
+
 LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
         const OUString& Text, sal_Int32 nStartPos,
         const css::lang::Locale& /*rLocale*/, sal_Int32 /*nMinBreakPos*/,
@@ -94,17 +96,34 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
 {
     LineBreakResults lbr;
 
+    const sal_Int32 nOldStartPos = nStartPos;
+
     if (bOptions.allowPunctuationOutsideMargin &&
             hangingCharacters.indexOf(Text[nStartPos]) != -1 &&
             (Text.iterateCodePoints( &nStartPos ), nStartPos == 
Text.getLength())) {
         ; // do nothing
     } else if (bOptions.applyForbiddenRules && 0 < nStartPos && nStartPos < 
Text.getLength()) {
+
         while (nStartPos > 0 &&
                 (bOptions.forbiddenBeginCharacters.indexOf(Text[nStartPos]) != 
-1 ||
                  bOptions.forbiddenEndCharacters.indexOf(Text[nStartPos-1]) != 
-1))
             Text.iterateCodePoints( &nStartPos, -1);
     }
 
+    // Prevent cutting Korean words in the middle.
+    if ( nOldStartPos == nStartPos && isHangul( Text[nStartPos] ) )
+    {
+        while ( nStartPos >= 0 && isHangul( Text[nStartPos] ) )
+            --nStartPos;
+
+        // beginning of the last Korean word.
+        if ( nStartPos < nOldStartPos )
+            ++nStartPos;
+
+        if ( nStartPos == 0 )
+            nStartPos = nOldStartPos;
+    }
+
     lbr.breakIndex = nStartPos;
     lbr.breakType = BreakType::WORDBOUNDARY;
     return lbr;
_______________________________________________
Libreoffice-commits mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

Reply via email to