sw/qa/extras/ww8export/data/tdf117636.doc |binary
 sw/qa/extras/ww8export/ww8export4.cxx     |   14 ++++++++++++++
 sw/source/filter/ww8/ww8par6.cxx          |   28 +++++++++++++++++++++-------
 3 files changed, 35 insertions(+), 7 deletions(-)

New commits:
commit 22152fd1278110bc625f5ad535f7f03a52e69ddb
Author:     Jonathan Clark <jonat...@libreoffice.org>
AuthorDate: Tue Jul 29 05:01:45 2025 -0600
Commit:     Jonathan Clark <jonat...@libreoffice.org>
CommitDate: Fri Aug 1 00:48:49 2025 +0200

    tdf#117636 sw: Ignore spurious language spans in DOC files
    
    Some DOC files may contain large amounts of unnecessary CLid* entries,
    indicating noop char locale changes. These entries don't affect document
    meaning, but may cause significant performance problems when handled
    naively.
    
    This is a fairly straightforward change to ignore language runs when the
    same language is already used for the font slot. Manual testing suggests
    newer versions of Word also implement the same optimization, so this
    should be safe.
    
    Change-Id: Ia3f34d3485acd2ee67ae9c081c9045ea36cf3fa2
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/188569
    Tested-by: Jenkins
    Reviewed-by: Jonathan Clark <jonat...@libreoffice.org>

diff --git a/sw/qa/extras/ww8export/data/tdf117636.doc 
b/sw/qa/extras/ww8export/data/tdf117636.doc
new file mode 100644
index 000000000000..10afef0d7151
Binary files /dev/null and b/sw/qa/extras/ww8export/data/tdf117636.doc differ
diff --git a/sw/qa/extras/ww8export/ww8export4.cxx 
b/sw/qa/extras/ww8export/ww8export4.cxx
index 6658664118fa..64c02421af30 100644
--- a/sw/qa/extras/ww8export/ww8export4.cxx
+++ b/sw/qa/extras/ww8export/ww8export4.cxx
@@ -832,6 +832,20 @@ CPPUNIT_TEST_FIXTURE(Test, testTdf167583)
     fnVerify(false);
 }
 
+DECLARE_WW8EXPORT_TEST(testTdf117636, "tdf117636.doc")
+{
+    xmlDocUniquePtr pXmlDoc = parseLayoutDump();
+
+    // The fix for bug 117636 works by coalescing runs that are separated by 
unnecessary
+    // language tags. There is some unrelated platform-specific difference in 
the number
+    // of spans in this document, but if the fix is working most of the line 
portions
+    // should be combined.
+    // Without the fix in place, this fails with:
+    // Expected: 1 or 2
+    // Actual:   6
+    CPPUNIT_ASSERT_LESS(3, countXPathNodes(pXmlDoc, "//SwLinePortion"));
+}
+
 } // end of anonymous namespace
 CPPUNIT_PLUGIN_IMPLEMENT();
 
diff --git a/sw/source/filter/ww8/ww8par6.cxx b/sw/source/filter/ww8/ww8par6.cxx
index 960f62bf019d..92ec2141ac79 100644
--- a/sw/source/filter/ww8/ww8par6.cxx
+++ b/sw/source/filter/ww8/ww8par6.cxx
@@ -4083,32 +4083,46 @@ void SwWW8ImplReader::Read_CharSet(sal_uInt16 , const 
sal_uInt8* pData, short nL
 
 void SwWW8ImplReader::Read_Language( sal_uInt16 nId, const sal_uInt8* pData, 
short nLen )
 {
-    switch( nId )
+    TypedWhichId<SvxLanguageItem> nTypedId = RES_CHRATR_LANGUAGE;
+    switch (nId)
     {
         case NS_sprm::v6::sprmCLid:
         case NS_sprm::CRgLid0_80::val:
         case NS_sprm::CRgLid0::val:
-            nId = RES_CHRATR_LANGUAGE;
+            nTypedId = RES_CHRATR_LANGUAGE;
             break;
         case NS_sprm::CRgLid1_80::val:
         case NS_sprm::CRgLid1::val:
-            nId = RES_CHRATR_CJK_LANGUAGE;
+            nTypedId = RES_CHRATR_CJK_LANGUAGE;
             break;
         case 83:  // WW2
         case 114: // WW7
         case NS_sprm::CLidBi::val:
-            nId = RES_CHRATR_CTL_LANGUAGE;
+            nTypedId = RES_CHRATR_CTL_LANGUAGE;
             break;
         default:
             return;
     }
 
     if (nLen < 2)                  // end of attribute
-        m_xCtrlStck->SetAttr( *m_pPaM->GetPoint(), nId );
+        m_xCtrlStck->SetAttr(*m_pPaM->GetPoint(), nTypedId);
     else
     {
-        sal_uInt16 nLang = SVBT16ToUInt16( pData );  // Language-Id
-        NewAttr(SvxLanguageItem(LanguageType(nLang), nId));
+        auto eLang = LanguageType(SVBT16ToUInt16(pData)); // Language-Id
+
+        // tdf#117636: Certain DOC files may contain spurious language spans.
+        // These may cause performance issues when present, and are stripped
+        // automatically by newer versions of Word. Do the same thing here.
+        std::optional<LanguageType> eCurrLanguage;
+        if (const SvxLanguageItem* pLang = GetFormatAttr(nTypedId); pLang)
+        {
+            eCurrLanguage = pLang->GetLanguage();
+        }
+
+        if (eLang != eCurrLanguage)
+        {
+            NewAttr(SvxLanguageItem(eLang, nTypedId));
+        }
     }
 }
 

Reply via email to