sw/qa/extras/ww8export/data/tdf117636.doc |binary sw/qa/extras/ww8export/ww8export4.cxx | 14 ++++++++++++++ sw/source/filter/ww8/ww8par6.cxx | 28 +++++++++++++++++++++------- 3 files changed, 35 insertions(+), 7 deletions(-)
New commits: commit 22152fd1278110bc625f5ad535f7f03a52e69ddb Author: Jonathan Clark <jonat...@libreoffice.org> AuthorDate: Tue Jul 29 05:01:45 2025 -0600 Commit: Jonathan Clark <jonat...@libreoffice.org> CommitDate: Fri Aug 1 00:48:49 2025 +0200 tdf#117636 sw: Ignore spurious language spans in DOC files Some DOC files may contain large amounts of unnecessary CLid* entries, indicating noop char locale changes. These entries don't affect document meaning, but may cause significant performance problems when handled naively. This is a fairly straightforward change to ignore language runs when the same language is already used for the font slot. Manual testing suggests newer versions of Word also implement the same optimization, so this should be safe. Change-Id: Ia3f34d3485acd2ee67ae9c081c9045ea36cf3fa2 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/188569 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonat...@libreoffice.org> diff --git a/sw/qa/extras/ww8export/data/tdf117636.doc b/sw/qa/extras/ww8export/data/tdf117636.doc new file mode 100644 index 000000000000..10afef0d7151 Binary files /dev/null and b/sw/qa/extras/ww8export/data/tdf117636.doc differ diff --git a/sw/qa/extras/ww8export/ww8export4.cxx b/sw/qa/extras/ww8export/ww8export4.cxx index 6658664118fa..64c02421af30 100644 --- a/sw/qa/extras/ww8export/ww8export4.cxx +++ b/sw/qa/extras/ww8export/ww8export4.cxx @@ -832,6 +832,20 @@ CPPUNIT_TEST_FIXTURE(Test, testTdf167583) fnVerify(false); } +DECLARE_WW8EXPORT_TEST(testTdf117636, "tdf117636.doc") +{ + xmlDocUniquePtr pXmlDoc = parseLayoutDump(); + + // The fix for bug 117636 works by coalescing runs that are separated by unnecessary + // language tags. There is some unrelated platform-specific difference in the number + // of spans in this document, but if the fix is working most of the line portions + // should be combined. + // Without the fix in place, this fails with: + // Expected: 1 or 2 + // Actual: 6 + CPPUNIT_ASSERT_LESS(3, countXPathNodes(pXmlDoc, "//SwLinePortion")); +} + } // end of anonymous namespace CPPUNIT_PLUGIN_IMPLEMENT(); diff --git a/sw/source/filter/ww8/ww8par6.cxx b/sw/source/filter/ww8/ww8par6.cxx index 960f62bf019d..92ec2141ac79 100644 --- a/sw/source/filter/ww8/ww8par6.cxx +++ b/sw/source/filter/ww8/ww8par6.cxx @@ -4083,32 +4083,46 @@ void SwWW8ImplReader::Read_CharSet(sal_uInt16 , const sal_uInt8* pData, short nL void SwWW8ImplReader::Read_Language( sal_uInt16 nId, const sal_uInt8* pData, short nLen ) { - switch( nId ) + TypedWhichId<SvxLanguageItem> nTypedId = RES_CHRATR_LANGUAGE; + switch (nId) { case NS_sprm::v6::sprmCLid: case NS_sprm::CRgLid0_80::val: case NS_sprm::CRgLid0::val: - nId = RES_CHRATR_LANGUAGE; + nTypedId = RES_CHRATR_LANGUAGE; break; case NS_sprm::CRgLid1_80::val: case NS_sprm::CRgLid1::val: - nId = RES_CHRATR_CJK_LANGUAGE; + nTypedId = RES_CHRATR_CJK_LANGUAGE; break; case 83: // WW2 case 114: // WW7 case NS_sprm::CLidBi::val: - nId = RES_CHRATR_CTL_LANGUAGE; + nTypedId = RES_CHRATR_CTL_LANGUAGE; break; default: return; } if (nLen < 2) // end of attribute - m_xCtrlStck->SetAttr( *m_pPaM->GetPoint(), nId ); + m_xCtrlStck->SetAttr(*m_pPaM->GetPoint(), nTypedId); else { - sal_uInt16 nLang = SVBT16ToUInt16( pData ); // Language-Id - NewAttr(SvxLanguageItem(LanguageType(nLang), nId)); + auto eLang = LanguageType(SVBT16ToUInt16(pData)); // Language-Id + + // tdf#117636: Certain DOC files may contain spurious language spans. + // These may cause performance issues when present, and are stripped + // automatically by newer versions of Word. Do the same thing here. + std::optional<LanguageType> eCurrLanguage; + if (const SvxLanguageItem* pLang = GetFormatAttr(nTypedId); pLang) + { + eCurrLanguage = pLang->GetLanguage(); + } + + if (eLang != eCurrLanguage) + { + NewAttr(SvxLanguageItem(eLang, nTypedId)); + } } }