sw/source/filter/ww8/ww8par.cxx | 136 +++++++++++++++++++++++++--------------- sw/source/filter/ww8/ww8par.hxx | 2 2 files changed, 88 insertions(+), 50 deletions(-)
New commits: commit 47b84f7e5143f445a087fc9ccc4fb29bbd88ff64 Author: Caolán McNamara <[email protected]> Date: Wed Aug 27 15:03:45 2014 +0100 Resolves: fdo#82904 non-Japanese ww95 documents claiming ms932 encoding Change-Id: I62f8d5c3cac71f83f5cdde114f66e8554a780538 diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx index 4fc41eb..54cfbcc 100644 --- a/sw/source/filter/ww8/ww8par.cxx +++ b/sw/source/filter/ww8/ww8par.cxx @@ -3037,8 +3037,51 @@ bool SwWW8ImplReader::ReadPlainChars(WW8_CP& rPos, sal_Int32 nEnd, sal_Int32 nCp // the correct FilePos has already been reached. const sal_Int32 nStrLen = std::min(nValidStrLen, SAL_MAX_INT32-1); - const rtl_TextEncoding eSrcCharSet = bVer67 ? GetCurrentCharSet() : + rtl_TextEncoding eSrcCharSet = bVer67 ? GetCurrentCharSet() : RTL_TEXTENCODING_MS_1252; + if (bVer67 && eSrcCharSet == RTL_TEXTENCODING_MS_932) + { + /* + fdo#82904 + + Older documents exported as word 95 that use unicode aware fonts will + have the charset of those fonts set to RTL_TEXTENCODING_MS_932 on + export as the conversion from RTL_TEXTENCODING_UNICODE. This is a serious + pain. + + We will try and use a fallback encoding if the conversion from + RTL_TEXTENCODING_MS_932 fails, but you can get unlucky and get a document + which isn't really in RTL_TEXTENCODING_MS_932 but parts of it form + valid RTL_TEXTENCODING_MS_932 by chance :-( + + We're not the only ones that struggle with this: Here's the help from + MSOffice 2003 on the topic: + + << + Earlier versions of Microsoft Word were sometimes used in conjunction with + third-party language-processing add-in programs designed to support Chinese or + Korean on English versions of Microsoft Windows. Use of these add-ins sometimes + results in incorrect text display in more recent versions of Word. + + However, you can set options to convert these documents so that text is + displayed correctly. On the Tools menu, click Options, and then click the + General tab. In the English Word 6.0/95 documents list, select Contain Asian + text (to have Word interpret the text as Asian code page data, regardless of + its font) or Automatically detect Asian text (to have Word attempt to determine + which parts of the text are meant to be Asian). + >> + + What we can try here is to ignore a RTL_TEXTENCODING_MS_932 codepage if + the language is not Japanese + */ + + const SfxPoolItem * pItem = GetFmtAttr(RES_CHRATR_CJK_LANGUAGE); + if (pItem != NULL && LANGUAGE_JAPANESE != static_cast<const SvxLanguageItem *>(pItem)->GetLanguage()) + { + SAL_WARN("sw.ww8", "discarding word95 RTL_TEXTENCODING_MS_932 encoding"); + eSrcCharSet = GetCharSetFromLanguage(); + } + } const rtl_TextEncoding eSrcCJKCharSet = bVer67 ? GetCurrentCJKCharSet() : RTL_TEXTENCODING_MS_1252; commit 4143d7bc7078fb367130e092a354b20da57585cc Author: Caolán McNamara <[email protected]> Date: Wed Aug 27 15:00:15 2014 +0100 sync GetCurrentCJKCharSet with GetCurrentCharSet Change-Id: Ibcf1fa35617ee8d7fab6b66e3e8c8881ad55c3e5 diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx index 53a2a02..4fc41eb 100644 --- a/sw/source/filter/ww8/ww8par.cxx +++ b/sw/source/filter/ww8/ww8par.cxx @@ -2809,6 +2809,24 @@ rtl_TextEncoding SwWW8ImplReader::GetCharSetFromLanguage() return msfilter::util::getBestTextEncodingFromLocale(aLocale); } +rtl_TextEncoding SwWW8ImplReader::GetCJKCharSetFromLanguage() +{ + /* + #i22206#/#i52786# + The (default) character set used for a run of text is the default + character set for the version of Word that last saved the document. + + This is a bit tentative, more might be required if the concept is correct. + When later version of word write older 6/95 documents the charset is + correctly set in the character runs involved, so its hard to reproduce + documents that require this to be sure of the process involved. + */ + const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_CJK_LANGUAGE); + LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM; + ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang)); + return msfilter::util::getBestTextEncodingFromLocale(aLocale); +} + rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet() { /* @@ -2846,33 +2864,12 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCJKCharSet() { if (!maFontSrcCJKCharSets.empty()) eSrcCharSet = maFontSrcCJKCharSets.top(); - if (!vColl.empty()) - { - if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && nCharFmt >= 0 && (size_t)nCharFmt < vColl.size() ) - eSrcCharSet = vColl[nCharFmt].GetCJKCharSet(); - if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW && nAktColl < vColl.size()) - eSrcCharSet = vColl[nAktColl].GetCJKCharSet(); - } + if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && nCharFmt >= 0 && (size_t)nCharFmt < vColl.size() ) + eSrcCharSet = vColl[nCharFmt].GetCJKCharSet(); + if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW && StyleExists(nAktColl) && nAktColl < vColl.size()) + eSrcCharSet = vColl[nAktColl].GetCJKCharSet(); if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) - { // patch from cmc for #i52786# - /* - #i22206#/#i52786# - The (default) character set used for a run of text is the default - character set for the version of Word that last saved the document. - - This is a bit tentative, more might be required if the concept is correct. - When later version of word write older 6/95 documents the charset is - correctly set in the character runs involved, so its hard to reproduce - documents that require this to be sure of the process involved. - */ - const SvxLanguageItem *pLang = - (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE); - if (pLang) - { - ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(pLang->GetLanguage())); - eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale); - } - } + eSrcCharSet = GetCJKCharSetFromLanguage(); } return eSrcCharSet; } diff --git a/sw/source/filter/ww8/ww8par.hxx b/sw/source/filter/ww8/ww8par.hxx index 50fe2c9..96963a2 100644 --- a/sw/source/filter/ww8/ww8par.hxx +++ b/sw/source/filter/ww8/ww8par.hxx @@ -1922,6 +1922,7 @@ public: // eigentlich private, geht aber leider nur public rtl_TextEncoding GetCurrentCharSet(); rtl_TextEncoding GetCurrentCJKCharSet(); rtl_TextEncoding GetCharSetFromLanguage(); + rtl_TextEncoding GetCJKCharSetFromLanguage(); void PostProcessAttrs(); static void ReadEmbeddedData(SvMemoryStream& rStrm, SwDocShell* pDocShell, struct HyperLinksTable& hlStr); commit 56c9850145faa9ac04c3f09633e56b6c8c22c6c4 Author: Caolán McNamara <[email protected]> Date: Wed Aug 27 14:57:05 2014 +0100 refactor into GetCharSetFromLanguage Change-Id: I54382b0dd0f6b6f21f635d75cb3ee3cefc1eb203 diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx index 6b4ec95..53a2a02 100644 --- a/sw/source/filter/ww8/ww8par.cxx +++ b/sw/source/filter/ww8/ww8par.cxx @@ -2791,6 +2791,24 @@ bool SwWW8ImplReader::ProcessSpecial(bool &rbReSync, WW8_CP nStartCp) return bTableRowEnd; } +rtl_TextEncoding SwWW8ImplReader::GetCharSetFromLanguage() +{ + /* + #i22206#/#i52786# + The (default) character set used for a run of text is the default + character set for the version of Word that last saved the document. + + This is a bit tentative, more might be required if the concept is correct. + When later version of word write older 6/95 documents the charset is + correctly set in the character runs involved, so its hard to reproduce + documents that require this to be sure of the process involved. + */ + const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE); + LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM; + ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang)); + return msfilter::util::getBestTextEncodingFromLocale(aLocale); +} + rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet() { /* @@ -2809,22 +2827,7 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCharSet() if ((eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) && StyleExists(nAktColl) && nAktColl < vColl.size()) eSrcCharSet = vColl[nAktColl].GetCharSet(); if (eSrcCharSet == RTL_TEXTENCODING_DONTKNOW) - { - /* - #i22206#/#i52786# - The (default) character set used for a run of text is the default - character set for the version of Word that last saved the document. - - This is a bit tentative, more might be required if the concept is correct. - When later version of word write older 6/95 documents the charset is - correctly set in the character runs involved, so its hard to reproduce - documents that require this to be sure of the process involved. - */ - const SvxLanguageItem *pLang = (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE); - LanguageType eLang = pLang ? pLang->GetLanguage() : LANGUAGE_SYSTEM; - ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(eLang)); - eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale); - } + eSrcCharSet = GetCharSetFromLanguage(); } return eSrcCharSet; } diff --git a/sw/source/filter/ww8/ww8par.hxx b/sw/source/filter/ww8/ww8par.hxx index d44e508..50fe2c9 100644 --- a/sw/source/filter/ww8/ww8par.hxx +++ b/sw/source/filter/ww8/ww8par.hxx @@ -1921,6 +1921,7 @@ public: // eigentlich private, geht aber leider nur public sal_uLong LoadDoc( SwPaM&,WW8Glossary *pGloss=0); rtl_TextEncoding GetCurrentCharSet(); rtl_TextEncoding GetCurrentCJKCharSet(); + rtl_TextEncoding GetCharSetFromLanguage(); void PostProcessAttrs(); static void ReadEmbeddedData(SvMemoryStream& rStrm, SwDocShell* pDocShell, struct HyperLinksTable& hlStr); commit 804d60d2ee4c099f685a6e42438fa0de15ca29be Author: Caolán McNamara <[email protected]> Date: Wed Aug 27 14:04:53 2014 +0100 duplicate with getBestTextEncodingFromLocale Change-Id: I73a69fdfee0b0f3af5bf6b4e52629dba7ed69630 diff --git a/sw/source/filter/ww8/ww8par.cxx b/sw/source/filter/ww8/ww8par.cxx index 3395868..6b4ec95 100644 --- a/sw/source/filter/ww8/ww8par.cxx +++ b/sw/source/filter/ww8/ww8par.cxx @@ -2866,15 +2866,8 @@ rtl_TextEncoding SwWW8ImplReader::GetCurrentCJKCharSet() (const SvxLanguageItem*)GetFmtAttr(RES_CHRATR_LANGUAGE); if (pLang) { - switch (pLang->GetLanguage()) - { - case LANGUAGE_CZECH: - eSrcCharSet = RTL_TEXTENCODING_MS_1250; - break; - default: - eSrcCharSet = RTL_TEXTENCODING_MS_1252; - break; - } + ::com::sun::star::lang::Locale aLocale(LanguageTag::convertToLocale(pLang->GetLanguage())); + eSrcCharSet = msfilter::util::getBestTextEncodingFromLocale(aLocale); } } }
_______________________________________________ Libreoffice-commits mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
