include/rtl/uri.h | 5 +++++ sal/qa/rtl/uri/rtl_testuri.cxx | 41 +++++++++++++++++++++++++++++++++++++++++ sal/rtl/uri.cxx | 3 ++- 3 files changed, 48 insertions(+), 1 deletion(-)
New commits: commit cef555e9a43b2cf1f46763b768973f89c5c1b053 Author: Stephan Bergmann <[email protected]> AuthorDate: Fri Nov 3 16:07:44 2023 +0100 Commit: Stephan Bergmann <[email protected]> CommitDate: Fri Nov 3 17:26:58 2023 +0100 Make ExternalReferenceUriTranslator more robust against broken UTF-8 <https://lists.freedesktop.org/archives/libreoffice/2023-November/091151.html> "CppunitTest_stoc_uriproc failed on Windows" reports that translateToExternal("file:///abc/%feef") produces an empty string (indicating failure) instead of "file:///abc/%FEef" (as expected in stoc/test/uriproc/test_uriproc.cxx) when osl_getThreadTextEncoding() is Shift JIS. This was due to how the call to rtl::Uri::encode in Translator::translateToExternal (in stoc/source/uriproc/ExternalUriReferenceTranslator.cxx) behaved: It internally interpreted its input "%FE" as the single-byte Shift JIS character 0xFE. Which gets mapped to U+2122 as an extension (see "APPLE additions over SJIS, we convert this like Apple, because I think, this gives better result, then [sic] we take a replacement char" in sal/textenc/tcvtjp6.tab) in readUcs4, but which in turn doesn't get mapped back to any Shift JIS character in writeEscapeChar. Translator::translateToExternal is the only user of rtl_UriEncodeStrictKeepEscapes, as introduced by 6ff5d3341dbc5df3f0cb5368ccb0e1089338916c "INTEGRATION: CWS c07v013_SRC680 (1.4.40); FILE MERGED: 2007/06/21 13:00:56 sb 1.4.40.1: #b6550116# Made XExternalUriReferenceTranslator.translateToExternal more robust when the input URL contains spurious non--UTF-8 octets like %FE (which are now copied verbatim, instead of signalling error)." To make the claim true that such "spurious non--UTF-8 octets like %FE" are always "copied verbatim", regardless of text encoding being used, repurpose rtl_UriEncodeStrictKeepEscapes to always treat any escape sequences that are present as (potentially broken) UTF-8. Change-Id: I0fa0b14d3e3d44e4b5514e1b73c84c407a947ce9 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/158888 Tested-by: Jenkins Reviewed-by: Stephan Bergmann <[email protected]> diff --git a/include/rtl/uri.h b/include/rtl/uri.h index 1fbd0128ab89..28975060c03e 100644 --- a/include/rtl/uri.h +++ b/include/rtl/uri.h @@ -174,6 +174,11 @@ typedef enum /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting unmappable characters. + Also, any escape sequences that are present are always considered to be (potentially broken) + UTF-8. This mechanism is meant to be used on the result of a rtl_UriDecodeToIuri decoding, + which will thus only contain escape sequences representing either ASCII characters or broken + UTF-8 sequences, and which will all be kept as-is. + @since UDK 3.2.7 */ rtl_UriEncodeStrictKeepEscapes, diff --git a/sal/qa/rtl/uri/rtl_testuri.cxx b/sal/qa/rtl/uri/rtl_testuri.cxx index 6389f0222418..2686d3607d37 100644 --- a/sal/qa/rtl/uri/rtl_testuri.cxx +++ b/sal/qa/rtl/uri/rtl_testuri.cxx @@ -495,6 +495,47 @@ void Test::test_Uri() { aText1, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, RTL_TEXTENCODING_ISO_8859_5)); } + + CPPUNIT_ASSERT_EQUAL( + u"%80"_ustr, + rtl::Uri::encode( + u"%80"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, + RTL_TEXTENCODING_ASCII_US)); + CPPUNIT_ASSERT_EQUAL( + u"%80"_ustr, + rtl::Uri::encode( + u"%80"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, + RTL_TEXTENCODING_ISO_8859_1)); + CPPUNIT_ASSERT_EQUAL( + u"%80"_ustr, + rtl::Uri::encode( + u"%80"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, + RTL_TEXTENCODING_UTF8)); + CPPUNIT_ASSERT_EQUAL( + u"%80"_ustr, + rtl::Uri::encode( + u"%80"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, + RTL_TEXTENCODING_SHIFT_JIS)); + CPPUNIT_ASSERT_EQUAL( + u"%FE"_ustr, + rtl::Uri::encode( + u"%FE"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, + RTL_TEXTENCODING_ASCII_US)); + CPPUNIT_ASSERT_EQUAL( + u"%FE"_ustr, + rtl::Uri::encode( + u"%FE"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, + RTL_TEXTENCODING_ISO_8859_1)); + CPPUNIT_ASSERT_EQUAL( + u"%FE"_ustr, + rtl::Uri::encode( + u"%FE"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, + RTL_TEXTENCODING_UTF8)); + CPPUNIT_ASSERT_EQUAL( + u"%FE"_ustr, + rtl::Uri::encode( + u"%FE"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes, + RTL_TEXTENCODING_SHIFT_JIS)); } } diff --git a/sal/rtl/uri.cxx b/sal/rtl/uri.cxx index bb53385dbaab..06163084893f 100644 --- a/sal/rtl/uri.cxx +++ b/sal/rtl/uri.cxx @@ -515,7 +515,8 @@ void SAL_CALL rtl_uriEncode(rtl_uString * pText, sal_Bool const * pCharClass, (eMechanism == rtl_UriEncodeKeepEscapes || eMechanism == rtl_UriEncodeCheckEscapes || eMechanism == rtl_UriEncodeStrictKeepEscapes), - eCharset, &eType); + eMechanism == rtl_UriEncodeStrictKeepEscapes ? RTL_TEXTENCODING_UTF8 : eCharset, + &eType); switch (eType) {
