include/rtl/uri.h              |    5 +++++
 sal/qa/rtl/uri/rtl_testuri.cxx |   41 +++++++++++++++++++++++++++++++++++++++++
 sal/rtl/uri.cxx                |    3 ++-
 3 files changed, 48 insertions(+), 1 deletion(-)

New commits:
commit cef555e9a43b2cf1f46763b768973f89c5c1b053
Author:     Stephan Bergmann <[email protected]>
AuthorDate: Fri Nov 3 16:07:44 2023 +0100
Commit:     Stephan Bergmann <[email protected]>
CommitDate: Fri Nov 3 17:26:58 2023 +0100

    Make ExternalReferenceUriTranslator more robust against broken UTF-8
    
    
<https://lists.freedesktop.org/archives/libreoffice/2023-November/091151.html>
    "CppunitTest_stoc_uriproc failed on Windows" reports that
    translateToExternal("file:///abc/%feef") produces an empty string 
(indicating
    failure) instead of "file:///abc/%FEef" (as expected in
    stoc/test/uriproc/test_uriproc.cxx) when osl_getThreadTextEncoding() is 
Shift
    JIS.
    
    This was due to how the call to rtl::Uri::encode in
    Translator::translateToExternal (in
    stoc/source/uriproc/ExternalUriReferenceTranslator.cxx) behaved:  It 
internally
    interpreted its input "%FE" as the single-byte Shift JIS character 0xFE.  
Which
    gets mapped to U+2122 as an extension (see "APPLE additions over SJIS, we
    convert this like Apple, because I think, this gives better result, then 
[sic]
    we take a replacement char" in sal/textenc/tcvtjp6.tab) in readUcs4, but 
which
    in turn doesn't get mapped back to any Shift JIS character in 
writeEscapeChar.
    
    Translator::translateToExternal is the only user of
    rtl_UriEncodeStrictKeepEscapes, as introduced by
    6ff5d3341dbc5df3f0cb5368ccb0e1089338916c "INTEGRATION: CWS c07v013_SRC680
    (1.4.40); FILE MERGED: 2007/06/21 13:00:56 sb 1.4.40.1: #b6550116# Made
    XExternalUriReferenceTranslator.translateToExternal more robust when the 
input
    URL contains spurious non--UTF-8 octets like %FE (which are now copied 
verbatim,
    instead of signalling error)."
    
    To make the claim true that such "spurious non--UTF-8 octets like %FE" are
    always "copied verbatim", regardless of text encoding being used, repurpose
    rtl_UriEncodeStrictKeepEscapes to always treat any escape sequences that are
    present as (potentially broken) UTF-8.
    
    Change-Id: I0fa0b14d3e3d44e4b5514e1b73c84c407a947ce9
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/158888
    Tested-by: Jenkins
    Reviewed-by: Stephan Bergmann <[email protected]>

diff --git a/include/rtl/uri.h b/include/rtl/uri.h
index 1fbd0128ab89..28975060c03e 100644
--- a/include/rtl/uri.h
+++ b/include/rtl/uri.h
@@ -174,6 +174,11 @@ typedef enum
     /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting
         unmappable characters.
 
+        Also, any escape sequences that are present are always considered to 
be (potentially broken)
+        UTF-8.  This mechanism is meant to be used on the result of a 
rtl_UriDecodeToIuri decoding,
+        which will thus only contain escape sequences representing either 
ASCII characters or broken
+        UTF-8 sequences, and which will all be kept as-is.
+
         @since UDK 3.2.7
      */
     rtl_UriEncodeStrictKeepEscapes,
diff --git a/sal/qa/rtl/uri/rtl_testuri.cxx b/sal/qa/rtl/uri/rtl_testuri.cxx
index 6389f0222418..2686d3607d37 100644
--- a/sal/qa/rtl/uri/rtl_testuri.cxx
+++ b/sal/qa/rtl/uri/rtl_testuri.cxx
@@ -495,6 +495,47 @@ void Test::test_Uri() {
                 aText1, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
                 RTL_TEXTENCODING_ISO_8859_5));
     }
+
+    CPPUNIT_ASSERT_EQUAL(
+        u"%80"_ustr,
+        rtl::Uri::encode(
+            u"%80"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
+            RTL_TEXTENCODING_ASCII_US));
+    CPPUNIT_ASSERT_EQUAL(
+        u"%80"_ustr,
+        rtl::Uri::encode(
+            u"%80"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
+            RTL_TEXTENCODING_ISO_8859_1));
+    CPPUNIT_ASSERT_EQUAL(
+        u"%80"_ustr,
+        rtl::Uri::encode(
+            u"%80"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
+            RTL_TEXTENCODING_UTF8));
+    CPPUNIT_ASSERT_EQUAL(
+        u"%80"_ustr,
+        rtl::Uri::encode(
+            u"%80"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
+            RTL_TEXTENCODING_SHIFT_JIS));
+    CPPUNIT_ASSERT_EQUAL(
+        u"%FE"_ustr,
+        rtl::Uri::encode(
+            u"%FE"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
+            RTL_TEXTENCODING_ASCII_US));
+    CPPUNIT_ASSERT_EQUAL(
+        u"%FE"_ustr,
+        rtl::Uri::encode(
+            u"%FE"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
+            RTL_TEXTENCODING_ISO_8859_1));
+    CPPUNIT_ASSERT_EQUAL(
+        u"%FE"_ustr,
+        rtl::Uri::encode(
+            u"%FE"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
+            RTL_TEXTENCODING_UTF8));
+    CPPUNIT_ASSERT_EQUAL(
+        u"%FE"_ustr,
+        rtl::Uri::encode(
+            u"%FE"_ustr, rtl_UriCharClassUric, rtl_UriEncodeStrictKeepEscapes,
+            RTL_TEXTENCODING_SHIFT_JIS));
 }
 
 }
diff --git a/sal/rtl/uri.cxx b/sal/rtl/uri.cxx
index bb53385dbaab..06163084893f 100644
--- a/sal/rtl/uri.cxx
+++ b/sal/rtl/uri.cxx
@@ -515,7 +515,8 @@ void SAL_CALL rtl_uriEncode(rtl_uString * pText, sal_Bool 
const * pCharClass,
             (eMechanism == rtl_UriEncodeKeepEscapes
              || eMechanism == rtl_UriEncodeCheckEscapes
              || eMechanism == rtl_UriEncodeStrictKeepEscapes),
-            eCharset, &eType);
+            eMechanism == rtl_UriEncodeStrictKeepEscapes ? 
RTL_TEXTENCODING_UTF8 : eCharset,
+            &eType);
 
         switch (eType)
         {

Reply via email to