This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git


The following commit(s) were added to refs/heads/master by this push:
     new ba4b8468 InputStreamReader failed to load a multibyte UTF-8 sequence 
on a read boundary (#627)
ba4b8468 is described below

commit ba4b84682daeec1b50482d7d97c9227ae96be9be
Author: Stephen Webb <[email protected]>
AuthorDate: Thu Apr 9 09:27:01 2026 +1000

    InputStreamReader failed to load a multibyte UTF-8 sequence on a read 
boundary (#627)
    
    * CharsetDecoder::getUTF8Decoder no longer returns a TrivialCharsetDecoder 
when Log4cxx is built with LOG4CXX_CHAR=utf-8 (the default )
---
 src/main/cpp/charsetdecoder.cpp    | 27 +++++----------
 src/main/cpp/domconfigurator.cpp   |  2 +-
 src/main/cpp/inputstreamreader.cpp | 39 +++++++++++++---------
 src/test/cpp/filetestcase.cpp      | 67 +++++++++++++++++++++++++++++++++++++-
 4 files changed, 99 insertions(+), 36 deletions(-)

diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp
index 7aadc1ee..b7f852a7 100644
--- a/src/main/cpp/charsetdecoder.cpp
+++ b/src/main/cpp/charsetdecoder.cpp
@@ -280,10 +280,6 @@ class TrivialCharsetDecoder : public CharsetDecoder
                TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&);
 };
 
-
-#if LOG4CXX_LOGCHAR_IS_UTF8
-typedef TrivialCharsetDecoder UTF8CharsetDecoder;
-#else
 /**
 *    Converts from UTF-8 to std::wstring
 *
@@ -333,7 +329,6 @@ class UTF8CharsetDecoder : public CharsetDecoder
                UTF8CharsetDecoder(const UTF8CharsetDecoder&);
                UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&);
 };
-#endif
 
 /**
 *    Converts from ISO-8859-1 to LogString.
@@ -504,7 +499,11 @@ CharsetDecoder::~CharsetDecoder()
 CharsetDecoder* CharsetDecoder::createDefaultDecoder()
 {
 #if LOG4CXX_CHARSET_UTF8
+#if LOG4CXX_LOGCHAR_IS_UTF8
+       return new TrivialCharsetDecoder();
+#else
        return new UTF8CharsetDecoder();
+#endif
 #elif LOG4CXX_CHARSET_ISO88591 || defined(_WIN32_WCE)
        return new ISOLatinCharsetDecoder();
 #elif LOG4CXX_CHARSET_USASCII
@@ -535,19 +534,7 @@ CharsetDecoderPtr CharsetDecoder::getDefaultDecoder()
 
 CharsetDecoderPtr CharsetDecoder::getUTF8Decoder()
 {
-       static WideLife<CharsetDecoderPtr> decoder(new UTF8CharsetDecoder());
-
-       //
-       //  if invoked after static variable destruction
-       //     (if logging is called in the destructor of a static object)
-       //     then create a new decoder.
-       //
-       if (decoder.value() == 0)
-       {
-               return std::make_shared<UTF8CharsetDecoder>();
-       }
-
-       return decoder;
+       return std::make_shared<UTF8CharsetDecoder>();
 }
 
 CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder()
@@ -562,7 +549,11 @@ CharsetDecoderPtr CharsetDecoder::getDecoder(const 
LogString& charset)
                StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), 
LOG4CXX_STR("utf8")) ||
                StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), 
LOG4CXX_STR("cp65001")))
        {
+#if LOG4CXX_LOGCHAR_IS_UTF8
+               return std::make_shared<TrivialCharsetDecoder>();
+#else
                return std::make_shared<UTF8CharsetDecoder>();
+#endif
        }
        else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), 
LOG4CXX_STR("c")) ||
                charset == LOG4CXX_STR("646") ||
diff --git a/src/main/cpp/domconfigurator.cpp b/src/main/cpp/domconfigurator.cpp
index 82b0d0d9..69e457db 100644
--- a/src/main/cpp/domconfigurator.cpp
+++ b/src/main/cpp/domconfigurator.cpp
@@ -85,7 +85,7 @@ public: // Attributes
        bool appenderAdded{ false };
        AppenderMap     appenders;
        Pool p;
-       CharsetDecoderPtr utf8Decoder{ CharsetDecoder::getUTF8Decoder() };
+       CharsetDecoderPtr utf8Decoder{ 
CharsetDecoder::getDecoder(LOG4CXX_STR("UTF-8")) };
        apr_xml_doc* doc{ nullptr };
 
 public: // ...structor
diff --git a/src/main/cpp/inputstreamreader.cpp 
b/src/main/cpp/inputstreamreader.cpp
index ba5ce19b..53f839ac 100644
--- a/src/main/cpp/inputstreamreader.cpp
+++ b/src/main/cpp/inputstreamreader.cpp
@@ -20,6 +20,7 @@
 #include <log4cxx/helpers/exception.h>
 #include <log4cxx/helpers/pool.h>
 #include <log4cxx/helpers/bytebuffer.h>
+#include <log4cxx/helpers/stringhelper.h>
 
 using namespace LOG4CXX_NS;
 using namespace LOG4CXX_NS::helpers;
@@ -74,30 +75,36 @@ LogString InputStreamReader::read(Pool& p)
        const size_t BUFSIZE = 4096;
        ByteBuffer buf(p.pstralloc(BUFSIZE), BUFSIZE);
        LogString output;
+       log4cxx_status_t stat{ 0 };
 
        // read whole file
        while (m_priv->in->read(buf) >= 0)
        {
                buf.flip();
-               log4cxx_status_t stat = m_priv->dec->decode(buf, output);
-
-               if (stat != 0)
-               {
-                       throw IOException(LOG4CXX_STR("decode"), stat);
-               }
-
-               if (buf.remaining() > 0)
+               auto lastAvailableCount = buf.remaining();
+               stat = m_priv->dec->decode(buf, output);
+               if (buf.remaining() == lastAvailableCount)
                {
-                       if (buf.remaining() == BUFSIZE)
-                       {
-                               throw IOException(LOG4CXX_STR("Decoder made no 
progress"));
-                       }
-                       buf.carry();
+                       if (stat == 0)
+                               stat = -1;
+                       break;
                }
-               else
+               buf.carry();
+       }
+       if (stat != 0 && 0 < buf.remaining())
+       {
+               auto toHexDigit = [](int ch) -> int
                {
-                       buf.clear();
-               }
+                       return (10 <= ch ? (0x61 - 10) : 0x30) + ch;
+               };
+               LogString msg(LOG4CXX_STR("Unable to decode character 0x"));
+               auto ch = static_cast<unsigned int>(*buf.current());
+               msg.push_back(toHexDigit((ch & 0xF0) >> 4));
+               msg.push_back(toHexDigit((ch & 0xF)));
+               msg += LOG4CXX_STR(" at offset ");
+               Pool p;
+               StringHelper::toString(output.size(), p, msg);
+               throw RuntimeException(msg);
        }
 
        return output;
diff --git a/src/test/cpp/filetestcase.cpp b/src/test/cpp/filetestcase.cpp
index 00146c83..e0c45d17 100644
--- a/src/test/cpp/filetestcase.cpp
+++ b/src/test/cpp/filetestcase.cpp
@@ -28,6 +28,8 @@
 #include <log4cxx/helpers/inputstreamreader.h>
 #include <log4cxx/helpers/fileinputstream.h>
 #include <log4cxx/helpers/loglog.h>
+#include <log4cxx/helpers/bytebuffer.h>
+#include <log4cxx/helpers/transcoder.h>
 
 #if LOG4CXX_CFSTRING_API
        #include <CoreFoundation/CFString.h>
@@ -58,6 +60,8 @@ LOGUNIT_CLASS(FileTestCase)
        LOGUNIT_TEST(copyConstructor);
        LOGUNIT_TEST(assignment);
        LOGUNIT_TEST(deleteBackslashedFileName);
+       LOGUNIT_TEST(testSplitMultibyteUtf8);
+       LOGUNIT_TEST(testInvalidUtf8);
        LOGUNIT_TEST_SUITE_END();
 
 #ifdef _DEBUG
@@ -102,6 +106,8 @@ public:
                }
                catch (IOException& ex)
                {
+                       LOG4CXX_DECODE_CHAR(msg, ex.what());
+                       LogLog::debug(msg);
                }
        }
 
@@ -206,7 +212,66 @@ public:
                Pool pool;
                /*bool deleted = */file.deleteFile(pool);
        }
-};
 
+       class MockInputStream : public InputStream
+       {
+               ByteBuffer m_data;
+       public:
+               MockInputStream(const char* data, size_t charCount)
+                       : m_data(const_cast<char*>(data), charCount)
+               {}
+
+               int read(ByteBuffer& dst) override
+               {
+                       auto availableBytes = m_data.remaining();
+                       if (availableBytes < 1)
+                               return -1;
+                       int count = 0;
+                       for (auto p = m_data.current(); count < availableBytes 
&& dst.put(*p); ++p)
+                               ++count;
+                       m_data.increment_position(count);
+                       return count;
+               }
+
+               void close() override {}
+       };
+
+       /**
+        * Tests behavior when a multibyte UTF-8 sequence occurs on a read 
boundary
+        */
+       void testSplitMultibyteUtf8()
+       {
+               Pool p;
+               // InputStreamReader uses a buffer of size 4096
+               std::string input( 4094, 'A' );
+               // räksmörgås.josefsson.org
+               
input.append("\162\303\244\153\163\155\303\266\162\147\303\245\163\056\152\157\163\145\146\163\163\157\156\056\157\162\147");
+               InputStreamReader 
reader(std::make_shared<MockInputStream>(input.c_str(), input.size()), 
CharsetDecoder::getUTF8Decoder());
+               auto contentLS = reader.read(p);
+               LOG4CXX_ENCODE_CHAR(content, contentLS);
+               LOGUNIT_ASSERT_EQUAL(input, content);
+       }
+
+       /**
+        * Tests behavior given an incomplete multibyte UTF-8 sequence in the 
input
+        */
+       void testInvalidUtf8()
+       {
+               Pool p;
+               // 0xC2 is a generic start byte for a 2-byte sequence in UTF-8.
+               char input[] = { 'A', (char)0xC2, 'B', 'C', 0 };
+               InputStreamReader 
reader(std::make_shared<MockInputStream>(input, 4), 
CharsetDecoder::getUTF8Decoder());
+               try
+               {
+                       reader.read(p);
+                       LOGUNIT_ASSERT(false);
+               }
+               catch (const Exception& ex)
+               {
+                       LOG4CXX_DECODE_CHAR(msg, ex.what());
+                       LogLog::debug(msg);
+               }
+       }
+};
 
 LOGUNIT_TEST_SUITE_REGISTRATION(FileTestCase);

Reply via email to