This is an automated email from the ASF dual-hosted git repository.
swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
The following commit(s) were added to refs/heads/master by this push:
new ba4b8468 InputStreamReader failed to load a multibyte UTF-8 sequence
on a read boundary (#627)
ba4b8468 is described below
commit ba4b84682daeec1b50482d7d97c9227ae96be9be
Author: Stephen Webb <[email protected]>
AuthorDate: Thu Apr 9 09:27:01 2026 +1000
InputStreamReader failed to load a multibyte UTF-8 sequence on a read
boundary (#627)
* CharsetDecoder::getUTF8Decoder no longer returns a TrivialCharsetDecoder
when Log4cxx is built with LOG4CXX_CHAR=utf-8 (the default )
---
src/main/cpp/charsetdecoder.cpp | 27 +++++----------
src/main/cpp/domconfigurator.cpp | 2 +-
src/main/cpp/inputstreamreader.cpp | 39 +++++++++++++---------
src/test/cpp/filetestcase.cpp | 67 +++++++++++++++++++++++++++++++++++++-
4 files changed, 99 insertions(+), 36 deletions(-)
diff --git a/src/main/cpp/charsetdecoder.cpp b/src/main/cpp/charsetdecoder.cpp
index 7aadc1ee..b7f852a7 100644
--- a/src/main/cpp/charsetdecoder.cpp
+++ b/src/main/cpp/charsetdecoder.cpp
@@ -280,10 +280,6 @@ class TrivialCharsetDecoder : public CharsetDecoder
TrivialCharsetDecoder& operator=(const TrivialCharsetDecoder&);
};
-
-#if LOG4CXX_LOGCHAR_IS_UTF8
-typedef TrivialCharsetDecoder UTF8CharsetDecoder;
-#else
/**
* Converts from UTF-8 to std::wstring
*
@@ -333,7 +329,6 @@ class UTF8CharsetDecoder : public CharsetDecoder
UTF8CharsetDecoder(const UTF8CharsetDecoder&);
UTF8CharsetDecoder& operator=(const UTF8CharsetDecoder&);
};
-#endif
/**
* Converts from ISO-8859-1 to LogString.
@@ -504,7 +499,11 @@ CharsetDecoder::~CharsetDecoder()
CharsetDecoder* CharsetDecoder::createDefaultDecoder()
{
#if LOG4CXX_CHARSET_UTF8
+#if LOG4CXX_LOGCHAR_IS_UTF8
+ return new TrivialCharsetDecoder();
+#else
return new UTF8CharsetDecoder();
+#endif
#elif LOG4CXX_CHARSET_ISO88591 || defined(_WIN32_WCE)
return new ISOLatinCharsetDecoder();
#elif LOG4CXX_CHARSET_USASCII
@@ -535,19 +534,7 @@ CharsetDecoderPtr CharsetDecoder::getDefaultDecoder()
CharsetDecoderPtr CharsetDecoder::getUTF8Decoder()
{
- static WideLife<CharsetDecoderPtr> decoder(new UTF8CharsetDecoder());
-
- //
- // if invoked after static variable destruction
- // (if logging is called in the destructor of a static object)
- // then create a new decoder.
- //
- if (decoder.value() == 0)
- {
- return std::make_shared<UTF8CharsetDecoder>();
- }
-
- return decoder;
+ return std::make_shared<UTF8CharsetDecoder>();
}
CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder()
@@ -562,7 +549,11 @@ CharsetDecoderPtr CharsetDecoder::getDecoder(const
LogString& charset)
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"),
LOG4CXX_STR("utf8")) ||
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"),
LOG4CXX_STR("cp65001")))
{
+#if LOG4CXX_LOGCHAR_IS_UTF8
+ return std::make_shared<TrivialCharsetDecoder>();
+#else
return std::make_shared<UTF8CharsetDecoder>();
+#endif
}
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"),
LOG4CXX_STR("c")) ||
charset == LOG4CXX_STR("646") ||
diff --git a/src/main/cpp/domconfigurator.cpp b/src/main/cpp/domconfigurator.cpp
index 82b0d0d9..69e457db 100644
--- a/src/main/cpp/domconfigurator.cpp
+++ b/src/main/cpp/domconfigurator.cpp
@@ -85,7 +85,7 @@ public: // Attributes
bool appenderAdded{ false };
AppenderMap appenders;
Pool p;
- CharsetDecoderPtr utf8Decoder{ CharsetDecoder::getUTF8Decoder() };
+ CharsetDecoderPtr utf8Decoder{
CharsetDecoder::getDecoder(LOG4CXX_STR("UTF-8")) };
apr_xml_doc* doc{ nullptr };
public: // ...structor
diff --git a/src/main/cpp/inputstreamreader.cpp
b/src/main/cpp/inputstreamreader.cpp
index ba5ce19b..53f839ac 100644
--- a/src/main/cpp/inputstreamreader.cpp
+++ b/src/main/cpp/inputstreamreader.cpp
@@ -20,6 +20,7 @@
#include <log4cxx/helpers/exception.h>
#include <log4cxx/helpers/pool.h>
#include <log4cxx/helpers/bytebuffer.h>
+#include <log4cxx/helpers/stringhelper.h>
using namespace LOG4CXX_NS;
using namespace LOG4CXX_NS::helpers;
@@ -74,30 +75,36 @@ LogString InputStreamReader::read(Pool& p)
const size_t BUFSIZE = 4096;
ByteBuffer buf(p.pstralloc(BUFSIZE), BUFSIZE);
LogString output;
+ log4cxx_status_t stat{ 0 };
// read whole file
while (m_priv->in->read(buf) >= 0)
{
buf.flip();
- log4cxx_status_t stat = m_priv->dec->decode(buf, output);
-
- if (stat != 0)
- {
- throw IOException(LOG4CXX_STR("decode"), stat);
- }
-
- if (buf.remaining() > 0)
+ auto lastAvailableCount = buf.remaining();
+ stat = m_priv->dec->decode(buf, output);
+ if (buf.remaining() == lastAvailableCount)
{
- if (buf.remaining() == BUFSIZE)
- {
- throw IOException(LOG4CXX_STR("Decoder made no
progress"));
- }
- buf.carry();
+ if (stat == 0)
+ stat = -1;
+ break;
}
- else
+ buf.carry();
+ }
+ if (stat != 0 && 0 < buf.remaining())
+ {
+ auto toHexDigit = [](int ch) -> int
{
- buf.clear();
- }
+ return (10 <= ch ? (0x61 - 10) : 0x30) + ch;
+ };
+ LogString msg(LOG4CXX_STR("Unable to decode character 0x"));
+ auto ch = static_cast<unsigned int>(*buf.current());
+ msg.push_back(toHexDigit((ch & 0xF0) >> 4));
+ msg.push_back(toHexDigit((ch & 0xF)));
+ msg += LOG4CXX_STR(" at offset ");
+ Pool p;
+ StringHelper::toString(output.size(), p, msg);
+ throw RuntimeException(msg);
}
return output;
diff --git a/src/test/cpp/filetestcase.cpp b/src/test/cpp/filetestcase.cpp
index 00146c83..e0c45d17 100644
--- a/src/test/cpp/filetestcase.cpp
+++ b/src/test/cpp/filetestcase.cpp
@@ -28,6 +28,8 @@
#include <log4cxx/helpers/inputstreamreader.h>
#include <log4cxx/helpers/fileinputstream.h>
#include <log4cxx/helpers/loglog.h>
+#include <log4cxx/helpers/bytebuffer.h>
+#include <log4cxx/helpers/transcoder.h>
#if LOG4CXX_CFSTRING_API
#include <CoreFoundation/CFString.h>
@@ -58,6 +60,8 @@ LOGUNIT_CLASS(FileTestCase)
LOGUNIT_TEST(copyConstructor);
LOGUNIT_TEST(assignment);
LOGUNIT_TEST(deleteBackslashedFileName);
+ LOGUNIT_TEST(testSplitMultibyteUtf8);
+ LOGUNIT_TEST(testInvalidUtf8);
LOGUNIT_TEST_SUITE_END();
#ifdef _DEBUG
@@ -102,6 +106,8 @@ public:
}
catch (IOException& ex)
{
+ LOG4CXX_DECODE_CHAR(msg, ex.what());
+ LogLog::debug(msg);
}
}
@@ -206,7 +212,66 @@ public:
Pool pool;
/*bool deleted = */file.deleteFile(pool);
}
-};
+ class MockInputStream : public InputStream
+ {
+ ByteBuffer m_data;
+ public:
+ MockInputStream(const char* data, size_t charCount)
+ : m_data(const_cast<char*>(data), charCount)
+ {}
+
+ int read(ByteBuffer& dst) override
+ {
+ auto availableBytes = m_data.remaining();
+ if (availableBytes < 1)
+ return -1;
+ int count = 0;
+ for (auto p = m_data.current(); count < availableBytes
&& dst.put(*p); ++p)
+ ++count;
+ m_data.increment_position(count);
+ return count;
+ }
+
+ void close() override {}
+ };
+
+ /**
+ * Tests behavior when a multibyte UTF-8 sequence occurs on a read
boundary
+ */
+ void testSplitMultibyteUtf8()
+ {
+ Pool p;
+ // InputStreamReader uses a buffer of size 4096
+ std::string input( 4094, 'A' );
+ // räksmörgås.josefsson.org
+
input.append("\162\303\244\153\163\155\303\266\162\147\303\245\163\056\152\157\163\145\146\163\163\157\156\056\157\162\147");
+ InputStreamReader
reader(std::make_shared<MockInputStream>(input.c_str(), input.size()),
CharsetDecoder::getUTF8Decoder());
+ auto contentLS = reader.read(p);
+ LOG4CXX_ENCODE_CHAR(content, contentLS);
+ LOGUNIT_ASSERT_EQUAL(input, content);
+ }
+
+ /**
+ * Tests behavior given an incomplete multibyte UTF-8 sequence in the
input
+ */
+ void testInvalidUtf8()
+ {
+ Pool p;
+ // 0xC2 is a generic start byte for a 2-byte sequence in UTF-8.
+ char input[] = { 'A', (char)0xC2, 'B', 'C', 0 };
+ InputStreamReader
reader(std::make_shared<MockInputStream>(input, 4),
CharsetDecoder::getUTF8Decoder());
+ try
+ {
+ reader.read(p);
+ LOGUNIT_ASSERT(false);
+ }
+ catch (const Exception& ex)
+ {
+ LOG4CXX_DECODE_CHAR(msg, ex.what());
+ LogLog::debug(msg);
+ }
+ }
+};
LOGUNIT_TEST_SUITE_REGISTRATION(FileTestCase);