(logging-log4cxx) 01/01: Restore support for multi-byte codepoints in JSON output

swebb2066 Fri, 27 Mar 2026 22:28:41 -0700

This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch json_layout_unicode_support
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git


commit e41540bae8fe01447edd0cb37198d3938a4429de
Author: Stephen Webb <[email protected]>
AuthorDate: Sat Mar 28 15:14:01 2026 +1100

    Restore support for multi-byte codepoints in JSON output
---
 src/main/cpp/jsonlayout.cpp     | 34 +++++++++++++++-------------------
 src/test/cpp/jsonlayouttest.cpp | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/src/main/cpp/jsonlayout.cpp b/src/main/cpp/jsonlayout.cpp
index 54712359..d0f45f27 100644
--- a/src/main/cpp/jsonlayout.cpp
+++ b/src/main/cpp/jsonlayout.cpp
@@ -220,23 +220,24 @@ void JSONLayout::appendItem(const LogString& input, 
LogString& buf)
        /* add leading quote */
        buf.push_back(0x22);
 
-       size_t start = 0;
-       size_t index = 0;
-
-       for (int ch : input)
+       auto start = input.begin();
+       for (auto nextCodePoint = start; input.end() != nextCodePoint; )
        {
-               if (0x22 == ch || 0x5c == ch)
-                       ;
-               else if (0x20 <= ch)
-               {
-                       ++index;
-                       continue;
-               }
-               if (start < index)
+               auto lastCodePoint = nextCodePoint;
+               auto ch = Transcoder::decode(input, nextCodePoint);
+               if (nextCodePoint == lastCodePoint) // failed to decode input?
                {
-                       buf.append(input, start, index - start);
+                       nextCodePoint = input.end();
+                       ch = 0xFFFD; // The Unicode replacement character
                }
+               else if (0x22 == ch || 0x5c == ch) // double quote or backslash?
+                       ;
+               else if (0x20 <= ch) // not a control character?
+                       continue;
 
+               if (start != lastCodePoint)
+                       buf.append(start, lastCodePoint);
+               start = nextCodePoint;
                switch (ch)
                {
                        case 0x08:
@@ -290,13 +291,8 @@ void JSONLayout::appendItem(const LogString& input, 
LogString& buf)
                                buf.push_back(toHexDigit(ch & 0xF));
                                break;
                }
-               start = ++index;
-       }
-
-       if (start < input.size())
-       {
-               buf.append(input, start, input.size() - start);
        }
+       buf.append(start, input.end());
 
        /* add trailing quote */
        buf.push_back(0x22);
diff --git a/src/test/cpp/jsonlayouttest.cpp b/src/test/cpp/jsonlayouttest.cpp
index 75f17dc0..083cc481 100644
--- a/src/test/cpp/jsonlayouttest.cpp
+++ b/src/test/cpp/jsonlayouttest.cpp
@@ -59,6 +59,7 @@ LOGUNIT_CLASS(JSONLayoutTest), public JSONLayout
        LOGUNIT_TEST(testFormat);
        LOGUNIT_TEST(testFormatWithPrettyPrint);
        LOGUNIT_TEST(testGetSetLocationInfo);
+       LOGUNIT_TEST(testAppendQuotedEscapedString);
        LOGUNIT_TEST_SUITE_END();
 
 
@@ -492,6 +493,43 @@ public:
                layout.setPrettyPrint(false);
                LOGUNIT_ASSERT_EQUAL(false, layout.getPrettyPrint());
        }
+
+       /**
+        * Tests Unicode characters.
+        */
+       void testAppendQuotedEscapedString()
+       {
+               std::string problemMessage = "'\001\"<Hello >\"\004'";
+               LogString expectedQuotedEscapedMessage = 
LOG4CXX_STR("\"'\\u0001\\\"<Hello >\\\"\\u0004'\"");
+               LOG4CXX_DECODE_CHAR(problemMessageLS, problemMessage);
+               LogString quotedEscapedMessage;
+               appendQuotedEscapedString(quotedEscapedMessage, 
problemMessageLS);
+               LOGUNIT_ASSERT_EQUAL(expectedQuotedEscapedMessage, 
quotedEscapedMessage);
+
+               // '\"<räksmörgås.josefsson.org>\"'
+               std::string problemName = 
"'\"\162\303\244\153\163\155\303\266\162\147\303\245\163\056\152\157\163\145\146\163\163\157\156\056\157\162\147>\"'";
+               LOG4CXX_DECODE_CHAR(problemNameLS, problemName);
+               LogString expectedQuotedEscapedName = 
LOG4CXX_STR("\"'\\\"\162\303\244\153\163\155\303\266\162\147\303\245\163\056\152\157\163\145\146\163\163\157\156\056\157\162\147>\\\"'\"");
+               LogString quotedEscapedName;
+               appendQuotedEscapedString(quotedEscapedName, problemNameLS);
+               LOGUNIT_ASSERT_EQUAL(expectedQuotedEscapedName, 
quotedEscapedName);
+
+               Transcoder::encode(0xD822, problemNameLS); // Add a character 
that cannot be converted to UTF16
+#if LOG4CXX_LOGCHAR_IS_WCHAR && defined(__STDC_ISO_10646__)
+               
expectedQuotedEscapedName.replace(expectedQuotedEscapedName.size() - 1, 1, 
0xD822);
+               expectedQuotedEscapedName += 0x22; // Add a double quote at the 
end
+#elif LOG4CXX_LOGCHAR_IS_WCHAR
+               // encodeUTF16 adds 0xD822, but decodeUTF16 cannot convert 
0xD822
+               // The Unicode replacement character is the following utf-8 
sequence
+               
expectedQuotedEscapedName.replace(expectedQuotedEscapedName.size() - 1, 1, 
"\xEF\xBF\xBD\"");
+#elif LOG4CXX_LOGCHAR_IS_UTF8
+               // 0xD822 is encoded in UTF-8 as 0xED 0xA0 0xA2
+               
expectedQuotedEscapedName.replace(expectedQuotedEscapedName.size() - 1, 1, 
"\xED\xa0\xa2\"");
+#endif
+               LogString escapedQuoted0xD822Name;
+               appendQuotedEscapedString(escapedQuoted0xD822Name, 
problemNameLS);
+               LOGUNIT_ASSERT_EQUAL(expectedQuotedEscapedName, 
escapedQuoted0xD822Name);
+       }
 };

(logging-log4cxx) 01/01: Restore support for multi-byte codepoints in JSON output

Reply via email to