This is an automated email from the ASF dual-hosted git repository.

swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git


The following commit(s) were added to refs/heads/master by this push:
     new 0406623b Restore support for multi-byte code points in XML and HTML 
output (#610)
0406623b is described below

commit 0406623b6e32998d23ba6cda8e86b57a18c1c4c3
Author: Stephen Webb <[email protected]>
AuthorDate: Sat Mar 21 13:52:09 2026 +1100

    Restore support for multi-byte code points in XML and HTML output (#610)
---
 src/main/cpp/transform.cpp                   | 97 ++++++++++++++--------------
 src/main/include/log4cxx/helpers/transform.h |  2 +-
 src/test/cpp/xml/xmllayouttest.cpp           | 22 +++++--
 3 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/src/main/cpp/transform.cpp b/src/main/cpp/transform.cpp
index 9c56317b..32397a76 100644
--- a/src/main/cpp/transform.cpp
+++ b/src/main/cpp/transform.cpp
@@ -17,6 +17,7 @@
 
 #include <log4cxx/logstring.h>
 #include <log4cxx/helpers/transform.h>
+#include <log4cxx/helpers/transcoder.h>
 #include <log4cxx/helpers/widelife.h>
 #include <functional>
 
@@ -27,39 +28,42 @@ namespace
 {
 using CharProcessor = std::function<void(LogString&, int)>;
 
+// Allowable XML 1.0 characters are:
+// #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 void appendValidCharacters(LogString& buf, const LogString& input, 
CharProcessor handler = {})
 {
-       static const logchar specials[] =
+       static const unsigned int specials[] =
                { 0x22 /* " */
                , 0x26 /* & */
                , 0x3C /* < */
                , 0x3E /* > */
                , 0x00
                };
-       size_t start = 0;
-       for (size_t index = 0; index < input.size(); ++index)
+       auto start = input.begin();
+       for (auto nextCodePoint = start; input.end() != nextCodePoint; )
        {
-               int ch = input[index];
-               // Allowable XML 1.0 characters are:
-               // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 
[#x10000-#x10FFFF]
-               if (0x20 <= ch && ch <= 0xD7FF)
+               auto lastCodePoint = nextCodePoint;
+               auto ch = Transcoder::decode(input, nextCodePoint);
+               if (nextCodePoint == lastCodePoint) // failed to decode input?
+                       nextCodePoint = input.end();
+               else if ((0x20 <= ch && ch <= 0xD7FF) &&
+                       specials[0] != ch &&
+                       specials[1] != ch &&
+                       specials[2] != ch &&
+                       specials[3] != ch)
                {
-                       auto pSpecial = &specials[0];
-                       while (*pSpecial && *pSpecial != ch)
-                               ++pSpecial;
-                       if (!*pSpecial)
-                               continue;
+                       continue;
                }
-               else if (0x9 == ch || 0xA == ch || 0xD == ch ||
+               else if ((0x9 == ch || 0xA == ch || 0xD == ch) ||
                                (0xE000 <= ch && ch <= 0xFFFD) ||
                                (0x10000 <= ch && ch <= 0x10FFFF))
                {
                        continue;
                }
 
-               if (start < index)
-                       buf.append(input, start, index - start);
-               start = index + 1;
+               if (start != lastCodePoint)
+                       buf.append(start, lastCodePoint);
+               start = nextCodePoint;
                switch (ch)
                {
                        case 0: // Do not output a NUL character
@@ -80,17 +84,17 @@ void appendValidCharacters(LogString& buf, const LogString& 
input, CharProcessor
                                buf.append(LOG4CXX_STR("&gt;"));
                                break;
 
+                       case 0xFFFF: // invalid sequence
+                               Transform::appendCharacterReference(buf, 
0xFFFD); // The Unicode replacement character
+                               break;
+
                        default:
                                if (handler)
                                        handler(buf, ch);
                                break;
                }
        }
-
-       if (start < input.size())
-       {
-               buf.append(input, start, input.size() - start);
-       }
+       buf.append(start, input.end());
 }
 
 } // namespace
@@ -101,51 +105,50 @@ void Transform::appendEscapingCDATA(
        static const LogString CDATA_END(LOG4CXX_STR("]]>"));
        const LogString::size_type CDATA_END_LEN = 3;
        static const LogString 
CDATA_EMBEDED_END(LOG4CXX_STR("]]&gt;<![CDATA["));
-       size_t start = 0;
-       for (size_t index = 0; index < input.size(); ++index)
+       auto start = input.begin();
+       for (auto nextCodePoint = start; input.end() != nextCodePoint; )
        {
-               int ch = input[index];
                bool cdataEnd = false;
-               // Allowable XML 1.0 characters are:
-               // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 
[#x10000-#x10FFFF]
-               if (0x20 <= ch && ch <= 0xD7FF)
+               auto lastCodePoint = nextCodePoint;
+               auto ch = Transcoder::decode(input, nextCodePoint);
+               if (nextCodePoint == lastCodePoint) // failed to decode input?
                {
-                       if (CDATA_END[0] == ch &&
-                               index + CDATA_END_LEN <= input.size() &&
-                               0 == input.compare(index, CDATA_END_LEN, 
CDATA_END))
-                       {
-                               index += CDATA_END_LEN;
-                               cdataEnd = true;
-                       }
-                       else
+                       nextCodePoint = input.end();
+                       ch = 0xFFFD; // The Unicode replacement character
+               }
+               else if (CDATA_END[0] == ch && input.end() != nextCodePoint)
+               {
+                       lastCodePoint = nextCodePoint;
+                       if (CDATA_END[1] != Transcoder::decode(input, 
nextCodePoint) ||
+                               input.end() == nextCodePoint ||
+                               CDATA_END[2] != Transcoder::decode(input, 
nextCodePoint))
                        {
+                               nextCodePoint = lastCodePoint;
                                continue;
                        }
+                       lastCodePoint = nextCodePoint;
+                       cdataEnd = true;
                }
-               else if (0x9 == ch || 0xA == ch || 0xD == ch ||
+               else if ((0x20 <= ch && ch <= 0xD7FF) ||
+                               (0x9 == ch || 0xA == ch || 0xD == ch) ||
                                (0xE000 <= ch && ch <= 0xFFFD) ||
                                (0x10000 <= ch && ch <= 0x10FFFF))
                {
                        continue;
                }
 
-               if (start < index)
-                       buf.append(input, start, index - start);
+               if (start != lastCodePoint)
+                       buf.append(start, lastCodePoint);
                if (cdataEnd)
-               {
                        buf.append(CDATA_EMBEDED_END);
-                       --index;
-               }
                else if (0 != ch)
                        appendCharacterReference(buf, ch);
-               start = index + 1;
+               start = nextCodePoint;
        }
-
-       if (start < input.size())
-               buf.append(input, start, input.size() - start);
+       buf.append(start, input.end());
 }
 
-void Transform::appendCharacterReference(LogString& buf, int ch)
+void Transform::appendCharacterReference(LogString& buf, unsigned int ch)
 {
        auto toHexDigit = [](int ch) -> int
        {
@@ -155,7 +158,7 @@ void Transform::appendCharacterReference(LogString& buf, 
int ch)
        buf.push_back('#');
        buf.push_back('x');
        if (0xFFFFFFF < ch)
-               buf.push_back(toHexDigit((ch & 0x70000000) >> 28));
+               buf.push_back(toHexDigit((ch & 0xF0000000) >> 28));
        if (0xFFFFFF < ch)
                buf.push_back(toHexDigit((ch & 0xF000000) >> 24));
        if (0xFFFFF < ch)
diff --git a/src/main/include/log4cxx/helpers/transform.h 
b/src/main/include/log4cxx/helpers/transform.h
index 70ee53d0..301ea63c 100644
--- a/src/main/include/log4cxx/helpers/transform.h
+++ b/src/main/include/log4cxx/helpers/transform.h
@@ -66,7 +66,7 @@ class LOG4CXX_EXPORT Transform
                * @param buf output stream holding the XML data to this point.
                * @param ch the value to encode as a XML character reference
                */
-               static void appendCharacterReference(LogString& buf, int ch);
+               static void appendCharacterReference(LogString& buf, unsigned 
int ch);
 
                /**
                * Append a transformation of \c input onto \c buf.
diff --git a/src/test/cpp/xml/xmllayouttest.cpp 
b/src/test/cpp/xml/xmllayouttest.cpp
index 7ce31033..016d64c3 100644
--- a/src/test/cpp/xml/xmllayouttest.cpp
+++ b/src/test/cpp/xml/xmllayouttest.cpp
@@ -40,7 +40,6 @@
 #include <log4cxx/helpers/transcoder.h>
 #include <log4cxx/helpers/loglog.h>
 
-
 using namespace log4cxx;
 using namespace log4cxx::helpers;
 using namespace log4cxx::xml;
@@ -373,18 +372,29 @@ public:
         */
        void testProblemCharacters()
        {
-               std::string problemName = "'\"<com.example.bar>&\"'";
+               // '\"<räksmörgås.josefsson.org>&\"'
+               std::string problemName = 
"'\"\162\303\244\153\163\155\303\266\162\147\303\245\163\056\152\157\163\145\146\163\163\157\156\056\157\162\147>&\"'";
                LOG4CXX_DECODE_CHAR(problemNameLS, problemName);
+               auto loggerNameLS = problemNameLS;
+               auto levelNameLS = problemNameLS;
+               Transcoder::encode(0xD822, problemNameLS); // Add an invalid 
character that should be stripped from attribute values
+               auto keyLS = problemNameLS;
+               auto expectedKeyValue = problemName;
+#if LOG4CXX_LOGCHAR_IS_WCHAR && !defined(__STDC_ISO_10646__)
+               // encodeUTF16 adds 0xD822, but decodeUTF16 cannot convert 
0xD822
+               // Expat translates the Unicode replacement character to the 
following
+               expectedKeyValue += "\xEF\xBF\xBD";
+#endif
                std::string problemMessage = "'\001\"<Hello >\"\004'";
                std::string expectedCdataValue = "'&#x1;\"<Hello >\"&#x4;'";
                std::string expectedAttributeValue = "'\"<Hello >\"'"; // 
Invalid characters stripped
                LOG4CXX_DECODE_CHAR(problemMessageLS, problemMessage);
-               LevelPtr level = LevelPtr(new XLevel(6000, problemNameLS, 7));
+               LevelPtr level = LevelPtr(new XLevel(6000, levelNameLS, 7));
                NDC::push(problemName);
                MDC::clear();
-               MDC::put(problemName, problemMessage);
+               MDC::put(keyLS, problemMessageLS);
                auto event = std::make_shared<LoggingEvent>
-                               (problemNameLS, level, problemMessageLS, 
LOG4CXX_LOCATION);
+                               (loggerNameLS, level, problemMessageLS, 
LOG4CXX_LOCATION);
                XMLLayout layout;
                layout.setProperties(true);
                Pool p;
@@ -414,7 +424,7 @@ public:
                                        break;
 
                                case 3:
-                                       checkPropertiesElement(node, 
problemName.c_str(), expectedAttributeValue.c_str());
+                                       checkPropertiesElement(node, 
expectedKeyValue.c_str(), expectedAttributeValue.c_str());
                                        break;
 
                                default:

Reply via email to