This is an automated email from the ASF dual-hosted git repository.
swebb2066 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/logging-log4cxx.git
The following commit(s) were added to refs/heads/master by this push:
new 0406623b Restore support for multi-byte code points in XML and HTML
output (#610)
0406623b is described below
commit 0406623b6e32998d23ba6cda8e86b57a18c1c4c3
Author: Stephen Webb <[email protected]>
AuthorDate: Sat Mar 21 13:52:09 2026 +1100
Restore support for multi-byte code points in XML and HTML output (#610)
---
src/main/cpp/transform.cpp | 97 ++++++++++++++--------------
src/main/include/log4cxx/helpers/transform.h | 2 +-
src/test/cpp/xml/xmllayouttest.cpp | 22 +++++--
3 files changed, 67 insertions(+), 54 deletions(-)
diff --git a/src/main/cpp/transform.cpp b/src/main/cpp/transform.cpp
index 9c56317b..32397a76 100644
--- a/src/main/cpp/transform.cpp
+++ b/src/main/cpp/transform.cpp
@@ -17,6 +17,7 @@
#include <log4cxx/logstring.h>
#include <log4cxx/helpers/transform.h>
+#include <log4cxx/helpers/transcoder.h>
#include <log4cxx/helpers/widelife.h>
#include <functional>
@@ -27,39 +28,42 @@ namespace
{
using CharProcessor = std::function<void(LogString&, int)>;
+// Allowable XML 1.0 characters are:
+// #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
void appendValidCharacters(LogString& buf, const LogString& input,
CharProcessor handler = {})
{
- static const logchar specials[] =
+ static const unsigned int specials[] =
{ 0x22 /* " */
, 0x26 /* & */
, 0x3C /* < */
, 0x3E /* > */
, 0x00
};
- size_t start = 0;
- for (size_t index = 0; index < input.size(); ++index)
+ auto start = input.begin();
+ for (auto nextCodePoint = start; input.end() != nextCodePoint; )
{
- int ch = input[index];
- // Allowable XML 1.0 characters are:
- // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
[#x10000-#x10FFFF]
- if (0x20 <= ch && ch <= 0xD7FF)
+ auto lastCodePoint = nextCodePoint;
+ auto ch = Transcoder::decode(input, nextCodePoint);
+ if (nextCodePoint == lastCodePoint) // failed to decode input?
+ nextCodePoint = input.end();
+ else if ((0x20 <= ch && ch <= 0xD7FF) &&
+ specials[0] != ch &&
+ specials[1] != ch &&
+ specials[2] != ch &&
+ specials[3] != ch)
{
- auto pSpecial = &specials[0];
- while (*pSpecial && *pSpecial != ch)
- ++pSpecial;
- if (!*pSpecial)
- continue;
+ continue;
}
- else if (0x9 == ch || 0xA == ch || 0xD == ch ||
+ else if ((0x9 == ch || 0xA == ch || 0xD == ch) ||
(0xE000 <= ch && ch <= 0xFFFD) ||
(0x10000 <= ch && ch <= 0x10FFFF))
{
continue;
}
- if (start < index)
- buf.append(input, start, index - start);
- start = index + 1;
+ if (start != lastCodePoint)
+ buf.append(start, lastCodePoint);
+ start = nextCodePoint;
switch (ch)
{
case 0: // Do not output a NUL character
@@ -80,17 +84,17 @@ void appendValidCharacters(LogString& buf, const LogString&
input, CharProcessor
buf.append(LOG4CXX_STR(">"));
break;
+ case 0xFFFF: // invalid sequence
+ Transform::appendCharacterReference(buf,
0xFFFD); // The Unicode replacement character
+ break;
+
default:
if (handler)
handler(buf, ch);
break;
}
}
-
- if (start < input.size())
- {
- buf.append(input, start, input.size() - start);
- }
+ buf.append(start, input.end());
}
} // namespace
@@ -101,51 +105,50 @@ void Transform::appendEscapingCDATA(
static const LogString CDATA_END(LOG4CXX_STR("]]>"));
const LogString::size_type CDATA_END_LEN = 3;
static const LogString
CDATA_EMBEDED_END(LOG4CXX_STR("]]><![CDATA["));
- size_t start = 0;
- for (size_t index = 0; index < input.size(); ++index)
+ auto start = input.begin();
+ for (auto nextCodePoint = start; input.end() != nextCodePoint; )
{
- int ch = input[index];
bool cdataEnd = false;
- // Allowable XML 1.0 characters are:
- // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
[#x10000-#x10FFFF]
- if (0x20 <= ch && ch <= 0xD7FF)
+ auto lastCodePoint = nextCodePoint;
+ auto ch = Transcoder::decode(input, nextCodePoint);
+ if (nextCodePoint == lastCodePoint) // failed to decode input?
{
- if (CDATA_END[0] == ch &&
- index + CDATA_END_LEN <= input.size() &&
- 0 == input.compare(index, CDATA_END_LEN,
CDATA_END))
- {
- index += CDATA_END_LEN;
- cdataEnd = true;
- }
- else
+ nextCodePoint = input.end();
+ ch = 0xFFFD; // The Unicode replacement character
+ }
+ else if (CDATA_END[0] == ch && input.end() != nextCodePoint)
+ {
+ lastCodePoint = nextCodePoint;
+ if (CDATA_END[1] != Transcoder::decode(input,
nextCodePoint) ||
+ input.end() == nextCodePoint ||
+ CDATA_END[2] != Transcoder::decode(input,
nextCodePoint))
{
+ nextCodePoint = lastCodePoint;
continue;
}
+ lastCodePoint = nextCodePoint;
+ cdataEnd = true;
}
- else if (0x9 == ch || 0xA == ch || 0xD == ch ||
+ else if ((0x20 <= ch && ch <= 0xD7FF) ||
+ (0x9 == ch || 0xA == ch || 0xD == ch) ||
(0xE000 <= ch && ch <= 0xFFFD) ||
(0x10000 <= ch && ch <= 0x10FFFF))
{
continue;
}
- if (start < index)
- buf.append(input, start, index - start);
+ if (start != lastCodePoint)
+ buf.append(start, lastCodePoint);
if (cdataEnd)
- {
buf.append(CDATA_EMBEDED_END);
- --index;
- }
else if (0 != ch)
appendCharacterReference(buf, ch);
- start = index + 1;
+ start = nextCodePoint;
}
-
- if (start < input.size())
- buf.append(input, start, input.size() - start);
+ buf.append(start, input.end());
}
-void Transform::appendCharacterReference(LogString& buf, int ch)
+void Transform::appendCharacterReference(LogString& buf, unsigned int ch)
{
auto toHexDigit = [](int ch) -> int
{
@@ -155,7 +158,7 @@ void Transform::appendCharacterReference(LogString& buf,
int ch)
buf.push_back('#');
buf.push_back('x');
if (0xFFFFFFF < ch)
- buf.push_back(toHexDigit((ch & 0x70000000) >> 28));
+ buf.push_back(toHexDigit((ch & 0xF0000000) >> 28));
if (0xFFFFFF < ch)
buf.push_back(toHexDigit((ch & 0xF000000) >> 24));
if (0xFFFFF < ch)
diff --git a/src/main/include/log4cxx/helpers/transform.h
b/src/main/include/log4cxx/helpers/transform.h
index 70ee53d0..301ea63c 100644
--- a/src/main/include/log4cxx/helpers/transform.h
+++ b/src/main/include/log4cxx/helpers/transform.h
@@ -66,7 +66,7 @@ class LOG4CXX_EXPORT Transform
* @param buf output stream holding the XML data to this point.
* @param ch the value to encode as a XML character reference
*/
- static void appendCharacterReference(LogString& buf, int ch);
+ static void appendCharacterReference(LogString& buf, unsigned
int ch);
/**
* Append a transformation of \c input onto \c buf.
diff --git a/src/test/cpp/xml/xmllayouttest.cpp
b/src/test/cpp/xml/xmllayouttest.cpp
index 7ce31033..016d64c3 100644
--- a/src/test/cpp/xml/xmllayouttest.cpp
+++ b/src/test/cpp/xml/xmllayouttest.cpp
@@ -40,7 +40,6 @@
#include <log4cxx/helpers/transcoder.h>
#include <log4cxx/helpers/loglog.h>
-
using namespace log4cxx;
using namespace log4cxx::helpers;
using namespace log4cxx::xml;
@@ -373,18 +372,29 @@ public:
*/
void testProblemCharacters()
{
- std::string problemName = "'\"<com.example.bar>&\"'";
+ // '\"<räksmörgås.josefsson.org>&\"'
+ std::string problemName =
"'\"\162\303\244\153\163\155\303\266\162\147\303\245\163\056\152\157\163\145\146\163\163\157\156\056\157\162\147>&\"'";
LOG4CXX_DECODE_CHAR(problemNameLS, problemName);
+ auto loggerNameLS = problemNameLS;
+ auto levelNameLS = problemNameLS;
+ Transcoder::encode(0xD822, problemNameLS); // Add an invalid
character that should be stripped from attribute values
+ auto keyLS = problemNameLS;
+ auto expectedKeyValue = problemName;
+#if LOG4CXX_LOGCHAR_IS_WCHAR && !defined(__STDC_ISO_10646__)
+ // encodeUTF16 adds 0xD822, but decodeUTF16 cannot convert
0xD822
+ // Expat translates the Unicode replacement character to the
following
+ expectedKeyValue += "\xEF\xBF\xBD";
+#endif
std::string problemMessage = "'\001\"<Hello >\"\004'";
std::string expectedCdataValue = "'\"<Hello >\"'";
std::string expectedAttributeValue = "'\"<Hello >\"'"; //
Invalid characters stripped
LOG4CXX_DECODE_CHAR(problemMessageLS, problemMessage);
- LevelPtr level = LevelPtr(new XLevel(6000, problemNameLS, 7));
+ LevelPtr level = LevelPtr(new XLevel(6000, levelNameLS, 7));
NDC::push(problemName);
MDC::clear();
- MDC::put(problemName, problemMessage);
+ MDC::put(keyLS, problemMessageLS);
auto event = std::make_shared<LoggingEvent>
- (problemNameLS, level, problemMessageLS,
LOG4CXX_LOCATION);
+ (loggerNameLS, level, problemMessageLS,
LOG4CXX_LOCATION);
XMLLayout layout;
layout.setProperties(true);
Pool p;
@@ -414,7 +424,7 @@ public:
break;
case 3:
- checkPropertiesElement(node,
problemName.c_str(), expectedAttributeValue.c_str());
+ checkPropertiesElement(node,
expectedKeyValue.c_str(), expectedAttributeValue.c_str());
break;
default: