This is an automated email from the ASF dual-hosted git repository.
pkarwasz pushed a commit to branch 2.25.x
in repository https://gitbox.apache.org/repos/asf/logging-log4j2.git
The following commit(s) were added to refs/heads/2.25.x by this push:
new 25043cfad1 Log4j1XmlLayout: replace invalid XML characters with U+FFFD
(#4078)
25043cfad1 is described below
commit 25043cfad1bac2e43ebc4638450cbb73c51451c4
Author: Piotr P. Karwasz <[email protected]>
AuthorDate: Tue Mar 24 23:57:28 2026 +0100
Log4j1XmlLayout: replace invalid XML characters with U+FFFD (#4078)
This change sanitizes the output of `Log4j1XmlLayout` by replacing
characters that are not permitted in XML 1.0 with the Unicode replacement
character (`U+FFFD`).
This guarantees that the generated log output is always well-formed XML and
can be parsed by any XML 1.0–compliant parser, even when log data contains
control characters or other invalid code points.
Co-authored-by: Copilot <[email protected]>
---
.../apache/log4j/layout/Log4j1XmlLayoutTest.java | 42 ++++++-
.../logging/log4j/core/util/TransformTest.java | 90 ++++++++++++++
.../apache/logging/log4j/core/util/Transform.java | 137 ++++++++++++++-------
.../.2.x.x/4078_log4j1-xml-control-characters.xml | 12 ++
4 files changed, 235 insertions(+), 46 deletions(-)
diff --git
a/log4j-1.2-api/src/test/java/org/apache/log4j/layout/Log4j1XmlLayoutTest.java
b/log4j-1.2-api/src/test/java/org/apache/log4j/layout/Log4j1XmlLayoutTest.java
index 1af12167a6..9c8953f2d0 100644
---
a/log4j-1.2-api/src/test/java/org/apache/log4j/layout/Log4j1XmlLayoutTest.java
+++
b/log4j-1.2-api/src/test/java/org/apache/log4j/layout/Log4j1XmlLayoutTest.java
@@ -72,7 +72,8 @@ class Log4j1XmlLayoutTest {
final String expected = "<log4j:event logger=\"a.B\" timestamp=\"" +
event.getTimeMillis()
+ "\" level=\"INFO\" thread=\"main\">\r\n"
+ "<log4j:message><![CDATA[Hello, World]]></log4j:message>\r\n"
- + "<log4j:locationInfo class=\"pack.MyClass\"
method=\"myMethod\" file=\"MyClass.java\" line=\"17\"/>\r\n"
+ + "<log4j:locationInfo class=\"pack.MyClass\"
method=\"myMethod\" file=\"MyClass.java\""
+ + " line=\"17\"/>\r\n"
+ "<log4j:properties>\r\n"
+ "<log4j:data name=\"key1\" value=\"value1\"/>\r\n"
+ "<log4j:data name=\"key2\" value=\"value2\"/>\r\n"
@@ -81,4 +82,43 @@ class Log4j1XmlLayoutTest {
assertEquals(expected, result);
}
+
+ @Test
+ void testWithInvalidXmlCharacters() {
+ final Log4j1XmlLayout layout = Log4j1XmlLayout.createLayout(true,
true);
+
+ final String message =
"<>'\"&A\uD800B\uDE00C\u0000\u0001\u0002\u0003\uFFFE\uFFFF";
+ final String expectedMessage =
"<>'\"&A\uFFFDB\uFFFDC\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD";
+ final String expectedEscapedMessage =
+
"<>'"&A\uFFFDB\uFFFDC\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD";
+
+ final StringMap contextMap = ContextDataFactory.createContextData(1);
+ contextMap.putValue(message, message);
+ final Log4jLogEvent event = Log4jLogEvent.newBuilder()
+ .setLoggerName(message)
+ .setLevel(Level.forName(message, 100))
+ .setMessage(new SimpleMessage(message))
+ .setTimeMillis(System.currentTimeMillis() + 17)
+ .setIncludeLocation(true)
+ .setSource(new StackTraceElement(message, message, message,
17))
+ .setContextData(contextMap)
+ .build();
+
+ final String result = layout.toSerializable(event);
+
+ final String expected =
+ "<log4j:event logger=\"" + expectedEscapedMessage + "\"
timestamp=\"" + event.getTimeMillis()
+ + "\" level=\"" + expectedEscapedMessage + "\"
thread=\"main\">\r\n"
+ + "<log4j:message><![CDATA[" + expectedMessage +
"]]></log4j:message>\r\n"
+ + "<log4j:locationInfo class=\"" +
expectedEscapedMessage
+ + "\" method=\"" + expectedEscapedMessage
+ + "\" file=\"" + expectedEscapedMessage + "\"
line=\"17\"/>\r\n"
+ + "<log4j:properties>\r\n"
+ + "<log4j:data name=\"" + expectedEscapedMessage + "\"
value=\"" + expectedEscapedMessage
+ + "\"/>\r\n"
+ + "</log4j:properties>\r\n"
+ + "</log4j:event>\r\n\r\n";
+
+ assertEquals(expected, result);
+ }
}
diff --git
a/log4j-core-test/src/test/java/org/apache/logging/log4j/core/util/TransformTest.java
b/log4j-core-test/src/test/java/org/apache/logging/log4j/core/util/TransformTest.java
new file mode 100644
index 0000000000..a9a79e2afb
--- /dev/null
+++
b/log4j-core-test/src/test/java/org/apache/logging/log4j/core/util/TransformTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.logging.log4j.core.util;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+class TransformTest {
+
+ static Stream<Arguments> testEscapeHtmlTags() {
+ final char replacement = '\uFFFD';
+ return Stream.of(
+ // Empty
+ Arguments.of("", ""),
+ // characters that need to be escaped
+ Arguments.of("<\"Salt&Peppa'\">",
"<"Salt&Peppa'">"),
+ // control character replaced with U+FFFD
+ Arguments.of("A" + (char) 0x01 + "B", "A" + replacement + "B"),
+ // standalone low surrogate replaced with U+FFFD
+ Arguments.of("low" + Character.MIN_SURROGATE + "surrogate",
"low" + replacement + "surrogate"),
+ Arguments.of(Character.MIN_SURROGATE + "low", replacement +
"low"),
+ // standalone high surrogate replaced with U+FFFD
+ Arguments.of("high" + Character.MAX_SURROGATE + "surrogate",
"high" + replacement + "surrogate"),
+ Arguments.of(Character.MAX_SURROGATE + "high", replacement +
"high"),
+ // FFFE and FFFF
+ Arguments.of("invalid\uFFFEchars", "invalid" + replacement +
"chars"),
+ Arguments.of("invalid\uFFFFchars", "invalid" + replacement +
"chars"),
+ // whitespace characters are preserved
+ Arguments.of("tab\tnewline\ncr\r", "tab\tnewline\ncr\r"),
+ // character beyond BMP (emoji) preserved as surrogate pair
+ Arguments.of("emoji " + "\uD83D\uDE00" + " end", "emoji " +
"\uD83D\uDE00" + " end"));
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void testEscapeHtmlTags(final String input, final String expected) {
+ String actual = Transform.escapeHtmlTags(input);
+ assertThat(actual).isEqualTo(expected);
+ }
+
+ static Stream<Arguments> testAppendEscapingCData() {
+ final char replacement = '\uFFFD';
+ return Stream.of(
+ // Empty
+ Arguments.of("", ""),
+ // characters that need to be escaped
+ Arguments.of("<\"Salt&Peppa'\">", "<\"Salt&Peppa'\">"),
+ // control character replaced with U+FFFD
+ Arguments.of("A" + (char) 0x01 + "B", "A" + replacement + "B"),
+ // standalone low surrogate replaced with U+FFFD
+ Arguments.of("low" + Character.MIN_SURROGATE + "surrogate",
"low" + replacement + "surrogate"),
+ Arguments.of(Character.MIN_SURROGATE + "low", replacement +
"low"),
+ // standalone high surrogate replaced with U+FFFD
+ Arguments.of("high" + Character.MAX_SURROGATE + "surrogate",
"high" + replacement + "surrogate"),
+ Arguments.of(Character.MAX_SURROGATE + "high", replacement +
"high"),
+ // FFFE and FFFF
+ Arguments.of("invalid\uFFFEchars", "invalid" + replacement +
"chars"),
+ Arguments.of("invalid\uFFFFchars", "invalid" + replacement +
"chars"),
+ // whitespace characters are preserved
+ Arguments.of("tab\tnewline\ncr\r", "tab\tnewline\ncr\r"),
+ // character beyond BMP (emoji) preserved as surrogate pair
+ Arguments.of("emoji " + "\uD83D\uDE00" + " end", "emoji " +
"\uD83D\uDE00" + " end"));
+ }
+
+ @ParameterizedTest
+ @MethodSource
+ void testAppendEscapingCData(final String input, final String expected) {
+ StringBuilder cdata = new StringBuilder();
+ Transform.appendEscapingCData(cdata, input);
+ assertThat(cdata.toString()).isEqualTo(expected);
+ }
+}
diff --git
a/log4j-core/src/main/java/org/apache/logging/log4j/core/util/Transform.java
b/log4j-core/src/main/java/org/apache/logging/log4j/core/util/Transform.java
index e9eb2a5f39..136955448d 100644
--- a/log4j-core/src/main/java/org/apache/logging/log4j/core/util/Transform.java
+++ b/log4j-core/src/main/java/org/apache/logging/log4j/core/util/Transform.java
@@ -29,59 +29,41 @@ public final class Transform {
private static final String CDATA_EMBEDED_END = CDATA_END +
CDATA_PSEUDO_END + CDATA_START;
private static final int CDATA_END_LEN = CDATA_END.length();
+ private static final char REPLACEMENT_CHAR = '\uFFFD';
+
private Transform() {}
/**
- * This method takes a string which may contain HTML tags (ie,
- * <b>, <table>, etc) and replaces any
- * '<', '>' , '&' or '"'
- * characters with respective predefined entity references.
+ * Escapes characters in a string for safe inclusion in HTML or XML text.
*
- * @param input The text to be converted.
- * @return The input string with the special characters replaced.
+ * <p>Replaces the characters {@code <}, {@code >}, {@code &}, {@code "}
and {@code '} with their corresponding
+ * entity references ({@code <}, {@code >}, {@code &}, {@code
"}, and {@code '}). Any code point
+ * that is invalid in XML 1.0 is replaced with the Unicode replacement
character U+FFFD.</p>
+ *
+ * @param input The text to be escaped; may be {@code null} or empty.
+ * @return The escaped string, or the original {@code input} if no changes
were required.
*/
public static String escapeHtmlTags(final String input) {
- // Check if the string is null, zero length or devoid of special
characters
+ // Check if the string is null or zero length
// if so, return what was sent in.
-
- if (Strings.isEmpty(input)
- || (input.indexOf('"') == -1
- && input.indexOf('&') == -1
- && input.indexOf('<') == -1
- && input.indexOf('>') == -1)) {
+ if (Strings.isEmpty(input)) {
return input;
}
- // Use a StringBuilder in lieu of String concatenation -- it is
- // much more efficient this way.
-
- final StringBuilder buf = new StringBuilder(input.length() + 6);
-
- final int len = input.length();
- for (int i = 0; i < len; i++) {
- final char ch = input.charAt(i);
- if (ch > '>') {
- buf.append(ch);
- } else
- switch (ch) {
- case '<':
- buf.append("<");
- break;
- case '>':
- buf.append(">");
- break;
- case '&':
- buf.append("&");
- break;
- case '"':
- buf.append(""");
- break;
- default:
- buf.append(ch);
- break;
- }
+ // Only create a new string if we find a special character or invalid
code point.
+ // In the common case, this should avoid unnecessary allocations.
+ final int length = input.length();
+ for (int i = 0; i < length; ) {
+ final int cp = input.codePointAt(i);
+ if (!isValidXml10(cp) || isHtmlTagCharacter(cp)) {
+ final StringBuilder out = new StringBuilder(length);
+ out.append(input, 0, i);
+ appendEscapingHtmlTags(input, i, length, out);
+ return out.toString();
+ }
+ i += Character.charCount(cp);
}
- return buf.toString();
+ return input;
}
/**
@@ -97,11 +79,11 @@ public final class Transform {
if (str != null) {
int end = str.indexOf(CDATA_END);
if (end < 0) {
- buf.append(str);
+ appendSanitizedXml10(str, 0, str.length(), buf);
} else {
int start = 0;
while (end > -1) {
- buf.append(str.substring(start, end));
+ appendSanitizedXml10(str, start, end, buf);
buf.append(CDATA_EMBEDED_END);
start = end + CDATA_END_LEN;
if (start < str.length()) {
@@ -110,7 +92,7 @@ public final class Transform {
return;
}
}
- buf.append(str.substring(start));
+ appendSanitizedXml10(str, start, str.length(), buf);
}
}
}
@@ -185,4 +167,69 @@ public final class Transform {
}
return buf.toString();
}
+
+ private static void appendEscapingHtmlTags(final String input, int i,
final int length, final StringBuilder buf) {
+ while (i < length) {
+ final int ch = input.codePointAt(i);
+ switch (ch) {
+ case '<':
+ buf.append("<");
+ break;
+ case '>':
+ buf.append(">");
+ break;
+ case '&':
+ buf.append("&");
+ break;
+ case '"':
+ buf.append(""");
+ break;
+ case '\'':
+ buf.append("'");
+ break;
+ default:
+ buf.appendCodePoint(isValidXml10(ch) ? ch :
REPLACEMENT_CHAR);
+ break;
+ }
+ i += Character.charCount(ch);
+ }
+ }
+
+ private static boolean isHtmlTagCharacter(final int cp) {
+ return cp == '<' || cp == '>' || cp == '&' || cp == '"' || cp == '\'';
+ }
+
+ private static void appendSanitizedXml10(
+ final String input, final int start, final int end, final
StringBuilder out) {
+ for (int i = start; i < end; ) {
+ final int cp = input.codePointAt(i);
+ out.appendCodePoint(isValidXml10(cp) ? cp : REPLACEMENT_CHAR);
+ i += Character.charCount(cp);
+ }
+ }
+
+ /**
+ * Checks if a code point is valid in XML 1.0.
+ *
+ * @param codePoint a code point between {@code 0} and {@link
Character#MAX_CODE_POINT}
+ * @return {@code true} if it is a valid XML 1.0 code point
+ */
+ private static boolean isValidXml10(final int codePoint) {
+ assert codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT;
+ // XML 1.0 valid characters (Fifth Edition):
+ // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
[#x10000-#x10FFFF]
+
+ // [#x20–#xD7FF] (placed early as a fast path for the most common case)
+ return (codePoint >= ' ' && codePoint < Character.MIN_SURROGATE)
+ // #x9
+ || codePoint == '\t'
+ // #xA
+ || codePoint == '\n'
+ // #xD
+ || codePoint == '\r'
+ // [#xE000-#xFFFD]
+ || (codePoint > Character.MAX_SURROGATE && codePoint <= 0xFFFD)
+ // [#x10000-#x10FFFF]
+ || codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT;
+ }
}
diff --git a/src/changelog/.2.x.x/4078_log4j1-xml-control-characters.xml
b/src/changelog/.2.x.x/4078_log4j1-xml-control-characters.xml
new file mode 100644
index 0000000000..b9164561e6
--- /dev/null
+++ b/src/changelog/.2.x.x/4078_log4j1-xml-control-characters.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<entry xmlns="https://logging.apache.org/xml/ns"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="
+ https://logging.apache.org/xml/ns
+ https://logging.apache.org/xml/ns/log4j-changelog-0.xsd"
+ type="fixed">
+ <issue id="4078"
link="https://github.com/apache/logging-log4j2/pull/4078"/>
+ <description format="asciidoc">
+ Replace invalid characters in Log4j1XmlLayout output with the Unicode
replacement character (U+FFFD).
+ </description>
+</entry>