This is an automated email from the ASF dual-hosted git repository. pkarwasz pushed a commit to branch fix/2.25.x/log4j1-xml-control-characters in repository https://gitbox.apache.org/repos/asf/logging-log4j2.git
commit 77bd0a62c00efba3226032d71114345a93b6abc9 Author: Piotr P. Karwasz <[email protected]> AuthorDate: Tue Mar 24 18:27:11 2026 +0100 Log4j1XmlLayout: replace invalid XML characters with U+FFFD This change sanitizes the output of `Log4j1XmlLayout` by replacing characters that are not permitted in XML 1.0 with the Unicode replacement character (`U+FFFD`). This guarantees that the generated log output is always well-formed XML and can be parsed by any XML 1.0–compliant parser, even when log data contains control characters or other invalid code points. --- .../apache/log4j/layout/Log4j1XmlLayoutTest.java | 42 ++++++- .../logging/log4j/core/util/TransformTest.java | 90 +++++++++++++++ .../apache/logging/log4j/core/util/Transform.java | 124 ++++++++++++++------- .../.2.x.x/log4j1-xml-control-characters.xml | 11 ++ 4 files changed, 227 insertions(+), 40 deletions(-) diff --git a/log4j-1.2-api/src/test/java/org/apache/log4j/layout/Log4j1XmlLayoutTest.java b/log4j-1.2-api/src/test/java/org/apache/log4j/layout/Log4j1XmlLayoutTest.java index 1af12167a6..51de2fbb92 100644 --- a/log4j-1.2-api/src/test/java/org/apache/log4j/layout/Log4j1XmlLayoutTest.java +++ b/log4j-1.2-api/src/test/java/org/apache/log4j/layout/Log4j1XmlLayoutTest.java @@ -72,7 +72,8 @@ class Log4j1XmlLayoutTest { final String expected = "<log4j:event logger=\"a.B\" timestamp=\"" + event.getTimeMillis() + "\" level=\"INFO\" thread=\"main\">\r\n" + "<log4j:message><![CDATA[Hello, World]]></log4j:message>\r\n" - + "<log4j:locationInfo class=\"pack.MyClass\" method=\"myMethod\" file=\"MyClass.java\" line=\"17\"/>\r\n" + + "<log4j:locationInfo class=\"pack.MyClass\" method=\"myMethod\" file=\"MyClass.java\"" + + " line=\"17\"/>\r\n" + "<log4j:properties>\r\n" + "<log4j:data name=\"key1\" value=\"value1\"/>\r\n" + "<log4j:data name=\"key2\" value=\"value2\"/>\r\n" @@ -81,4 +82,43 @@ class Log4j1XmlLayoutTest { assertEquals(expected, result); } + + @Test + void testWithInvalidXmlCharacters() { + final Log4j1XmlLayout layout = Log4j1XmlLayout.createLayout(true, true); + + final String message = "<>'\"&A\uD800B\uDE00C\u0000\u0001\u0002\u0003\uFFFE\uFFFF"; + final String expectedMessage = "<>'\"&A\uFFFDB\uFFFDC\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD"; + final String expectedEscapedMessage = + "<>'"&A\uFFFDB\uFFFDC\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD"; + + final StringMap contextMap = ContextDataFactory.createContextData(1); + contextMap.putValue(message, message); + final Log4jLogEvent event = Log4jLogEvent.newBuilder() + .setLoggerName(message) + .setLevel(Level.forName(message, 100)) + .setMessage(new SimpleMessage(message)) + .setTimeMillis(System.currentTimeMillis() + 17) + .setIncludeLocation(true) + .setSource(new StackTraceElement(message, message, message, 17)) + .setContextData(contextMap) + .build(); + + final String result = layout.toSerializable(event); + + final String expected = + "<log4j:event logger=\"" + expectedEscapedMessage + "\" timestamp=\"" + event.getTimeMillis() + + "\" level=\"" + expectedEscapedMessage + "\" thread=\"main\">\r\n" + + "<log4j:message><![CDATA[" + expectedMessage + "]]></log4j:message>\r\n" + + "<log4j:locationInfo class=\"" + expectedEscapedMessage + + "\" method=\"" + expectedEscapedMessage + + "\" file=\"" + expectedEscapedMessage + "\" line=\"17\"/>\r\n" + + "<log4j:properties>\r\n" + + "<log4j:data name=\"" + expectedEscapedMessage + "\" value=\"" + expectedEscapedMessage + + "\"/>\r\n" + + "</log4j:properties>\r\n" + + "</log4j:event>\r\n\r\n"; + + assertEquals(expected, result); + } } diff --git a/log4j-core-test/src/test/java/org/apache/logging/log4j/core/util/TransformTest.java b/log4j-core-test/src/test/java/org/apache/logging/log4j/core/util/TransformTest.java new file mode 100644 index 0000000000..3dcf2ae863 --- /dev/null +++ b/log4j-core-test/src/test/java/org/apache/logging/log4j/core/util/TransformTest.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.logging.log4j.core.util; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.stream.Stream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class TransformTest { + + static Stream<Arguments> testEscapeHtmlTags() { + final char replacement = '\uFFFD'; + return Stream.of( + // Empty + Arguments.of("", ""), + // characters that need to be escaped + Arguments.of("<\"Salt&Peppa'\">", "<"Salt&Peppa'">"), + // control character replaced with U+FFFD + Arguments.of("A" + (char) 0x01 + "B", "A" + replacement + "B"), + // standalone low surrogate replaced with U+FFFD + Arguments.of("low" + Character.MIN_SURROGATE + "surrogate", "low" + replacement + "surrogate"), + Arguments.of(Character.MIN_SURROGATE + "low", replacement + "low"), + // standalone high surrogate replaced with U+FFFD + Arguments.of("high" + Character.MAX_SURROGATE + "surrogate", "high" + replacement + "surrogate"), + Arguments.of(Character.MAX_SURROGATE + "high", replacement + "high"), + // FFFE and FFFF + Arguments.of("invalid\uFFFEchars", "invalid" + replacement + "chars"), + Arguments.of("invalid\uFFFFchars", "invalid" + replacement + "chars"), + // whitespace characters are preserved + Arguments.of("tab\tnewline\ncr\r", "tab\tnewline\ncr\r"), + // character beyond BMP (emoji) preserved as surrogate pair + Arguments.of("emoji " + "\uD83D\uDE00" + " end", "emoji " + "\uD83D\uDE00" + " end")); + } + + @ParameterizedTest + @MethodSource + void testEscapeHtmlTags(final String input, final String expected) { + String actual = Transform.escapeHtmlTags(input); + assertThat(actual).isEqualTo(expected); + } + + static Stream<Arguments> testAppendEscapingCData() { + final char replacement = '\uFFFD'; + return Stream.of( + // Empty + Arguments.of("", ""), + // characters that need to be escaped + Arguments.of("<\"Salt&Peppa'\">", "<\"Salt&Peppa'\">"), + // control character replaced with U+FFFD + Arguments.of("A" + (char) 0x01 + "B", "A" + replacement + "B"), + // standalone low surrogate replaced with U+FFFD + Arguments.of("low" + Character.MIN_SURROGATE + "surrogate", "low" + replacement + "surrogate"), + Arguments.of(Character.MIN_SURROGATE + "low", replacement + "low"), + // standalone high surrogate replaced with U+FFFD + Arguments.of("high" + Character.MAX_SURROGATE + "surrogate", "high" + replacement + "surrogate"), + Arguments.of(Character.MAX_SURROGATE + "high", replacement + "high"), + // FFFE and FFFF + Arguments.of("invalid\uFFFEchars", "invalid" + replacement + "chars"), + Arguments.of("invalid\uFFFFchars", "invalid" + replacement + "chars"), + // whitespace characters are preserved + Arguments.of("tab\tnewline\ncr\r", "tab\tnewline\ncr\r"), + // character beyond BMP (emoji) preserved as surrogate pair + Arguments.of("emoji " + "\uD83D\uDE00" + " end", "emoji " + "\uD83D\uDE00" + " end")); + } + + @ParameterizedTest + @MethodSource + void testAppendEscapingCData(final String input, final String expected) { + StringBuilder cdata = new StringBuilder(); + Transform.appendEscapingCData(cdata, input); + assertThat(cdata.toString()).isEqualTo(expected); + } +} diff --git a/log4j-core/src/main/java/org/apache/logging/log4j/core/util/Transform.java b/log4j-core/src/main/java/org/apache/logging/log4j/core/util/Transform.java index e9eb2a5f39..5788b4c65c 100644 --- a/log4j-core/src/main/java/org/apache/logging/log4j/core/util/Transform.java +++ b/log4j-core/src/main/java/org/apache/logging/log4j/core/util/Transform.java @@ -29,6 +29,8 @@ public final class Transform { private static final String CDATA_EMBEDED_END = CDATA_END + CDATA_PSEUDO_END + CDATA_START; private static final int CDATA_END_LEN = CDATA_END.length(); + private static final char REPLACEMENT_CHAR = '\uFFFD'; + private Transform() {} /** @@ -41,47 +43,26 @@ public final class Transform { * @return The input string with the special characters replaced. */ public static String escapeHtmlTags(final String input) { - // Check if the string is null, zero length or devoid of special characters + // Check if the string is null or zero length // if so, return what was sent in. - - if (Strings.isEmpty(input) - || (input.indexOf('"') == -1 - && input.indexOf('&') == -1 - && input.indexOf('<') == -1 - && input.indexOf('>') == -1)) { + if (Strings.isEmpty(input)) { return input; } - // Use a StringBuilder in lieu of String concatenation -- it is - // much more efficient this way. - - final StringBuilder buf = new StringBuilder(input.length() + 6); - - final int len = input.length(); - for (int i = 0; i < len; i++) { - final char ch = input.charAt(i); - if (ch > '>') { - buf.append(ch); - } else - switch (ch) { - case '<': - buf.append("<"); - break; - case '>': - buf.append(">"); - break; - case '&': - buf.append("&"); - break; - case '"': - buf.append("""); - break; - default: - buf.append(ch); - break; - } + // Only create a new string if we find a special character or invalid code point. + // In the common case, this should avoid unnecessary allocations. + final int length = input.length(); + for (int i = 0; i < length; ) { + final int cp = input.codePointAt(i); + if (!isValidXml10(cp) || isHtmlTagCharacter(cp)) { + final StringBuilder out = new StringBuilder(length); + out.append(input, 0, i); + appendEscapingHtmlTags(input, i, length, out); + return out.toString(); + } + i += Character.charCount(cp); } - return buf.toString(); + return input; } /** @@ -97,11 +78,11 @@ public final class Transform { if (str != null) { int end = str.indexOf(CDATA_END); if (end < 0) { - buf.append(str); + appendSanitizedXml10(str, 0, str.length(), buf); } else { int start = 0; while (end > -1) { - buf.append(str.substring(start, end)); + appendSanitizedXml10(str, start, end, buf); buf.append(CDATA_EMBEDED_END); start = end + CDATA_END_LEN; if (start < str.length()) { @@ -110,7 +91,7 @@ public final class Transform { return; } } - buf.append(str.substring(start)); + appendSanitizedXml10(str, start, str.length(), buf); } } } @@ -185,4 +166,69 @@ public final class Transform { } return buf.toString(); } + + private static void appendEscapingHtmlTags(final String input, int i, final int length, final StringBuilder buf) { + while (i < length) { + final int ch = input.codePointAt(i); + switch (ch) { + case '<': + buf.append("<"); + break; + case '>': + buf.append(">"); + break; + case '&': + buf.append("&"); + break; + case '"': + buf.append("""); + break; + case '\'': + buf.append("'"); + break; + default: + buf.appendCodePoint(isValidXml10(ch) ? ch : REPLACEMENT_CHAR); + break; + } + i += Character.charCount(ch); + } + } + + private static boolean isHtmlTagCharacter(final int cp) { + return cp == '<' || cp == '>' || cp == '&' || cp == '"' || cp == '\''; + } + + private static void appendSanitizedXml10( + final String input, final int start, final int end, final StringBuilder out) { + for (int i = start; i < end; ) { + final int cp = input.codePointAt(i); + out.appendCodePoint(isValidXml10(cp) ? cp : REPLACEMENT_CHAR); + i += Character.charCount(cp); + } + } + + /** + * Checks if a code point is a valid in XML 1.0 + * + * @param codePoint a code point between {@code 0} and {@link Character#MAX_CODE_POINT} + * @return {@code true} if it is a valid XML 1.0 code point + */ + private static boolean isValidXml10(final int codePoint) { + assert codePoint >= 0 && codePoint <= Character.MAX_CODE_POINT; + // XML 1.0 valid characters (Fifth Edition): + // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + + // [#x20–#xD7FF] (placed early as a fast path for the most common case) + return (codePoint >= ' ' && codePoint < Character.MIN_SURROGATE) + // #x9 + || codePoint == '\t' + // #xA + || codePoint == '\n' + // #xD + || codePoint == '\r' + // [#xE000-#xFFFD] + || (codePoint > Character.MAX_SURROGATE && codePoint <= 0xFFFD) + // [#x10000-#x10FFFF] + || codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT; + } } diff --git a/src/changelog/.2.x.x/log4j1-xml-control-characters.xml b/src/changelog/.2.x.x/log4j1-xml-control-characters.xml new file mode 100644 index 0000000000..91e2f2b322 --- /dev/null +++ b/src/changelog/.2.x.x/log4j1-xml-control-characters.xml @@ -0,0 +1,11 @@ +<?xml version="1.0" encoding="UTF-8"?> +<entry xmlns="https://logging.apache.org/xml/ns" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation=" + https://logging.apache.org/xml/ns + https://logging.apache.org/xml/ns/log4j-changelog-0.xsd" + type="fixed"> + <description format="asciidoc"> + Replace invalid characters in Log4j1XmlLayout output with the Unicode replacement character (U+FFFD). + </description> +</entry>
