This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-io.git
commit 17f8b44d50372f4b540059232ed0ffa189eceb62 Author: Gary Gregory <garydgreg...@gmail.com> AuthorDate: Tue Jan 2 09:08:58 2024 -0500 XmlStreamReader can't parse XML document with multi-line prolog #550 - Apply PR #550, not merged or would have caused the build to fail. - Implement fix --- src/changes/changes.xml | 1 + .../org/apache/commons/io/input/XmlStreamReader.java | 16 +++++++++++----- .../org/apache/commons/io/input/XmlStreamReaderTest.java | 10 ++++++++++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index b0670bf4..7508b585 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -88,6 +88,7 @@ The <action> type attribute can be add,update,fix,remove. <action dev="ggregory" type="fix" issue="IO-807" due-to="Elliotte Rusty Harold, Gary Gregory">Characterization test for broken symlinks when copying directories #547.</action> <action dev="ggregory" type="fix" due-to="Gary Gregory">ClosedInputStream.read(byte[], int, int) does not always return -1.</action> <action dev="ggregory" type="fix" due-to="Gary Gregory">ClosedOutputStream.write(byte[], int, int) does not always throw IOException.</action> + <action dev="ggregory" type="fix" due-to="Sylwester Lachiewicz, Gary Gregory">XmlStreamReader can't parse XML document with multi-line prolog #550.</action> <!-- Add --> <action dev="ggregory" type="add" due-to="Gary Gregory">Add and use PathUtils.getFileName(Path, Function<Path, R>).</action> <action dev="ggregory" type="add" due-to="Gary Gregory">Add and use PathUtils.getFileNameString().</action> diff --git a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java index 2b9b379d..ff16987f 100644 --- a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java +++ b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java @@ -214,6 +214,16 @@ public class XmlStreamReader extends Reader { * <p> * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">XML specification</a>. * </p> + * <p> + * Note the documented pattern is: + * </p> + * <pre> + * EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* + * </pre> + * <p> + * However this does not match all the aliases that are supported by Java. + * For example, '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro'. + * </p> */ public static final Pattern ENCODING_PATTERN = Pattern.compile( // @formatter:off @@ -223,10 +233,6 @@ public class XmlStreamReader extends Reader { + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")" // double-quoted + "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted Pattern.MULTILINE); - // N.B. the documented pattern is - // EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* - // However this does not match all the aliases that are supported by Java. - // e.g. '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro' // @formatter:on private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; @@ -325,7 +331,7 @@ public class XmlStreamReader extends Reader { inputStream.reset(); final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1))); final StringBuilder prolog = new StringBuilder(); - IOConsumer.forEach(bReader.lines(), prolog::append); + IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' ')); final Matcher m = ENCODING_PATTERN.matcher(prolog); if (m.find()) { encoding = m.group(1).toUpperCase(Locale.ROOT); diff --git a/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java b/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java index 63d587a8..de986c98 100644 --- a/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java +++ b/src/test/java/org/apache/commons/io/input/XmlStreamReaderTest.java @@ -60,6 +60,8 @@ public class XmlStreamReaderTest { private static final String UTF_32LE = "UTF-32LE"; private static final String UTF_32BE = "UTF-32BE"; private static final String UTF_8 = StandardCharsets.UTF_8.name(); + + private static final String XML6 = "xml-prolog-encoding-new-line"; private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes"; private static final String XML4 = "xml-prolog-encoding-single-quotes"; private static final String XML3 = "xml-prolog-encoding-double-quotes"; @@ -102,6 +104,8 @@ public class XmlStreamReaderTest { private static final MessageFormat XML_WITH_PROLOG = new MessageFormat( "<?xml version=\"1.0\"?>\n<root>{2}</root>"); + private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_NEW_LINES = new MessageFormat( + "<?xml\nversion\n=\n\"1.0\"\nencoding\n=\n\"{1}\"\n?>\n<root>{2}</root>"); private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES = new MessageFormat( "<?xml version=\"1.0\" encoding=\"{1}\"?>\n<root>{2}</root>"); @@ -123,6 +127,7 @@ public class XmlStreamReaderTest { XMLs.put(XML3, XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES); XMLs.put(XML4, XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES); XMLs.put(XML5, XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES); + XMLs.put(XML6, XML_WITH_PROLOG_AND_ENCODING_NEW_LINES); } /** @@ -624,5 +629,10 @@ public class XmlStreamReaderTest { xmlReader = new XmlStreamReader(is); assertEquals(xmlReader.getEncoding(), encoding); xmlReader.close(); + + is = getXmlInputStream("no-bom", XML6, encoding, encoding); + xmlReader = new XmlStreamReader(is); + assertEquals(xmlReader.getEncoding(), encoding); + xmlReader.close(); } }