This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-io.git
commit e403b84597a4299d1cd2d1b93224eade2530128c Author: Gary Gregory <gardgreg...@gmail.com> AuthorDate: Thu Jan 21 11:22:50 2021 -0500 Sort members. --- .../apache/commons/io/input/XmlStreamReader.java | 708 ++++++++++----------- 1 file changed, 354 insertions(+), 354 deletions(-) diff --git a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java index d77d382..ee80736 100644 --- a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java +++ b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java @@ -110,24 +110,161 @@ public class XmlStreamReader extends Reader { new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) }; - private final Reader reader; + private static final Pattern CHARSET_PATTERN = Pattern + .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); - private final String encoding; + /** + * Pattern capturing the encoding of the "xml" processing instruction. + */ + public static final Pattern ENCODING_PATTERN = Pattern.compile( + "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", + Pattern.MULTILINE); - private final String defaultEncoding; + private static final String RAW_EX_1 = + "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; + + private static final String RAW_EX_2 = + "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; + + private static final String HTTP_EX_1 = + "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; + + private static final String HTTP_EX_2 = + "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; + + private static final String HTTP_EX_3 = + "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"; /** - * Returns the default encoding to use if none is set in HTTP content-type, - * XML prolog and the rules based on content-type are not adequate. - * <p> - * If it is NULL the content-type based rules are used. + * Returns charset parameter value, NULL if not present, NULL if + * httpContentType is NULL. * - * @return the default encoding to use. + * @param httpContentType the HTTP content type + * @return The content type encoding (upcased) */ - public String getDefaultEncoding() { - return defaultEncoding; + static String getContentTypeEncoding(final String httpContentType) { + String encoding = null; + if (httpContentType != null) { + final int i = httpContentType.indexOf(";"); + if (i > -1) { + final String postMime = httpContentType.substring(i + 1); + final Matcher m = CHARSET_PATTERN.matcher(postMime); + encoding = m.find() ? m.group(1) : null; + encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null; + } + } + return encoding; + } + + /** + * Returns MIME type or NULL if httpContentType is NULL. + * + * @param httpContentType the HTTP content type + * @return The mime content type + */ + static String getContentTypeMime(final String httpContentType) { + String mime = null; + if (httpContentType != null) { + final int i = httpContentType.indexOf(";"); + if (i >= 0) { + mime = httpContentType.substring(0, i); + } else { + mime = httpContentType; + } + mime = mime.trim(); + } + return mime; + } + + /** + * Returns the encoding declared in the <?xml encoding=...?>, NULL if none. + * + * @param inputStream InputStream to create the reader from. + * @param guessedEnc guessed encoding + * @return the encoding declared in the <?xml encoding=...?> + * @throws IOException thrown if there is a problem reading the stream. + */ + private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) + throws IOException { + String encoding = null; + if (guessedEnc != null) { + final byte[] bytes = new byte[BUFFER_SIZE]; + inputStream.mark(BUFFER_SIZE); + int offset = 0; + int max = BUFFER_SIZE; + int c = inputStream.read(bytes, offset, max); + int firstGT = -1; + String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) + while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) { + offset += c; + max -= c; + c = inputStream.read(bytes, offset, max); + xmlProlog = new String(bytes, 0, offset, guessedEnc); + firstGT = xmlProlog.indexOf('>'); + } + if (firstGT == -1) { + if (c == -1) { + throw new IOException("Unexpected end of XML stream"); + } + throw new IOException( + "XML prolog or ROOT element not found on first " + + offset + " bytes"); + } + final int bytesRead = offset; + if (bytesRead > 0) { + inputStream.reset(); + final BufferedReader bReader = new BufferedReader(new StringReader( + xmlProlog.substring(0, firstGT + 1))); + final StringBuffer prolog = new StringBuffer(); + String line; + while ((line = bReader.readLine()) != null) { + prolog.append(line); + } + final Matcher m = ENCODING_PATTERN.matcher(prolog); + if (m.find()) { + encoding = m.group(1).toUpperCase(Locale.ROOT); + encoding = encoding.substring(1, encoding.length() - 1); + } + } + } + return encoding; + } + + /** + * Indicates if the MIME type belongs to the APPLICATION XML family. + * + * @param mime The mime type + * @return true if the mime type belongs to the APPLICATION XML family, + * otherwise false + */ + static boolean isAppXml(final String mime) { + return mime != null && + (mime.equals("application/xml") || + mime.equals("application/xml-dtd") || + mime.equals("application/xml-external-parsed-entity") || + mime.startsWith("application/") && mime.endsWith("+xml")); + } + + /** + * Indicates if the MIME type belongs to the TEXT XML family. + * + * @param mime The mime type + * @return true if the mime type belongs to the TEXT XML family, + * otherwise false + */ + static boolean isTextXml(final String mime) { + return mime != null && + (mime.equals("text/xml") || + mime.equals("text/xml-external-parsed-entity") || + mime.startsWith("text/") && mime.endsWith("+xml")); } + private final Reader reader; + + private final String encoding; + + private final String defaultEncoding; + /** * Creates a Reader for a File. * <p> @@ -229,62 +366,6 @@ public class XmlStreamReader extends Reader { } /** - * Creates a Reader using the InputStream of a URL. - * <p> - * If the URL is not of type HTTP and there is not 'content-type' header in - * the fetched data it uses the same logic used for Files. - * <p> - * If the URL is a HTTP Url or there is a 'content-type' header in the - * fetched data it uses the same logic used for an InputStream with - * content-type. - * <p> - * It does a lenient charset encoding detection, check the constructor with - * the lenient parameter for details. - * - * @param url URL to create a Reader from. - * @throws IOException thrown if there is a problem reading the stream of - * the URL. - */ - public XmlStreamReader(final URL url) throws IOException { - this(Objects.requireNonNull(url, "url").openConnection(), null); - } - - /** - * Creates a Reader using the InputStream of a URLConnection. - * <p> - * If the URLConnection is not of type HttpURLConnection and there is not - * 'content-type' header in the fetched data it uses the same logic used for - * files. - * <p> - * If the URLConnection is a HTTP Url or there is a 'content-type' header in - * the fetched data it uses the same logic used for an InputStream with - * content-type. - * <p> - * It does a lenient charset encoding detection, check the constructor with - * the lenient parameter for details. - * - * @param conn URLConnection to create a Reader from. - * @param defaultEncoding The default encoding - * @throws IOException thrown if there is a problem reading the stream of - * the URLConnection. - */ - public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException { - Objects.requireNonNull(conn, "conm"); - this.defaultEncoding = defaultEncoding; - final boolean lenient = true; - final String contentType = conn.getContentType(); - final InputStream inputStream = conn.getInputStream(); - final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, BUFFER_SIZE), false, BOMS); - final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); - if (conn instanceof HttpURLConnection || contentType != null) { - this.encoding = processHttpStream(bom, pis, contentType, lenient); - } else { - this.encoding = doRawStream(bom, pis, lenient); - } - this.reader = new InputStreamReader(pis, encoding); - } - - /** * Creates a Reader using an InputStream and the associated content-type * header. * <p> @@ -306,6 +387,7 @@ public class XmlStreamReader extends Reader { this(inputStream, httpContentType, true); } + /** * Creates a Reader using an InputStream and the associated content-type * header. This constructor is lenient regarding the encoding detection. @@ -335,19 +417,13 @@ public class XmlStreamReader extends Reader { * the charset encoding. * @param lenient indicates if the charset encoding detection should be * relaxed. - * @param defaultEncoding The default encoding * @throws IOException thrown if there is a problem reading the file. * @throws XmlStreamReaderException thrown if the charset encoding could not * be determined according to the specs. */ public XmlStreamReader(final InputStream inputStream, final String httpContentType, - final boolean lenient, final String defaultEncoding) throws IOException { - Objects.requireNonNull(inputStream, "inputStream"); - this.defaultEncoding = defaultEncoding; - final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, BUFFER_SIZE), false, BOMS); - final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); - this.encoding = processHttpStream(bom, pis, httpContentType, lenient); - this.reader = new InputStreamReader(pis, encoding); + final boolean lenient) throws IOException { + this(inputStream, httpContentType, lenient, null); } /** @@ -379,127 +455,155 @@ public class XmlStreamReader extends Reader { * the charset encoding. * @param lenient indicates if the charset encoding detection should be * relaxed. + * @param defaultEncoding The default encoding * @throws IOException thrown if there is a problem reading the file. * @throws XmlStreamReaderException thrown if the charset encoding could not * be determined according to the specs. */ public XmlStreamReader(final InputStream inputStream, final String httpContentType, - final boolean lenient) throws IOException { - this(inputStream, httpContentType, lenient, null); + final boolean lenient, final String defaultEncoding) throws IOException { + Objects.requireNonNull(inputStream, "inputStream"); + this.defaultEncoding = defaultEncoding; + final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, BUFFER_SIZE), false, BOMS); + final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); + this.encoding = processHttpStream(bom, pis, httpContentType, lenient); + this.reader = new InputStreamReader(pis, encoding); } /** - * Returns the charset encoding of the XmlStreamReader. + * Creates a Reader using the InputStream of a URL. + * <p> + * If the URL is not of type HTTP and there is not 'content-type' header in + * the fetched data it uses the same logic used for Files. + * <p> + * If the URL is a HTTP Url or there is a 'content-type' header in the + * fetched data it uses the same logic used for an InputStream with + * content-type. + * <p> + * It does a lenient charset encoding detection, check the constructor with + * the lenient parameter for details. * - * @return charset encoding. + * @param url URL to create a Reader from. + * @throws IOException thrown if there is a problem reading the stream of + * the URL. */ - public String getEncoding() { - return encoding; + public XmlStreamReader(final URL url) throws IOException { + this(Objects.requireNonNull(url, "url").openConnection(), null); } /** - * Invokes the underlying reader's {@code read(char[], int, int)} method. - * @param buf the buffer to read the characters into - * @param offset The start offset - * @param len The number of bytes to read - * @return the number of characters read or -1 if the end of stream - * @throws IOException if an I/O error occurs - */ - @Override - public int read(final char[] buf, final int offset, final int len) throws IOException { - return reader.read(buf, offset, len); - } - - /** - * Closes the XmlStreamReader stream. + * Creates a Reader using the InputStream of a URLConnection. + * <p> + * If the URLConnection is not of type HttpURLConnection and there is not + * 'content-type' header in the fetched data it uses the same logic used for + * files. + * <p> + * If the URLConnection is a HTTP Url or there is a 'content-type' header in + * the fetched data it uses the same logic used for an InputStream with + * content-type. + * <p> + * It does a lenient charset encoding detection, check the constructor with + * the lenient parameter for details. * - * @throws IOException thrown if there was a problem closing the stream. + * @param conn URLConnection to create a Reader from. + * @param defaultEncoding The default encoding + * @throws IOException thrown if there is a problem reading the stream of + * the URLConnection. */ - @Override - public void close() throws IOException { - reader.close(); + public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException { + Objects.requireNonNull(conn, "conm"); + this.defaultEncoding = defaultEncoding; + final boolean lenient = true; + final String contentType = conn.getContentType(); + final InputStream inputStream = conn.getInputStream(); + final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, BUFFER_SIZE), false, BOMS); + final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); + if (conn instanceof HttpURLConnection || contentType != null) { + this.encoding = processHttpStream(bom, pis, contentType, lenient); + } else { + this.encoding = doRawStream(bom, pis, lenient); + } + this.reader = new InputStreamReader(pis, encoding); } /** - * Process the raw stream. + * Calculate the HTTP encoding. * - * @param bom BOMInputStream to detect byte order marks - * @param pis BOMInputStream to guess XML encoding + * @param httpContentType The HTTP content type + * @param bomEnc BOM encoding + * @param xmlGuessEnc XML Guess encoding + * @param xmlEnc XML encoding * @param lenient indicates if the charset encoding detection should be * relaxed. - * @return the encoding to be used + * @return the HTTP encoding * @throws IOException thrown if there is a problem reading the stream. */ - private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient) - throws IOException { - final String bomEnc = bom.getBOMCharsetName(); - final String xmlGuessEnc = pis.getBOMCharsetName(); - final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); - try { - return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); - } catch (final XmlStreamReaderException ex) { - if (lenient) { - return doLenientDetection(null, ex); + String calculateHttpEncoding(final String httpContentType, + final String bomEnc, final String xmlGuessEnc, final String xmlEnc, + final boolean lenient) throws IOException { + + // Lenient and has XML encoding + if (lenient && xmlEnc != null) { + return xmlEnc; + } + + // Determine mime/encoding content types from HTTP Content Type + final String cTMime = getContentTypeMime(httpContentType); + final String cTEnc = getContentTypeEncoding(httpContentType); + final boolean appXml = isAppXml(cTMime); + final boolean textXml = isTextXml(cTMime); + + // Mime type NOT "application/xml" or "text/xml" + if (!appXml && !textXml) { + final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); + throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); + } + + // No content type encoding + if (cTEnc == null) { + if (appXml) { + return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); } - throw ex; + return defaultEncoding == null ? US_ASCII : defaultEncoding; } - } - /** - * Process a HTTP stream. - * - * @param bom BOMInputStream to detect byte order marks - * @param pis BOMInputStream to guess XML encoding - * @param httpContentType The HTTP content type - * @param lenient indicates if the charset encoding detection should be - * relaxed. - * @return the encoding to be used - * @throws IOException thrown if there is a problem reading the stream. - */ - private String processHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType, - final boolean lenient) throws IOException { - final String bomEnc = bom.getBOMCharsetName(); - final String xmlGuessEnc = pis.getBOMCharsetName(); - final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); - try { - return calculateHttpEncoding(httpContentType, bomEnc, xmlGuessEnc, xmlEnc, lenient); - } catch (final XmlStreamReaderException ex) { - if (lenient) { - return doLenientDetection(httpContentType, ex); + // UTF-16BE or UTF-16LE content type encoding + if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { + if (bomEnc != null) { + final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); + throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); } - throw ex; + return cTEnc; } - } - /** - * Do lenient detection. - * - * @param httpContentType content-type header to use for the resolution of - * the charset encoding. - * @param ex The thrown exception - * @return the encoding - * @throws IOException thrown if there is a problem reading the stream. - */ - private String doLenientDetection(String httpContentType, - XmlStreamReaderException ex) throws IOException { - if (httpContentType != null && httpContentType.startsWith("text/html")) { - httpContentType = httpContentType.substring("text/html".length()); - httpContentType = "text/xml" + httpContentType; - try { - return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), - ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); - } catch (final XmlStreamReaderException ex2) { - ex = ex2; + // UTF-16 content type encoding + if (cTEnc.equals(UTF_16)) { + if (bomEnc != null && bomEnc.startsWith(UTF_16)) { + return bomEnc; } + final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); + throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); } - String encoding = ex.getXmlEncoding(); - if (encoding == null) { - encoding = ex.getContentTypeEncoding(); + + // UTF-32BE or UTF-132E content type encoding + if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { + if (bomEnc != null) { + final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); + throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); + } + return cTEnc; } - if (encoding == null) { - encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; + + // UTF-32 content type encoding + if (cTEnc.equals(UTF_32)) { + if (bomEnc != null && bomEnc.startsWith(UTF_32)) { + return bomEnc; + } + final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); + throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); } - return encoding; + + return cTEnc; } /** @@ -570,234 +674,130 @@ public class XmlStreamReader extends Reader { throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); } + /** + * Closes the XmlStreamReader stream. + * + * @throws IOException thrown if there was a problem closing the stream. + */ + @Override + public void close() throws IOException { + reader.close(); + } /** - * Calculate the HTTP encoding. + * Do lenient detection. * - * @param httpContentType The HTTP content type - * @param bomEnc BOM encoding - * @param xmlGuessEnc XML Guess encoding - * @param xmlEnc XML encoding - * @param lenient indicates if the charset encoding detection should be - * relaxed. - * @return the HTTP encoding + * @param httpContentType content-type header to use for the resolution of + * the charset encoding. + * @param ex The thrown exception + * @return the encoding * @throws IOException thrown if there is a problem reading the stream. */ - String calculateHttpEncoding(final String httpContentType, - final String bomEnc, final String xmlGuessEnc, final String xmlEnc, - final boolean lenient) throws IOException { - - // Lenient and has XML encoding - if (lenient && xmlEnc != null) { - return xmlEnc; - } - - // Determine mime/encoding content types from HTTP Content Type - final String cTMime = getContentTypeMime(httpContentType); - final String cTEnc = getContentTypeEncoding(httpContentType); - final boolean appXml = isAppXml(cTMime); - final boolean textXml = isTextXml(cTMime); - - // Mime type NOT "application/xml" or "text/xml" - if (!appXml && !textXml) { - final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); - throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); - } - - // No content type encoding - if (cTEnc == null) { - if (appXml) { - return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); - } - return defaultEncoding == null ? US_ASCII : defaultEncoding; - } - - // UTF-16BE or UTF-16LE content type encoding - if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { - if (bomEnc != null) { - final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); - throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); - } - return cTEnc; - } - - // UTF-16 content type encoding - if (cTEnc.equals(UTF_16)) { - if (bomEnc != null && bomEnc.startsWith(UTF_16)) { - return bomEnc; + private String doLenientDetection(String httpContentType, + XmlStreamReaderException ex) throws IOException { + if (httpContentType != null && httpContentType.startsWith("text/html")) { + httpContentType = httpContentType.substring("text/html".length()); + httpContentType = "text/xml" + httpContentType; + try { + return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), + ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); + } catch (final XmlStreamReaderException ex2) { + ex = ex2; } - final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); - throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); } - - // UTF-32BE or UTF-132E content type encoding - if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { - if (bomEnc != null) { - final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); - throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); - } - return cTEnc; + String encoding = ex.getXmlEncoding(); + if (encoding == null) { + encoding = ex.getContentTypeEncoding(); } - - // UTF-32 content type encoding - if (cTEnc.equals(UTF_32)) { - if (bomEnc != null && bomEnc.startsWith(UTF_32)) { - return bomEnc; - } - final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); - throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); + if (encoding == null) { + encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; } - - return cTEnc; + return encoding; } /** - * Returns MIME type or NULL if httpContentType is NULL. + * Process the raw stream. * - * @param httpContentType the HTTP content type - * @return The mime content type + * @param bom BOMInputStream to detect byte order marks + * @param pis BOMInputStream to guess XML encoding + * @param lenient indicates if the charset encoding detection should be + * relaxed. + * @return the encoding to be used + * @throws IOException thrown if there is a problem reading the stream. */ - static String getContentTypeMime(final String httpContentType) { - String mime = null; - if (httpContentType != null) { - final int i = httpContentType.indexOf(";"); - if (i >= 0) { - mime = httpContentType.substring(0, i); - } else { - mime = httpContentType; + private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient) + throws IOException { + final String bomEnc = bom.getBOMCharsetName(); + final String xmlGuessEnc = pis.getBOMCharsetName(); + final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); + try { + return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); + } catch (final XmlStreamReaderException ex) { + if (lenient) { + return doLenientDetection(null, ex); } - mime = mime.trim(); + throw ex; } - return mime; } - private static final Pattern CHARSET_PATTERN = Pattern - .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); - /** - * Returns charset parameter value, NULL if not present, NULL if - * httpContentType is NULL. + * Returns the default encoding to use if none is set in HTTP content-type, + * XML prolog and the rules based on content-type are not adequate. + * <p> + * If it is NULL the content-type based rules are used. * - * @param httpContentType the HTTP content type - * @return The content type encoding (upcased) + * @return the default encoding to use. */ - static String getContentTypeEncoding(final String httpContentType) { - String encoding = null; - if (httpContentType != null) { - final int i = httpContentType.indexOf(";"); - if (i > -1) { - final String postMime = httpContentType.substring(i + 1); - final Matcher m = CHARSET_PATTERN.matcher(postMime); - encoding = m.find() ? m.group(1) : null; - encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null; - } - } - return encoding; + public String getDefaultEncoding() { + return defaultEncoding; } /** - * Pattern capturing the encoding of the "xml" processing instruction. - */ - public static final Pattern ENCODING_PATTERN = Pattern.compile( - "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", - Pattern.MULTILINE); - - /** - * Returns the encoding declared in the <?xml encoding=...?>, NULL if none. + * Returns the charset encoding of the XmlStreamReader. * - * @param inputStream InputStream to create the reader from. - * @param guessedEnc guessed encoding - * @return the encoding declared in the <?xml encoding=...?> - * @throws IOException thrown if there is a problem reading the stream. + * @return charset encoding. */ - private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) - throws IOException { - String encoding = null; - if (guessedEnc != null) { - final byte[] bytes = new byte[BUFFER_SIZE]; - inputStream.mark(BUFFER_SIZE); - int offset = 0; - int max = BUFFER_SIZE; - int c = inputStream.read(bytes, offset, max); - int firstGT = -1; - String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) - while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) { - offset += c; - max -= c; - c = inputStream.read(bytes, offset, max); - xmlProlog = new String(bytes, 0, offset, guessedEnc); - firstGT = xmlProlog.indexOf('>'); - } - if (firstGT == -1) { - if (c == -1) { - throw new IOException("Unexpected end of XML stream"); - } - throw new IOException( - "XML prolog or ROOT element not found on first " - + offset + " bytes"); - } - final int bytesRead = offset; - if (bytesRead > 0) { - inputStream.reset(); - final BufferedReader bReader = new BufferedReader(new StringReader( - xmlProlog.substring(0, firstGT + 1))); - final StringBuffer prolog = new StringBuffer(); - String line; - while ((line = bReader.readLine()) != null) { - prolog.append(line); - } - final Matcher m = ENCODING_PATTERN.matcher(prolog); - if (m.find()) { - encoding = m.group(1).toUpperCase(Locale.ROOT); - encoding = encoding.substring(1, encoding.length() - 1); - } - } - } + public String getEncoding() { return encoding; } /** - * Indicates if the MIME type belongs to the APPLICATION XML family. + * Process a HTTP stream. * - * @param mime The mime type - * @return true if the mime type belongs to the APPLICATION XML family, - * otherwise false + * @param bom BOMInputStream to detect byte order marks + * @param pis BOMInputStream to guess XML encoding + * @param httpContentType The HTTP content type + * @param lenient indicates if the charset encoding detection should be + * relaxed. + * @return the encoding to be used + * @throws IOException thrown if there is a problem reading the stream. */ - static boolean isAppXml(final String mime) { - return mime != null && - (mime.equals("application/xml") || - mime.equals("application/xml-dtd") || - mime.equals("application/xml-external-parsed-entity") || - mime.startsWith("application/") && mime.endsWith("+xml")); + private String processHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType, + final boolean lenient) throws IOException { + final String bomEnc = bom.getBOMCharsetName(); + final String xmlGuessEnc = pis.getBOMCharsetName(); + final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); + try { + return calculateHttpEncoding(httpContentType, bomEnc, xmlGuessEnc, xmlEnc, lenient); + } catch (final XmlStreamReaderException ex) { + if (lenient) { + return doLenientDetection(httpContentType, ex); + } + throw ex; + } } /** - * Indicates if the MIME type belongs to the TEXT XML family. - * - * @param mime The mime type - * @return true if the mime type belongs to the TEXT XML family, - * otherwise false + * Invokes the underlying reader's {@code read(char[], int, int)} method. + * @param buf the buffer to read the characters into + * @param offset The start offset + * @param len The number of bytes to read + * @return the number of characters read or -1 if the end of stream + * @throws IOException if an I/O error occurs */ - static boolean isTextXml(final String mime) { - return mime != null && - (mime.equals("text/xml") || - mime.equals("text/xml-external-parsed-entity") || - mime.startsWith("text/") && mime.endsWith("+xml")); + @Override + public int read(final char[] buf, final int offset, final int len) throws IOException { + return reader.read(buf, offset, len); } - private static final String RAW_EX_1 = - "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; - - private static final String RAW_EX_2 = - "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; - - private static final String HTTP_EX_1 = - "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; - - private static final String HTTP_EX_2 = - "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; - - private static final String HTTP_EX_3 = - "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"; - }