Author: markt Date: Thu Dec 15 21:34:06 2016 New Revision: 1774526 URL: http://svn.apache.org/viewvc?rev=1774526&view=rev Log: Add a new encoding detector implementation. The BoM encoding detection is based in the previous code. The prolog encoding detection delegates to the JRE's XM<L parser rather than the custom Jasper parser.
Added: tomcat/trunk/java/org/apache/jasper/compiler/EncodingDetector.java (with props) Modified: tomcat/trunk/java/org/apache/jasper/security/SecurityClassLoad.java Added: tomcat/trunk/java/org/apache/jasper/compiler/EncodingDetector.java URL: http://svn.apache.org/viewvc/tomcat/trunk/java/org/apache/jasper/compiler/EncodingDetector.java?rev=1774526&view=auto ============================================================================== --- tomcat/trunk/java/org/apache/jasper/compiler/EncodingDetector.java (added) +++ tomcat/trunk/java/org/apache/jasper/compiler/EncodingDetector.java Thu Dec 15 21:34:06 2016 @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jasper.compiler; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; + +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +/* + * The BoM detection is derived from: + * http://svn.us.apache.org/viewvc/tomcat/trunk/java/org/apache/jasper/xmlparser/XMLEncodingDetector.java?annotate=1742248 + */ +class EncodingDetector { + + private static final XMLInputFactory XML_INPUT_FACTORY; + static { + XML_INPUT_FACTORY = XMLInputFactory.newFactory(); + } + + private final BomResult bomResult; + private final String prologEncoding; + + + /* + * TODO: Refactor Jasper InputStream creation and handling so the + * InputStream passed to this method is buffered and therefore saves + * on multiple opening and re-opening of the same file. + */ + EncodingDetector(InputStream is) throws IOException { + // Keep buffer size to a minimum here. BoM will be no more than 4 bytes + // so that is the maximum we need to buffer + BufferedInputStream bis = new BufferedInputStream(is, 4); + bis.mark(4); + + bomResult = processBom(bis); + + // Reset the stream back to the start to allow the XML prolog detection + // to work. Skip any BoM we discovered. + bis.reset(); + if (bomResult != null) { + for (int i = 0; i < bomResult.skip; i++) { + is.read(); + } + } + + prologEncoding = getPrologEncoding(bis); + } + + + String getBomEncoding() { + return bomResult.encoding; + } + + + Boolean getBigEndian() { + return bomResult.bigEndian; + } + + + int getSkip() { + return bomResult.skip; + } + + + String getPrologEncoding() { + return prologEncoding; + } + + + private String getPrologEncoding(InputStream stream) { + String encoding = null; + try { + XMLStreamReader xmlStreamReader = XML_INPUT_FACTORY.createXMLStreamReader(stream); + encoding = xmlStreamReader.getCharacterEncodingScheme(); + } catch (XMLStreamException e) { + // Ignore + } + return encoding; + } + + + private BomResult processBom(InputStream stream) { + // Read first four bytes (or as many are available) and determine + // encoding + try { + final byte[] b4 = new byte[4]; + int count = 0; + int singleByteRead; + while (count < 4) { + singleByteRead = stream.read(); + if (singleByteRead == -1) { + break; + } + b4[count] = (byte) singleByteRead; + count++; + } + + return parseBom(b4, count); + } catch (IOException ioe) { + // Failed. + return new BomResult("UTF-8", null, 0); + } + } + + + private BomResult parseBom(byte[] b4, int count) { + + if (count < 2) { + return new BomResult("UTF-8", null, 0); + } + + // UTF-16, with BOM + int b0 = b4[0] & 0xFF; + int b1 = b4[1] & 0xFF; + if (b0 == 0xFE && b1 == 0xFF) { + // UTF-16, big-endian + return new BomResult("UTF-16BE", Boolean.TRUE, 2); + } + if (b0 == 0xFF && b1 == 0xFE) { + // UTF-16, little-endian + return new BomResult("UTF-16LE", Boolean.FALSE, 2); + } + + // default to UTF-8 if we don't have enough bytes to make a + // good determination of the encoding + if (count < 3) { + return new BomResult("UTF-8", null, 0); + } + + // UTF-8 with a BOM + int b2 = b4[2] & 0xFF; + if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { + return new BomResult("UTF-8", null, 3); + } + + // default to UTF-8 if we don't have enough bytes to make a + // good determination of the encoding + if (count < 4) { + return new BomResult("UTF-8", null, 0); + } + + // other encodings + int b3 = b4[3] & 0xFF; + if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { + // UCS-4, big endian (1234) + return new BomResult("ISO-10646-UCS-4", Boolean.TRUE, 4); + } + if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { + // UCS-4, little endian (4321) + return new BomResult("ISO-10646-UCS-4", Boolean.FALSE, 4); + } + if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { + // UCS-4, unusual octet order (2143) + // REVISIT: What should this be? + return new BomResult("ISO-10646-UCS-4", null, 4); + } + if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { + // UCS-4, unusual octect order (3412) + // REVISIT: What should this be? + return new BomResult("ISO-10646-UCS-4", null, 4); + } + if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { + // UTF-16, big-endian, no BOM + // (or could turn out to be UCS-2... + // REVISIT: What should this be? + return new BomResult("UTF-16BE", Boolean.TRUE, 4); + } + if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { + // UTF-16, little-endian, no BOM + // (or could turn out to be UCS-2... + return new BomResult("UTF-16LE", Boolean.FALSE, 4); + } + if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { + // EBCDIC + // a la xerces1, return CP037 instead of EBCDIC here + return new BomResult("CP037", null, 4); + } + + // default encoding + return new BomResult("UTF-8", null, 0); + } + + + private static class BomResult { + + public final String encoding; + public final Boolean bigEndian; + public final int skip; + + public BomResult(String encoding, Boolean bigEndian, int skip) { + this.encoding = encoding; + this.bigEndian = bigEndian; + this.skip = skip; + } + } +} Propchange: tomcat/trunk/java/org/apache/jasper/compiler/EncodingDetector.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: tomcat/trunk/java/org/apache/jasper/security/SecurityClassLoad.java URL: http://svn.apache.org/viewvc/tomcat/trunk/java/org/apache/jasper/security/SecurityClassLoad.java?rev=1774526&r1=1774525&r2=1774526&view=diff ============================================================================== --- tomcat/trunk/java/org/apache/jasper/security/SecurityClassLoad.java (original) +++ tomcat/trunk/java/org/apache/jasper/security/SecurityClassLoad.java Thu Dec 15 21:34:06 2016 @@ -39,6 +39,10 @@ public final class SecurityClassLoad { final String basePackage = "org.apache.jasper."; try { + // Ensure XMLInputFactory is loaded with Tomcat's class loader + loader.loadClass( basePackage + + "comppiler.EncodingDetector"); + loader.loadClass( basePackage + "runtime.JspFactoryImpl$PrivilegedGetPageContext"); loader.loadClass( basePackage + --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org For additional commands, e-mail: dev-h...@tomcat.apache.org