Modified: commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java URL: http://svn.apache.org/viewvc/commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java?rev=995859&r1=995858&r2=995859&view=diff ============================================================================== --- commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java (original) +++ commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java Fri Sep 10 16:33:35 2010 @@ -23,442 +23,442 @@ import org.apache.sanselan.common.Binary public abstract class UnicodeUtils implements BinaryConstants { - /** - * This class should never be instantiated. - */ - private UnicodeUtils() - { - } - - public static class UnicodeException extends Exception - { - public UnicodeException(String message) - { - super(message); - } - } - - // A default single-byte charset. - public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0; - public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1; - public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2; - public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3; - public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4; - public static final int CHAR_ENCODING_CODE_UTF_8 = 5; - public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1; - - // /* - // * Guess the character encoding of arbitrary character data in a data - // * buffer. - // * - // * The data may not run to the end of the buffer; it may be terminated. - // This - // * makes the problem much harder, since the character data may be followed - // * by arbitrary data. - // */ - // public static int guessCharacterEncoding(byte bytes[], int index) - // { - // int length = bytes.length - index; - // - // if (length < 1) - // return CHAR_ENCODING_CODE_AMBIGUOUS; - // - // if (length >= 2) - // { - // // look for BOM. - // - // int c1 = 0xff & bytes[index]; - // int c2 = 0xff & bytes[index + 1]; - // if (c1 == 0xFF && c2 == 0xFE) - // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM; - // else if (c1 == 0xFE && c2 == 0xFF) - // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM; - // } - // - // } - // - // /* - // * Guess the character encoding of arbitrary character data in a data - // * buffer. - // * - // * The data fills the entire buffer. If it is terminated, the terminator - // * byte(s) will be the last bytes in the buffer. - // * - // * This makes the problem a bit easier. - // */ - // public static int guessCharacterEncodingSimple(byte bytes[], int index) - // throws UnicodeException - // { - // int length = bytes.length - index; - // - // if (length < 1) - // return CHAR_ENCODING_CODE_AMBIGUOUS; - // - // if (length >= 2) - // { - // // identify or eliminate UTF-16 with a BOM. - // - // int c1 = 0xff & bytes[index]; - // int c2 = 0xff & bytes[index + 1]; - // if (c1 == 0xFF && c2 == 0xFE) - // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM; - // else if (c1 == 0xFE && c2 == 0xFF) - // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM; - // } - // - // if (length >= 2) - // { - // // look for optional double-byte terminator. - // - // int c1 = 0xff & bytes[bytes.length - 2]; - // int c2 = 0xff & bytes[bytes.length - 1]; - // if (c1 == 0 && c2 == 0) - // { - // // definitely a flavor of UTF-16. - // if (length % 2 != 0) - // throw new UnicodeException( - // "Character data with double-byte terminator has an odd length."); - // - // boolean mayHaveTerminator = true; - // boolean mustHaveTerminator = false; - // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM( - // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index, - // mayHaveTerminator, mustHaveTerminator); - // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM( - // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index, - // mayHaveTerminator, mustHaveTerminator); - // if ((!possibleBigEndian) && (!possibleLittleEndian)) - // throw new UnicodeException( - // "Invalid character data, possibly UTF-16."); - // if (possibleBigEndian && possibleLittleEndian) - // return CHAR_ENCODING_CODE_AMBIGUOUS; - // if (possibleBigEndian) - // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM; - // if (possibleLittleEndian) - // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM; - // } - // } - // - // List possibleEncodings = new ArrayList(); - // if (length % 2 == 0) - // { - // boolean mayHaveTerminator = true; - // boolean mustHaveTerminator = false; - // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM( - // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index, - // mayHaveTerminator, mustHaveTerminator); - // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM( - // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index, - // mayHaveTerminator, mustHaveTerminator); - // - // if (possibleBigEndian) - // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM; - // if (possibleLittleEndian) - // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM; - // } - // - // } - - public static final boolean isValidISO_8859_1(String s) - { - try - { - String roundtrip = new String(s.getBytes("ISO-8859-1"), - "ISO-8859-1"); - return s.equals(roundtrip); - } catch (UnsupportedEncodingException e) - { - // should never be thrown. - throw new RuntimeException("Error parsing string.", e); - } - } - - /* - * Return the index of the first utf-16 terminator (ie. two even-aligned - * nulls). If not found, return -1. - */ - private static int findFirstDoubleByteTerminator(byte bytes[], int index) - { - for (int i = index; i < bytes.length - 1; i += 2) - { - int c1 = 0xff & bytes[index]; - int c2 = 0xff & bytes[index + 1]; - if (c1 == 0 && c2 == 0) - return i; - } - return -1; - } - - public final int findEndWithTerminator(byte bytes[], int index) - throws UnicodeException - { - return findEnd(bytes, index, true); - } - - public final int findEndWithoutTerminator(byte bytes[], int index) - throws UnicodeException - { - return findEnd(bytes, index, false); - } - - protected abstract int findEnd(byte bytes[], int index, - boolean includeTerminator) throws UnicodeException; - - public static UnicodeUtils getInstance(int charEncodingCode) - throws UnicodeException - { - switch (charEncodingCode) - { - case CHAR_ENCODING_CODE_ISO_8859_1: - return new UnicodeMetricsASCII(); - case CHAR_ENCODING_CODE_UTF_8: - // Debug.debug("CHAR_ENCODING_CODE_UTF_8"); - return new UnicodeMetricsUTF8(); - case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM: - case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM: - // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM"); - return new UnicodeMetricsUTF16WithBOM(); - case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM: - return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN); - case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM: - return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN); - default: - throw new UnicodeException("Unknown char encoding code: " - + charEncodingCode); - } - } - - private static class UnicodeMetricsASCII extends UnicodeUtils - { - public int findEnd(byte bytes[], int index, boolean includeTerminator) - throws UnicodeException - { - for (int i = index; i < bytes.length; i++) - { - if (bytes[i] == 0) - return includeTerminator ? i + 1 : i; - } - return bytes.length; - // throw new UnicodeException("Terminator not found."); - } - } - - // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils - // { - // public int findEnd(byte bytes[], int index, boolean includeTerminator) - // throws UnicodeException - // { - // for (int i = index; i < bytes.length; i++) - // { - // if (bytes[i] == 0) - // return includeTerminator ? i + 1 : i; - // } - // return bytes.length; - // // throw new UnicodeException("Terminator not found."); - // } - // } - - private static class UnicodeMetricsUTF8 extends UnicodeUtils - { - - public int findEnd(byte bytes[], int index, boolean includeTerminator) - throws UnicodeException - { - // http://en.wikipedia.org/wiki/UTF-8 - - while (true) - { - if (index == bytes.length) - return bytes.length; - if (index > bytes.length) - throw new UnicodeException("Terminator not found."); - - int c1 = 0xff & bytes[index++]; - if (c1 == 0) - return includeTerminator ? index : index - 1; - else if (c1 <= 0x7f) - continue; - else if (c1 <= 0xDF) - { - if (index >= bytes.length) - throw new UnicodeException("Invalid unicode."); - - int c2 = 0xff & bytes[index++]; - if (c2 < 0x80 || c2 > 0xBF) - throw new UnicodeException("Invalid code point."); - } else if (c1 <= 0xEF) - { - if (index >= bytes.length - 1) - throw new UnicodeException("Invalid unicode."); - - int c2 = 0xff & bytes[index++]; - if (c2 < 0x80 || c2 > 0xBF) - throw new UnicodeException("Invalid code point."); - int c3 = 0xff & bytes[index++]; - if (c3 < 0x80 || c3 > 0xBF) - throw new UnicodeException("Invalid code point."); - } else if (c1 <= 0xF4) - { - if (index >= bytes.length - 2) - throw new UnicodeException("Invalid unicode."); - - int c2 = 0xff & bytes[index++]; - if (c2 < 0x80 || c2 > 0xBF) - throw new UnicodeException("Invalid code point."); - int c3 = 0xff & bytes[index++]; - if (c3 < 0x80 || c3 > 0xBF) - throw new UnicodeException("Invalid code point."); - int c4 = 0xff & bytes[index++]; - if (c4 < 0x80 || c4 > 0xBF) - throw new UnicodeException("Invalid code point."); - } else - throw new UnicodeException("Invalid code point."); - } - } - } - - private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils - { - protected static final int BYTE_ORDER_BIG_ENDIAN = 0; - protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1; - protected int byteOrder = BYTE_ORDER_BIG_ENDIAN; - - public UnicodeMetricsUTF16(int byteOrder) - { - this.byteOrder = byteOrder; - } - - public boolean isValid(byte bytes[], int index, - boolean mayHaveTerminator, boolean mustHaveTerminator) - throws UnicodeException - { - // http://en.wikipedia.org/wiki/UTF-16/UCS-2 - - while (true) - { - if (index == bytes.length) - { - // end of buffer, no terminator found. - return !mustHaveTerminator; - } - - if (index >= bytes.length - 1) - { - // end of odd-length buffer, no terminator found. - return false; - } - - int c1 = 0xff & bytes[index++]; - int c2 = 0xff & bytes[index++]; - int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2; - - if (c1 == 0 && c2 == 0) - { - // terminator found. - return mayHaveTerminator; - } - - if (msb1 >= 0xD8) - { - // Surrogate pair found. - - if (msb1 >= 0xDC) - { - // invalid first surrogate. - return false; - } - - if (index >= bytes.length - 1) - { - // missing second surrogate. - return false; - } - - // second word. - int c3 = 0xff & bytes[index++]; - int c4 = 0xff & bytes[index++]; - int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4; - if (msb2 < 0xDC) - { - // invalid second surrogate. - return false; - } - } - } - } - - public int findEnd(byte bytes[], int index, boolean includeTerminator) - throws UnicodeException - { - // http://en.wikipedia.org/wiki/UTF-16/UCS-2 - - while (true) - { - if (index == bytes.length) - return bytes.length; - if (index > bytes.length - 1) - throw new UnicodeException("Terminator not found."); - - int c1 = 0xff & bytes[index++]; - int c2 = 0xff & bytes[index++]; - int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2; - - if (c1 == 0 && c2 == 0) - { - return includeTerminator ? index : index - 2; - } else if (msb1 >= 0xD8) - { - if (index > bytes.length - 1) - throw new UnicodeException("Terminator not found."); - - // second word. - int c3 = 0xff & bytes[index++]; - int c4 = 0xff & bytes[index++]; - int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4; - if (msb2 < 0xDC) - throw new UnicodeException("Invalid code point."); - } - } - } - } - - private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16 - { - - public UnicodeMetricsUTF16NoBOM(final int byteOrder) - { - super(byteOrder); - } - - } - - private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16 - { - - public UnicodeMetricsUTF16WithBOM() - { - super(BYTE_ORDER_BIG_ENDIAN); - } - - public int findEnd(byte bytes[], int index, boolean includeTerminator) - throws UnicodeException - { - // http://en.wikipedia.org/wiki/UTF-16/UCS-2 - - if (index >= bytes.length - 1) - throw new UnicodeException("Missing BOM."); - - int c1 = 0xff & bytes[index++]; - int c2 = 0xff & bytes[index++]; - if (c1 == 0xFF && c2 == 0xFE) - byteOrder = BYTE_ORDER_LITTLE_ENDIAN; - else if (c1 == 0xFE && c2 == 0xFF) - byteOrder = BYTE_ORDER_BIG_ENDIAN; - else - throw new UnicodeException("Invalid byte order mark."); - - return super.findEnd(bytes, index, includeTerminator); - } - } + /** + * This class should never be instantiated. + */ + private UnicodeUtils() + { + } + + public static class UnicodeException extends Exception + { + public UnicodeException(String message) + { + super(message); + } + } + + // A default single-byte charset. + public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0; + public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1; + public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2; + public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3; + public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4; + public static final int CHAR_ENCODING_CODE_UTF_8 = 5; + public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1; + + // /* + // * Guess the character encoding of arbitrary character data in a data + // * buffer. + // * + // * The data may not run to the end of the buffer; it may be terminated. + // This + // * makes the problem much harder, since the character data may be followed + // * by arbitrary data. + // */ + // public static int guessCharacterEncoding(byte bytes[], int index) + // { + // int length = bytes.length - index; + // + // if (length < 1) + // return CHAR_ENCODING_CODE_AMBIGUOUS; + // + // if (length >= 2) + // { + // // look for BOM. + // + // int c1 = 0xff & bytes[index]; + // int c2 = 0xff & bytes[index + 1]; + // if (c1 == 0xFF && c2 == 0xFE) + // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM; + // else if (c1 == 0xFE && c2 == 0xFF) + // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM; + // } + // + // } + // + // /* + // * Guess the character encoding of arbitrary character data in a data + // * buffer. + // * + // * The data fills the entire buffer. If it is terminated, the terminator + // * byte(s) will be the last bytes in the buffer. + // * + // * This makes the problem a bit easier. + // */ + // public static int guessCharacterEncodingSimple(byte bytes[], int index) + // throws UnicodeException + // { + // int length = bytes.length - index; + // + // if (length < 1) + // return CHAR_ENCODING_CODE_AMBIGUOUS; + // + // if (length >= 2) + // { + // // identify or eliminate UTF-16 with a BOM. + // + // int c1 = 0xff & bytes[index]; + // int c2 = 0xff & bytes[index + 1]; + // if (c1 == 0xFF && c2 == 0xFE) + // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM; + // else if (c1 == 0xFE && c2 == 0xFF) + // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM; + // } + // + // if (length >= 2) + // { + // // look for optional double-byte terminator. + // + // int c1 = 0xff & bytes[bytes.length - 2]; + // int c2 = 0xff & bytes[bytes.length - 1]; + // if (c1 == 0 && c2 == 0) + // { + // // definitely a flavor of UTF-16. + // if (length % 2 != 0) + // throw new UnicodeException( + // "Character data with double-byte terminator has an odd length."); + // + // boolean mayHaveTerminator = true; + // boolean mustHaveTerminator = false; + // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM( + // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index, + // mayHaveTerminator, mustHaveTerminator); + // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM( + // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index, + // mayHaveTerminator, mustHaveTerminator); + // if ((!possibleBigEndian) && (!possibleLittleEndian)) + // throw new UnicodeException( + // "Invalid character data, possibly UTF-16."); + // if (possibleBigEndian && possibleLittleEndian) + // return CHAR_ENCODING_CODE_AMBIGUOUS; + // if (possibleBigEndian) + // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM; + // if (possibleLittleEndian) + // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM; + // } + // } + // + // List possibleEncodings = new ArrayList(); + // if (length % 2 == 0) + // { + // boolean mayHaveTerminator = true; + // boolean mustHaveTerminator = false; + // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM( + // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index, + // mayHaveTerminator, mustHaveTerminator); + // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM( + // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index, + // mayHaveTerminator, mustHaveTerminator); + // + // if (possibleBigEndian) + // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM; + // if (possibleLittleEndian) + // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM; + // } + // + // } + + public static final boolean isValidISO_8859_1(String s) + { + try + { + String roundtrip = new String(s.getBytes("ISO-8859-1"), + "ISO-8859-1"); + return s.equals(roundtrip); + } catch (UnsupportedEncodingException e) + { + // should never be thrown. + throw new RuntimeException("Error parsing string.", e); + } + } + + /* + * Return the index of the first utf-16 terminator (ie. two even-aligned + * nulls). If not found, return -1. + */ + private static int findFirstDoubleByteTerminator(byte bytes[], int index) + { + for (int i = index; i < bytes.length - 1; i += 2) + { + int c1 = 0xff & bytes[index]; + int c2 = 0xff & bytes[index + 1]; + if (c1 == 0 && c2 == 0) + return i; + } + return -1; + } + + public final int findEndWithTerminator(byte bytes[], int index) + throws UnicodeException + { + return findEnd(bytes, index, true); + } + + public final int findEndWithoutTerminator(byte bytes[], int index) + throws UnicodeException + { + return findEnd(bytes, index, false); + } + + protected abstract int findEnd(byte bytes[], int index, + boolean includeTerminator) throws UnicodeException; + + public static UnicodeUtils getInstance(int charEncodingCode) + throws UnicodeException + { + switch (charEncodingCode) + { + case CHAR_ENCODING_CODE_ISO_8859_1: + return new UnicodeMetricsASCII(); + case CHAR_ENCODING_CODE_UTF_8: + // Debug.debug("CHAR_ENCODING_CODE_UTF_8"); + return new UnicodeMetricsUTF8(); + case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM: + case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM: + // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM"); + return new UnicodeMetricsUTF16WithBOM(); + case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM: + return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN); + case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM: + return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN); + default: + throw new UnicodeException("Unknown char encoding code: " + + charEncodingCode); + } + } + + private static class UnicodeMetricsASCII extends UnicodeUtils + { + public int findEnd(byte bytes[], int index, boolean includeTerminator) + throws UnicodeException + { + for (int i = index; i < bytes.length; i++) + { + if (bytes[i] == 0) + return includeTerminator ? i + 1 : i; + } + return bytes.length; + // throw new UnicodeException("Terminator not found."); + } + } + + // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils + // { + // public int findEnd(byte bytes[], int index, boolean includeTerminator) + // throws UnicodeException + // { + // for (int i = index; i < bytes.length; i++) + // { + // if (bytes[i] == 0) + // return includeTerminator ? i + 1 : i; + // } + // return bytes.length; + // // throw new UnicodeException("Terminator not found."); + // } + // } + + private static class UnicodeMetricsUTF8 extends UnicodeUtils + { + + public int findEnd(byte bytes[], int index, boolean includeTerminator) + throws UnicodeException + { + // http://en.wikipedia.org/wiki/UTF-8 + + while (true) + { + if (index == bytes.length) + return bytes.length; + if (index > bytes.length) + throw new UnicodeException("Terminator not found."); + + int c1 = 0xff & bytes[index++]; + if (c1 == 0) + return includeTerminator ? index : index - 1; + else if (c1 <= 0x7f) + continue; + else if (c1 <= 0xDF) + { + if (index >= bytes.length) + throw new UnicodeException("Invalid unicode."); + + int c2 = 0xff & bytes[index++]; + if (c2 < 0x80 || c2 > 0xBF) + throw new UnicodeException("Invalid code point."); + } else if (c1 <= 0xEF) + { + if (index >= bytes.length - 1) + throw new UnicodeException("Invalid unicode."); + + int c2 = 0xff & bytes[index++]; + if (c2 < 0x80 || c2 > 0xBF) + throw new UnicodeException("Invalid code point."); + int c3 = 0xff & bytes[index++]; + if (c3 < 0x80 || c3 > 0xBF) + throw new UnicodeException("Invalid code point."); + } else if (c1 <= 0xF4) + { + if (index >= bytes.length - 2) + throw new UnicodeException("Invalid unicode."); + + int c2 = 0xff & bytes[index++]; + if (c2 < 0x80 || c2 > 0xBF) + throw new UnicodeException("Invalid code point."); + int c3 = 0xff & bytes[index++]; + if (c3 < 0x80 || c3 > 0xBF) + throw new UnicodeException("Invalid code point."); + int c4 = 0xff & bytes[index++]; + if (c4 < 0x80 || c4 > 0xBF) + throw new UnicodeException("Invalid code point."); + } else + throw new UnicodeException("Invalid code point."); + } + } + } + + private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils + { + protected static final int BYTE_ORDER_BIG_ENDIAN = 0; + protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1; + protected int byteOrder = BYTE_ORDER_BIG_ENDIAN; + + public UnicodeMetricsUTF16(int byteOrder) + { + this.byteOrder = byteOrder; + } + + public boolean isValid(byte bytes[], int index, + boolean mayHaveTerminator, boolean mustHaveTerminator) + throws UnicodeException + { + // http://en.wikipedia.org/wiki/UTF-16/UCS-2 + + while (true) + { + if (index == bytes.length) + { + // end of buffer, no terminator found. + return !mustHaveTerminator; + } + + if (index >= bytes.length - 1) + { + // end of odd-length buffer, no terminator found. + return false; + } + + int c1 = 0xff & bytes[index++]; + int c2 = 0xff & bytes[index++]; + int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2; + + if (c1 == 0 && c2 == 0) + { + // terminator found. + return mayHaveTerminator; + } + + if (msb1 >= 0xD8) + { + // Surrogate pair found. + + if (msb1 >= 0xDC) + { + // invalid first surrogate. + return false; + } + + if (index >= bytes.length - 1) + { + // missing second surrogate. + return false; + } + + // second word. + int c3 = 0xff & bytes[index++]; + int c4 = 0xff & bytes[index++]; + int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4; + if (msb2 < 0xDC) + { + // invalid second surrogate. + return false; + } + } + } + } + + public int findEnd(byte bytes[], int index, boolean includeTerminator) + throws UnicodeException + { + // http://en.wikipedia.org/wiki/UTF-16/UCS-2 + + while (true) + { + if (index == bytes.length) + return bytes.length; + if (index > bytes.length - 1) + throw new UnicodeException("Terminator not found."); + + int c1 = 0xff & bytes[index++]; + int c2 = 0xff & bytes[index++]; + int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2; + + if (c1 == 0 && c2 == 0) + { + return includeTerminator ? index : index - 2; + } else if (msb1 >= 0xD8) + { + if (index > bytes.length - 1) + throw new UnicodeException("Terminator not found."); + + // second word. + int c3 = 0xff & bytes[index++]; + int c4 = 0xff & bytes[index++]; + int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4; + if (msb2 < 0xDC) + throw new UnicodeException("Invalid code point."); + } + } + } + } + + private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16 + { + + public UnicodeMetricsUTF16NoBOM(final int byteOrder) + { + super(byteOrder); + } + + } + + private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16 + { + + public UnicodeMetricsUTF16WithBOM() + { + super(BYTE_ORDER_BIG_ENDIAN); + } + + public int findEnd(byte bytes[], int index, boolean includeTerminator) + throws UnicodeException + { + // http://en.wikipedia.org/wiki/UTF-16/UCS-2 + + if (index >= bytes.length - 1) + throw new UnicodeException("Missing BOM."); + + int c1 = 0xff & bytes[index++]; + int c2 = 0xff & bytes[index++]; + if (c1 == 0xFF && c2 == 0xFE) + byteOrder = BYTE_ORDER_LITTLE_ENDIAN; + else if (c1 == 0xFE && c2 == 0xFF) + byteOrder = BYTE_ORDER_BIG_ENDIAN; + else + throw new UnicodeException("Invalid byte order mark."); + + return super.findEnd(bytes, index, includeTerminator); + } + } }