Modified: 
commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java
URL: 
http://svn.apache.org/viewvc/commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java?rev=995859&r1=995858&r2=995859&view=diff
==============================================================================
--- 
commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java
 (original)
+++ 
commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java
 Fri Sep 10 16:33:35 2010
@@ -23,442 +23,442 @@ import org.apache.sanselan.common.Binary
 
 public abstract class UnicodeUtils implements BinaryConstants
 {
-       /**
-        * This class should never be instantiated.
-        */
-       private UnicodeUtils()
-       {
-       }
-       
-       public static class UnicodeException extends Exception
-       {
-               public UnicodeException(String message)
-               {
-                       super(message);
-               }
-       }
-
-       // A default single-byte charset.
-       public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
-       public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 
1;
-       public static final int 
CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
-       public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
-       public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM 
= 4;
-       public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
-       public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;
-
-       // /*
-       // * Guess the character encoding of arbitrary character data in a data
-       // * buffer.
-       // *
-       // * The data may not run to the end of the buffer; it may be 
terminated.
-       // This
-       // * makes the problem much harder, since the character data may be 
followed
-       // * by arbitrary data.
-       // */
-       // public static int guessCharacterEncoding(byte bytes[], int index)
-       // {
-       // int length = bytes.length - index;
-       //
-       // if (length < 1)
-       // return CHAR_ENCODING_CODE_AMBIGUOUS;
-       //
-       // if (length >= 2)
-       // {
-       // // look for BOM.
-       //
-       // int c1 = 0xff & bytes[index];
-       // int c2 = 0xff & bytes[index + 1];
-       // if (c1 == 0xFF && c2 == 0xFE)
-       // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
-       // else if (c1 == 0xFE && c2 == 0xFF)
-       // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
-       // }
-       //
-       // }
-       //
-       // /*
-       // * Guess the character encoding of arbitrary character data in a data
-       // * buffer.
-       // *
-       // * The data fills the entire buffer. If it is terminated, the 
terminator
-       // * byte(s) will be the last bytes in the buffer.
-       // *
-       // * This makes the problem a bit easier.
-       // */
-       // public static int guessCharacterEncodingSimple(byte bytes[], int 
index)
-       // throws UnicodeException
-       // {
-       // int length = bytes.length - index;
-       //
-       // if (length < 1)
-       // return CHAR_ENCODING_CODE_AMBIGUOUS;
-       //
-       // if (length >= 2)
-       // {
-       // // identify or eliminate UTF-16 with a BOM.
-       //
-       // int c1 = 0xff & bytes[index];
-       // int c2 = 0xff & bytes[index + 1];
-       // if (c1 == 0xFF && c2 == 0xFE)
-       // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
-       // else if (c1 == 0xFE && c2 == 0xFF)
-       // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
-       // }
-       //
-       // if (length >= 2)
-       // {
-       // // look for optional double-byte terminator.
-       //
-       // int c1 = 0xff & bytes[bytes.length - 2];
-       // int c2 = 0xff & bytes[bytes.length - 1];
-       // if (c1 == 0 && c2 == 0)
-       // {
-       // // definitely a flavor of UTF-16.
-       // if (length % 2 != 0)
-       // throw new UnicodeException(
-       // "Character data with double-byte terminator has an odd length.");
-       //
-       // boolean mayHaveTerminator = true;
-       // boolean mustHaveTerminator = false;
-       // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
-       // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
-       // mayHaveTerminator, mustHaveTerminator);
-       // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
-       // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
-       // mayHaveTerminator, mustHaveTerminator);
-       // if ((!possibleBigEndian) && (!possibleLittleEndian))
-       // throw new UnicodeException(
-       // "Invalid character data, possibly UTF-16.");
-       // if (possibleBigEndian && possibleLittleEndian)
-       // return CHAR_ENCODING_CODE_AMBIGUOUS;
-       // if (possibleBigEndian)
-       // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
-       // if (possibleLittleEndian)
-       // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
-       // }
-       // }
-       //
-       // List possibleEncodings = new ArrayList();
-       // if (length % 2 == 0)
-       // {
-       // boolean mayHaveTerminator = true;
-       // boolean mustHaveTerminator = false;
-       // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
-       // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
-       // mayHaveTerminator, mustHaveTerminator);
-       // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
-       // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
-       // mayHaveTerminator, mustHaveTerminator);
-       //
-       // if (possibleBigEndian)
-       // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
-       // if (possibleLittleEndian)
-       // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
-       // }
-       //
-       // }
-
-       public static final boolean isValidISO_8859_1(String s)
-       {
-               try
-               {
-                       String roundtrip = new String(s.getBytes("ISO-8859-1"),
-                                       "ISO-8859-1");
-                       return s.equals(roundtrip);
-               } catch (UnsupportedEncodingException e)
-               {
-                       // should never be thrown.
-                       throw new RuntimeException("Error parsing string.", e);
-               }
-       }
-
-       /*
-        * Return the index of the first utf-16 terminator (ie. two even-aligned
-        * nulls). If not found, return -1.
-        */
-       private static int findFirstDoubleByteTerminator(byte bytes[], int 
index)
-       {
-               for (int i = index; i < bytes.length - 1; i += 2)
-               {
-                       int c1 = 0xff & bytes[index];
-                       int c2 = 0xff & bytes[index + 1];
-                       if (c1 == 0 && c2 == 0)
-                               return i;
-               }
-               return -1;
-       }
-
-       public final int findEndWithTerminator(byte bytes[], int index)
-                       throws UnicodeException
-       {
-               return findEnd(bytes, index, true);
-       }
-
-       public final int findEndWithoutTerminator(byte bytes[], int index)
-                       throws UnicodeException
-       {
-               return findEnd(bytes, index, false);
-       }
-
-       protected abstract int findEnd(byte bytes[], int index,
-                       boolean includeTerminator) throws UnicodeException;
-
-       public static UnicodeUtils getInstance(int charEncodingCode)
-                       throws UnicodeException
-       {
-               switch (charEncodingCode)
-               {
-               case CHAR_ENCODING_CODE_ISO_8859_1:
-                       return new UnicodeMetricsASCII();
-               case CHAR_ENCODING_CODE_UTF_8:
-                       // Debug.debug("CHAR_ENCODING_CODE_UTF_8");
-                       return new UnicodeMetricsUTF8();
-               case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
-               case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
-                       // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
-                       return new UnicodeMetricsUTF16WithBOM();
-               case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
-                       return new 
UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
-               case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
-                       return new 
UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
-               default:
-                       throw new UnicodeException("Unknown char encoding code: 
"
-                                       + charEncodingCode);
-               }
-       }
-
-       private static class UnicodeMetricsASCII extends UnicodeUtils
-       {
-               public int findEnd(byte bytes[], int index, boolean 
includeTerminator)
-                               throws UnicodeException
-               {
-                       for (int i = index; i < bytes.length; i++)
-                       {
-                               if (bytes[i] == 0)
-                                       return includeTerminator ? i + 1 : i;
-                       }
-                       return bytes.length;
-                       // throw new UnicodeException("Terminator not found.");
-               }
-       }
-
-       // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
-       // {
-       // public int findEnd(byte bytes[], int index, boolean 
includeTerminator)
-       // throws UnicodeException
-       // {
-       // for (int i = index; i < bytes.length; i++)
-       // {
-       // if (bytes[i] == 0)
-       // return includeTerminator ? i + 1 : i;
-       // }
-       // return bytes.length;
-       // // throw new UnicodeException("Terminator not found.");
-       // }
-       // }
-
-       private static class UnicodeMetricsUTF8 extends UnicodeUtils
-       {
-
-               public int findEnd(byte bytes[], int index, boolean 
includeTerminator)
-                               throws UnicodeException
-               {
-                       // http://en.wikipedia.org/wiki/UTF-8
-
-                       while (true)
-                       {
-                               if (index == bytes.length)
-                                       return bytes.length;
-                               if (index > bytes.length)
-                                       throw new UnicodeException("Terminator 
not found.");
-
-                               int c1 = 0xff & bytes[index++];
-                               if (c1 == 0)
-                                       return includeTerminator ? index : 
index - 1;
-                               else if (c1 <= 0x7f)
-                                       continue;
-                               else if (c1 <= 0xDF)
-                               {
-                                       if (index >= bytes.length)
-                                               throw new 
UnicodeException("Invalid unicode.");
-
-                                       int c2 = 0xff & bytes[index++];
-                                       if (c2 < 0x80 || c2 > 0xBF)
-                                               throw new 
UnicodeException("Invalid code point.");
-                               } else if (c1 <= 0xEF)
-                               {
-                                       if (index >= bytes.length - 1)
-                                               throw new 
UnicodeException("Invalid unicode.");
-
-                                       int c2 = 0xff & bytes[index++];
-                                       if (c2 < 0x80 || c2 > 0xBF)
-                                               throw new 
UnicodeException("Invalid code point.");
-                                       int c3 = 0xff & bytes[index++];
-                                       if (c3 < 0x80 || c3 > 0xBF)
-                                               throw new 
UnicodeException("Invalid code point.");
-                               } else if (c1 <= 0xF4)
-                               {
-                                       if (index >= bytes.length - 2)
-                                               throw new 
UnicodeException("Invalid unicode.");
-
-                                       int c2 = 0xff & bytes[index++];
-                                       if (c2 < 0x80 || c2 > 0xBF)
-                                               throw new 
UnicodeException("Invalid code point.");
-                                       int c3 = 0xff & bytes[index++];
-                                       if (c3 < 0x80 || c3 > 0xBF)
-                                               throw new 
UnicodeException("Invalid code point.");
-                                       int c4 = 0xff & bytes[index++];
-                                       if (c4 < 0x80 || c4 > 0xBF)
-                                               throw new 
UnicodeException("Invalid code point.");
-                               } else
-                                       throw new UnicodeException("Invalid 
code point.");
-                       }
-               }
-       }
-
-       private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
-       {
-               protected static final int BYTE_ORDER_BIG_ENDIAN = 0;
-               protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1;
-               protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;
-
-               public UnicodeMetricsUTF16(int byteOrder)
-               {
-                       this.byteOrder = byteOrder;
-               }
-
-               public boolean isValid(byte bytes[], int index,
-                               boolean mayHaveTerminator, boolean 
mustHaveTerminator)
-                               throws UnicodeException
-               {
-                       // http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
-                       while (true)
-                       {
-                               if (index == bytes.length)
-                               {
-                                       // end of buffer, no terminator found.
-                                       return !mustHaveTerminator;
-                               }
-
-                               if (index >= bytes.length - 1)
-                               {
-                                       // end of odd-length buffer, no 
terminator found.
-                                       return false;
-                               }
-
-                               int c1 = 0xff & bytes[index++];
-                               int c2 = 0xff & bytes[index++];
-                               int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? 
c1 : c2;
-
-                               if (c1 == 0 && c2 == 0)
-                               {
-                                       // terminator found.
-                                       return mayHaveTerminator;
-                               }
-
-                               if (msb1 >= 0xD8)
-                               {
-                                       // Surrogate pair found.
-
-                                       if (msb1 >= 0xDC)
-                                       {
-                                               // invalid first surrogate.
-                                               return false;
-                                       }
-
-                                       if (index >= bytes.length - 1)
-                                       {
-                                               // missing second surrogate.
-                                               return false;
-                                       }
-
-                                       // second word.
-                                       int c3 = 0xff & bytes[index++];
-                                       int c4 = 0xff & bytes[index++];
-                                       int msb2 = byteOrder == 
BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
-                                       if (msb2 < 0xDC)
-                                       {
-                                               // invalid second surrogate.
-                                               return false;
-                                       }
-                               }
-                       }
-               }
-
-               public int findEnd(byte bytes[], int index, boolean 
includeTerminator)
-                               throws UnicodeException
-               {
-                       // http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
-                       while (true)
-                       {
-                               if (index == bytes.length)
-                                       return bytes.length;
-                               if (index > bytes.length - 1)
-                                       throw new UnicodeException("Terminator 
not found.");
-
-                               int c1 = 0xff & bytes[index++];
-                               int c2 = 0xff & bytes[index++];
-                               int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? 
c1 : c2;
-
-                               if (c1 == 0 && c2 == 0)
-                               {
-                                       return includeTerminator ? index : 
index - 2;
-                               } else if (msb1 >= 0xD8)
-                               {
-                                       if (index > bytes.length - 1)
-                                               throw new 
UnicodeException("Terminator not found.");
-
-                                       // second word.
-                                       int c3 = 0xff & bytes[index++];
-                                       int c4 = 0xff & bytes[index++];
-                                       int msb2 = byteOrder == 
BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
-                                       if (msb2 < 0xDC)
-                                               throw new 
UnicodeException("Invalid code point.");
-                               }
-                       }
-               }
-       }
-
-       private static class UnicodeMetricsUTF16NoBOM extends 
UnicodeMetricsUTF16
-       {
-
-               public UnicodeMetricsUTF16NoBOM(final int byteOrder)
-               {
-                       super(byteOrder);
-               }
-
-       }
-
-       private static class UnicodeMetricsUTF16WithBOM extends 
UnicodeMetricsUTF16
-       {
-
-               public UnicodeMetricsUTF16WithBOM()
-               {
-                       super(BYTE_ORDER_BIG_ENDIAN);
-               }
-
-               public int findEnd(byte bytes[], int index, boolean 
includeTerminator)
-                               throws UnicodeException
-               {
-                       // http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
-                       if (index >= bytes.length - 1)
-                               throw new UnicodeException("Missing BOM.");
-
-                       int c1 = 0xff & bytes[index++];
-                       int c2 = 0xff & bytes[index++];
-                       if (c1 == 0xFF && c2 == 0xFE)
-                               byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
-                       else if (c1 == 0xFE && c2 == 0xFF)
-                               byteOrder = BYTE_ORDER_BIG_ENDIAN;
-                       else
-                               throw new UnicodeException("Invalid byte order 
mark.");
-
-                       return super.findEnd(bytes, index, includeTerminator);
-               }
-       }
+    /**
+     * This class should never be instantiated.
+     */
+    private UnicodeUtils()
+    {
+    }
+
+    public static class UnicodeException extends Exception
+    {
+        public UnicodeException(String message)
+        {
+            super(message);
+        }
+    }
+
+    // A default single-byte charset.
+    public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
+    public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
+    public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 
2;
+    public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
+    public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
+    public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
+    public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;
+
+    // /*
+    // * Guess the character encoding of arbitrary character data in a data
+    // * buffer.
+    // *
+    // * The data may not run to the end of the buffer; it may be terminated.
+    // This
+    // * makes the problem much harder, since the character data may be 
followed
+    // * by arbitrary data.
+    // */
+    // public static int guessCharacterEncoding(byte bytes[], int index)
+    // {
+    // int length = bytes.length - index;
+    //
+    // if (length < 1)
+    // return CHAR_ENCODING_CODE_AMBIGUOUS;
+    //
+    // if (length >= 2)
+    // {
+    // // look for BOM.
+    //
+    // int c1 = 0xff & bytes[index];
+    // int c2 = 0xff & bytes[index + 1];
+    // if (c1 == 0xFF && c2 == 0xFE)
+    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
+    // else if (c1 == 0xFE && c2 == 0xFF)
+    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
+    // }
+    //
+    // }
+    //
+    // /*
+    // * Guess the character encoding of arbitrary character data in a data
+    // * buffer.
+    // *
+    // * The data fills the entire buffer. If it is terminated, the terminator
+    // * byte(s) will be the last bytes in the buffer.
+    // *
+    // * This makes the problem a bit easier.
+    // */
+    // public static int guessCharacterEncodingSimple(byte bytes[], int index)
+    // throws UnicodeException
+    // {
+    // int length = bytes.length - index;
+    //
+    // if (length < 1)
+    // return CHAR_ENCODING_CODE_AMBIGUOUS;
+    //
+    // if (length >= 2)
+    // {
+    // // identify or eliminate UTF-16 with a BOM.
+    //
+    // int c1 = 0xff & bytes[index];
+    // int c2 = 0xff & bytes[index + 1];
+    // if (c1 == 0xFF && c2 == 0xFE)
+    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
+    // else if (c1 == 0xFE && c2 == 0xFF)
+    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
+    // }
+    //
+    // if (length >= 2)
+    // {
+    // // look for optional double-byte terminator.
+    //
+    // int c1 = 0xff & bytes[bytes.length - 2];
+    // int c2 = 0xff & bytes[bytes.length - 1];
+    // if (c1 == 0 && c2 == 0)
+    // {
+    // // definitely a flavor of UTF-16.
+    // if (length % 2 != 0)
+    // throw new UnicodeException(
+    // "Character data with double-byte terminator has an odd length.");
+    //
+    // boolean mayHaveTerminator = true;
+    // boolean mustHaveTerminator = false;
+    // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
+    // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
+    // mayHaveTerminator, mustHaveTerminator);
+    // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
+    // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
+    // mayHaveTerminator, mustHaveTerminator);
+    // if ((!possibleBigEndian) && (!possibleLittleEndian))
+    // throw new UnicodeException(
+    // "Invalid character data, possibly UTF-16.");
+    // if (possibleBigEndian && possibleLittleEndian)
+    // return CHAR_ENCODING_CODE_AMBIGUOUS;
+    // if (possibleBigEndian)
+    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
+    // if (possibleLittleEndian)
+    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
+    // }
+    // }
+    //
+    // List possibleEncodings = new ArrayList();
+    // if (length % 2 == 0)
+    // {
+    // boolean mayHaveTerminator = true;
+    // boolean mustHaveTerminator = false;
+    // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
+    // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
+    // mayHaveTerminator, mustHaveTerminator);
+    // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
+    // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
+    // mayHaveTerminator, mustHaveTerminator);
+    //
+    // if (possibleBigEndian)
+    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
+    // if (possibleLittleEndian)
+    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
+    // }
+    //
+    // }
+
+    public static final boolean isValidISO_8859_1(String s)
+    {
+        try
+        {
+            String roundtrip = new String(s.getBytes("ISO-8859-1"),
+                    "ISO-8859-1");
+            return s.equals(roundtrip);
+        } catch (UnsupportedEncodingException e)
+        {
+            // should never be thrown.
+            throw new RuntimeException("Error parsing string.", e);
+        }
+    }
+
+    /*
+     * Return the index of the first utf-16 terminator (ie. two even-aligned
+     * nulls). If not found, return -1.
+     */
+    private static int findFirstDoubleByteTerminator(byte bytes[], int index)
+    {
+        for (int i = index; i < bytes.length - 1; i += 2)
+        {
+            int c1 = 0xff & bytes[index];
+            int c2 = 0xff & bytes[index + 1];
+            if (c1 == 0 && c2 == 0)
+                return i;
+        }
+        return -1;
+    }
+
+    public final int findEndWithTerminator(byte bytes[], int index)
+            throws UnicodeException
+    {
+        return findEnd(bytes, index, true);
+    }
+
+    public final int findEndWithoutTerminator(byte bytes[], int index)
+            throws UnicodeException
+    {
+        return findEnd(bytes, index, false);
+    }
+
+    protected abstract int findEnd(byte bytes[], int index,
+            boolean includeTerminator) throws UnicodeException;
+
+    public static UnicodeUtils getInstance(int charEncodingCode)
+            throws UnicodeException
+    {
+        switch (charEncodingCode)
+        {
+        case CHAR_ENCODING_CODE_ISO_8859_1:
+            return new UnicodeMetricsASCII();
+        case CHAR_ENCODING_CODE_UTF_8:
+            // Debug.debug("CHAR_ENCODING_CODE_UTF_8");
+            return new UnicodeMetricsUTF8();
+        case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
+        case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
+            // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
+            return new UnicodeMetricsUTF16WithBOM();
+        case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
+            return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
+        case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
+            return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
+        default:
+            throw new UnicodeException("Unknown char encoding code: "
+                    + charEncodingCode);
+        }
+    }
+
+    private static class UnicodeMetricsASCII extends UnicodeUtils
+    {
+        public int findEnd(byte bytes[], int index, boolean includeTerminator)
+                throws UnicodeException
+        {
+            for (int i = index; i < bytes.length; i++)
+            {
+                if (bytes[i] == 0)
+                    return includeTerminator ? i + 1 : i;
+            }
+            return bytes.length;
+            // throw new UnicodeException("Terminator not found.");
+        }
+    }
+
+    // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
+    // {
+    // public int findEnd(byte bytes[], int index, boolean includeTerminator)
+    // throws UnicodeException
+    // {
+    // for (int i = index; i < bytes.length; i++)
+    // {
+    // if (bytes[i] == 0)
+    // return includeTerminator ? i + 1 : i;
+    // }
+    // return bytes.length;
+    // // throw new UnicodeException("Terminator not found.");
+    // }
+    // }
+
+    private static class UnicodeMetricsUTF8 extends UnicodeUtils
+    {
+
+        public int findEnd(byte bytes[], int index, boolean includeTerminator)
+                throws UnicodeException
+        {
+            // http://en.wikipedia.org/wiki/UTF-8
+
+            while (true)
+            {
+                if (index == bytes.length)
+                    return bytes.length;
+                if (index > bytes.length)
+                    throw new UnicodeException("Terminator not found.");
+
+                int c1 = 0xff & bytes[index++];
+                if (c1 == 0)
+                    return includeTerminator ? index : index - 1;
+                else if (c1 <= 0x7f)
+                    continue;
+                else if (c1 <= 0xDF)
+                {
+                    if (index >= bytes.length)
+                        throw new UnicodeException("Invalid unicode.");
+
+                    int c2 = 0xff & bytes[index++];
+                    if (c2 < 0x80 || c2 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                } else if (c1 <= 0xEF)
+                {
+                    if (index >= bytes.length - 1)
+                        throw new UnicodeException("Invalid unicode.");
+
+                    int c2 = 0xff & bytes[index++];
+                    if (c2 < 0x80 || c2 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                    int c3 = 0xff & bytes[index++];
+                    if (c3 < 0x80 || c3 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                } else if (c1 <= 0xF4)
+                {
+                    if (index >= bytes.length - 2)
+                        throw new UnicodeException("Invalid unicode.");
+
+                    int c2 = 0xff & bytes[index++];
+                    if (c2 < 0x80 || c2 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                    int c3 = 0xff & bytes[index++];
+                    if (c3 < 0x80 || c3 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                    int c4 = 0xff & bytes[index++];
+                    if (c4 < 0x80 || c4 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                } else
+                    throw new UnicodeException("Invalid code point.");
+            }
+        }
+    }
+
+    private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
+    {
+        protected static final int BYTE_ORDER_BIG_ENDIAN = 0;
+        protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1;
+        protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;
+
+        public UnicodeMetricsUTF16(int byteOrder)
+        {
+            this.byteOrder = byteOrder;
+        }
+
+        public boolean isValid(byte bytes[], int index,
+                boolean mayHaveTerminator, boolean mustHaveTerminator)
+                throws UnicodeException
+        {
+            // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+            while (true)
+            {
+                if (index == bytes.length)
+                {
+                    // end of buffer, no terminator found.
+                    return !mustHaveTerminator;
+                }
+
+                if (index >= bytes.length - 1)
+                {
+                    // end of odd-length buffer, no terminator found.
+                    return false;
+                }
+
+                int c1 = 0xff & bytes[index++];
+                int c2 = 0xff & bytes[index++];
+                int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
+
+                if (c1 == 0 && c2 == 0)
+                {
+                    // terminator found.
+                    return mayHaveTerminator;
+                }
+
+                if (msb1 >= 0xD8)
+                {
+                    // Surrogate pair found.
+
+                    if (msb1 >= 0xDC)
+                    {
+                        // invalid first surrogate.
+                        return false;
+                    }
+
+                    if (index >= bytes.length - 1)
+                    {
+                        // missing second surrogate.
+                        return false;
+                    }
+
+                    // second word.
+                    int c3 = 0xff & bytes[index++];
+                    int c4 = 0xff & bytes[index++];
+                    int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
+                    if (msb2 < 0xDC)
+                    {
+                        // invalid second surrogate.
+                        return false;
+                    }
+                }
+            }
+        }
+
+        public int findEnd(byte bytes[], int index, boolean includeTerminator)
+                throws UnicodeException
+        {
+            // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+            while (true)
+            {
+                if (index == bytes.length)
+                    return bytes.length;
+                if (index > bytes.length - 1)
+                    throw new UnicodeException("Terminator not found.");
+
+                int c1 = 0xff & bytes[index++];
+                int c2 = 0xff & bytes[index++];
+                int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
+
+                if (c1 == 0 && c2 == 0)
+                {
+                    return includeTerminator ? index : index - 2;
+                } else if (msb1 >= 0xD8)
+                {
+                    if (index > bytes.length - 1)
+                        throw new UnicodeException("Terminator not found.");
+
+                    // second word.
+                    int c3 = 0xff & bytes[index++];
+                    int c4 = 0xff & bytes[index++];
+                    int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
+                    if (msb2 < 0xDC)
+                        throw new UnicodeException("Invalid code point.");
+                }
+            }
+        }
+    }
+
+    private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
+    {
+
+        public UnicodeMetricsUTF16NoBOM(final int byteOrder)
+        {
+            super(byteOrder);
+        }
+
+    }
+
+    private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
+    {
+
+        public UnicodeMetricsUTF16WithBOM()
+        {
+            super(BYTE_ORDER_BIG_ENDIAN);
+        }
+
+        public int findEnd(byte bytes[], int index, boolean includeTerminator)
+                throws UnicodeException
+        {
+            // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+            if (index >= bytes.length - 1)
+                throw new UnicodeException("Missing BOM.");
+
+            int c1 = 0xff & bytes[index++];
+            int c2 = 0xff & bytes[index++];
+            if (c1 == 0xFF && c2 == 0xFE)
+                byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
+            else if (c1 == 0xFE && c2 == 0xFF)
+                byteOrder = BYTE_ORDER_BIG_ENDIAN;
+            else
+                throw new UnicodeException("Invalid byte order mark.");
+
+            return super.findEnd(bytes, index, includeTerminator);
+        }
+    }
 
 }


Reply via email to