Author: vsiveton
Date: Tue May 19 12:18:28 2009
New Revision: 776288

URL: http://svn.apache.org/viewvc?rev=776288&view=rev
Log:
o improved escapeHTML and unescapeHTML for all entities
o added more test cases
o updated parser test 
o import part of ASF Harmony project

Modified:
    
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java
    
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java
    
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java

Modified: 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java
URL: 
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java?rev=776288&r1=776287&r2=776288&view=diff
==============================================================================
--- 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java
 (original)
+++ 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java
 Tue May 19 12:18:28 2009
@@ -20,15 +20,15 @@
  */
 
 import java.io.UnsupportedEncodingException;
-
+import java.util.ArrayList;
 import java.util.Hashtable;
+import java.util.List;
 
 import javax.swing.text.html.HTML.Tag;
 
 import org.apache.commons.lang.StringEscapeUtils;
-
 import org.apache.maven.doxia.markup.HtmlMarkup;
-import org.apache.xerces.util.XMLChar;
+import org.codehaus.plexus.util.StringUtils;
 
 /**
  * The <code>HtmlTools</code> class defines methods to HTML handling.
@@ -117,6 +117,12 @@
      * If <code>xmlMode</code> is true, every other character than the above 
remains unchanged,
      * if <code>xmlMode</code> is false, non-ascii characters get replaced by 
their hex code.
      *
+     * <b>Note</b>: all characters are encoded, i.e.:
+     * <pre>
+     * \u0159       = &#38;#x159;
+     * \uD835\uDFED = &#38;#x1d7ed;
+     * </pre>
+     *
      * @param text The String to escape, may be null.
      * @param xmlMode set to <code>false</code> to replace non-ascii 
characters.
      * @return The escaped text or the empty string if text == null.
@@ -164,25 +170,9 @@
                         else
                         {
                             buffer.append( "&#x" );
-                            if ( XMLChar.isHighSurrogate( c ) )
+                            if ( isHighSurrogate( c ) )
                             {
-                                int c2 = text.charAt( ++i );
-                                if ( XMLChar.isLowSurrogate( c2 ) )
-                                {
-                                    int sup = XMLChar.supplemental( c, (char) 
c2 );
-                                    if ( !XMLChar.isValid( sup ) )
-                                    {
-                                        throw new IllegalArgumentException( 
"Invalid XML character "
-                                            + Integer.toString( sup, 16 ) + " 
in " + text );
-                                    }
-
-                                    buffer.append( Integer.toHexString( sup ) 
);
-                                }
-                                else
-                                {
-                                    throw new IllegalArgumentException( 
"Invalid XML character "
-                                        + Integer.toString( c2, 16 ) + " in " 
+ text );
-                                }
+                                buffer.append( Integer.toHexString( 
toCodePoint( c, text.charAt( ++i ) ) ) );
                             }
                             else
                             {
@@ -207,15 +197,67 @@
      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
      * will become "&lt;Fran&ccedil;ais&gt;".</p>
      *
-     * @param text the <code>String</code> to unescape, may be null.
+     * <b>Note</b>: all unicode entities are decoded, i.e.:
+     * <pre>
+     * &#38;#x159;   = \u0159
+     * &#38;#x1d7ed; = \uD835\uDFED
+     * </pre>
      *
+     * @param text the <code>String</code> to unescape, may be null.
      * @return a new unescaped <code>String</code>, <code>null</code> if null 
string input.
-     *
      * @since 1.1.1.
      */
     public static String unescapeHtml( String text )
     {
-        return StringEscapeUtils.unescapeHtml( text );
+        if ( text == null )
+        {
+            return null;
+        }
+
+        String unescaped = StringEscapeUtils.unescapeHtml( text );
+
+        if ( !text.equals( unescaped ))
+        {
+            return unescaped;
+        }
+
+        String tmp = text;
+        List entities = new ArrayList();
+        while ( true )
+        {
+            int i = tmp.indexOf( "&#x" );
+            if ( i == -1 )
+            {
+                break;
+            }
+
+            tmp = tmp.substring( i + 3 );
+            if ( tmp.indexOf( ';' ) == -1 )
+            {
+                throw new IllegalArgumentException( "Wrong HTML near '..." + 
tmp + "'" );
+            }
+
+            String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
+            try
+            {
+                Integer.parseInt( entity, 16 );
+            }
+            catch ( Exception e )
+            {
+                throw new IllegalArgumentException( "Wrong HTML near '..." + 
tmp + "'" );
+            }
+            entities.add( entity );
+        }
+
+        for ( int i = 0; i < entities.size(); i++ )
+        {
+            String entity = (String) entities.get( i );
+
+            int codePoint = Integer.parseInt( entity, 16 );
+            text = StringUtils.replace( text, "&#x" + entity + ";", new 
String( toChars( codePoint ) ) );
+        }
+
+        return text;
     }
 
     /**
@@ -338,4 +380,57 @@
     {
         // utility class
     }
+
+    //
+    // Imported code from ASF Harmony project
+    // 
http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
+    //
+
+    private static int toCodePoint( char high, char low )
+    {
+        // See RFC 2781, Section 2.2
+        // http://www.faqs.org/rfcs/rfc2781.html
+        int h = ( high & 0x3FF ) << 10;
+        int l = low & 0x3FF;
+        return ( h | l ) + 0x10000;
+    }
+
+    private static final char MIN_HIGH_SURROGATE = '\uD800';
+    private static final char MAX_HIGH_SURROGATE = '\uDBFF';
+
+    private static boolean isHighSurrogate( char ch )
+    {
+        return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
+    }
+
+    private static final int MIN_CODE_POINT = 0x000000;
+    private static final int MAX_CODE_POINT = 0x10FFFF;
+    private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
+
+    private static boolean isValidCodePoint( int codePoint )
+    {
+        return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
+    }
+
+    private static boolean isSupplementaryCodePoint( int codePoint )
+    {
+        return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT 
>= codePoint );
+    }
+
+    private static char[] toChars( int codePoint )
+    {
+        if ( !isValidCodePoint( codePoint ) )
+        {
+            throw new IllegalArgumentException();
+        }
+
+        if ( isSupplementaryCodePoint( codePoint ) )
+        {
+            int cpPrime = codePoint - 0x10000;
+            int high = 0xD800 | ( ( cpPrime >> 10 ) & 0x3FF );
+            int low = 0xDC00 | ( cpPrime & 0x3FF );
+            return new char[] { (char) high, (char) low };
+        }
+        return new char[] { (char) codePoint };
+    }
 }

Modified: 
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java
URL: 
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java?rev=776288&r1=776287&r2=776288&view=diff
==============================================================================
--- 
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java
 (original)
+++ 
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java
 Tue May 19 12:18:28 2009
@@ -348,8 +348,8 @@
         assertEquals( "\u0161",  (String) event.getArgs()[0] );
 
         event = (SinkEventElement) it.next();
-        assertEquals( "unknown", event.getName() );
-        assertEquals( "&#x1d7ed;",  (String) event.getArgs()[0] );
+        assertEquals( "text", event.getName() );
+        assertEquals( "\uD835\uDFED",  (String) event.getArgs()[0] );
 
         event = (SinkEventElement) it.next();
         assertEquals( "bold_", event.getName() );
@@ -382,10 +382,9 @@
         assertEquals( "text", textEvt.getName() );
         assertEquals( "\u0159", textEvt.getArgs()[0] );
 
-        // TODO this should be emitted as the same text event as well
         textEvt = (SinkEventElement) it.next();
-        assertEquals( "unknown", textEvt.getName() );
-        assertEquals( "&#x1d7ed;", textEvt.getArgs()[0] );
+        assertEquals( "text", textEvt.getName() );
+        assertEquals( "\uD835\uDFED",  (String) textEvt.getArgs()[0] );
 
         textEvt = (SinkEventElement) it.next();
         assertEquals( "text", textEvt.getName() );
@@ -406,10 +405,9 @@
         assertEquals( "text", textEvt.getName() );
         assertEquals( "\u0159", textEvt.getArgs()[0] );
 
-        // TODO this should be emitted as the same text event as well
         textEvt = (SinkEventElement) it.next();
-        assertEquals( "unknown", textEvt.getName() );
-        assertEquals( "&#x1d7ed;", textEvt.getArgs()[0] );
+        assertEquals( "text", textEvt.getName() );
+        assertEquals( "\uD835\uDFED",  (String) textEvt.getArgs()[0] );
 
         textEvt = (SinkEventElement) it.next();
         assertEquals( "text", textEvt.getName() );

Modified: 
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java
URL: 
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java?rev=776288&r1=776287&r2=776288&view=diff
==============================================================================
--- 
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java
 (original)
+++ 
maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java
 Tue May 19 12:18:28 2009
@@ -19,6 +19,7 @@
  * under the License.
  */
 
+import org.apache.commons.lang.StringEscapeUtils;
 import org.codehaus.plexus.PlexusTestCase;
 
 /**
@@ -46,6 +47,7 @@
         // xml mode
         assertEquals( HtmlTools.escapeHTML( "\u00e4", true ), "\u00e4" );
         assertEquals( HtmlTools.escapeHTML( "\u00e4", false ), "&#xe4;" );
+        assertEquals( HtmlTools.escapeHTML( "\u0159", false ), "&#x159;" );
         assertEquals( HtmlTools.escapeHTML( "\uD835\uDFED", false ), 
"&#x1d7ed;" );
     }
 
@@ -62,7 +64,20 @@
         assertEquals( "\"", HtmlTools.unescapeHtml( "&quot;" ) );
         assertEquals( "&amp;", HtmlTools.unescapeHtml( "&amp;amp;" ) );
         assertEquals( "&lt;Fran&ccedil;ais&gt;", HtmlTools.unescapeHtml( 
"&amp;lt;Fran&amp;ccedil;ais&amp;gt;" ) );
-        assertEquals( "&#x12345;", HtmlTools.unescapeHtml( "&#x12345;" ) );
+        assertEquals( "\u0159", HtmlTools.unescapeHtml( "&#x159;" ) );
+        assertEquals( "\uD808\uDF45", HtmlTools.unescapeHtml( "&#x12345;" ) );
+        assertEquals( "\uD835\uDFED", HtmlTools.unescapeHtml( "&#x1d7ed;" ) );
+        assertEquals( "\uD808\uDF45\uD835\uDFED", HtmlTools.unescapeHtml( 
"&#x12345;&#x1d7ed;" ) );
+
+        try
+        {
+            HtmlTools.unescapeHtml( "test &#x1d7ed test" );
+            assertTrue( false );
+        }
+        catch ( IllegalArgumentException e )
+        {
+            assertTrue( true );
+        }
     }
 
     /**


Reply via email to