Author: vsiveton Date: Tue May 19 12:18:28 2009 New Revision: 776288 URL: http://svn.apache.org/viewvc?rev=776288&view=rev Log: o improved escapeHTML and unescapeHTML for all entities o added more test cases o updated parser test o import part of ASF Harmony project
Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java?rev=776288&r1=776287&r2=776288&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/util/HtmlTools.java Tue May 19 12:18:28 2009 @@ -20,15 +20,15 @@ */ import java.io.UnsupportedEncodingException; - +import java.util.ArrayList; import java.util.Hashtable; +import java.util.List; import javax.swing.text.html.HTML.Tag; import org.apache.commons.lang.StringEscapeUtils; - import org.apache.maven.doxia.markup.HtmlMarkup; -import org.apache.xerces.util.XMLChar; +import org.codehaus.plexus.util.StringUtils; /** * The <code>HtmlTools</code> class defines methods to HTML handling. @@ -117,6 +117,12 @@ * If <code>xmlMode</code> is true, every other character than the above remains unchanged, * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code. * + * <b>Note</b>: all characters are encoded, i.e.: + * <pre> + * \u0159 = &#x159; + * \uD835\uDFED = &#x1d7ed; + * </pre> + * * @param text The String to escape, may be null. * @param xmlMode set to <code>false</code> to replace non-ascii characters. * @return The escaped text or the empty string if text == null. @@ -164,25 +170,9 @@ else { buffer.append( "&#x" ); - if ( XMLChar.isHighSurrogate( c ) ) + if ( isHighSurrogate( c ) ) { - int c2 = text.charAt( ++i ); - if ( XMLChar.isLowSurrogate( c2 ) ) - { - int sup = XMLChar.supplemental( c, (char) c2 ); - if ( !XMLChar.isValid( sup ) ) - { - throw new IllegalArgumentException( "Invalid XML character " - + Integer.toString( sup, 16 ) + " in " + text ); - } - - buffer.append( Integer.toHexString( sup ) ); - } - else - { - throw new IllegalArgumentException( "Invalid XML character " - + Integer.toString( c2, 16 ) + " in " + text ); - } + buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) ); } else { @@ -207,15 +197,67 @@ * <p>For example, the string "&lt;Fran&ccedil;ais&gt;" * will become "<Français>".</p> * - * @param text the <code>String</code> to unescape, may be null. + * <b>Note</b>: all unicode entities are decoded, i.e.: + * <pre> + * &#x159; = \u0159 + * &#x1d7ed; = \uD835\uDFED + * </pre> * + * @param text the <code>String</code> to unescape, may be null. * @return a new unescaped <code>String</code>, <code>null</code> if null string input. - * * @since 1.1.1. */ public static String unescapeHtml( String text ) { - return StringEscapeUtils.unescapeHtml( text ); + if ( text == null ) + { + return null; + } + + String unescaped = StringEscapeUtils.unescapeHtml( text ); + + if ( !text.equals( unescaped )) + { + return unescaped; + } + + String tmp = text; + List entities = new ArrayList(); + while ( true ) + { + int i = tmp.indexOf( "&#x" ); + if ( i == -1 ) + { + break; + } + + tmp = tmp.substring( i + 3 ); + if ( tmp.indexOf( ';' ) == -1 ) + { + throw new IllegalArgumentException( "Wrong HTML near '..." + tmp + "'" ); + } + + String entity = tmp.substring( 0, tmp.indexOf( ';' ) ); + try + { + Integer.parseInt( entity, 16 ); + } + catch ( Exception e ) + { + throw new IllegalArgumentException( "Wrong HTML near '..." + tmp + "'" ); + } + entities.add( entity ); + } + + for ( int i = 0; i < entities.size(); i++ ) + { + String entity = (String) entities.get( i ); + + int codePoint = Integer.parseInt( entity, 16 ); + text = StringUtils.replace( text, "&#x" + entity + ";", new String( toChars( codePoint ) ) ); + } + + return text; } /** @@ -338,4 +380,57 @@ { // utility class } + + // + // Imported code from ASF Harmony project + // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java + // + + private static int toCodePoint( char high, char low ) + { + // See RFC 2781, Section 2.2 + // http://www.faqs.org/rfcs/rfc2781.html + int h = ( high & 0x3FF ) << 10; + int l = low & 0x3FF; + return ( h | l ) + 0x10000; + } + + private static final char MIN_HIGH_SURROGATE = '\uD800'; + private static final char MAX_HIGH_SURROGATE = '\uDBFF'; + + private static boolean isHighSurrogate( char ch ) + { + return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch ); + } + + private static final int MIN_CODE_POINT = 0x000000; + private static final int MAX_CODE_POINT = 0x10FFFF; + private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; + + private static boolean isValidCodePoint( int codePoint ) + { + return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint ); + } + + private static boolean isSupplementaryCodePoint( int codePoint ) + { + return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint ); + } + + private static char[] toChars( int codePoint ) + { + if ( !isValidCodePoint( codePoint ) ) + { + throw new IllegalArgumentException(); + } + + if ( isSupplementaryCodePoint( codePoint ) ) + { + int cpPrime = codePoint - 0x10000; + int high = 0xD800 | ( ( cpPrime >> 10 ) & 0x3FF ); + int low = 0xDC00 | ( cpPrime & 0x3FF ); + return new char[] { (char) high, (char) low }; + } + return new char[] { (char) codePoint }; + } } Modified: maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java?rev=776288&r1=776287&r2=776288&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java Tue May 19 12:18:28 2009 @@ -348,8 +348,8 @@ assertEquals( "\u0161", (String) event.getArgs()[0] ); event = (SinkEventElement) it.next(); - assertEquals( "unknown", event.getName() ); - assertEquals( "𝟭", (String) event.getArgs()[0] ); + assertEquals( "text", event.getName() ); + assertEquals( "\uD835\uDFED", (String) event.getArgs()[0] ); event = (SinkEventElement) it.next(); assertEquals( "bold_", event.getName() ); @@ -382,10 +382,9 @@ assertEquals( "text", textEvt.getName() ); assertEquals( "\u0159", textEvt.getArgs()[0] ); - // TODO this should be emitted as the same text event as well textEvt = (SinkEventElement) it.next(); - assertEquals( "unknown", textEvt.getName() ); - assertEquals( "𝟭", textEvt.getArgs()[0] ); + assertEquals( "text", textEvt.getName() ); + assertEquals( "\uD835\uDFED", (String) textEvt.getArgs()[0] ); textEvt = (SinkEventElement) it.next(); assertEquals( "text", textEvt.getName() ); @@ -406,10 +405,9 @@ assertEquals( "text", textEvt.getName() ); assertEquals( "\u0159", textEvt.getArgs()[0] ); - // TODO this should be emitted as the same text event as well textEvt = (SinkEventElement) it.next(); - assertEquals( "unknown", textEvt.getName() ); - assertEquals( "𝟭", textEvt.getArgs()[0] ); + assertEquals( "text", textEvt.getName() ); + assertEquals( "\uD835\uDFED", (String) textEvt.getArgs()[0] ); textEvt = (SinkEventElement) it.next(); assertEquals( "text", textEvt.getName() ); Modified: maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java?rev=776288&r1=776287&r2=776288&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/util/HtmlToolsTest.java Tue May 19 12:18:28 2009 @@ -19,6 +19,7 @@ * under the License. */ +import org.apache.commons.lang.StringEscapeUtils; import org.codehaus.plexus.PlexusTestCase; /** @@ -46,6 +47,7 @@ // xml mode assertEquals( HtmlTools.escapeHTML( "\u00e4", true ), "\u00e4" ); assertEquals( HtmlTools.escapeHTML( "\u00e4", false ), "ä" ); + assertEquals( HtmlTools.escapeHTML( "\u0159", false ), "ř" ); assertEquals( HtmlTools.escapeHTML( "\uD835\uDFED", false ), "𝟭" ); } @@ -62,7 +64,20 @@ assertEquals( "\"", HtmlTools.unescapeHtml( """ ) ); assertEquals( "&", HtmlTools.unescapeHtml( "&amp;" ) ); assertEquals( "<Français>", HtmlTools.unescapeHtml( "&lt;Fran&ccedil;ais&gt;" ) ); - assertEquals( "𒍅", HtmlTools.unescapeHtml( "𒍅" ) ); + assertEquals( "\u0159", HtmlTools.unescapeHtml( "ř" ) ); + assertEquals( "\uD808\uDF45", HtmlTools.unescapeHtml( "𒍅" ) ); + assertEquals( "\uD835\uDFED", HtmlTools.unescapeHtml( "𝟭" ) ); + assertEquals( "\uD808\uDF45\uD835\uDFED", HtmlTools.unescapeHtml( "𒍅𝟭" ) ); + + try + { + HtmlTools.unescapeHtml( "test 𝟭 test" ); + assertTrue( false ); + } + catch ( IllegalArgumentException e ) + { + assertTrue( true ); + } } /**