Author: vsiveton Date: Fri Jan 30 01:56:23 2009 New Revision: 739137 URL: http://svn.apache.org/viewvc?rev=739137&view=rev Log: DOXIA-250: Xml parser should handle entities defined in doctype
o better handle of entities o updated test case Modified: maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java Modified: maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java?rev=739137&r1=739136&r2=739137&view=diff ============================================================================== --- maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java (original) +++ maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java Fri Jan 30 01:56:23 2009 @@ -23,6 +23,7 @@ import com.gargoylesoftware.htmlunit.html.HtmlDivision; import com.gargoylesoftware.htmlunit.html.HtmlElement; import com.gargoylesoftware.htmlunit.html.HtmlHeader2; +import com.gargoylesoftware.htmlunit.html.HtmlHeader3; import com.gargoylesoftware.htmlunit.html.HtmlHeader4; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlParagraph; @@ -30,9 +31,8 @@ import java.util.Iterator; - /** - * + * Verify the <code>site/xdoc/entityTest.xml</code> * * @author ltheussl * @version $Id$ @@ -78,17 +78,41 @@ assertNotNull( h4 ); assertEquals( h4.asText().trim(), "Entities" ); + div = (HtmlDivision) elementIterator.next(); + + HtmlHeader3 h3 = (HtmlHeader3) elementIterator.next(); + assertNotNull( h3 ); + assertEquals( h3.asText().trim(), "Generic Entities" ); + + a = (HtmlAnchor) elementIterator.next(); + HtmlParagraph p = (HtmlParagraph) elementIterator.next(); assertNotNull( p ); - assertEquals( p.asText().trim(), "'&' '<' '>' '\"' ''' ' ' ' '" ); + assertEquals( p.asText().trim(), "'&' '<' '>' '\"' '''" ); div = (HtmlDivision) elementIterator.next(); - assertNotNull( div ); - assertEquals( div.getAttributeValue( "class" ), "section" ); - h4 = (HtmlHeader4) elementIterator.next(); - assertNotNull( h4 ); - assertEquals( h4.asText().trim(), "Comment" ); + h3 = (HtmlHeader3) elementIterator.next(); + assertNotNull( h3 ); + assertEquals( h3.asText().trim(), "Local Entities" ); + + a = (HtmlAnchor) elementIterator.next(); + + p = (HtmlParagraph) elementIterator.next(); + assertNotNull( p ); + assertEquals( p.asText().trim(), "'Î' 'Î' 'Î'" ); + + div = (HtmlDivision) elementIterator.next(); + + h3 = (HtmlHeader3) elementIterator.next(); + assertNotNull( h3 ); + assertEquals( h3.asText().trim(), "DTD Entities" ); + + a = (HtmlAnchor) elementIterator.next(); + + p = (HtmlParagraph) elementIterator.next(); + assertNotNull( p ); + assertEquals( p.asText().trim(), "' ' '¡' '¢'" ); div = (HtmlDivision) elementIterator.next(); assertNotNull( div ); @@ -106,6 +130,17 @@ assertNotNull( pre ); assertEquals( pre.asText().trim(), "<project xmlns:ant=\"jelly:ant\">" ); + p = (HtmlParagraph) elementIterator.next(); + assertNotNull( p ); + assertEquals( p.asText().trim(), "' ' '¡'" ); + + elementIterator.next(); // div + elementIterator.next(); // hr + elementIterator.next(); // div + elementIterator.next(); // div + elementIterator.next(); // hr + elementIterator.next(); // hr + assertFalse( elementIterator.hasNext() ); } } Modified: maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml URL: http://svn.apache.org/viewvc/maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml?rev=739137&r1=739136&r2=739137&view=diff ============================================================================== --- maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml (original) +++ maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml Fri Jan 30 01:56:23 2009 @@ -19,15 +19,14 @@ --> <!DOCTYPE document [ - <!-- These are the entity sets for ISO Latin 1 characters for the XHTML --> - <!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent"> - %HTMLlat1; - <!-- These are the entity sets for special characters for the XHTML --> - <!ENTITY % HTMLsymbol PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent"> - %HTMLsymbol; - <!-- These are the entity sets for symbol characters for the XHTML --> - <!ENTITY % HTMLspecial PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent"> - %HTMLspecial; +<!-- These are the entity sets for ISO Latin 1 characters for the XHTML --> +<!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent"> +%HTMLlat1; +<!-- Some entities from http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent --> +<!ENTITY Alpha "Α"> <!-- greek capital letter alpha, U+0391 --> +<!ENTITY Beta "Β"> <!-- greek capital letter beta, U+0392 --> +<!ENTITY Gamma "Γ"> <!-- greek capital letter gamma, +U+0393 ISOgrk3 --> ]> <document xmlns="http://maven.apache.org/XDOC/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -41,13 +40,18 @@ <section name="section name"> <h4>Entities</h4> - <p>'&' '<' '>' '"' ''' ' ' ' '</p> + <h3>Generic Entities</h3> + <p>'&' '<' '>' '"' '''</p> - <h4>Comment</h4> - <!-- a comment and nothing else! --> + <h3>Local Entities</h3> + <p>'Α' 'Β' 'Γ'</p> + + <h3>DTD Entities</h3> + <p>' ' '¡' '¢'</p> <h4>CDATA</h4> <source><![CDATA[<project xmlns:ant="jelly:ant">]]></source> + <p><![CDATA[' ' '¡']]></p> </section> Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java?rev=739137&r1=739136&r2=739137&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java Fri Jan 30 01:56:23 2009 @@ -42,4 +42,10 @@ /** CDATA string: "CDATA" */ String CDATA = "CDATA"; + + /** DOCTYPE start string: "<!DOCTYPE" */ + String DOCTYPE_START = "<!DOCTYPE"; + + /** ENTITY start string: "<!ENTITY" */ + String ENTITY_START = "<!ENTITY"; } Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java?rev=739137&r1=739136&r2=739137&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java Fri Jan 30 01:56:23 2009 @@ -30,6 +30,7 @@ import java.io.StringReader; import java.net.URL; import java.util.Hashtable; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Locale; import java.util.Map; @@ -70,16 +71,21 @@ extends AbstractParser implements XmlMarkup { - /** Entity pattern for HTML entity, i.e. &nbsp; see http://www.w3.org/TR/REC-xml/#NT-EntityDecl */ + /** Entity pattern for HTML entity, i.e. &nbsp; "<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*> + * <br/> + * see <a href="http://www.w3.org/TR/REC-xml/#NT-EntityDecl">http://www.w3.org/TR/REC-xml/#NT-EntityDecl</a> */ private static final Pattern PATTERN_ENTITY_1 = - Pattern.compile( "<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>" ); + Pattern.compile( ENTITY_START + "(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>" ); - /** Entity pattern for Unicode entity, i.e. &#38; see http://www.w3.org/TR/REC-xml/#NT-EntityDecl */ + /** Entity pattern for Unicode entity, i.e. &#38; "<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&(#x?[0-9a-fA-F]{1,4};)*)(\\s)*\"(\\s)*>" + * <br/> + * see <a href="http://www.w3.org/TR/REC-xml/#NT-EntityDecl">http://www.w3.org/TR/REC-xml/#NT-EntityDecl</a> */ private static final Pattern PATTERN_ENTITY_2 = - Pattern.compile( "<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&#x?[0-9a-fA-F]{1,4};)(\\s)*\"(\\s)*>" ); + Pattern.compile( ENTITY_START + "(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&(#x?[0-9a-fA-F]{1,4};)*)(\\s)*\"(\\s)*>" ); - /** Doctype pattern as defined in http://www.w3.org/TR/REC-xml/#NT-doctypedecl */ - private static final Pattern PATTERN_DOCTYPE = Pattern.compile( ".*<!DOCTYPE([^>]*)>.*" ); + /** Doctype pattern i.e. ".*<!DOCTYPE([^>]*)>.*" + * see <a href="http://www.w3.org/TR/REC-xml/#NT-doctypedecl">http://www.w3.org/TR/REC-xml/#NT-doctypedecl</a> */ + private static final Pattern PATTERN_DOCTYPE = Pattern.compile( ".*" + DOCTYPE_START + "([^>]*)>.*" ); /** Tag pattern as defined in http://www.w3.org/TR/REC-xml/#NT-Name */ private static final Pattern PATTERN_TAG = Pattern.compile( ".*<([A-Za-z][A-Za-z0-9:_.-]*)([^>]*)>.*" ); @@ -247,43 +253,13 @@ } else if ( eventType == XmlPullParser.DOCDECL ) { - String text = parser.getText(); - int entitiesCount = StringUtils.countMatches( text, "<!ENTITY" ); - // entities defined in a local doctype - if ( entitiesCount > 0 ) + addLocalEntities( parser, parser.getText() ); + + for ( Iterator it = CachedFileEntityResolver.ENTITY_CACHE.values().iterator(); it.hasNext(); ) { - int start = text.indexOf( "<" ); - int end = text.lastIndexOf( ">" ); - if ( start != -1 && end != -1 ) - { - text = text.substring( start, end + 1 ); - for ( int i = 0; i < entitiesCount; i++ ) - { - String tmp = text.substring( text.indexOf( "<" ), text.indexOf( ">" ) + 1 ); - Matcher matcher = PATTERN_ENTITY_1.matcher( tmp ); - if ( matcher.find() && matcher.groupCount() == 7 ) - { - String entityName = matcher.group( 2 ); - String entityValue = matcher.group( 5 ); - - parser.defineEntityReplacementText( entityName, entityValue ); - getLocalEntities().put( entityName, entityValue ); - } - else - { - matcher = PATTERN_ENTITY_2.matcher( text ); - if ( matcher.find() && matcher.groupCount() == 7 ) - { - String entityName = matcher.group( 2 ); - String entityValue = matcher.group( 5 ); - - parser.defineEntityReplacementText( entityName, entityValue ); - getLocalEntities().put( entityName, entityValue ); - } - } - text = StringUtils.replace( text, tmp, "" ).trim(); - } - } + byte[] res = (byte[])it.next(); + + addDTDEntities( parser, new String( res ) ); } } @@ -589,6 +565,123 @@ } /** + * Add an entity given by <code>entityName</code> and <code>entityValue</code> to {...@link #entities}. + * <br/> + * By default, we exclude the default XML entities: &amp;, &lt;, &gt;, &quot; and &apos;. + * + * @param parser not null + * @param entityName not null + * @param entityValue not null + * @throws XmlPullParserException if any + * @see {...@link XmlPullParser#defineEntityReplacementText(String, String)} + */ + private void addEntity( XmlPullParser parser, String entityName, String entityValue ) + throws XmlPullParserException + { + if ( entityName.endsWith( "amp" ) || entityName.endsWith( "lt" ) || entityName.endsWith( "gt" ) + || entityName.endsWith( "quot" ) || entityName.endsWith( "apos" ) ) + { + return; + } + + parser.defineEntityReplacementText( entityName, entityValue ); + getLocalEntities().put( entityName, entityValue ); + } + + /** + * Handle entities defined in a local doctype as the following: + * <pre> + * <!DOCTYPE foo [ + * <!ENTITY bar "&#x160;"> + * <!ENTITY bar1 "&#x161;"> + * ]> + * </pre> + * + * @param parser not null + * @param text not null + * @throws XmlPullParserException if any + */ + private void addLocalEntities( XmlPullParser parser, String text ) + throws XmlPullParserException + { + int entitiesCount = StringUtils.countMatches( text, ENTITY_START ); + if ( entitiesCount > 0 ) + { + // text should be foo [...] + int start = text.indexOf( "[" ); + int end = text.lastIndexOf( "]" ); + if ( start != -1 && end != -1 ) + { + text = text.substring( start + 1, end ); + addDTDEntities( parser, text ); + } + } + } + + /** + * Handle entities defined in external doctypes as the following: + * <pre> + * <!DOCTYPE foo [ + * <!-- These are the entity sets for ISO Latin 1 characters for the XHTML --> + * <!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent"> + * %HTMLlat1; + * ]> + * </pre> + * + * @param parser not null + * @param text not null + * @throws XmlPullParserException if any + */ + private void addDTDEntities( XmlPullParser parser, String text ) + throws XmlPullParserException + { + int entitiesCount = StringUtils.countMatches( text, ENTITY_START ); + if ( entitiesCount > 0 ) + { + BufferedReader reader = new BufferedReader( new StringReader( text ) ); + String line; + String tmpLine = ""; + try + { + Matcher matcher; + while ( ( line = reader.readLine() ) != null ) + { + tmpLine += "\n" + line; + matcher = PATTERN_ENTITY_1.matcher( tmpLine ); + if ( matcher.find() && matcher.groupCount() == 7 ) + { + String entityName = matcher.group( 2 ); + String entityValue = matcher.group( 5 ); + + addEntity( parser, entityName, entityValue ); + tmpLine = ""; + } + else + { + matcher = PATTERN_ENTITY_2.matcher( tmpLine ); + if ( matcher.find() && matcher.groupCount() == 8 ) + { + String entityName = matcher.group( 2 ); + String entityValue = matcher.group( 5 ); + + addEntity( parser, entityName, entityValue ); + tmpLine = ""; + } + } + } + } + catch ( IOException e ) + { + // nop + } + finally + { + IOUtil.close( reader ); + } + } + } + + /** * Convenience class to beautify <code>SAXParseException</code> messages. */ static class MessagesErrorHandler @@ -714,13 +807,14 @@ public static class CachedFileEntityResolver implements EntityResolver { - private static final Map cache = new Hashtable(); + /** Map with systemId as key and the content of systemId as byte[]. */ + protected static final Map ENTITY_CACHE = new Hashtable(); /** {...@inheritdoc} */ public InputSource resolveEntity( String publicId, String systemId ) throws SAXException, IOException { - byte[] res = (byte[]) cache.get( systemId ); + byte[] res = (byte[]) ENTITY_CACHE.get( systemId ); // already cached? if ( res == null ) { @@ -758,7 +852,7 @@ res = toByteArray( temp.toURL() ); } - cache.put( systemId, res ); + ENTITY_CACHE.put( systemId, res ); } InputSource is = new InputSource( new ByteArrayInputStream( res ) );