Author: vsiveton
Date: Fri Jan 30 01:56:23 2009
New Revision: 739137

URL: http://svn.apache.org/viewvc?rev=739137&view=rev
Log:
DOXIA-250: Xml parser should handle entities defined in doctype

o better handle of entities 
o updated test case

Modified:
    
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java
    
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml
    
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java
    
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java

Modified: 
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java
URL: 
http://svn.apache.org/viewvc/maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java?rev=739137&r1=739136&r2=739137&view=diff
==============================================================================
--- 
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java
 (original)
+++ 
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/java/org/apache/maven/doxia/siterenderer/EntitiesVerifier.java
 Fri Jan 30 01:56:23 2009
@@ -23,6 +23,7 @@
 import com.gargoylesoftware.htmlunit.html.HtmlDivision;
 import com.gargoylesoftware.htmlunit.html.HtmlElement;
 import com.gargoylesoftware.htmlunit.html.HtmlHeader2;
+import com.gargoylesoftware.htmlunit.html.HtmlHeader3;
 import com.gargoylesoftware.htmlunit.html.HtmlHeader4;
 import com.gargoylesoftware.htmlunit.html.HtmlPage;
 import com.gargoylesoftware.htmlunit.html.HtmlParagraph;
@@ -30,9 +31,8 @@
 
 import java.util.Iterator;
 
-
 /**
- * 
+ * Verify the <code>site/xdoc/entityTest.xml</code>
  *
  * @author ltheussl
  * @version $Id$
@@ -78,17 +78,41 @@
         assertNotNull( h4 );
         assertEquals( h4.asText().trim(), "Entities" );
 
+        div = (HtmlDivision) elementIterator.next();
+
+        HtmlHeader3 h3 = (HtmlHeader3) elementIterator.next();
+        assertNotNull( h3 );
+        assertEquals( h3.asText().trim(), "Generic Entities" );
+
+        a = (HtmlAnchor) elementIterator.next();
+
         HtmlParagraph p = (HtmlParagraph) elementIterator.next();
         assertNotNull( p );
-        assertEquals( p.asText().trim(), "'&' '<' '>' '\"' ''' ' ' ' '" );
+        assertEquals( p.asText().trim(), "'&' '<' '>' '\"' '''" );
 
         div = (HtmlDivision) elementIterator.next();
-        assertNotNull( div );
-        assertEquals( div.getAttributeValue( "class" ), "section" );
 
-        h4 = (HtmlHeader4) elementIterator.next();
-        assertNotNull( h4 );
-        assertEquals( h4.asText().trim(), "Comment" );
+        h3 = (HtmlHeader3) elementIterator.next();
+        assertNotNull( h3 );
+        assertEquals( h3.asText().trim(), "Local Entities" );
+
+        a = (HtmlAnchor) elementIterator.next();
+
+        p = (HtmlParagraph) elementIterator.next();
+        assertNotNull( p );
+        assertEquals( p.asText().trim(), "'Α' 'Β' 'Γ'" );
+
+        div = (HtmlDivision) elementIterator.next();
+
+        h3 = (HtmlHeader3) elementIterator.next();
+        assertNotNull( h3 );
+        assertEquals( h3.asText().trim(), "DTD Entities" );
+
+        a = (HtmlAnchor) elementIterator.next();
+
+        p = (HtmlParagraph) elementIterator.next();
+        assertNotNull( p );
+        assertEquals( p.asText().trim(), "' ' '¡' '¢'" );
 
         div = (HtmlDivision) elementIterator.next();
         assertNotNull( div );
@@ -106,6 +130,17 @@
         assertNotNull( pre );
         assertEquals( pre.asText().trim(), "<project xmlns:ant=\"jelly:ant\">" 
);
 
+        p = (HtmlParagraph) elementIterator.next();
+        assertNotNull( p );
+        assertEquals( p.asText().trim(), "'&nbsp;' '&iexcl;'" );
+
+        elementIterator.next(); // div
+        elementIterator.next(); // hr
+        elementIterator.next(); // div
+        elementIterator.next(); // div
+        elementIterator.next(); // hr
+        elementIterator.next(); // hr
+
         assertFalse( elementIterator.hasNext() );
     }
 }

Modified: 
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml
URL: 
http://svn.apache.org/viewvc/maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml?rev=739137&r1=739136&r2=739137&view=diff
==============================================================================
--- 
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml
 (original)
+++ 
maven/doxia/doxia-sitetools/trunk/doxia-site-renderer/src/test/resources/site/xdoc/entityTest.xml
 Fri Jan 30 01:56:23 2009
@@ -19,15 +19,14 @@
 -->
 
 <!DOCTYPE document [
-  <!-- These are the entity sets for ISO Latin 1 characters for the XHTML -->
-  <!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent";>
-  %HTMLlat1;
-  <!-- These are the entity sets for special characters for the XHTML -->
-  <!ENTITY % HTMLsymbol PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent";>
-  %HTMLsymbol;
-  <!-- These are the entity sets for symbol characters for the XHTML -->
-  <!ENTITY % HTMLspecial PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent";>
-  %HTMLspecial;
+<!-- These are the entity sets for ISO Latin 1 characters for the XHTML -->
+<!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent";>
+%HTMLlat1;
+<!-- Some entities from http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent -->
+<!ENTITY Alpha    "&#913;"> <!-- greek capital letter alpha, U+0391 -->
+<!ENTITY Beta     "&#914;"> <!-- greek capital letter beta, U+0392 -->
+<!ENTITY Gamma    "&#915;"> <!-- greek capital letter gamma,
+U+0393 ISOgrk3 -->
 ]>
 <document xmlns="http://maven.apache.org/XDOC/2.0";
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
@@ -41,13 +40,18 @@
     <section name="section name">
 
       <h4>Entities</h4>
-      <p>'&amp;' '&lt;' '&gt;' '&quot;' '&apos;' '&nbsp;' '&#160;'</p>
+      <h3>Generic Entities</h3>
+      <p>'&amp;' '&lt;' '&gt;' '&quot;' '&apos;'</p>
 
-      <h4>Comment</h4>
-      <!-- a comment and nothing else! -->
+      <h3>Local Entities</h3>
+      <p>'&Alpha;' '&Beta;' '&Gamma;'</p>
+
+      <h3>DTD Entities</h3>
+      <p>'&nbsp;' '&iexcl;' '&cent;'</p>
 
       <h4>CDATA</h4>
       <source><![CDATA[<project xmlns:ant="jelly:ant">]]></source>
+      <p><![CDATA['&nbsp;' '&iexcl;']]></p>
 
     </section>
 

Modified: 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java
URL: 
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java?rev=739137&r1=739136&r2=739137&view=diff
==============================================================================
--- 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java
 (original)
+++ 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/markup/XmlMarkup.java
 Fri Jan 30 01:56:23 2009
@@ -42,4 +42,10 @@
 
     /** CDATA string: "CDATA" */
     String CDATA = "CDATA";
+
+    /** DOCTYPE start string: "&lt;!DOCTYPE" */
+    String DOCTYPE_START = "<!DOCTYPE";
+
+    /** ENTITY start string: "&lt;!ENTITY" */
+    String ENTITY_START = "<!ENTITY";
 }

Modified: 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java
URL: 
http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java?rev=739137&r1=739136&r2=739137&view=diff
==============================================================================
--- 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java
 (original)
+++ 
maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java
 Fri Jan 30 01:56:23 2009
@@ -30,6 +30,7 @@
 import java.io.StringReader;
 import java.net.URL;
 import java.util.Hashtable;
+import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.Locale;
 import java.util.Map;
@@ -70,16 +71,21 @@
     extends AbstractParser
     implements XmlMarkup
 {
-    /** Entity pattern for HTML entity, i.e. &#38;nbsp; see 
http://www.w3.org/TR/REC-xml/#NT-EntityDecl */
+    /** Entity pattern for HTML entity, i.e. &#38;nbsp; 
"<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>
+     * <br/>
+     * see <a 
href="http://www.w3.org/TR/REC-xml/#NT-EntityDecl";>http://www.w3.org/TR/REC-xml/#NT-EntityDecl</a>
 */
     private static final Pattern PATTERN_ENTITY_1 =
-        Pattern.compile( 
"<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>" );
+        Pattern.compile( ENTITY_START + 
"(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>" );
 
-    /** Entity pattern for Unicode entity, i.e. &#38;#38; see 
http://www.w3.org/TR/REC-xml/#NT-EntityDecl */
+    /** Entity pattern for Unicode entity, i.e. &#38;#38; 
"<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&(#x?[0-9a-fA-F]{1,4};)*)(\\s)*\"(\\s)*>"
+     * <br/>
+     * see <a 
href="http://www.w3.org/TR/REC-xml/#NT-EntityDecl";>http://www.w3.org/TR/REC-xml/#NT-EntityDecl</a>
 */
     private static final Pattern PATTERN_ENTITY_2 =
-        Pattern.compile( 
"<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&#x?[0-9a-fA-F]{1,4};)(\\s)*\"(\\s)*>"
 );
+        Pattern.compile( ENTITY_START + 
"(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&(#x?[0-9a-fA-F]{1,4};)*)(\\s)*\"(\\s)*>" );
 
-    /** Doctype pattern as defined in 
http://www.w3.org/TR/REC-xml/#NT-doctypedecl */
-    private static final Pattern PATTERN_DOCTYPE = Pattern.compile( 
".*<!DOCTYPE([^>]*)>.*" );
+    /** Doctype pattern i.e. ".*<!DOCTYPE([^>]*)>.*"
+     * see <a 
href="http://www.w3.org/TR/REC-xml/#NT-doctypedecl";>http://www.w3.org/TR/REC-xml/#NT-doctypedecl</a>
 */
+    private static final Pattern PATTERN_DOCTYPE = Pattern.compile( ".*" + 
DOCTYPE_START + "([^>]*)>.*" );
 
     /** Tag pattern as defined in http://www.w3.org/TR/REC-xml/#NT-Name */
     private static final Pattern PATTERN_TAG = Pattern.compile( 
".*<([A-Za-z][A-Za-z0-9:_.-]*)([^>]*)>.*" );
@@ -247,43 +253,13 @@
             }
             else if ( eventType == XmlPullParser.DOCDECL )
             {
-                String text = parser.getText();
-                int entitiesCount = StringUtils.countMatches( text, "<!ENTITY" 
);
-                // entities defined in a local doctype
-                if ( entitiesCount > 0 )
+                addLocalEntities( parser, parser.getText() );
+
+                for ( Iterator it = 
CachedFileEntityResolver.ENTITY_CACHE.values().iterator(); it.hasNext(); )
                 {
-                    int start = text.indexOf( "<" );
-                    int end = text.lastIndexOf( ">" );
-                    if ( start != -1 && end != -1 )
-                    {
-                        text = text.substring( start, end + 1 );
-                        for ( int i = 0; i < entitiesCount; i++ )
-                        {
-                            String tmp = text.substring( text.indexOf( "<" ), 
text.indexOf( ">" ) + 1 );
-                            Matcher matcher = PATTERN_ENTITY_1.matcher( tmp );
-                            if ( matcher.find() && matcher.groupCount() == 7 )
-                            {
-                                String entityName = matcher.group( 2 );
-                                String entityValue = matcher.group( 5 );
-
-                                parser.defineEntityReplacementText( 
entityName, entityValue );
-                                getLocalEntities().put( entityName, 
entityValue );
-                            }
-                            else
-                            {
-                                matcher = PATTERN_ENTITY_2.matcher( text );
-                                if ( matcher.find() && matcher.groupCount() == 
7 )
-                                {
-                                    String entityName = matcher.group( 2 );
-                                    String entityValue = matcher.group( 5 );
-
-                                    parser.defineEntityReplacementText( 
entityName, entityValue );
-                                    getLocalEntities().put( entityName, 
entityValue );
-                                }
-                            }
-                            text = StringUtils.replace( text, tmp, "" ).trim();
-                        }
-                    }
+                    byte[] res = (byte[])it.next();
+
+                    addDTDEntities( parser, new String( res ) );
                 }
             }
 
@@ -589,6 +565,123 @@
     }
 
     /**
+     * Add an entity given by <code>entityName</code> and 
<code>entityValue</code> to {...@link #entities}.
+     * <br/>
+     * By default, we exclude the default XML entities: &#38;amp;, &#38;lt;, 
&#38;gt;, &#38;quot; and &#38;apos;.
+     *
+     * @param parser not null
+     * @param entityName not null
+     * @param entityValue not null
+     * @throws XmlPullParserException if any
+     * @see {...@link XmlPullParser#defineEntityReplacementText(String, 
String)}
+     */
+    private void addEntity( XmlPullParser parser, String entityName, String 
entityValue )
+        throws XmlPullParserException
+    {
+        if ( entityName.endsWith( "amp" ) || entityName.endsWith( "lt" ) || 
entityName.endsWith( "gt" )
+            || entityName.endsWith( "quot" ) || entityName.endsWith( "apos" ) )
+        {
+            return;
+        }
+
+        parser.defineEntityReplacementText( entityName, entityValue );
+        getLocalEntities().put( entityName, entityValue );
+    }
+
+    /**
+     * Handle entities defined in a local doctype as the following:
+     * <pre>
+     * &lt;!DOCTYPE foo [
+     *   &lt;!ENTITY bar "&#38;#x160;"&gt;
+     *   &lt;!ENTITY bar1 "&#38;#x161;"&gt;
+     * ]&gt;
+     * </pre>
+     *
+     * @param parser not null
+     * @param text not null
+     * @throws XmlPullParserException if any
+     */
+    private void addLocalEntities( XmlPullParser parser, String text )
+        throws XmlPullParserException
+    {
+        int entitiesCount = StringUtils.countMatches( text, ENTITY_START );
+        if ( entitiesCount > 0 )
+        {
+            // text should be foo [...]
+            int start = text.indexOf( "[" );
+            int end = text.lastIndexOf( "]" );
+            if ( start != -1 && end != -1 )
+            {
+                text = text.substring( start + 1, end );
+                addDTDEntities( parser, text );
+            }
+        }
+    }
+
+    /**
+     * Handle entities defined in external doctypes as the following:
+     * <pre>
+     * &lt;!DOCTYPE foo [
+     *   &lt;!-- These are the entity sets for ISO Latin 1 characters for the 
XHTML --&gt;
+     *   &lt;!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for 
XHTML//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent"&gt;
+     *   %HTMLlat1;
+     * ]&gt;
+     * </pre>
+     *
+     * @param parser not null
+     * @param text not null
+     * @throws XmlPullParserException if any
+     */
+    private void addDTDEntities( XmlPullParser parser, String text )
+        throws XmlPullParserException
+    {
+        int entitiesCount = StringUtils.countMatches( text, ENTITY_START );
+        if ( entitiesCount > 0 )
+        {
+            BufferedReader reader = new BufferedReader( new StringReader( text 
) );
+            String line;
+            String tmpLine = "";
+            try
+            {
+                Matcher matcher;
+                while ( ( line = reader.readLine() ) != null )
+                {
+                    tmpLine += "\n" + line;
+                    matcher = PATTERN_ENTITY_1.matcher( tmpLine );
+                    if ( matcher.find() && matcher.groupCount() == 7 )
+                    {
+                        String entityName = matcher.group( 2 );
+                        String entityValue = matcher.group( 5 );
+
+                        addEntity( parser, entityName, entityValue );
+                        tmpLine = "";
+                    }
+                    else
+                    {
+                        matcher = PATTERN_ENTITY_2.matcher( tmpLine );
+                        if ( matcher.find() && matcher.groupCount() == 8 )
+                        {
+                            String entityName = matcher.group( 2 );
+                            String entityValue = matcher.group( 5 );
+
+                            addEntity( parser, entityName, entityValue );
+                            tmpLine = "";
+                        }
+                    }
+                }
+            }
+            catch ( IOException e )
+            {
+                // nop
+            }
+            finally
+            {
+                IOUtil.close( reader );
+            }
+        }
+    }
+
+    /**
      * Convenience class to beautify <code>SAXParseException</code> messages.
      */
     static class MessagesErrorHandler
@@ -714,13 +807,14 @@
     public static class CachedFileEntityResolver
         implements EntityResolver
     {
-        private static final Map cache = new Hashtable();
+        /** Map with systemId as key and the content of systemId as byte[]. */
+        protected static final Map ENTITY_CACHE = new Hashtable();
 
         /** {...@inheritdoc} */
         public InputSource resolveEntity( String publicId, String systemId )
             throws SAXException, IOException
         {
-            byte[] res = (byte[]) cache.get( systemId );
+            byte[] res = (byte[]) ENTITY_CACHE.get( systemId );
             // already cached?
             if ( res == null )
             {
@@ -758,7 +852,7 @@
                     res = toByteArray( temp.toURL() );
                 }
 
-                cache.put( systemId, res );
+                ENTITY_CACHE.put( systemId, res );
             }
 
             InputSource is = new InputSource( new ByteArrayInputStream( res ) 
);


Reply via email to