Author: vsiveton Date: Sun Nov 2 16:51:25 2008 New Revision: 709994 URL: http://svn.apache.org/viewvc?rev=709994&view=rev Log: DOXIA-250: Xml parser should handle entities defined in doctype
o fixed the AbstractXmlParser to handle defined entities o added a test case Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java maven/doxia/doxia/trunk/doxia-modules/doxia-module-xhtml/src/test/java/org/apache/maven/doxia/module/xhtml/XhtmlParserTest.java Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java?rev=709994&r1=709993&r2=709994&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java Sun Nov 2 16:51:25 2008 @@ -21,12 +21,15 @@ import java.io.IOException; import java.io.Reader; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.maven.doxia.macro.MacroExecutionException; import org.apache.maven.doxia.markup.XmlMarkup; import org.apache.maven.doxia.sink.Sink; import org.apache.maven.doxia.sink.SinkEventAttributeSet; - import org.codehaus.plexus.util.StringUtils; import org.codehaus.plexus.util.xml.pull.MXParser; import org.codehaus.plexus.util.xml.pull.XmlPullParser; @@ -43,12 +46,22 @@ extends AbstractParser implements XmlMarkup { + /** Entity pattern for HTML entity, i.e. &nbsp; */ + private static final Pattern PATTERN_ENTITY_1 = + Pattern.compile( "<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>" ); + + /** Entity pattern for Unicode entity, i.e. &#38; */ + private static final Pattern PATTERN_ENTITY_2 = + Pattern.compile( "<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&#x?[0-9a-fA-F]{1,4};)(\\s)*\"(\\s)*>" ); + private boolean ignorable; private boolean collapsible; private boolean trimmable; + private Map entities; + /** [EMAIL PROTECTED] */ public void parse( Reader source, Sink sink ) throws ParseException @@ -65,8 +78,8 @@ } catch ( XmlPullParserException ex ) { - throw new ParseException( "Error parsing the model: " + ex.getMessage(), ex, ex.getLineNumber(), ex - .getColumnNumber() ); + throw new ParseException( "Error parsing the model: " + ex.getMessage(), ex, ex.getLineNumber(), + ex.getColumnNumber() ); } catch ( MacroExecutionException ex ) { @@ -180,7 +193,44 @@ } else if ( eventType == XmlPullParser.DOCDECL ) { - // nop + String text = parser.getText(); + int entitiesCount = StringUtils.countMatches( text, "<!ENTITY" ); + // entities defined in a local doctype + if ( entitiesCount > 0 ) + { + int start = text.indexOf( "<" ); + int end = text.lastIndexOf( ">" ); + if ( start != -1 && end != -1 ) + { + text = text.substring( start, end + 1 ); + for ( int i = 0; i < entitiesCount; i++ ) + { + String tmp = text.substring( text.indexOf( "<" ), text.indexOf( ">" ) + 1 ); + Matcher matcher = PATTERN_ENTITY_1.matcher( tmp ); + if ( matcher.find() && matcher.groupCount() == 7 ) + { + String entityName = matcher.group( 2 ); + String entityValue = matcher.group( 5 ); + + parser.defineEntityReplacementText( entityName, entityValue ); + getLocalEntities().put( entityName, entityValue ); + } + else + { + matcher = PATTERN_ENTITY_2.matcher( text ); + if ( matcher.find() && matcher.groupCount() == 7 ) + { + String entityName = matcher.group( 2 ); + String entityValue = matcher.group( 5 ); + + parser.defineEntityReplacementText( entityName, entityValue ); + getLocalEntities().put( entityName, entityValue ); + } + } + text = StringUtils.replace( text, tmp, "" ).trim(); + } + } + } } try @@ -355,4 +405,25 @@ return text; } + + /** + * Return the defined entities in a local doctype, i.e.: + * <pre> + * <!DOCTYPE foo [ + * <!ENTITY bar "&#x160;"> + * <!ENTITY bar1 "&#x161;"> + * ]> + * </pre> + * + * @return a map of the defined entities in a local doctype. + */ + protected Map getLocalEntities() + { + if ( entities == null ) + { + entities = new LinkedHashMap(); + } + + return entities; + } } Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java?rev=709994&r1=709993&r2=709994&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java Sun Nov 2 16:51:25 2008 @@ -698,7 +698,14 @@ } else { - sink.text( text ); + if ( getLocalEntities().containsKey( textChars ) ) + { + sink.rawText( text ); + } + else + { + sink.text( text ); + } } } Modified: maven/doxia/doxia/trunk/doxia-modules/doxia-module-xhtml/src/test/java/org/apache/maven/doxia/module/xhtml/XhtmlParserTest.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-modules/doxia-module-xhtml/src/test/java/org/apache/maven/doxia/module/xhtml/XhtmlParserTest.java?rev=709994&r1=709993&r2=709994&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-modules/doxia-module-xhtml/src/test/java/org/apache/maven/doxia/module/xhtml/XhtmlParserTest.java (original) +++ maven/doxia/doxia/trunk/doxia-modules/doxia-module-xhtml/src/test/java/org/apache/maven/doxia/module/xhtml/XhtmlParserTest.java Sun Nov 2 16:51:25 2008 @@ -19,10 +19,12 @@ * under the License. */ +import java.io.StringWriter; import java.util.Iterator; import org.apache.maven.doxia.parser.AbstractParserTest; import org.apache.maven.doxia.parser.Parser; +import org.apache.maven.doxia.sink.Sink; import org.apache.maven.doxia.sink.SinkEventElement; import org.apache.maven.doxia.sink.SinkEventTestingSink; @@ -104,4 +106,32 @@ assertFalse( it.hasNext() ); } + /** + * @throws Exception if any + */ + public void testDoxia250() + throws Exception + { + StringBuffer sb = new StringBuffer(); + sb.append( "<!DOCTYPE test [" ).append( EOL ); + sb.append( "<!ENTITY " ).append( EOL ).append( " foo " ).append( EOL ).append( " \" " ) + .append( EOL ).append( " ř " ).append( EOL ).append( " \">" ).append( EOL ); + sb.append( "<!ENTITY " ).append( EOL ).append( " foo1 " ).append( EOL ).append( " \" " ) + .append( EOL ).append( " " ).append( EOL ).append( " \">" ).append( EOL ); + sb.append( "<!ENTITY " ).append( EOL ).append( " foo2 " ).append( EOL ).append( " \" " ) + .append( EOL ).append( " š " ).append( EOL ).append( " \">" ).append( EOL ); + sb.append( "]>" ).append( EOL ); + sb.append( "<html><body>&foo;&foo1;&foo2;</body></html>" ); + + String text = sb.toString(); + StringWriter w = new StringWriter(); + Sink sink = new XhtmlSink( w ); + // Should fail when fixing DOXIA-263 I guess. + ( (XhtmlParser) createParser() ).parse( text.toString(), sink ); + String result = w.toString(); + + assertTrue( result.indexOf( "ř" ) != -1 ); + assertTrue( result.indexOf( " " ) != -1 ); + assertTrue( result.indexOf( "š" ) != -1 ); + } }