Author: olamy Date: Mon Jan 21 21:59:27 2013 New Revision: 1436646 URL: http://svn.apache.org/viewvc?rev=1436646&view=rev Log: [DOXIA-480] XhtmlBaseParser ignores XHTML default entities Submitted by Andrius Velykis
Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java?rev=1436646&r1=1436645&r2=1436646&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/AbstractXmlParser.java Mon Jan 21 21:59:27 2013 @@ -135,6 +135,10 @@ public abstract class AbstractXmlParser XmlPullParser parser = new MXParser(); parser.setInput( src ); + + // allow parser initialization, e.g. for additional entities in XHTML + // Note: do it after input is set, otherwise values are reset + initXmlParser(parser); sink.enableLogging( getLog() ); @@ -153,6 +157,18 @@ public abstract class AbstractXmlParser setSecondParsing( false ); init(); } + + /** + * Initializes the parser with custom entities or other options. + * + * @param parser A parser, not null. + * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem initializing the parser + */ + protected void initXmlParser( XmlPullParser parser ) + throws XmlPullParserException + { + // nop + } /** * {@inheritDoc} Modified: maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java?rev=1436646&r1=1436645&r2=1436646&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/main/java/org/apache/maven/doxia/parser/XhtmlBaseParser.java Mon Jan 21 21:59:27 2013 @@ -99,6 +99,285 @@ public class XhtmlBaseParser } /** + * {@inheritDoc} + * + * Adds all XHTML (HTML 4.0) entities to the parser so that they can be recognized and resolved + * without additional DTD. + */ + @Override + protected void initXmlParser(XmlPullParser parser) + throws XmlPullParserException + { + super.initXmlParser(parser); + + // the entities taken from org.apache.maven.doxia.document.io.xpp3.DocumentXpp3Reader, + // which is generated automatically + + // ---------------------------------------------------------------------- + // Latin 1 entities + // ---------------------------------------------------------------------- + + parser.defineEntityReplacementText( "nbsp", "\u00a0" ); + parser.defineEntityReplacementText( "iexcl", "\u00a1" ); + parser.defineEntityReplacementText( "cent", "\u00a2" ); + parser.defineEntityReplacementText( "pound", "\u00a3" ); + parser.defineEntityReplacementText( "curren", "\u00a4" ); + parser.defineEntityReplacementText( "yen", "\u00a5" ); + parser.defineEntityReplacementText( "brvbar", "\u00a6" ); + parser.defineEntityReplacementText( "sect", "\u00a7" ); + parser.defineEntityReplacementText( "uml", "\u00a8" ); + parser.defineEntityReplacementText( "copy", "\u00a9" ); + parser.defineEntityReplacementText( "ordf", "\u00aa" ); + parser.defineEntityReplacementText( "laquo", "\u00ab" ); + parser.defineEntityReplacementText( "not", "\u00ac" ); + parser.defineEntityReplacementText( "shy", "\u00ad" ); + parser.defineEntityReplacementText( "reg", "\u00ae" ); + parser.defineEntityReplacementText( "macr", "\u00af" ); + parser.defineEntityReplacementText( "deg", "\u00b0" ); + parser.defineEntityReplacementText( "plusmn", "\u00b1" ); + parser.defineEntityReplacementText( "sup2", "\u00b2" ); + parser.defineEntityReplacementText( "sup3", "\u00b3" ); + parser.defineEntityReplacementText( "acute", "\u00b4" ); + parser.defineEntityReplacementText( "micro", "\u00b5" ); + parser.defineEntityReplacementText( "para", "\u00b6" ); + parser.defineEntityReplacementText( "middot", "\u00b7" ); + parser.defineEntityReplacementText( "cedil", "\u00b8" ); + parser.defineEntityReplacementText( "sup1", "\u00b9" ); + parser.defineEntityReplacementText( "ordm", "\u00ba" ); + parser.defineEntityReplacementText( "raquo", "\u00bb" ); + parser.defineEntityReplacementText( "frac14", "\u00bc" ); + parser.defineEntityReplacementText( "frac12", "\u00bd" ); + parser.defineEntityReplacementText( "frac34", "\u00be" ); + parser.defineEntityReplacementText( "iquest", "\u00bf" ); + parser.defineEntityReplacementText( "Agrave", "\u00c0" ); + parser.defineEntityReplacementText( "Aacute", "\u00c1" ); + parser.defineEntityReplacementText( "Acirc", "\u00c2" ); + parser.defineEntityReplacementText( "Atilde", "\u00c3" ); + parser.defineEntityReplacementText( "Auml", "\u00c4" ); + parser.defineEntityReplacementText( "Aring", "\u00c5" ); + parser.defineEntityReplacementText( "AElig", "\u00c6" ); + parser.defineEntityReplacementText( "Ccedil", "\u00c7" ); + parser.defineEntityReplacementText( "Egrave", "\u00c8" ); + parser.defineEntityReplacementText( "Eacute", "\u00c9" ); + parser.defineEntityReplacementText( "Ecirc", "\u00ca" ); + parser.defineEntityReplacementText( "Euml", "\u00cb" ); + parser.defineEntityReplacementText( "Igrave", "\u00cc" ); + parser.defineEntityReplacementText( "Iacute", "\u00cd" ); + parser.defineEntityReplacementText( "Icirc", "\u00ce" ); + parser.defineEntityReplacementText( "Iuml", "\u00cf" ); + parser.defineEntityReplacementText( "ETH", "\u00d0" ); + parser.defineEntityReplacementText( "Ntilde", "\u00d1" ); + parser.defineEntityReplacementText( "Ograve", "\u00d2" ); + parser.defineEntityReplacementText( "Oacute", "\u00d3" ); + parser.defineEntityReplacementText( "Ocirc", "\u00d4" ); + parser.defineEntityReplacementText( "Otilde", "\u00d5" ); + parser.defineEntityReplacementText( "Ouml", "\u00d6" ); + parser.defineEntityReplacementText( "times", "\u00d7" ); + parser.defineEntityReplacementText( "Oslash", "\u00d8" ); + parser.defineEntityReplacementText( "Ugrave", "\u00d9" ); + parser.defineEntityReplacementText( "Uacute", "\u00da" ); + parser.defineEntityReplacementText( "Ucirc", "\u00db" ); + parser.defineEntityReplacementText( "Uuml", "\u00dc" ); + parser.defineEntityReplacementText( "Yacute", "\u00dd" ); + parser.defineEntityReplacementText( "THORN", "\u00de" ); + parser.defineEntityReplacementText( "szlig", "\u00df" ); + parser.defineEntityReplacementText( "agrave", "\u00e0" ); + parser.defineEntityReplacementText( "aacute", "\u00e1" ); + parser.defineEntityReplacementText( "acirc", "\u00e2" ); + parser.defineEntityReplacementText( "atilde", "\u00e3" ); + parser.defineEntityReplacementText( "auml", "\u00e4" ); + parser.defineEntityReplacementText( "aring", "\u00e5" ); + parser.defineEntityReplacementText( "aelig", "\u00e6" ); + parser.defineEntityReplacementText( "ccedil", "\u00e7" ); + parser.defineEntityReplacementText( "egrave", "\u00e8" ); + parser.defineEntityReplacementText( "eacute", "\u00e9" ); + parser.defineEntityReplacementText( "ecirc", "\u00ea" ); + parser.defineEntityReplacementText( "euml", "\u00eb" ); + parser.defineEntityReplacementText( "igrave", "\u00ec" ); + parser.defineEntityReplacementText( "iacute", "\u00ed" ); + parser.defineEntityReplacementText( "icirc", "\u00ee" ); + parser.defineEntityReplacementText( "iuml", "\u00ef" ); + parser.defineEntityReplacementText( "eth", "\u00f0" ); + parser.defineEntityReplacementText( "ntilde", "\u00f1" ); + parser.defineEntityReplacementText( "ograve", "\u00f2" ); + parser.defineEntityReplacementText( "oacute", "\u00f3" ); + parser.defineEntityReplacementText( "ocirc", "\u00f4" ); + parser.defineEntityReplacementText( "otilde", "\u00f5" ); + parser.defineEntityReplacementText( "ouml", "\u00f6" ); + parser.defineEntityReplacementText( "divide", "\u00f7" ); + parser.defineEntityReplacementText( "oslash", "\u00f8" ); + parser.defineEntityReplacementText( "ugrave", "\u00f9" ); + parser.defineEntityReplacementText( "uacute", "\u00fa" ); + parser.defineEntityReplacementText( "ucirc", "\u00fb" ); + parser.defineEntityReplacementText( "uuml", "\u00fc" ); + parser.defineEntityReplacementText( "yacute", "\u00fd" ); + parser.defineEntityReplacementText( "thorn", "\u00fe" ); + parser.defineEntityReplacementText( "yuml", "\u00ff" ); + + // ---------------------------------------------------------------------- + // Special entities + // ---------------------------------------------------------------------- + + parser.defineEntityReplacementText( "OElig", "\u0152" ); + parser.defineEntityReplacementText( "oelig", "\u0153" ); + parser.defineEntityReplacementText( "Scaron", "\u0160" ); + parser.defineEntityReplacementText( "scaron", "\u0161" ); + parser.defineEntityReplacementText( "Yuml", "\u0178" ); + parser.defineEntityReplacementText( "circ", "\u02c6" ); + parser.defineEntityReplacementText( "tilde", "\u02dc" ); + parser.defineEntityReplacementText( "ensp", "\u2002" ); + parser.defineEntityReplacementText( "emsp", "\u2003" ); + parser.defineEntityReplacementText( "thinsp", "\u2009" ); + parser.defineEntityReplacementText( "zwnj", "\u200c" ); + parser.defineEntityReplacementText( "zwj", "\u200d" ); + parser.defineEntityReplacementText( "lrm", "\u200e" ); + parser.defineEntityReplacementText( "rlm", "\u200f" ); + parser.defineEntityReplacementText( "ndash", "\u2013" ); + parser.defineEntityReplacementText( "mdash", "\u2014" ); + parser.defineEntityReplacementText( "lsquo", "\u2018" ); + parser.defineEntityReplacementText( "rsquo", "\u2019" ); + parser.defineEntityReplacementText( "sbquo", "\u201a" ); + parser.defineEntityReplacementText( "ldquo", "\u201c" ); + parser.defineEntityReplacementText( "rdquo", "\u201d" ); + parser.defineEntityReplacementText( "bdquo", "\u201e" ); + parser.defineEntityReplacementText( "dagger", "\u2020" ); + parser.defineEntityReplacementText( "Dagger", "\u2021" ); + parser.defineEntityReplacementText( "permil", "\u2030" ); + parser.defineEntityReplacementText( "lsaquo", "\u2039" ); + parser.defineEntityReplacementText( "rsaquo", "\u203a" ); + parser.defineEntityReplacementText( "euro", "\u20ac" ); + + // ---------------------------------------------------------------------- + // Symbol entities + // ---------------------------------------------------------------------- + + parser.defineEntityReplacementText( "fnof", "\u0192" ); + parser.defineEntityReplacementText( "Alpha", "\u0391" ); + parser.defineEntityReplacementText( "Beta", "\u0392" ); + parser.defineEntityReplacementText( "Gamma", "\u0393" ); + parser.defineEntityReplacementText( "Delta", "\u0394" ); + parser.defineEntityReplacementText( "Epsilon", "\u0395" ); + parser.defineEntityReplacementText( "Zeta", "\u0396" ); + parser.defineEntityReplacementText( "Eta", "\u0397" ); + parser.defineEntityReplacementText( "Theta", "\u0398" ); + parser.defineEntityReplacementText( "Iota", "\u0399" ); + parser.defineEntityReplacementText( "Kappa", "\u039a" ); + parser.defineEntityReplacementText( "Lambda", "\u039b" ); + parser.defineEntityReplacementText( "Mu", "\u039c" ); + parser.defineEntityReplacementText( "Nu", "\u039d" ); + parser.defineEntityReplacementText( "Xi", "\u039e" ); + parser.defineEntityReplacementText( "Omicron", "\u039f" ); + parser.defineEntityReplacementText( "Pi", "\u03a0" ); + parser.defineEntityReplacementText( "Rho", "\u03a1" ); + parser.defineEntityReplacementText( "Sigma", "\u03a3" ); + parser.defineEntityReplacementText( "Tau", "\u03a4" ); + parser.defineEntityReplacementText( "Upsilon", "\u03a5" ); + parser.defineEntityReplacementText( "Phi", "\u03a6" ); + parser.defineEntityReplacementText( "Chi", "\u03a7" ); + parser.defineEntityReplacementText( "Psi", "\u03a8" ); + parser.defineEntityReplacementText( "Omega", "\u03a9" ); + parser.defineEntityReplacementText( "alpha", "\u03b1" ); + parser.defineEntityReplacementText( "beta", "\u03b2" ); + parser.defineEntityReplacementText( "gamma", "\u03b3" ); + parser.defineEntityReplacementText( "delta", "\u03b4" ); + parser.defineEntityReplacementText( "epsilon", "\u03b5" ); + parser.defineEntityReplacementText( "zeta", "\u03b6" ); + parser.defineEntityReplacementText( "eta", "\u03b7" ); + parser.defineEntityReplacementText( "theta", "\u03b8" ); + parser.defineEntityReplacementText( "iota", "\u03b9" ); + parser.defineEntityReplacementText( "kappa", "\u03ba" ); + parser.defineEntityReplacementText( "lambda", "\u03bb" ); + parser.defineEntityReplacementText( "mu", "\u03bc" ); + parser.defineEntityReplacementText( "nu", "\u03bd" ); + parser.defineEntityReplacementText( "xi", "\u03be" ); + parser.defineEntityReplacementText( "omicron", "\u03bf" ); + parser.defineEntityReplacementText( "pi", "\u03c0" ); + parser.defineEntityReplacementText( "rho", "\u03c1" ); + parser.defineEntityReplacementText( "sigmaf", "\u03c2" ); + parser.defineEntityReplacementText( "sigma", "\u03c3" ); + parser.defineEntityReplacementText( "tau", "\u03c4" ); + parser.defineEntityReplacementText( "upsilon", "\u03c5" ); + parser.defineEntityReplacementText( "phi", "\u03c6" ); + parser.defineEntityReplacementText( "chi", "\u03c7" ); + parser.defineEntityReplacementText( "psi", "\u03c8" ); + parser.defineEntityReplacementText( "omega", "\u03c9" ); + parser.defineEntityReplacementText( "thetasym", "\u03d1" ); + parser.defineEntityReplacementText( "upsih", "\u03d2" ); + parser.defineEntityReplacementText( "piv", "\u03d6" ); + parser.defineEntityReplacementText( "bull", "\u2022" ); + parser.defineEntityReplacementText( "hellip", "\u2026" ); + parser.defineEntityReplacementText( "prime", "\u2032" ); + parser.defineEntityReplacementText( "Prime", "\u2033" ); + parser.defineEntityReplacementText( "oline", "\u203e" ); + parser.defineEntityReplacementText( "frasl", "\u2044" ); + parser.defineEntityReplacementText( "weierp", "\u2118" ); + parser.defineEntityReplacementText( "image", "\u2111" ); + parser.defineEntityReplacementText( "real", "\u211c" ); + parser.defineEntityReplacementText( "trade", "\u2122" ); + parser.defineEntityReplacementText( "alefsym", "\u2135" ); + parser.defineEntityReplacementText( "larr", "\u2190" ); + parser.defineEntityReplacementText( "uarr", "\u2191" ); + parser.defineEntityReplacementText( "rarr", "\u2192" ); + parser.defineEntityReplacementText( "darr", "\u2193" ); + parser.defineEntityReplacementText( "harr", "\u2194" ); + parser.defineEntityReplacementText( "crarr", "\u21b5" ); + parser.defineEntityReplacementText( "lArr", "\u21d0" ); + parser.defineEntityReplacementText( "uArr", "\u21d1" ); + parser.defineEntityReplacementText( "rArr", "\u21d2" ); + parser.defineEntityReplacementText( "dArr", "\u21d3" ); + parser.defineEntityReplacementText( "hArr", "\u21d4" ); + parser.defineEntityReplacementText( "forall", "\u2200" ); + parser.defineEntityReplacementText( "part", "\u2202" ); + parser.defineEntityReplacementText( "exist", "\u2203" ); + parser.defineEntityReplacementText( "empty", "\u2205" ); + parser.defineEntityReplacementText( "nabla", "\u2207" ); + parser.defineEntityReplacementText( "isin", "\u2208" ); + parser.defineEntityReplacementText( "notin", "\u2209" ); + parser.defineEntityReplacementText( "ni", "\u220b" ); + parser.defineEntityReplacementText( "prod", "\u220f" ); + parser.defineEntityReplacementText( "sum", "\u2211" ); + parser.defineEntityReplacementText( "minus", "\u2212" ); + parser.defineEntityReplacementText( "lowast", "\u2217" ); + parser.defineEntityReplacementText( "radic", "\u221a" ); + parser.defineEntityReplacementText( "prop", "\u221d" ); + parser.defineEntityReplacementText( "infin", "\u221e" ); + parser.defineEntityReplacementText( "ang", "\u2220" ); + parser.defineEntityReplacementText( "and", "\u2227" ); + parser.defineEntityReplacementText( "or", "\u2228" ); + parser.defineEntityReplacementText( "cap", "\u2229" ); + parser.defineEntityReplacementText( "cup", "\u222a" ); + parser.defineEntityReplacementText( "int", "\u222b" ); + parser.defineEntityReplacementText( "there4", "\u2234" ); + parser.defineEntityReplacementText( "sim", "\u223c" ); + parser.defineEntityReplacementText( "cong", "\u2245" ); + parser.defineEntityReplacementText( "asymp", "\u2248" ); + parser.defineEntityReplacementText( "ne", "\u2260" ); + parser.defineEntityReplacementText( "equiv", "\u2261" ); + parser.defineEntityReplacementText( "le", "\u2264" ); + parser.defineEntityReplacementText( "ge", "\u2265" ); + parser.defineEntityReplacementText( "sub", "\u2282" ); + parser.defineEntityReplacementText( "sup", "\u2283" ); + parser.defineEntityReplacementText( "nsub", "\u2284" ); + parser.defineEntityReplacementText( "sube", "\u2286" ); + parser.defineEntityReplacementText( "supe", "\u2287" ); + parser.defineEntityReplacementText( "oplus", "\u2295" ); + parser.defineEntityReplacementText( "otimes", "\u2297" ); + parser.defineEntityReplacementText( "perp", "\u22a5" ); + parser.defineEntityReplacementText( "sdot", "\u22c5" ); + parser.defineEntityReplacementText( "lceil", "\u2308" ); + parser.defineEntityReplacementText( "rceil", "\u2309" ); + parser.defineEntityReplacementText( "lfloor", "\u230a" ); + parser.defineEntityReplacementText( "rfloor", "\u230b" ); + parser.defineEntityReplacementText( "lang", "\u2329" ); + parser.defineEntityReplacementText( "rang", "\u232a" ); + parser.defineEntityReplacementText( "loz", "\u25ca" ); + parser.defineEntityReplacementText( "spades", "\u2660" ); + parser.defineEntityReplacementText( "clubs", "\u2663" ); + parser.defineEntityReplacementText( "hearts", "\u2665" ); + parser.defineEntityReplacementText( "diams", "\u2666" ); + } + + /** * <p> * Goes through a common list of possible html start tags. These include only tags that can go into * the body of a xhtml document and so should be re-usable by different xhtml-based parsers. Modified: maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java URL: http://svn.apache.org/viewvc/maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java?rev=1436646&r1=1436645&r2=1436646&view=diff ============================================================================== --- maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java (original) +++ maven/doxia/doxia/trunk/doxia-core/src/test/java/org/apache/maven/doxia/parser/XhtmlBaseParserTest.java Mon Jan 21 21:59:27 2013 @@ -425,6 +425,60 @@ public class XhtmlBaseParserTest } /** @throws Exception */ + public void testXhtmlEntities() + throws Exception + { + final String text = "<body><h2>«®</h2><p>“’Φ←</p></body>"; + + parser.parse( text, sink ); + + Iterator<SinkEventElement> it = sink.getEventList().iterator(); + + assertEquals( "section1", ( it.next() ).getName() ); + assertEquals( "sectionTitle1", ( it.next() ).getName() ); + + // Couple symbols from Latin-1: + // http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Latin-1_characters + + SinkEventElement textEvt = it.next(); + assertEquals( "text", textEvt.getName() ); + assertEquals( "\u00AB", textEvt.getArgs()[0] ); + + textEvt = it.next(); + assertEquals( "text", textEvt.getName() ); + assertEquals( "\u00AE", textEvt.getArgs()[0] ); + + assertEquals( "sectionTitle1_", ( it.next() ).getName() ); + assertEquals( "paragraph", ( it.next() ).getName() ); + + // Couple symbols from Special characters: + // http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters + + textEvt = it.next(); + assertEquals( "text", textEvt.getName() ); + assertEquals( "\u201C", textEvt.getArgs()[0] ); + + textEvt = it.next(); + assertEquals( "text", textEvt.getName() ); + assertEquals( "\u2019", textEvt.getArgs()[0] ); + + // Couple symbols from Symbols: + // http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Symbols + + textEvt = it.next(); + assertEquals( "text", textEvt.getName() ); + assertEquals( "\u03A6", textEvt.getArgs()[0] ); + + textEvt = it.next(); + assertEquals( "text", textEvt.getName() ); + assertEquals( "\u2190", textEvt.getArgs()[0] ); + + assertEquals( "paragraph_", ( it.next() ).getName() ); + + assertFalse( it.hasNext() ); + } + + /** @throws Exception */ public void testDecoration() throws Exception {