On Wed, Oct 7, 2009 at 6:54 PM, Adam Foltzer <acfolt...@gmail.com> wrote:
> Here's a sample: > > <?xml version="1.0" encoding="ISO-8859-1"?> > <!DOCTYPE document [ > <!ENTITY nbsp " "> > <!ENTITY copy "©"> > <!ENTITY reg "®"> > ]> > <document> > <kbml version="-//Indiana University//DTD KBML 0.9//EN"> > <kbq>In Mac OS X, how do I enable or disable the firewall?</kbq> > <body> > <p><kbh docid="aghe" access="allowed">Mac OS > X<domain>all</domain><visibility>visible</visibility></kbh> includes > an easy-to-use <kbh docid="aoru" > > access="allowed">firewall<domain>all</domain><visibility>visible</visibility></kbh> > that > can prevent potentially harmful incoming connections from other > computers. To turn it on or off:</p> > > > <h3>Mac OS X 10.6 (Snow Leopard)</h3> > > <ol><li>From the Apple menu, select <mi>System Preferences...†</mi>. > When the <code>System Preferences</code> window appears, from the > <mi>View</mi> menu, select <mi>Security</mi>. > > <br clear="none"/><br clear="none"/> > </li><li>Click the <mi>Firewall</mi> tab. > > ... > > </li></ol> > </body> > <xtra> > <term weight="0">macos</term> > <term weight="0">macintosh</term> > <term weight="0">apple</term> > <term weight="0">macosx</term> > > ... > > </xtra> > </kbml> > <metadata> > <docid>aozg</docid> > <owner firstname="" lastname="Macintosh Support">scmac</owner> > > ... > > </metadata> > </document> > > The /document/kbml/kbq works fine, but as you can see, it has no > children. The actual content of the document is within the body > element, though, which requires some flattening. > > Adam, I'm not able to reproduce your problem. I wrote a test case using your xml and configuration and it passes. Diff below: Index: contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java =================================================================== --- contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java (revision 824015) +++ contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java (working copy) @@ -109,6 +109,85 @@ } @Test + @SuppressWarnings("unchecked") + public void testFlatten() throws Exception { + String xml = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n" + + "<!DOCTYPE document [\n" + + "<!ENTITY nbsp \" \">\n" + + "<!ENTITY copy \"©\">\n" + + "<!ENTITY reg \"®\">\n" + + "]>\n" + + "<document>\n" + + " <kbml version=\"-//Indiana University//DTD KBML 0.9//EN\">\n" + + " <kbq>In Mac OS X, how do I enable or disable the firewall?</kbq>\n" + + " <body>\n" + + "<p><kbh docid=\"aghe\" access=\"allowed\">Mac OS\n" + + "X<domain>all</domain><visibility>visible</visibility></kbh> includes\n" + + "an easy-to-use <kbh docid=\"aoru\"\n" + + "access=\"allowed\">firewall<domain>all</domain><visibility>visible</visibility></kbh>\n" + + "that\n" + + "can prevent potentially harmful incoming connections from other\n" + + "computers. To turn it on or off:</p>\n" + + "\n" + + "\n" + + "<h3>Mac OS X 10.6 (Snow Leopard)</h3>\n" + + "\n" + + "<ol><li>From the Apple menu, select <mi>System Preferences...†</mi>.\n" + + "When the <code>System Preferences</code> window appears, from the\n" + + "<mi>View</mi> menu, select <mi>Security</mi>.\n" + + "\n" + + "<br clear=\"none\"/><br clear=\"none\"/>\n" + + "</li><li>Click the <mi>Firewall</mi> tab.\n" + + "\n" + + "...\n" + + "\n" + + "</li></ol>\n" + + "</body>\n" + + " <xtra>\n" + + " <term weight=\"0\">macos</term>\n" + + " <term weight=\"0\">macintosh</term>\n" + + " <term weight=\"0\">apple</term>\n" + + " <term weight=\"0\">macosx</term>\n" + + "\n" + + "...\n" + + "\n" + + " </xtra>\n" + + " </kbml>\n" + + " <metadata>\n" + + " <docid>aozg</docid>\n" + + " <owner firstname=\"\" lastname=\"Macintosh Support\">scmac</owner>\n" + + "\n" + + "...\n" + + "\n" + + " </metadata>\n" + + "</document>"; + Map entityAttrs = createMap("name", "kbxml", "url", "testdata.xml", + XPathEntityProcessor.FOR_EACH, "/document", "transformer", "HTMLStripTransformer"); + List fields = new ArrayList(); + fields.add(createMap("column", "content", "xpath", "/document/kbml/body" ,"flatten","true", "stripHTML", "true")); + fields.add(createMap("column", "title", "xpath", "/document/kbml/kbq")); + Context c = AbstractDataImportHandlerTest.getContext(null, + new VariableResolverImpl(), getDataSource(xml), Context.FULL_DUMP, fields, entityAttrs); + XPathEntityProcessor xPathEntityProcessor = new XPathEntityProcessor(); + xPathEntityProcessor.init(c); + Map<String, Object> result = null; + while (true) { + Map<String, Object> row = xPathEntityProcessor.nextRow(); + if (row == null) + break; + result = row; + } + System.out.println("result.get(\"content\") = " + result.get("content")); + Assert.assertNotNull(result.get("content")); + Assert.assertTrue(result.get("content").toString().trim().length() > 0); + HTMLStripTransformer t = new HTMLStripTransformer(); + t.transformRow(result, c); + System.out.println("result.get(\"content\") = " + result.get("content")); + Assert.assertNotNull(result.get("content")); + Assert.assertTrue(result.get("content").toString().trim().length() > 0); + } + + @Test public void withFieldsAndXpathStream() throws Exception { Map entityAttrs = createMap("name", "e", "url", "cd.xml", XPathEntityProcessor.FOR_EACH, "/catalog/cd", "stream", "true", "batchSize","1"); -- Regards, Shalin Shekhar Mangar.
Index: contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java =================================================================== --- contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java (revision 824015) +++ contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java (working copy) @@ -109,6 +109,85 @@ } @Test + @SuppressWarnings("unchecked") + public void testFlatten() throws Exception { + String xml = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n" + + "<!DOCTYPE document [\n" + + "<!ENTITY nbsp \" \">\n" + + "<!ENTITY copy \"©\">\n" + + "<!ENTITY reg \"®\">\n" + + "]>\n" + + "<document>\n" + + " <kbml version=\"-//Indiana University//DTD KBML 0.9//EN\">\n" + + " <kbq>In Mac OS X, how do I enable or disable the firewall?</kbq>\n" + + " <body>\n" + + "<p><kbh docid=\"aghe\" access=\"allowed\">Mac OS\n" + + "X<domain>all</domain><visibility>visible</visibility></kbh> includes\n" + + "an easy-to-use <kbh docid=\"aoru\"\n" + + "access=\"allowed\">firewall<domain>all</domain><visibility>visible</visibility></kbh>\n" + + "that\n" + + "can prevent potentially harmful incoming connections from other\n" + + "computers. To turn it on or off:</p>\n" + + "\n" + + "\n" + + "<h3>Mac OS X 10.6 (Snow Leopard)</h3>\n" + + "\n" + + "<ol><li>From the Apple menu, select <mi>System Preferences...†</mi>.\n" + + "When the <code>System Preferences</code> window appears, from the\n" + + "<mi>View</mi> menu, select <mi>Security</mi>.\n" + + "\n" + + "<br clear=\"none\"/><br clear=\"none\"/>\n" + + "</li><li>Click the <mi>Firewall</mi> tab.\n" + + "\n" + + "...\n" + + "\n" + + "</li></ol>\n" + + "</body>\n" + + " <xtra>\n" + + " <term weight=\"0\">macos</term>\n" + + " <term weight=\"0\">macintosh</term>\n" + + " <term weight=\"0\">apple</term>\n" + + " <term weight=\"0\">macosx</term>\n" + + "\n" + + "...\n" + + "\n" + + " </xtra>\n" + + " </kbml>\n" + + " <metadata>\n" + + " <docid>aozg</docid>\n" + + " <owner firstname=\"\" lastname=\"Macintosh Support\">scmac</owner>\n" + + "\n" + + "...\n" + + "\n" + + " </metadata>\n" + + "</document>"; + Map entityAttrs = createMap("name", "kbxml", "url", "testdata.xml", + XPathEntityProcessor.FOR_EACH, "/document", "transformer", "HTMLStripTransformer"); + List fields = new ArrayList(); + fields.add(createMap("column", "content", "xpath", "/document/kbml/body" ,"flatten","true", "stripHTML", "true")); + fields.add(createMap("column", "title", "xpath", "/document/kbml/kbq")); + Context c = AbstractDataImportHandlerTest.getContext(null, + new VariableResolverImpl(), getDataSource(xml), Context.FULL_DUMP, fields, entityAttrs); + XPathEntityProcessor xPathEntityProcessor = new XPathEntityProcessor(); + xPathEntityProcessor.init(c); + Map<String, Object> result = null; + while (true) { + Map<String, Object> row = xPathEntityProcessor.nextRow(); + if (row == null) + break; + result = row; + } + System.out.println("result.get(\"content\") = " + result.get("content")); + Assert.assertNotNull(result.get("content")); + Assert.assertTrue(result.get("content").toString().trim().length() > 0); + HTMLStripTransformer t = new HTMLStripTransformer(); + t.transformRow(result, c); + System.out.println("result.get(\"content\") = " + result.get("content")); + Assert.assertNotNull(result.get("content")); + Assert.assertTrue(result.get("content").toString().trim().length() > 0); + } + + @Test public void withFieldsAndXpathStream() throws Exception { Map entityAttrs = createMap("name", "e", "url", "cd.xml", XPathEntityProcessor.FOR_EACH, "/catalog/cd", "stream", "true", "batchSize","1");