I spent some more time thinking about this. Do you really need to use the TikaEntityProcessor? It doesn't offer anything new to the document you are building that couldn't be accomplished by the XPathEntityProcessor alone from what I can tell.
I also tried to get the Advanced Parsing<http://wiki.apache.org/solr/TikaEntityProcessor>example to work without success. There are some obvious typos (<document> instead of </document>) and an odd order to the pieces (<dataSources> is enclosed by <document>). It also looks like FieldStreamDataSource<http://lucene.apache.org/solr/4_3_1/solr-dataimporthandler/org/apache/solr/handler/dataimport/FieldStreamDataSource.html>is the one that is meant to work in this context. If Koji is still around maybe he could offer some help? Otherwise this bit of erroneous instruction should probably be removed from the wiki. Cheers, Tricia $ svn diff Index: solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java =================================================================== --- solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java (revision 1526990) +++ solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java (working copy) @@ -99,13 +99,13 @@ runFullImport(getConfigHTML("identity")); assertQ(req("*:*"), testsHTMLIdentity); } - + private String getConfigHTML(String htmlMapper) { return "<dataConfig>" + " <dataSource type='BinFileDataSource'/>" + " <document>" + - " <entity name='Tika' format='xml' processor='TikaEntityProcessor' " + + " <entity name='Tika' format='html' processor='TikaEntityProcessor' " + " url='" + getFile("dihextras/structured.html").getAbsolutePath() + "' " + ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper + "'")) + ">" + " <field column='text'/>" + @@ -114,4 +114,36 @@ "</dataConfig>"; } + private String[] testsHTMLH1 = { + "//*[@numFound='1']" + , "//str[@name='h1'][contains(.,'H1 Header')]" + }; + + @Test + public void testTikaHTMLMapperSubEntity() throws Exception { + runFullImport(getConfigSubEntity("identity")); + assertQ(req("*:*"), testsHTMLH1); + } + + private String getConfigSubEntity(String htmlMapper) { + return + "<dataConfig>" + + "<dataSource type='BinFileDataSource' name='bin'/>" + + "<dataSource type='FieldStreamDataSource' name='fld'/>" + + "<document>" + + "<entity name='tika' processor='TikaEntityProcessor' url='" + getFile("dihextras/structured.html").getAbsolutePath() + "' dataSource='bin' format='html' rootEntity='false'>" + + "<!--Do appropriate mapping here meta=\"true\" means it is a metadata field -->" + + "<field column='Author' meta='true' name='author'/>" + + "<field column='title' meta='true' name='title'/>" + + "<!--'text' is an implicit field emited by TikaEntityProcessor . Map it appropriately-->" + + "<field name='text' column='text'/>" + + "<entity name='detail' type='XPathEntityProcessor' forEach='/html' dataSource='fld' dataField='tika.text' rootEntity='true' >" + + "<field xpath='//div' column='foo'/>" + + "<field xpath='//h1' column='h1' />" + + "</entity>" + + "</entity>" + + "</document>" + + "</dataConfig>"; + } + } Index: solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml =================================================================== --- solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml (revision 1526990) +++ solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml (working copy) @@ -194,6 +194,8 @@ <field name="title" type="string" indexed="true" stored="true"/> <field name="author" type="string" indexed="true" stored="true" /> <field name="text" type="text" indexed="true" stored="true" /> + <field name="h1" type="text" indexed="true" stored="true" /> + <field name="foo" type="text" indexed="true" stored="true" /> </fields> <!-- field for the QueryParser to use when an explicit fieldname is absent --> I find the SqlEntityProcessor part particularly odd. That's the default right?: 2405 T12 C1 oashd.SqlEntityProcessor.initQuery ERROR The query failed 'null' java.lang.RuntimeException: unsupported type : class java.lang.String at org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:89) at org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:1) at org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59) at org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73) at org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243) at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:469) at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:495) at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:408) at org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:323) at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:231) at org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:411) at org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:476) at org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179) at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1859) at org.apache.solr.util.TestHarness.query(TestHarness.java:291) at org.apache.solr.handler.dataimport.AbstractDataImportHandlerTestCase.runFullImport(AbstractDataImportHandlerTestCase.java:96) at org.apache.solr.handler.dataimport.TestTikaEntityProcessor.testTikaHTMLMapperSubEntity(TestTikaEntityProcessor.java:124) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:601) at com.carrotsearch.randomizedtesting.RandomizedRunner.invoke(RandomizedRunner.java:1559) at com.carrotsearch.randomizedtesting.RandomizedRunner.access$600(RandomizedRunner.java:79) at com.carrotsearch.randomizedtesting.RandomizedRunner$6.evaluate(RandomizedRunner.java:737) at com.carrotsearch.randomizedtesting.RandomizedRunner$7.evaluate(RandomizedRunner.java:773) at com.carrotsearch.randomizedtesting.RandomizedRunner$8.evaluate(RandomizedRunner.java:787) at com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53) at org.apache.lucene.util.TestRuleSetupTeardownChained$1.evaluate(TestRuleSetupTeardownChained.java:50) at org.apache.lucene.util.TestRuleFieldCacheSanity$1.evaluate(TestRuleFieldCacheSanity.java:51) at org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46) at com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55) at org.apache.lucene.util.TestRuleThreadAndTestName$1.evaluate(TestRuleThreadAndTestName.java:49) at org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70) at org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48) at com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36) at com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358) at com.carrotsearch.randomizedtesting.ThreadLeakControl.forkTimeoutingTask(ThreadLeakControl.java:782) at com.carrotsearch.randomizedtesting.ThreadLeakControl$3.evaluate(ThreadLeakControl.java:442) at com.carrotsearch.randomizedtesting.RandomizedRunner.runSingleTest(RandomizedRunner.java:746) at com.carrotsearch.randomizedtesting.RandomizedRunner$3.evaluate(RandomizedRunner.java:648) at com.carrotsearch.randomizedtesting.RandomizedRunner$4.evaluate(RandomizedRunner.java:682) at com.carrotsearch.randomizedtesting.RandomizedRunner$5.evaluate(RandomizedRunner.java:693) at com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36) at com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53) at org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46) at org.apache.lucene.util.TestRuleStoreClassName$1.evaluate(TestRuleStoreClassName.java:42) at com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55) at com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39) at com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39) at com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36) at org.apache.lucene.util.TestRuleAssertionsRequired$1.evaluate(TestRuleAssertionsRequired.java:43) at org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48) at org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70) at org.apache.lucene.util.TestRuleIgnoreTestSuites$1.evaluate(TestRuleIgnoreTestSuites.java:55) at com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36) at com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358) at java.lang.Thread.run(Thread.java:722) On Fri, Sep 27, 2013 at 3:55 AM, Andreas Owen <a...@conx.ch> wrote: > i removed the FieldReaderDataSource and dataSource="fld" but it didn't > help. i get the following for each document: > DataImportHandlerException: Exception in invoking url null > Processing Document # 9 > nullpointerexception > > > On 26. Sep 2013, at 8:39 PM, P Williams wrote: > > > Hi, > > > > Haven't tried this myself but maybe try leaving out the > > FieldReaderDataSource entirely. From my quick searching looks like it's > > tied to SQL. Did you try copying the > > http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing example > > exactly? What happens when you leave out FieldReaderDataSource? > > > > Cheers, > > Tricia > > > > > > On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <a...@conx.ch> wrote: > > > >> i'm using solr 4.3.1 and the dataimporter. i am trying to use > >> XPathEntityProcessor within the TikaEntityProcessor for indexing > html-pages > >> but i'm getting this error for each document. i have also tried > >> dataField="tika.text" and dataField="text" to no avail. the nested > >> XPathEntityProcessor "detail" creates the error, the rest works fine. > what > >> am i doing wrong? > >> > >> error: > >> > >> ERROR - 2013-09-26 12:08:49.006; > >> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed > >> 'null' > >> java.lang.ClassCastException: java.io.StringReader cannot be cast to > >> java.util.Iterator > >> at > >> > org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59) > >> at > >> > org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73) > >> at > >> > org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227) > >> at > >> > org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422) > >> at > >> > org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487) > >> at > >> > org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179) > >> at > >> > org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) > >> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820) > >> at > >> > org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656) > >> at > >> > org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359) > >> at > >> > org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155) > >> at > >> > org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307) > >> at > >> > org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453) > >> at > >> > org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137) > >> at > >> > org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560) > >> at > >> > org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231) > >> at > >> > org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072) > >> at > >> > org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382) > >> at > >> > org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193) > >> at > >> > org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006) > >> at > >> > org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135) > >> at > >> > org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255) > >> at > >> > org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154) > >> at > >> > org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116) > >> at org.eclipse.jetty.server.Server.handle(Server.java:365) > >> at > >> > org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485) > >> at > >> > org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53) > >> at > >> > org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937) > >> at > >> > org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998) > >> at > org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856) > >> at > >> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240) > >> at > >> > org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72) > >> at > >> > org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264) > >> at > >> > org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608) > >> at > >> > org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543) > >> at java.lang.Thread.run(Unknown Source) > >> ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException; > >> Exception in entity : > >> detail:org.apache.solr.handler.dataimport.DataImportHandlerException: > >> java.lang.ClassCastException: java.io.StringReader cannot be cast to > >> java.util.Iterator > >> at > >> > org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65) > >> at > >> > org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73) > >> at > >> > org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319) > >> at > >> > org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227) > >> at > >> > org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422) > >> at > >> > org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487) > >> at > >> > org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179) > >> at > >> > org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) > >> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820) > >> at > >> > org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656) > >> at > >> > org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359) > >> at > >> > org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155) > >> at > >> > org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307) > >> at > >> > org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453) > >> at > >> > org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137) > >> at > >> > org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560) > >> at > >> > org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231) > >> at > >> > org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072) > >> at > >> > org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382) > >> at > >> > org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193) > >> at > >> > org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006) > >> at > >> > org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135) > >> at > >> > org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255) > >> at > >> > org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154) > >> at > >> > org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116) > >> at org.eclipse.jetty.server.Server.handle(Server.java:365) > >> at > >> > org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485) > >> at > >> > org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53) > >> at > >> > org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937) > >> at > >> > org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998) > >> at > org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856) > >> at > >> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240) > >> at > >> > org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72) > >> at > >> > org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264) > >> at > >> > org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608) > >> at > >> > org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543) > >> at java.lang.Thread.run(Unknown Source) > >> Caused by: java.lang.ClassCastException: java.io.StringReader cannot be > >> cast to java.util.Iterator > >> at > >> > org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59) > >> ... 41 more > >> > >> > >> > >> data-config.xml > >> > >> <dataConfig> > >> <dataSource type="BinURLDataSource" name="dataFile"/> > >> <dataSource type="BinURLDataSource" name="dataUrl"/> > >> <dataSource type="URLDataSource" name="main"/> > >> <dataSource type="FieldReaderDataSource" name="fld"/> > >> <document> > >> <entity name="rec" processor="XPathEntityProcessor" > >> > url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml" > >> forEach="/docs/doc" dataSource="main"> > >> <field column="title" xpath="//title" /> > >> <field column="id" xpath="//id" /> > >> <field column="file" xpath="//file" /> > >> <field column="url" xpath="//url" /> > >> <field column="urlParse" xpath="//urlParse" /> > >> <field column="last_modified" xpath="//last_modified" /> > >> <field column="Author" xpath="//author" /> > >> > >> <entity name="tika" processor="TikaEntityProcessor" > >> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html"> > >> <field column="text"/> > >> > >> <entity name="detail" type="XPathEntityProcessor" > >> forEach="/html" dataSource="fld" dataField="${tika.text}" > rootEntity="true" > >> onError="skip"> > >> <field xpath="//h1" column="h_1" /> > >> </entity> > >> </entity> > >> </entity> > >> </document> > >> </dataConfig> > >