I spent some more time thinking about this.  Do you really need to use the
TikaEntityProcessor?  It doesn't offer anything new to the document you are
building that couldn't be accomplished by the XPathEntityProcessor alone
from what I can tell.

I also tried to get the Advanced
Parsing<http://wiki.apache.org/solr/TikaEntityProcessor>example to
work without success.  There are some obvious typos (<document>
instead of </document>) and an odd order to the pieces (<dataSources> is
enclosed by <document>).  It also looks like
FieldStreamDataSource<http://lucene.apache.org/solr/4_3_1/solr-dataimporthandler/org/apache/solr/handler/dataimport/FieldStreamDataSource.html>is
the one that is meant to work in this context. If Koji is still around
maybe he could offer some help?  Otherwise this bit of erroneous
instruction should probably be removed from the wiki.

Cheers,
Tricia

$ svn diff
Index:
solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
===================================================================
---
solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
     (revision 1526990)
+++
solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
     (working copy)
@@ -99,13 +99,13 @@
     runFullImport(getConfigHTML("identity"));
     assertQ(req("*:*"), testsHTMLIdentity);
   }
-
+
   private String getConfigHTML(String htmlMapper) {
     return
         "<dataConfig>" +
             "  <dataSource type='BinFileDataSource'/>" +
             "  <document>" +
-            "    <entity name='Tika' format='xml'
processor='TikaEntityProcessor' " +
+            "    <entity name='Tika' format='html'
processor='TikaEntityProcessor' " +
             "       url='" +
getFile("dihextras/structured.html").getAbsolutePath() + "' " +
             ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper +
"'")) + ">" +
             "      <field column='text'/>" +
@@ -114,4 +114,36 @@
             "</dataConfig>";

   }
+  private String[] testsHTMLH1 = {
+      "//*[@numFound='1']"
+      , "//str[@name='h1'][contains(.,'H1 Header')]"
+  };
+
+  @Test
+  public void testTikaHTMLMapperSubEntity() throws Exception {
+    runFullImport(getConfigSubEntity("identity"));
+    assertQ(req("*:*"), testsHTMLH1);
+  }
+
+  private String getConfigSubEntity(String htmlMapper) {
+    return
+        "<dataConfig>" +
+        "<dataSource type='BinFileDataSource' name='bin'/>" +
+        "<dataSource type='FieldStreamDataSource' name='fld'/>" +
+        "<document>" +
+        "<entity name='tika' processor='TikaEntityProcessor' url='" +
getFile("dihextras/structured.html").getAbsolutePath() + "'
dataSource='bin' format='html' rootEntity='false'>" +
+        "<!--Do appropriate mapping here  meta=\"true\" means it is a
metadata field -->" +
+        "<field column='Author' meta='true' name='author'/>" +
+        "<field column='title' meta='true' name='title'/>" +
+        "<!--'text' is an implicit field emited by TikaEntityProcessor .
Map it appropriately-->" +
+        "<field name='text' column='text'/>" +
+        "<entity name='detail' type='XPathEntityProcessor' forEach='/html'
dataSource='fld' dataField='tika.text' rootEntity='true' >" +
+        "<field xpath='//div'  column='foo'/>" +
+        "<field xpath='//h1'  column='h1' />" +
+        "</entity>" +
+        "</entity>" +
+        "</document>" +
+        "</dataConfig>";
+  }
+
 }
Index:
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
===================================================================
---
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
   (revision 1526990)
+++
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml
   (working copy)
@@ -194,6 +194,8 @@
    <field name="title" type="string" indexed="true" stored="true"/>
    <field name="author" type="string" indexed="true" stored="true" />
    <field name="text" type="text" indexed="true" stored="true" />
+   <field name="h1" type="text" indexed="true" stored="true" />
+   <field name="foo" type="text" indexed="true" stored="true" />

  </fields>
  <!-- field for the QueryParser to use when an explicit fieldname is
absent -->


I find the SqlEntityProcessor part particularly odd.  That's the default
right?:
2405 T12 C1 oashd.SqlEntityProcessor.initQuery ERROR The query failed
'null' java.lang.RuntimeException: unsupported type : class java.lang.String
at
org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:89)
 at
org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:1)
at
org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
 at
org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
at
org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
 at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:469)
at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:495)
 at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:408)
at
org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:323)
 at
org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:231)
at
org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:411)
 at
org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:476)
at
org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
 at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1859)
 at org.apache.solr.util.TestHarness.query(TestHarness.java:291)
at
org.apache.solr.handler.dataimport.AbstractDataImportHandlerTestCase.runFullImport(AbstractDataImportHandlerTestCase.java:96)
 at
org.apache.solr.handler.dataimport.TestTikaEntityProcessor.testTikaHTMLMapperSubEntity(TestTikaEntityProcessor.java:124)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
 at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 at java.lang.reflect.Method.invoke(Method.java:601)
at
com.carrotsearch.randomizedtesting.RandomizedRunner.invoke(RandomizedRunner.java:1559)
 at
com.carrotsearch.randomizedtesting.RandomizedRunner.access$600(RandomizedRunner.java:79)
at
com.carrotsearch.randomizedtesting.RandomizedRunner$6.evaluate(RandomizedRunner.java:737)
 at
com.carrotsearch.randomizedtesting.RandomizedRunner$7.evaluate(RandomizedRunner.java:773)
at
com.carrotsearch.randomizedtesting.RandomizedRunner$8.evaluate(RandomizedRunner.java:787)
 at
com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
at
org.apache.lucene.util.TestRuleSetupTeardownChained$1.evaluate(TestRuleSetupTeardownChained.java:50)
 at
org.apache.lucene.util.TestRuleFieldCacheSanity$1.evaluate(TestRuleFieldCacheSanity.java:51)
at
org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
 at
com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
at
org.apache.lucene.util.TestRuleThreadAndTestName$1.evaluate(TestRuleThreadAndTestName.java:49)
 at
org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
at
org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
 at
com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
at
com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
 at
com.carrotsearch.randomizedtesting.ThreadLeakControl.forkTimeoutingTask(ThreadLeakControl.java:782)
at
com.carrotsearch.randomizedtesting.ThreadLeakControl$3.evaluate(ThreadLeakControl.java:442)
 at
com.carrotsearch.randomizedtesting.RandomizedRunner.runSingleTest(RandomizedRunner.java:746)
at
com.carrotsearch.randomizedtesting.RandomizedRunner$3.evaluate(RandomizedRunner.java:648)
 at
com.carrotsearch.randomizedtesting.RandomizedRunner$4.evaluate(RandomizedRunner.java:682)
at
com.carrotsearch.randomizedtesting.RandomizedRunner$5.evaluate(RandomizedRunner.java:693)
 at
com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
at
com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53)
 at
org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46)
at
org.apache.lucene.util.TestRuleStoreClassName$1.evaluate(TestRuleStoreClassName.java:42)
 at
com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55)
at
com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
 at
com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39)
at
com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
 at
org.apache.lucene.util.TestRuleAssertionsRequired$1.evaluate(TestRuleAssertionsRequired.java:43)
at
org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48)
 at
org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70)
at
org.apache.lucene.util.TestRuleIgnoreTestSuites$1.evaluate(TestRuleIgnoreTestSuites.java:55)
 at
com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36)
at
com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358)
 at java.lang.Thread.run(Thread.java:722)



On Fri, Sep 27, 2013 at 3:55 AM, Andreas Owen <a...@conx.ch> wrote:

> i removed the FieldReaderDataSource and dataSource="fld" but it didn't
> help. i get the following for each document:
>         DataImportHandlerException: Exception in invoking url null
> Processing Document # 9
>         nullpointerexception
>
>
> On 26. Sep 2013, at 8:39 PM, P Williams wrote:
>
> > Hi,
> >
> > Haven't tried this myself but maybe try leaving out the
> > FieldReaderDataSource entirely.  From my quick searching looks like it's
> > tied to SQL.  Did you try copying the
> > http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing example
> > exactly?  What happens when you leave out FieldReaderDataSource?
> >
> > Cheers,
> > Tricia
> >
> >
> > On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <a...@conx.ch> wrote:
> >
> >> i'm using solr 4.3.1 and the dataimporter. i am trying to use
> >> XPathEntityProcessor within the TikaEntityProcessor for indexing
> html-pages
> >> but i'm getting this error for each document. i have also tried
> >> dataField="tika.text" and dataField="text" to no avail. the nested
> >> XPathEntityProcessor "detail" creates the error, the rest works fine.
> what
> >> am i doing wrong?
> >>
> >> error:
> >>
> >> ERROR - 2013-09-26 12:08:49.006;
> >> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed
> >> 'null'
> >> java.lang.ClassCastException: java.io.StringReader cannot be cast to
> >> java.util.Iterator
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
> >>        at
> >>
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
> >>        at
> >>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> >>        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> >>        at
> >>
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> >>        at
> >>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> >>        at
> >>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> >>        at
> >>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> >>        at
> >>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> >>        at org.eclipse.jetty.server.Server.handle(Server.java:365)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> >>        at
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> >>        at
> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> >>        at
> >> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> >>        at
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> >>        at
> >>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> >>        at
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> >>        at
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> >>        at java.lang.Thread.run(Unknown Source)
> >> ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException;
> >> Exception in entity :
> >> detail:org.apache.solr.handler.dataimport.DataImportHandlerException:
> >> java.lang.ClassCastException: java.io.StringReader cannot be cast to
> >> java.util.Iterator
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65)
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73)
> >>        at
> >>
> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487)
> >>        at
> >>
> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179)
> >>        at
> >>
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> >>        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359)
> >>        at
> >>
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> >>        at
> >>
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
> >>        at
> >>
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
> >>        at
> >>
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
> >>        at
> >>
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> >>        at
> >>
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> >>        at
> >>
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> >>        at
> >>
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> >>        at org.eclipse.jetty.server.Server.handle(Server.java:365)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
> >>        at
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
> >>        at
> >>
> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
> >>        at
> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
> >>        at
> >> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> >>        at
> >>
> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> >>        at
> >>
> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> >>        at
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> >>        at
> >>
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> >>        at java.lang.Thread.run(Unknown Source)
> >> Caused by: java.lang.ClassCastException: java.io.StringReader cannot be
> >> cast to java.util.Iterator
> >>        at
> >>
> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59)
> >>        ... 41 more
> >>
> >>
> >>
> >> data-config.xml
> >>
> >> <dataConfig>
> >>        <dataSource type="BinURLDataSource" name="dataFile"/>
> >>        <dataSource type="BinURLDataSource" name="dataUrl"/>
> >>        <dataSource type="URLDataSource" name="main"/>
> >>        <dataSource type="FieldReaderDataSource" name="fld"/>
> >> <document>
> >> <entity name="rec" processor="XPathEntityProcessor"
> >>
> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
> >> forEach="/docs/doc" dataSource="main">
> >>                <field column="title" xpath="//title" />
> >>                <field column="id" xpath="//id" />
> >>                <field column="file" xpath="//file" />
> >>                <field column="url" xpath="//url" />
> >>                <field column="urlParse" xpath="//urlParse" />
> >>                <field column="last_modified" xpath="//last_modified" />
> >>                <field column="Author" xpath="//author" />
> >>
> >>                <entity name="tika" processor="TikaEntityProcessor"
> >> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html">
> >>                        <field column="text"/>
> >>
> >>                        <entity name="detail" type="XPathEntityProcessor"
> >> forEach="/html" dataSource="fld" dataField="${tika.text}"
> rootEntity="true"
> >> onError="skip">
> >>                                <field xpath="//h1" column="h_1" />
> >>                        </entity>
> >>                </entity>
> >>        </entity>
> >> </document>
> >> </dataConfig>
>
>

Reply via email to