I am trying to index a slew of web pages but want to restrict what gets
indexed
I'm trying to use a dataImportHandler to do this.
my initial config to test this approach isn't doing what I expect
<dataConfig>
<dataSource name="myfilereader" type="FileDataSource"/>
<document>
<entity name="jcurrent"
processor="FileListEntityProcessor"
fileName=".*html"
newerThan="${dataimporter.last_index_time}"
recursive="true"
rootEntity="false"
dataSource="null"
baseDir="/var/www/web/A10078">
<entity name="x"
dataSource="myfilereader"
processor="XPathEntityProcessor"
url="${jcurrent.fileAbsolutePath}"
stream="false"
forEach="/html/body"
dataField="text"
>
<field column="p" xpath="//p" />
</entity>
</entity>
</document>
</dataConfig>
The FileListEntityProccessor is feeding me the files as expected
But the XPathEntityProcessor is only processing one <p> and and its
coming up empty?
"entity:jcurrent",
[
null,
"----------- row #1-------------",
"file",
"A10078.html",
"fileSize",
43635,
"fileLastModified",
"2015-08-12T22:44:19Z",
"fileDir",
"/var/www/web/A10078",
"fileAbsolutePath",
"/var/www/web/A10078/A10078.html",
null,
"---------------------------------------------",
"entity:x",
[
"document#1",
[
"query",
"/var/www/web/A10078/A10078.html",
"time-taken",
"0:0:0.0",
null,
"----------- row #1-------------",
"p",
"",
"$forEach",
"/html/body",
null,
"---------------------------------------------"
],
"document#1",
[]
]
]
],