I am trying to index a slew of web pages but want to restrict what gets indexed

I'm trying to use a dataImportHandler to do this.

my initial config to test this approach isn't doing what I expect


<dataConfig>
 <dataSource name="myfilereader" type="FileDataSource"/>
 <document>
    <entity name="jcurrent"
       processor="FileListEntityProcessor"
       fileName=".*html"
       newerThan="${dataimporter.last_index_time}"
       recursive="true"
       rootEntity="false"
       dataSource="null"
       baseDir="/var/www/web/A10078">

       <entity name="x"
          dataSource="myfilereader"
          processor="XPathEntityProcessor"
          url="${jcurrent.fileAbsolutePath}"
          stream="false"
          forEach="/html/body"
          dataField="text"
          >
          <field column="p" xpath="//p"   />
          </entity>
       </entity>
    </document>
 </dataConfig>

The FileListEntityProccessor is feeding me the files as expected

But the XPathEntityProcessor is only processing one <p> and and its coming up empty?

"entity:jcurrent",
    [
      null,
      "----------- row #1-------------",
      "file",
      "A10078.html",
      "fileSize",
      43635,
      "fileLastModified",
      "2015-08-12T22:44:19Z",
      "fileDir",
      "/var/www/web/A10078",
      "fileAbsolutePath",
      "/var/www/web/A10078/A10078.html",
      null,
      "---------------------------------------------",
      "entity:x",
      [
        "document#1",
        [
          "query",
          "/var/www/web/A10078/A10078.html",
          "time-taken",
          "0:0:0.0",
          null,
          "----------- row #1-------------",
          "p",
          "",
          "$forEach",
          "/html/body",
          null,
          "---------------------------------------------"
        ],
        "document#1",
        []
      ]
    ]
  ],

Reply via email to