Hi All,
I need to index the documents presents in my file system at various
locations (e.g. C:\docs , d:\docs ).
Is there any way through which i can specify this in my DIH
Configuration.
Here is my configuration:-
<document>
<entity name="sd"
processor="FileListEntityProcessor"
fileName="docx$|doc$|pdf$|xls$|xlsx|html$|rtf$|txt$|zip$"
*baseDir="G:\\Desktop\\"*
recursive="false"
rootEntity="true"
transformer="DateFormatTransformer"
onerror="continue">
<entity name="tikatest"
processor="org.apache.solr.handler.dataimport.TikaEntityProcessor"
url="${sd.fileAbsolutePath}" format="text" dataSource="bin">
<field column="Author" name="author" meta="true"/>
<field column="Content-Type" name="title" meta="true"/>
<!-- field column="title" name="title" meta="true"/ -->
<field column="text" name="all_text"/>
</entity>
<!-- field column="fileLastModified" name="date"
dateTimeFormat="yyyy-MM-dd'T'hh:mm:ss" / -->
<field column="fileSize" name="size"/>
<field column="file" name="filename"/>
</entity>
<!--baseDir="../site"-->
</document>
/ Pankaj Bhatt.