Hello I have been trying to find out why DIH in FileListEntityProcessor mode did not appear to be recursing into subdirectories. Going through FileListEntityProcessor.java I eventually tumbled to the fact that my filename filter setting from data-config.xml also applied to directory names.
<entity name="jc" processor="FileListEntityProcessor" fileName=".*\.xml" newerThan="'NOW-1000DAYS'" recursive="true" rootEntity="false" dataSource="null" baseDir="/Volumes/spare/ts/stuff/ford"> Now, I feel that the fieldName filter should be applied to files fed into the parser, it should not be applied to the directory names we are recursing through. I bodged the code as follows to adjust the behavior so that the "FileName" and "excludes" attributes of "entity" only apply to filenames and not directory names. It now recurses though my directory tree only indexing the appropriate files! I think the new behavior is more standard. Is this a change valid? Regards Fergus. --- /Volumes/spare/ts/apache-solr-nightlyjan23/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java 2009-02-01 18:19:38.000000000 +0000 +++ /Volumes/spare/ts/apache-solr-nightlyjan29/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java 2008-10-02 20:38:30.000000000 +0100 @@ -85,10 +85,11 @@ if (r != null) recursive = Boolean.parseBoolean(r); excludes = context.getEntityAttribute(EXCLUDES); - if (excludes != null) { + if (excludes != null) excludes = resolver.replaceTokens(excludes); + if (excludes != null) excludesPattern = Pattern.compile(excludes); - } + } private Date getDate(String dateStr) { @@ -139,41 +140,42 @@ return getFromRowCache(); while (true) { Map<String, Object> r = getNext(); - if (r != null) r = applyTransformer(r); - return r; + if (r == null) + return null; + r = applyTransformer(r); + if (r != null) + return r; } } private void getFolderFiles(File dir, final List<Map<String, Object>> fileDetails) { - // Fetch an array of file objects that pass the filter, however the - // returned array is never populated; accept() always returns false. - // Rather we make use of the fileDetails array which is populated as - // a side affect of the accept method. dir.list(new FilenameFilter() { public boolean accept(File dir, String name) { - File fileObj = new File(dir, name); - LOG.info("Testing acceptance of dir:"+dir +" name:"+name); - if (fileObj.isDirectory()) { - LOG.info(" Recursing into directory "+fileObj); - if (recursive) getFolderFiles(fileObj, fileDetails); - } - else if (fileNamePattern == null) { + if (fileNamePattern == null) { addDetails(fileDetails, dir, name); - } - else if (fileNamePattern.matcher(name).find()) { - if (excludesPattern != null && excludesPattern.matcher(name).find()) return false; + return false; + } + if (fileNamePattern.matcher(name).find()) { + if (excludesPattern != null && excludesPattern.matcher(name).find()) + return false; addDetails(fileDetails, dir, name); - } - return false; } - }); - } + + return false; + } + }); + } private void addDetails(List<Map<String, Object>> files, File dir, String name) { Map<String, Object> details = new HashMap<String, Object>(); File aFile = new File(dir, name); - if (aFile.isDirectory()) return; + if (aFile.isDirectory()) { + if (!recursive) + return; + getFolderFiles(aFile, files); + return; + } long sz = aFile.length(); Date lastModified = new Date(aFile.lastModified()); if (biggerThan != -1 && sz <= biggerThan) -- =============================================================== Fergus McMenemie Email:fer...@twig.me.uk Techmore Ltd Phone:(UK) 07721 376021 Unix/Mac/Intranets Analyst Programmer ===============================================================