Hi,
I created a custom ZIPURLDataSource class to unzip the content from an http URL for an XML ZIP file and it seems to be working (at least I have no errors), but no data is imported. Here is my configuration in rss-data-config.xml: <dataConfig> <dataSource type="ZIPURLDataSource" connectionTimeout="15000" readTimeout="30000"/> <document> <entity name="cve-2002" pk="link" url="https://nvd.nist.gov/feeds/xml/cve/nvdcve-2.0-2002.xml.zip" processor="XPathEntityProcessor" forEach="/nvd/entry" transformer="DateFormatTransformer"> <field column="id" xpath="/nvd/entry/@id" commonField="true" /> <field column="cve" xpath="/nvd/entry/cve-id" commonField="true" /> <field column="cwe" xpath="/nvd/entry/cwe/@id" commonField="true" /> <field column="vulnerable-configuration" xpath="/nvd/entry/vulnerable-configuration/logical-test/fact-ref/@name" commonField="false" /> <field column="vulnerable-software" xpath="/nvd/entry/vulnerable-software-list/product" commonField="false" /> <field column="published" xpath="/nvd/entry/published-datetime" commonField="false" /> <field column="modified" xpath="/nvd/entry/last-modified-datetime" commonField="false" /> <field column="summary" xpath="/nvd/entry/summary" commonField="false" /> </entity> </document> </dataConfig> Attached is the ZIPURLDataSource.java file. It actually unzips and saves the raw XML to disk, which I have verified to be a valid XML file. The file has one or more entries (here is an example): <nvd xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:patch="http://scap.nist.gov/schema/patch/0.1" xmlns:vuln="http://scap.nist.gov/schema/vulnerability/0.4" xmlns:cvss="http://scap.nist.gov/schema/cvss-v2/0.2" xmlns:cpe-lang="http://cpe.mitre.org/language/2.0" xmlns="http://scap.nist.gov/schema/feed/vulnerability/2.0" pub_date="2015-01-10T05:37:05" xsi:schemaLocation="http://scap.nist.gov/schema/patch/0.1 http://nvd.nist.gov/schema/patch_0.1.xsd http://scap.nist.gov/schema/scap-core/0.1 http://nvd.nist.gov/schema/scap-core_0.1.xsd http://scap.nist.gov/schema/feed/vulnerability/2.0 http://nvd.nist.gov/schema/nvd-cve-feed_2.0.xsd" nvd_xml_version="2.0"> <entry id="CVE-1999-0001"> <vuln:vulnerable-configuration id="http://nvd.nist.gov/"> <cpe-lang:logical-test operator="OR" negate="false"> <cpe-lang:fact-ref name="cpe:/o:bsdi:bsd_os:3.1"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.0"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.1"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.1.5.1"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.2"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0.5"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.5"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.6"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.6.1"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.7"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.7.1"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.3"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.4"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.5"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.6"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.8"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:3.0"/> <cpe-lang:fact-ref name="cpe:/o:openbsd:openbsd:2.3"/> <cpe-lang:fact-ref name="cpe:/o:openbsd:openbsd:2.4"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.2"/> <cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0.1"/> </cpe-lang:logical-test> </vuln:vulnerable-configuration> <vuln:vulnerable-software-list> <vuln:product>cpe:/o:freebsd:freebsd:2.2.8</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:1.1.5.1</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.2.3</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.2.2</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.2.5</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.2.4</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.0.5</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.2.6</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.1.6.1</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.0.1</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.2</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.0</vuln:product> <vuln:product>cpe:/o:openbsd:openbsd:2.3</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:3.0</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:1.1</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.1.6</vuln:product> <vuln:product>cpe:/o:openbsd:openbsd:2.4</vuln:product> <vuln:product>cpe:/o:bsdi:bsd_os:3.1</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:1.0</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.1.7</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:1.2</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.1.5</vuln:product> <vuln:product>cpe:/o:freebsd:freebsd:2.1.7.1</vuln:product> </vuln:vulnerable-software-list> <vuln:cve-id>CVE-1999-0001</vuln:cve-id> <vuln:published-datetime>1999-12-30T00:00:00.000-05:00</vuln:published-datetime> <vuln:last-modified-datetime>2010-12-16T00:00:00.000-05:00</vuln:last-modified-datetime> <vuln:cvss> <cvss:base_metrics> <cvss:score>5.0</cvss:score> <cvss:access-vector>NETWORK</cvss:access-vector> <cvss:access-complexity>LOW</cvss:access-complexity> <cvss:authentication>NONE</cvss:authentication> <cvss:confidentiality-impact>NONE</cvss:confidentiality-impact> <cvss:integrity-impact>NONE</cvss:integrity-impact> <cvss:availability-impact>PARTIAL</cvss:availability-impact> <cvss:source>http://nvd.nist.gov</cvss:source> <cvss:generated-on-datetime>2004-01-01T00:00:00.000-05:00</cvss:generated-on-datetime> </cvss:base_metrics> </vuln:cvss> <vuln:cwe id="CWE-20"/> <vuln:references reference_type="UNKNOWN" xml:lang="en"> <vuln:source>OSVDB</vuln:source> <vuln:reference href="http://www.osvdb.org/5707" xml:lang="en">5707</vuln:reference> </vuln:references> <vuln:references reference_type="UNKNOWN" xml:lang="en"> <vuln:source>CONFIRM</vuln:source> <vuln:reference href="http://www.openbsd.org/errata23.html#tcpfix" xml:lang="en">http://www.openbsd.org/errata23.html#tcpfix</vuln:reference> </vuln:references> <vuln:summary>ip_input.c in BSD-derived TCP/IP implementations allows remote attackers to cause a denial of service (crash or hang) via crafted packets.</vuln:summary> </entry> Here is the curl command: curl http://127.0.0.1:8983/solr/nvd-rss/dataimport?command=full-import And here is the output from the console for Jetty: main{StandardDirectoryReader(segments_1:1:nrt)} 2407 [coreLoadExecutor-5-thread-1] INFO org.apache.solr.core.CoreContainer registering core: nvd-rss 2409 [main] INFO org.apache.solr.servlet.SolrDispatchFilter user.dir=/Users/carlroberts/dev/solr-4.10.3/example 2409 [main] INFO org.apache.solr.servlet.SolrDispatchFilter SolrDispatchFilter.init() done 2431 [main] INFO org.eclipse.jetty.server.AbstractConnector Started SocketConnector@0.0.0.0:8983 2450 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore [nvd-rss] webapp=null path=null params={event=firstSearcher&q=static+firstSearcher+warming+in+solrconfig.xml&distrib=false} hits=0 status=0 QTime=43 2451 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore QuerySenderListener done. 2451 [searcherExecutor-6-thread-1] INFO org.apache.solr.handler.component.SpellCheckComponent Loading spell index for spellchecker: default 2451 [searcherExecutor-6-thread-1] INFO org.apache.solr.handler.component.SpellCheckComponent Loading spell index for spellchecker: wordbreak 2452 [searcherExecutor-6-thread-1] INFO org.apache.solr.handler.component.SuggestComponent Loading suggester index for: mySuggester 2452 [searcherExecutor-6-thread-1] INFO org.apache.solr.spelling.suggest.SolrSuggester reload() 2452 [searcherExecutor-6-thread-1] INFO org.apache.solr.spelling.suggest.SolrSuggester build() 2459 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore [nvd-rss] Registered new searcher Searcher@df9e84e[nvd-rss] main{StandardDirectoryReader(segments_1:1:nrt)} 8371 [qtp1640586218-17] INFO org.apache.solr.handler.dataimport.DataImporter Loading DIH Configuration: rss-data-config.xml 8379 [qtp1640586218-17] INFO org.apache.solr.handler.dataimport.DataImporter Data Configuration loaded successfully 8383 [Thread-15] INFO org.apache.solr.handler.dataimport.DataImporter Starting Full Import 8384 [qtp1640586218-17] INFO org.apache.solr.core.SolrCore [nvd-rss] webapp=/solr path=/dataimport params={command=full-import} status=0 QTime=15 8396 [Thread-15] INFO org.apache.solr.handler.dataimport.SimplePropertiesWriter Read dataimport.properties 23431 [commitScheduler-8-thread-1] INFO org.apache.solr.update.UpdateHandler start commit{,optimize=false,openSearcher=false,waitSearcher=true,expungeDeletes=false,softCommit=false,prepareCommit=false} 23431 [commitScheduler-8-thread-1] INFO org.apache.solr.update.UpdateHandler No uncommitted changes. Skipping IW.commit. 23432 [commitScheduler-8-thread-1] INFO org.apache.solr.update.UpdateHandler end_commit_flush 47189 [Thread-15] INFO org.apache.solr.handler.dataimport.ZIPURLDataSource raw bytes={19485161} 47301 [Thread-15] INFO org.apache.solr.handler.dataimport.ZIPURLDataSource bytes available are {19485161} 47840 [Thread-15] INFO org.apache.solr.handler.dataimport.DocBuilder Import completed successfully 47840 [Thread-15] INFO org.apache.solr.update.UpdateHandler start commit{,optimize=false,openSearcher=true,waitSearcher=true,expungeDeletes=false,softCommit=false,prepareCommit=false} 47840 [Thread-15] INFO org.apache.solr.update.UpdateHandler No uncommitted changes. Skipping IW.commit. 47841 [Thread-15] INFO org.apache.solr.core.SolrCore SolrIndexSearcher has not changed - not re-opening: org.apache.solr.search.SolrIndexSearcher 47841 [Thread-15] INFO org.apache.solr.update.UpdateHandler end_commit_flush Can someone please help me figure out why the data is not being imported? Perhaps I missed something? Regards, Joe
package org.apache.solr.handler.dataimport; import java.util.zip.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.net.URL; import java.net.URLConnection; import java.nio.charset.StandardCharsets; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.io.EOFException; public class ZIPURLDataSource extends DataSource<Reader> { Logger LOG = LoggerFactory.getLogger(ZIPURLDataSource.class); private String baseUrl; private String encoding; private int connectionTimeout = CONNECTION_TIMEOUT; private int readTimeout = READ_TIMEOUT; private Context context; private Properties initProps; public ZIPURLDataSource(){ super(); } @Override public void init(Context context, Properties initProps) { this.context = context; this.initProps = initProps; baseUrl = getInitPropWithReplacements(BASE_URL); if (getInitPropWithReplacements(ENCODING) != null) encoding = getInitPropWithReplacements(ENCODING); String cTimeout = getInitPropWithReplacements(CONNECTION_TIMEOUT_FIELD_NAME); String rTimeout = getInitPropWithReplacements(READ_TIMEOUT_FIELD_NAME); if (cTimeout != null) { try { connectionTimeout = Integer.parseInt(cTimeout); } catch (NumberFormatException e) { LOG.warn("Invalid connection timeout: " + cTimeout); } } if (rTimeout != null) { try { readTimeout = Integer.parseInt(rTimeout); } catch (NumberFormatException e) { LOG.warn("Invalid read timeout: " + rTimeout); } } } @Override public Reader getData(String query) { URL url = null; try { if (URIMETHOD.matcher(query).find()) url = new URL(query); else url = new URL(baseUrl + query); LOG.debug("Accessing URL: " + url.toString()); URLConnection conn = url.openConnection(); conn.setConnectTimeout(connectionTimeout); conn.setReadTimeout(readTimeout); InputStream in = conn.getInputStream(); if (in == null){ LOG.info("Invalid InputStream {" + in + "}"); return null; } InputStream bis = unzip(in); in.close(); in = bis; if (in != null){ LOG.info("bytes available are {" + in.available() + "}"); }else{ LOG.info("Invalid InputStream {" + in + "}"); return null; } String enc = encoding; if (enc == null) { String cType = conn.getContentType(); if (cType != null) { Matcher m = CHARSET_PATTERN.matcher(cType); if (m.find()) { enc = m.group(1); } } } if (enc == null) enc = UTF_8; DataImporter.QUERY_COUNT.get().incrementAndGet(); return new InputStreamReader(in, enc); } catch (Exception e) { LOG.error("Exception thrown while getting data", e); throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Exception in invoking url " + url, e); } } @Override public void close() { } private InputStream unzip(InputStream in) throws Exception{ ZipInputStream zin = null; ZipEntry entry = null; try{ zin = new ZipInputStream(in); //loop only once while((entry = zin.getNextEntry()) != null){ byte raw[] = new byte[1024]; int read = 0; ByteArrayOutputStream bos = new ByteArrayOutputStream(); while ((read = zin.read(raw)) != -1) { bos.write(raw, 0, read); } bos.close(); zin.closeEntry(); raw = bos.toByteArray(); LOG.info("raw bytes={" + raw.length + "}"); putBinaryToFile(raw, "raw.xml"); return new ByteArrayInputStream(raw); } }finally{ if (zin != null){ try{ zin.close(); }catch(Exception e){} } } return null; } public String getBaseUrl() { return baseUrl; } private String getInitPropWithReplacements(String propertyName) { final String expr = initProps.getProperty(propertyName); if (expr == null) { return null; } return context.replaceTokens(expr); } private void putBinaryToFile(byte[] buf, String fileName) throws IOException { putBinaryToFile(buf, 0, buf.length, fileName); } private void putBinaryToFile(byte[] buf, int off, int len, String fileName) throws IOException { FileOutputStream fos = null; BufferedOutputStream out = null; try{ fos = new FileOutputStream(fileName); out = new BufferedOutputStream(fos); out.write(buf, off, len); }finally{ if (out != null){ try{ out.close(); }catch(Exception e){ LOG.error(e.getMessage()); } } if (fos != null){ try{ fos.close(); }catch(Exception e){ LOG.error(e.getMessage()); } } } } static final Pattern URIMETHOD = Pattern.compile("\\w{3,}:/"); private static final Pattern CHARSET_PATTERN = Pattern.compile(".*?charset=(.*)$", Pattern.CASE_INSENSITIVE); public static final String ENCODING = "encoding"; public static final String BASE_URL = "baseUrl"; public static final String UTF_8 = StandardCharsets.UTF_8.name(); public static final String CONNECTION_TIMEOUT_FIELD_NAME = "connectionTimeout"; public static final String READ_TIMEOUT_FIELD_NAME = "readTimeout"; public static final int CONNECTION_TIMEOUT = 5000; public static final int READ_TIMEOUT = 10000; }