Hi,
I created a custom ZIPURLDataSource class to unzip the content from an
http URL for an XML ZIP file and it seems to be working (at least I have
no errors), but no data is imported.
Here is my configuration in rss-data-config.xml:
<dataConfig>
<dataSource type="ZIPURLDataSource" connectionTimeout="15000"
readTimeout="30000"/>
<document>
<entity name="cve-2002"
pk="link"
url="https://nvd.nist.gov/feeds/xml/cve/nvdcve-2.0-2002.xml.zip"
processor="XPathEntityProcessor"
forEach="/nvd/entry"
transformer="DateFormatTransformer">
<field column="id" xpath="/nvd/entry/@id" commonField="true" />
<field column="cve" xpath="/nvd/entry/cve-id" commonField="true" />
<field column="cwe" xpath="/nvd/entry/cwe/@id" commonField="true" />
<field column="vulnerable-configuration"
xpath="/nvd/entry/vulnerable-configuration/logical-test/fact-ref/@name"
commonField="false" />
<field column="vulnerable-software"
xpath="/nvd/entry/vulnerable-software-list/product" commonField="false" />
<field column="published" xpath="/nvd/entry/published-datetime"
commonField="false" />
<field column="modified" xpath="/nvd/entry/last-modified-datetime"
commonField="false" />
<field column="summary" xpath="/nvd/entry/summary" commonField="false" />
</entity>
</document>
</dataConfig>
Attached is the ZIPURLDataSource.java file.
It actually unzips and saves the raw XML to disk, which I have verified to be a
valid XML file. The file has one or more entries (here is an example):
<nvd xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.1"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:patch="http://scap.nist.gov/schema/patch/0.1"
xmlns:vuln="http://scap.nist.gov/schema/vulnerability/0.4"
xmlns:cvss="http://scap.nist.gov/schema/cvss-v2/0.2"
xmlns:cpe-lang="http://cpe.mitre.org/language/2.0"
xmlns="http://scap.nist.gov/schema/feed/vulnerability/2.0"
pub_date="2015-01-10T05:37:05"
xsi:schemaLocation="http://scap.nist.gov/schema/patch/0.1
http://nvd.nist.gov/schema/patch_0.1.xsd
http://scap.nist.gov/schema/scap-core/0.1
http://nvd.nist.gov/schema/scap-core_0.1.xsd
http://scap.nist.gov/schema/feed/vulnerability/2.0
http://nvd.nist.gov/schema/nvd-cve-feed_2.0.xsd" nvd_xml_version="2.0">
<entry id="CVE-1999-0001">
<vuln:vulnerable-configuration id="http://nvd.nist.gov/">
<cpe-lang:logical-test operator="OR" negate="false">
<cpe-lang:fact-ref name="cpe:/o:bsdi:bsd_os:3.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.0"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.1.5.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.2"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0.5"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.5"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.6"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.6.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.7"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.7.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.3"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.4"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.5"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.6"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.8"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:3.0"/>
<cpe-lang:fact-ref name="cpe:/o:openbsd:openbsd:2.3"/>
<cpe-lang:fact-ref name="cpe:/o:openbsd:openbsd:2.4"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.2"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0.1"/>
</cpe-lang:logical-test>
</vuln:vulnerable-configuration>
<vuln:vulnerable-software-list>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.8</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:1.1.5.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.3</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.2</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.5</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.4</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.0.5</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.6</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.6.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.0.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.0</vuln:product>
<vuln:product>cpe:/o:openbsd:openbsd:2.3</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:3.0</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:1.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.6</vuln:product>
<vuln:product>cpe:/o:openbsd:openbsd:2.4</vuln:product>
<vuln:product>cpe:/o:bsdi:bsd_os:3.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:1.0</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.7</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:1.2</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.5</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.7.1</vuln:product>
</vuln:vulnerable-software-list>
<vuln:cve-id>CVE-1999-0001</vuln:cve-id>
<vuln:published-datetime>1999-12-30T00:00:00.000-05:00</vuln:published-datetime>
<vuln:last-modified-datetime>2010-12-16T00:00:00.000-05:00</vuln:last-modified-datetime>
<vuln:cvss>
<cvss:base_metrics>
<cvss:score>5.0</cvss:score>
<cvss:access-vector>NETWORK</cvss:access-vector>
<cvss:access-complexity>LOW</cvss:access-complexity>
<cvss:authentication>NONE</cvss:authentication>
<cvss:confidentiality-impact>NONE</cvss:confidentiality-impact>
<cvss:integrity-impact>NONE</cvss:integrity-impact>
<cvss:availability-impact>PARTIAL</cvss:availability-impact>
<cvss:source>http://nvd.nist.gov</cvss:source>
<cvss:generated-on-datetime>2004-01-01T00:00:00.000-05:00</cvss:generated-on-datetime>
</cvss:base_metrics>
</vuln:cvss>
<vuln:cwe id="CWE-20"/>
<vuln:references reference_type="UNKNOWN" xml:lang="en">
<vuln:source>OSVDB</vuln:source>
<vuln:reference href="http://www.osvdb.org/5707"
xml:lang="en">5707</vuln:reference>
</vuln:references>
<vuln:references reference_type="UNKNOWN" xml:lang="en">
<vuln:source>CONFIRM</vuln:source>
<vuln:reference href="http://www.openbsd.org/errata23.html#tcpfix"
xml:lang="en">http://www.openbsd.org/errata23.html#tcpfix</vuln:reference>
</vuln:references>
<vuln:summary>ip_input.c in BSD-derived TCP/IP implementations allows
remote attackers to cause a denial of service (crash or hang) via
crafted packets.</vuln:summary>
</entry>
Here is the curl command:
curl http://127.0.0.1:8983/solr/nvd-rss/dataimport?command=full-import
And here is the output from the console for Jetty:
main{StandardDirectoryReader(segments_1:1:nrt)}
2407 [coreLoadExecutor-5-thread-1] INFO
org.apache.solr.core.CoreContainer registering core: nvd-rss
2409 [main] INFO org.apache.solr.servlet.SolrDispatchFilter
user.dir=/Users/carlroberts/dev/solr-4.10.3/example
2409 [main] INFO org.apache.solr.servlet.SolrDispatchFilter
SolrDispatchFilter.init() done
2431 [main] INFO org.eclipse.jetty.server.AbstractConnector Started
[email protected]:8983
2450 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore
[nvd-rss] webapp=null path=null
params={event=firstSearcher&q=static+firstSearcher+warming+in+solrconfig.xml&distrib=false}
hits=0 status=0 QTime=43
2451 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore
QuerySenderListener done.
2451 [searcherExecutor-6-thread-1] INFO
org.apache.solr.handler.component.SpellCheckComponent Loading spell
index for spellchecker: default
2451 [searcherExecutor-6-thread-1] INFO
org.apache.solr.handler.component.SpellCheckComponent Loading spell
index for spellchecker: wordbreak
2452 [searcherExecutor-6-thread-1] INFO
org.apache.solr.handler.component.SuggestComponent Loading suggester
index for: mySuggester
2452 [searcherExecutor-6-thread-1] INFO
org.apache.solr.spelling.suggest.SolrSuggester reload()
2452 [searcherExecutor-6-thread-1] INFO
org.apache.solr.spelling.suggest.SolrSuggester build()
2459 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore
[nvd-rss] Registered new searcher Searcher@df9e84e[nvd-rss]
main{StandardDirectoryReader(segments_1:1:nrt)}
8371 [qtp1640586218-17] INFO
org.apache.solr.handler.dataimport.DataImporter Loading DIH
Configuration: rss-data-config.xml
8379 [qtp1640586218-17] INFO
org.apache.solr.handler.dataimport.DataImporter Data Configuration
loaded successfully
8383 [Thread-15] INFO org.apache.solr.handler.dataimport.DataImporter
Starting Full Import
8384 [qtp1640586218-17] INFO org.apache.solr.core.SolrCore [nvd-rss]
webapp=/solr path=/dataimport params={command=full-import} status=0 QTime=15
8396 [Thread-15] INFO
org.apache.solr.handler.dataimport.SimplePropertiesWriter Read
dataimport.properties
23431 [commitScheduler-8-thread-1] INFO
org.apache.solr.update.UpdateHandler start
commit{,optimize=false,openSearcher=false,waitSearcher=true,expungeDeletes=false,softCommit=false,prepareCommit=false}
23431 [commitScheduler-8-thread-1] INFO
org.apache.solr.update.UpdateHandler No uncommitted changes. Skipping
IW.commit.
23432 [commitScheduler-8-thread-1] INFO
org.apache.solr.update.UpdateHandler end_commit_flush
47189 [Thread-15] INFO
org.apache.solr.handler.dataimport.ZIPURLDataSource raw bytes={19485161}
47301 [Thread-15] INFO
org.apache.solr.handler.dataimport.ZIPURLDataSource bytes available
are {19485161}
47840 [Thread-15] INFO org.apache.solr.handler.dataimport.DocBuilder
Import completed successfully
47840 [Thread-15] INFO org.apache.solr.update.UpdateHandler start
commit{,optimize=false,openSearcher=true,waitSearcher=true,expungeDeletes=false,softCommit=false,prepareCommit=false}
47840 [Thread-15] INFO org.apache.solr.update.UpdateHandler No
uncommitted changes. Skipping IW.commit.
47841 [Thread-15] INFO org.apache.solr.core.SolrCore SolrIndexSearcher
has not changed - not re-opening: org.apache.solr.search.SolrIndexSearcher
47841 [Thread-15] INFO org.apache.solr.update.UpdateHandler
end_commit_flush
Can someone please help me figure out why the data is not being
imported? Perhaps I missed something?
Regards,
Joe
package org.apache.solr.handler.dataimport;
import java.util.zip.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.EOFException;
public class ZIPURLDataSource extends DataSource<Reader> {
Logger LOG = LoggerFactory.getLogger(ZIPURLDataSource.class);
private String baseUrl;
private String encoding;
private int connectionTimeout = CONNECTION_TIMEOUT;
private int readTimeout = READ_TIMEOUT;
private Context context;
private Properties initProps;
public ZIPURLDataSource(){
super();
}
@Override
public void init(Context context, Properties initProps) {
this.context = context;
this.initProps = initProps;
baseUrl = getInitPropWithReplacements(BASE_URL);
if (getInitPropWithReplacements(ENCODING) != null)
encoding = getInitPropWithReplacements(ENCODING);
String cTimeout =
getInitPropWithReplacements(CONNECTION_TIMEOUT_FIELD_NAME);
String rTimeout =
getInitPropWithReplacements(READ_TIMEOUT_FIELD_NAME);
if (cTimeout != null) {
try {
connectionTimeout = Integer.parseInt(cTimeout);
} catch (NumberFormatException e) {
LOG.warn("Invalid connection timeout: " + cTimeout);
}
}
if (rTimeout != null) {
try {
readTimeout = Integer.parseInt(rTimeout);
} catch (NumberFormatException e) {
LOG.warn("Invalid read timeout: " + rTimeout);
}
}
}
@Override
public Reader getData(String query) {
URL url = null;
try {
if (URIMETHOD.matcher(query).find())
url = new URL(query);
else
url = new URL(baseUrl + query);
LOG.debug("Accessing URL: " + url.toString());
URLConnection conn = url.openConnection();
conn.setConnectTimeout(connectionTimeout);
conn.setReadTimeout(readTimeout);
InputStream in = conn.getInputStream();
if (in == null){
LOG.info("Invalid InputStream {" + in + "}");
return null;
}
InputStream bis = unzip(in);
in.close();
in = bis;
if (in != null){
LOG.info("bytes available are {" + in.available() + "}");
}else{
LOG.info("Invalid InputStream {" + in + "}");
return null;
}
String enc = encoding;
if (enc == null) {
String cType = conn.getContentType();
if (cType != null) {
Matcher m = CHARSET_PATTERN.matcher(cType);
if (m.find()) {
enc = m.group(1);
}
}
}
if (enc == null)
enc = UTF_8;
DataImporter.QUERY_COUNT.get().incrementAndGet();
return new InputStreamReader(in, enc);
} catch (Exception e) {
LOG.error("Exception thrown while getting data", e);
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Exception in invoking url " + url, e);
}
}
@Override
public void close() {
}
private InputStream unzip(InputStream in) throws Exception{
ZipInputStream zin = null;
ZipEntry entry = null;
try{
zin = new ZipInputStream(in);
//loop only once
while((entry = zin.getNextEntry()) != null){
byte raw[] = new byte[1024];
int read = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while ((read = zin.read(raw)) != -1) {
bos.write(raw, 0, read);
}
bos.close();
zin.closeEntry();
raw = bos.toByteArray();
LOG.info("raw bytes={" + raw.length + "}");
putBinaryToFile(raw, "raw.xml");
return new ByteArrayInputStream(raw);
}
}finally{
if (zin != null){
try{
zin.close();
}catch(Exception e){}
}
}
return null;
}
public String getBaseUrl() {
return baseUrl;
}
private String getInitPropWithReplacements(String propertyName) {
final String expr = initProps.getProperty(propertyName);
if (expr == null) {
return null;
}
return context.replaceTokens(expr);
}
private void putBinaryToFile(byte[] buf, String fileName) throws
IOException {
putBinaryToFile(buf, 0, buf.length, fileName);
}
private void putBinaryToFile(byte[] buf, int off, int len, String fileName)
throws IOException {
FileOutputStream fos = null;
BufferedOutputStream out = null;
try{
fos = new FileOutputStream(fileName);
out = new BufferedOutputStream(fos);
out.write(buf, off, len);
}finally{
if (out != null){
try{
out.close();
}catch(Exception e){
LOG.error(e.getMessage());
}
}
if (fos != null){
try{
fos.close();
}catch(Exception e){
LOG.error(e.getMessage());
}
}
}
}
static final Pattern URIMETHOD = Pattern.compile("\\w{3,}:/");
private static final Pattern CHARSET_PATTERN =
Pattern.compile(".*?charset=(.*)$", Pattern.CASE_INSENSITIVE);
public static final String ENCODING = "encoding";
public static final String BASE_URL = "baseUrl";
public static final String UTF_8 = StandardCharsets.UTF_8.name();
public static final String CONNECTION_TIMEOUT_FIELD_NAME =
"connectionTimeout";
public static final String READ_TIMEOUT_FIELD_NAME = "readTimeout";
public static final int CONNECTION_TIMEOUT = 5000;
public static final int READ_TIMEOUT = 10000;
}