Hi,

I created a custom ZIPURLDataSource class to unzip the content from an
http URL for an XML ZIP file and it seems to be working (at least I have
no errors), but no data is imported.

Here is my configuration in rss-data-config.xml:

<dataConfig>
<dataSource type="ZIPURLDataSource" connectionTimeout="15000"
readTimeout="30000"/>
<document>
<entity name="cve-2002"
pk="link"
url="https://nvd.nist.gov/feeds/xml/cve/nvdcve-2.0-2002.xml.zip";
processor="XPathEntityProcessor"
forEach="/nvd/entry"
transformer="DateFormatTransformer">
<field column="id" xpath="/nvd/entry/@id" commonField="true" />
<field column="cve" xpath="/nvd/entry/cve-id" commonField="true" />
<field column="cwe" xpath="/nvd/entry/cwe/@id" commonField="true" />
<field column="vulnerable-configuration"
xpath="/nvd/entry/vulnerable-configuration/logical-test/fact-ref/@name"
commonField="false" />
<field column="vulnerable-software"
xpath="/nvd/entry/vulnerable-software-list/product" commonField="false" />
<field column="published" xpath="/nvd/entry/published-datetime"
commonField="false" />
<field column="modified" xpath="/nvd/entry/last-modified-datetime"
commonField="false" />
<field column="summary" xpath="/nvd/entry/summary" commonField="false" />
</entity>
</document>
</dataConfig>


Attached is the ZIPURLDataSource.java file.

It actually unzips and saves the raw XML to disk, which I have verified to be a 
valid XML file.  The file has one or more entries (here is an example):

<nvd xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.1";
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
xmlns:patch="http://scap.nist.gov/schema/patch/0.1";
xmlns:vuln="http://scap.nist.gov/schema/vulnerability/0.4";
xmlns:cvss="http://scap.nist.gov/schema/cvss-v2/0.2";
xmlns:cpe-lang="http://cpe.mitre.org/language/2.0";
xmlns="http://scap.nist.gov/schema/feed/vulnerability/2.0";
pub_date="2015-01-10T05:37:05"
xsi:schemaLocation="http://scap.nist.gov/schema/patch/0.1
http://nvd.nist.gov/schema/patch_0.1.xsd
http://scap.nist.gov/schema/scap-core/0.1
http://nvd.nist.gov/schema/scap-core_0.1.xsd
http://scap.nist.gov/schema/feed/vulnerability/2.0
http://nvd.nist.gov/schema/nvd-cve-feed_2.0.xsd"; nvd_xml_version="2.0">
<entry id="CVE-1999-0001">
<vuln:vulnerable-configuration id="http://nvd.nist.gov/";>
<cpe-lang:logical-test operator="OR" negate="false">
<cpe-lang:fact-ref name="cpe:/o:bsdi:bsd_os:3.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.0"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.1.5.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:1.2"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0.5"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.5"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.6"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.6.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.7"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.1.7.1"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.3"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.4"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.5"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.6"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.8"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:3.0"/>
<cpe-lang:fact-ref name="cpe:/o:openbsd:openbsd:2.3"/>
<cpe-lang:fact-ref name="cpe:/o:openbsd:openbsd:2.4"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.2.2"/>
<cpe-lang:fact-ref name="cpe:/o:freebsd:freebsd:2.0.1"/>
</cpe-lang:logical-test>
</vuln:vulnerable-configuration>
<vuln:vulnerable-software-list>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.8</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:1.1.5.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.3</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.2</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.5</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.4</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.0.5</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2.6</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.6.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.0.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.2</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.0</vuln:product>
<vuln:product>cpe:/o:openbsd:openbsd:2.3</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:3.0</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:1.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.6</vuln:product>
<vuln:product>cpe:/o:openbsd:openbsd:2.4</vuln:product>
<vuln:product>cpe:/o:bsdi:bsd_os:3.1</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:1.0</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.7</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:1.2</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.5</vuln:product>
<vuln:product>cpe:/o:freebsd:freebsd:2.1.7.1</vuln:product>
</vuln:vulnerable-software-list>
<vuln:cve-id>CVE-1999-0001</vuln:cve-id>
<vuln:published-datetime>1999-12-30T00:00:00.000-05:00</vuln:published-datetime>
<vuln:last-modified-datetime>2010-12-16T00:00:00.000-05:00</vuln:last-modified-datetime>
<vuln:cvss>
<cvss:base_metrics>
<cvss:score>5.0</cvss:score>
<cvss:access-vector>NETWORK</cvss:access-vector>
<cvss:access-complexity>LOW</cvss:access-complexity>
<cvss:authentication>NONE</cvss:authentication>
<cvss:confidentiality-impact>NONE</cvss:confidentiality-impact>
<cvss:integrity-impact>NONE</cvss:integrity-impact>
<cvss:availability-impact>PARTIAL</cvss:availability-impact>
<cvss:source>http://nvd.nist.gov</cvss:source>
<cvss:generated-on-datetime>2004-01-01T00:00:00.000-05:00</cvss:generated-on-datetime>
</cvss:base_metrics>
</vuln:cvss>
<vuln:cwe id="CWE-20"/>
<vuln:references reference_type="UNKNOWN" xml:lang="en">
<vuln:source>OSVDB</vuln:source>
<vuln:reference href="http://www.osvdb.org/5707";
xml:lang="en">5707</vuln:reference>
</vuln:references>
<vuln:references reference_type="UNKNOWN" xml:lang="en">
<vuln:source>CONFIRM</vuln:source>
<vuln:reference href="http://www.openbsd.org/errata23.html#tcpfix";
xml:lang="en">http://www.openbsd.org/errata23.html#tcpfix</vuln:reference>
</vuln:references>
<vuln:summary>ip_input.c in BSD-derived TCP/IP implementations allows
remote attackers to cause a denial of service (crash or hang) via
crafted packets.</vuln:summary>
</entry>


Here is the curl command:

curl http://127.0.0.1:8983/solr/nvd-rss/dataimport?command=full-import

And here is the output from the console for Jetty:

main{StandardDirectoryReader(segments_1:1:nrt)}
2407 [coreLoadExecutor-5-thread-1] INFO
org.apache.solr.core.CoreContainer – registering core: nvd-rss
2409 [main] INFO org.apache.solr.servlet.SolrDispatchFilter –
user.dir=/Users/carlroberts/dev/solr-4.10.3/example
2409 [main] INFO org.apache.solr.servlet.SolrDispatchFilter –
SolrDispatchFilter.init() done
2431 [main] INFO org.eclipse.jetty.server.AbstractConnector – Started
SocketConnector@0.0.0.0:8983
2450 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore –
[nvd-rss] webapp=null path=null
params={event=firstSearcher&q=static+firstSearcher+warming+in+solrconfig.xml&distrib=false}
hits=0 status=0 QTime=43
2451 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore –
QuerySenderListener done.
2451 [searcherExecutor-6-thread-1] INFO
org.apache.solr.handler.component.SpellCheckComponent – Loading spell
index for spellchecker: default
2451 [searcherExecutor-6-thread-1] INFO
org.apache.solr.handler.component.SpellCheckComponent – Loading spell
index for spellchecker: wordbreak
2452 [searcherExecutor-6-thread-1] INFO
org.apache.solr.handler.component.SuggestComponent – Loading suggester
index for: mySuggester
2452 [searcherExecutor-6-thread-1] INFO
org.apache.solr.spelling.suggest.SolrSuggester – reload()
2452 [searcherExecutor-6-thread-1] INFO
org.apache.solr.spelling.suggest.SolrSuggester – build()
2459 [searcherExecutor-6-thread-1] INFO org.apache.solr.core.SolrCore –
[nvd-rss] Registered new searcher Searcher@df9e84e[nvd-rss]
main{StandardDirectoryReader(segments_1:1:nrt)}
8371 [qtp1640586218-17] INFO
org.apache.solr.handler.dataimport.DataImporter – Loading DIH
Configuration: rss-data-config.xml
8379 [qtp1640586218-17] INFO
org.apache.solr.handler.dataimport.DataImporter – Data Configuration
loaded successfully
8383 [Thread-15] INFO org.apache.solr.handler.dataimport.DataImporter –
Starting Full Import
8384 [qtp1640586218-17] INFO org.apache.solr.core.SolrCore – [nvd-rss]
webapp=/solr path=/dataimport params={command=full-import} status=0 QTime=15
8396 [Thread-15] INFO
org.apache.solr.handler.dataimport.SimplePropertiesWriter – Read
dataimport.properties
23431 [commitScheduler-8-thread-1] INFO
org.apache.solr.update.UpdateHandler – start
commit{,optimize=false,openSearcher=false,waitSearcher=true,expungeDeletes=false,softCommit=false,prepareCommit=false}
23431 [commitScheduler-8-thread-1] INFO
org.apache.solr.update.UpdateHandler – No uncommitted changes. Skipping
IW.commit.
23432 [commitScheduler-8-thread-1] INFO
org.apache.solr.update.UpdateHandler – end_commit_flush
47189 [Thread-15] INFO
org.apache.solr.handler.dataimport.ZIPURLDataSource – raw bytes={19485161}
47301 [Thread-15] INFO
org.apache.solr.handler.dataimport.ZIPURLDataSource – bytes available
are {19485161}
47840 [Thread-15] INFO org.apache.solr.handler.dataimport.DocBuilder –
Import completed successfully
47840 [Thread-15] INFO org.apache.solr.update.UpdateHandler – start
commit{,optimize=false,openSearcher=true,waitSearcher=true,expungeDeletes=false,softCommit=false,prepareCommit=false}
47840 [Thread-15] INFO org.apache.solr.update.UpdateHandler – No
uncommitted changes. Skipping IW.commit.
47841 [Thread-15] INFO org.apache.solr.core.SolrCore – SolrIndexSearcher
has not changed - not re-opening: org.apache.solr.search.SolrIndexSearcher
47841 [Thread-15] INFO org.apache.solr.update.UpdateHandler –
end_commit_flush

Can someone please help me figure out why the data is not being
imported? Perhaps I missed something?

Regards,

Joe



package org.apache.solr.handler.dataimport;

import java.util.zip.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.EOFException;

public class ZIPURLDataSource extends DataSource<Reader> {
        Logger LOG = LoggerFactory.getLogger(ZIPURLDataSource.class);

        private String baseUrl;

        private String encoding;

        private int connectionTimeout = CONNECTION_TIMEOUT;

        private int readTimeout = READ_TIMEOUT;

        private Context context;

        private Properties initProps;


        public ZIPURLDataSource(){
                super();
        }
        
        @Override
        public void init(Context context, Properties initProps) {
                this.context = context;
                this.initProps = initProps;

                baseUrl = getInitPropWithReplacements(BASE_URL);
                if (getInitPropWithReplacements(ENCODING) != null)
                  encoding = getInitPropWithReplacements(ENCODING);
                String cTimeout = 
getInitPropWithReplacements(CONNECTION_TIMEOUT_FIELD_NAME);
                String rTimeout = 
getInitPropWithReplacements(READ_TIMEOUT_FIELD_NAME);
                if (cTimeout != null) {
                  try {
                        connectionTimeout = Integer.parseInt(cTimeout);
                  } catch (NumberFormatException e) {
                        LOG.warn("Invalid connection timeout: " + cTimeout);
                  }
                }
                if (rTimeout != null) {
                  try {
                        readTimeout = Integer.parseInt(rTimeout);
                  } catch (NumberFormatException e) {
                        LOG.warn("Invalid read timeout: " + rTimeout);
                  }
                }
        }

        
        @Override
   public Reader getData(String query) {
    URL url = null;
    try {
      if (URIMETHOD.matcher(query).find()) 
        url = new URL(query);
      else 
        url = new URL(baseUrl + query);

      LOG.debug("Accessing URL: " + url.toString());

      URLConnection conn = url.openConnection();
      conn.setConnectTimeout(connectionTimeout);
      conn.setReadTimeout(readTimeout);
      InputStream in = conn.getInputStream();
      if (in == null){
                LOG.info("Invalid InputStream {" + in + "}");
        return null;
      }
      InputStream bis = unzip(in);
      in.close();
      in = bis;
      if (in != null){
        LOG.info("bytes available are {" + in.available() + "}");
      }else{
        LOG.info("Invalid InputStream {" + in + "}");
        return null;
      }
      String enc = encoding;
      if (enc == null) {
        String cType = conn.getContentType();
        if (cType != null) {
          Matcher m = CHARSET_PATTERN.matcher(cType);
          if (m.find()) {
            enc = m.group(1);
          }
        }
      }
      if (enc == null)
        enc = UTF_8;
      DataImporter.QUERY_COUNT.get().incrementAndGet();
      return new InputStreamReader(in, enc);
    } catch (Exception e) {
      LOG.error("Exception thrown while getting data", e);
      throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
              "Exception in invoking url " + url, e);
    }
  }

  @Override
  public void close() {
  }
  
  private InputStream unzip(InputStream in) throws Exception{

        ZipInputStream zin = null;
        ZipEntry entry = null;
        try{
            zin = new ZipInputStream(in);
            
            //loop only once
            while((entry = zin.getNextEntry()) != null){
            
                byte raw[] = new byte[1024];
                int read = 0;
                        ByteArrayOutputStream bos = new ByteArrayOutputStream();
                        while ((read = zin.read(raw)) != -1) {
                        bos.write(raw, 0, read);
                        }
                        bos.close();
                zin.closeEntry();
                raw = bos.toByteArray();
                LOG.info("raw bytes={" + raw.length + "}");
                putBinaryToFile(raw, "raw.xml");
                return new ByteArrayInputStream(raw);
               
            }
        }finally{
            if (zin != null){
                try{
                    zin.close();
                }catch(Exception e){}
            }
        }
        return null;
    }
    
    public String getBaseUrl() {
    return baseUrl;
  }

  private String getInitPropWithReplacements(String propertyName) {
    final String expr = initProps.getProperty(propertyName);
    if (expr == null) {
      return null;
    }
    return context.replaceTokens(expr);
  }

    private void putBinaryToFile(byte[] buf, String fileName) throws 
IOException {
        putBinaryToFile(buf, 0, buf.length, fileName);
    }

    
    private void putBinaryToFile(byte[] buf, int off, int len, String fileName) 
throws IOException {
    
        FileOutputStream fos = null;
        BufferedOutputStream out = null;
        try{
                fos = new FileOutputStream(fileName);
                out = new BufferedOutputStream(fos);
                out.write(buf, off, len);
        }finally{
                if (out != null){
                        try{
                                out.close();
                        }catch(Exception e){
                                LOG.error(e.getMessage());
                        }
                }
                if (fos != null){
                        try{
                                fos.close();
                        }catch(Exception e){
                                LOG.error(e.getMessage());
                        }
                }
        }
    }

  static final Pattern URIMETHOD = Pattern.compile("\\w{3,}:/");

  private static final Pattern CHARSET_PATTERN = 
Pattern.compile(".*?charset=(.*)$", Pattern.CASE_INSENSITIVE);

  public static final String ENCODING = "encoding";

  public static final String BASE_URL = "baseUrl";

  public static final String UTF_8 = StandardCharsets.UTF_8.name();

  public static final String CONNECTION_TIMEOUT_FIELD_NAME = 
"connectionTimeout";

  public static final String READ_TIMEOUT_FIELD_NAME = "readTimeout";

  public static final int CONNECTION_TIMEOUT = 5000;

  public static final int READ_TIMEOUT = 10000;
}

Reply via email to