It looks like maybe you do not have "apache-solr-dataimporthandler-extras.jar" in your classpath.
James Dyer E-Commerce Systems Ingram Content Group (615) 213-4311 -----Original Message----- From: anarchos78 [mailto:rigasathanasio...@hotmail.com] Sent: Friday, May 11, 2012 11:00 AM To: solr-user@lucene.apache.org Subject: Re: Indexing data from pdf Now I am getting the following: *From Solr:* <response> <lst name="responseHeader"> <int name="status">0</int> <int name="QTime">1</int> </lst><lst name="initArgs"> <lst name="defaults"> <str name="config">data-config.xml</str> </lst> </lst><str name="command">full-import</str> <str name="status">idle</str> <str name="importResponse"/> <lst name="statusMessages"> <str name="Time Elapsed">0:0:4.231</str> <str name="Total Requests made to DataSource">0</str> <str name="Total Rows Fetched">1</str> <str name="Total Documents Processed">0</str> <str name="Total Documents Skipped">0</str> <str name="Full Dump Started">2012-05-11 18:43:30</str> <str name="">Indexing failed. Rolled back all changes.</str> <str name="Rolledback">2012-05-11 18:43:30</str></lst><str name="WARNING">This response format is experimental. It is likely to change in the future.</str> </response> *The log file:* org.apache.solr.update.processor.LogUpdateProcessor finish INFO: {deleteByQuery=*:*} 0 4 11 Μαϊ 2012 6:55:28 μμ org.apache.solr.common.SolrException log SEVERE: Full Import failed:java.lang.RuntimeException: java.lang.RuntimeException: org.apache.solr.handler.dataimport.DataImportHandlerException: Unable to load EntityProcessor implementation for entity:tika Processing Document # 1 at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:264) at org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:375) at org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:445) at org.apache.solr.handler.dataimport.DataImporter$1.run(DataImporter.java:426) Caused by: java.lang.RuntimeException: org.apache.solr.handler.dataimport.DataImportHandlerException: Unable to load EntityProcessor implementation for entity:tika Processing Document # 1 at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:621) at org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:327) at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:225) ... 3 more Caused by: org.apache.solr.handler.dataimport.DataImportHandlerException: Unable to load EntityProcessor implementation for entity:tika Processing Document # 1 at org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow(DataImportHandlerException.java:72) at org.apache.solr.handler.dataimport.DocBuilder.getEntityProcessor(DocBuilder.java:915) at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:635) at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:709) at org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:619) ... 5 more Caused by: java.lang.ClassNotFoundException: Unable to load TikaEntityProcessor or org.apache.solr.handler.dataimport.TikaEntityProcessor at org.apache.solr.handler.dataimport.DocBuilder.loadClass(DocBuilder.java:1110) at org.apache.solr.handler.dataimport.DocBuilder.getEntityProcessor(DocBuilder.java:912) ... 8 more Caused by: org.apache.solr.common.SolrException: Error loading class 'TikaEntityProcessor' at org.apache.solr.core.SolrResourceLoader.findClass(SolrResourceLoader.java:394) at org.apache.solr.handler.dataimport.DocBuilder.loadClass(DocBuilder.java:1100) ... 9 more Caused by: java.lang.ClassNotFoundException: TikaEntityProcessor at java.net.URLClassLoader$1.run(Unknown Source) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at java.net.FactoryURLClassLoader.loadClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at java.lang.Class.forName0(Native Method) at java.lang.Class.forName(Unknown Source) at org.apache.solr.core.SolrResourceLoader.findClass(SolrResourceLoader.java:378) ... 10 more *The data-config.xml:* <?xml version="1.0" encoding="utf-8"?> <dataConfig> <dataSource type="BinFileDataSource" name="binary" /> <document> <entity name="f" dataSource="binary" rootEntity="false" processor="FileListEntityProcessor" baseDir="/solr/solr/docu/" fileName=".*pdf" recursive="true"> <entity name="tika" processor="TikaEntityProcessor" url="${f.fileAbsolutePath}" format="text"> <field column="id" name="id" meta="true" /> <field column="fake_id" name="fake_id" /> <field column="model" name="model" meta="true" /> <field column="text" name="biog" /> </entity> </entity> </document> </dataConfig> *The solrconfig.xml:* <?xml version="1.0" encoding="UTF-8" ?> <config> <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError> <luceneMatchVersion>LUCENE_36</luceneMatchVersion> <lib dir="lib/dist/" regex="apache-solr-cell-\d.*\.jar" /> <lib dir="lib/contrib/extraction/lib/" regex=".*\.jar" /> <lib dir="lib/dist/" regex="apache-solr-clustering-\d.*\.jar" /> <lib dir="lib/contrib/clustering/lib/" regex=".*\.jar" /> <lib dir="lib/dist/" regex="apache-solr-dataimporthandler-\d.*\.jar" /> <lib dir="lib/contrib/dataimporthandler/lib/" regex=".*\.jar" /> <lib dir="lib/dist/" regex="apache-solr-langid-\d.*\.jar" /> <lib dir="lib/contrib/langid/lib/" regex=".*\.jar" /> <lib dir="lib/dist/" regex="apache-solr-velocity-\d.*\.jar" /> <lib dir="lib/contrib/velocity/lib/" regex=".*\.jar" /> <lib dir="lib/contrib/extraction/lib/" /> guration. --> <dataDir>${solr.data.dir:}</dataDir> <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/> <indexConfig> </indexConfig> <jmx /> <updateHandler class="solr.DirectUpdateHandler2"> </updateHandler> <query> <maxBooleanClauses>1024</maxBooleanClauses> <filterCache class="solr.FastLRUCache" size="512" initialSize="512" autowarmCount="0"/> <queryResultCache class="solr.LRUCache" size="512" initialSize="512" autowarmCount="0"/> <documentCache class="solr.LRUCache" size="512" initialSize="512" autowarmCount="0"/> <enableLazyFieldLoading>true</enableLazyFieldLoading> <queryResultWindowSize>20</queryResultWindowSize> <queryResultMaxDocsCached>200</queryResultMaxDocsCached> <listener event="newSearcher" class="solr.QuerySenderListener"> <arr name="queries"> </arr> </listener> <listener event="firstSearcher" class="solr.QuerySenderListener"> <arr name="queries"> <lst> <str name="q">static firstSearcher warming in solrconfig.xml</str> </lst> </arr> </listener> <useColdSearcher>false</useColdSearcher> <maxWarmingSearchers>2</maxWarmingSearchers> </query> <requestDispatcher> <requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048000" /> <httpCaching never304="true" /> </requestDispatcher> <requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler"> <lst name="defaults"> <str name="config">data-config.xml</str> </lst> </requestHandler> <requestHandler name="/select" class="solr.SearchHandler"> <lst name="defaults"> <str name="echoParams">explicit</str> <int name="rows">100</int> <str name="df">biog</str> </lst> </requestHandler> <requestHandler name="/browse" class="solr.SearchHandler"> <lst name="defaults"> <str name="echoParams">explicit</str> <str name="wt">velocity</str> <str name="v.template">browse</str> <str name="v.layout">layout</str> <str name="title">Solritas</str> <str name="df">text</str> <str name="defType">edismax</str> <str name="q.alt">*:*</str> <str name="rows">10</str> <str name="fl">*,score</str> <str name="mlt.qf"> text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 </str> <str name="mlt.fl">text,features,name,sku,id,manu,cat</str> <int name="mlt.count">3</int> <str name="qf"> text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 </str> <str name="facet">on</str> <str name="facet.field">cat</str> <str name="facet.field">manu_exact</str> <str name="facet.query">ipod</str> <str name="facet.query">GB</str> <str name="facet.mincount">1</str> <str name="facet.pivot">cat,inStock</str> <str name="facet.range.other">after</str> <str name="facet.range">price</str> <int name="f.price.facet.range.start">0</int> <int name="f.price.facet.range.end">600</int> <int name="f.price.facet.range.gap">50</int> <str name="facet.range">popularity</str> <int name="f.popularity.facet.range.start">0</int> <int name="f.popularity.facet.range.end">10</int> <int name="f.popularity.facet.range.gap">3</int> <str name="facet.range">manufacturedate_dt</str> <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str> <str name="f.manufacturedate_dt.facet.range.end">NOW</str> <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str> <str name="f.manufacturedate_dt.facet.range.other">before</str> <str name="f.manufacturedate_dt.facet.range.other">after</str> <str name="hl">on</str> <str name="hl.fl">text features name</str> <str name="f.name.hl.fragsize">0</str> <str name="f.name.hl.alternateField">name</str> </lst> <arr name="last-components"> <str>spellcheck</str> </arr> </requestHandler> <requestHandler name="/update" class="solr.XmlUpdateRequestHandler"> </requestHandler> <requestHandler name="/update/javabin" class="solr.BinaryUpdateRequestHandler" /> <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" /> <requestHandler name="/update/json" class="solr.JsonUpdateRequestHandler" startup="lazy" /> <requestHandler name="/update/extract" startup="lazy" class="solr.extraction.ExtractingRequestHandler" > <lst name="defaults"> <str name="fmap.content">text</str> <str name="lowernames">true</str> <str name="uprefix">ignored_</str> <str name="captureAttr">true</str> <str name="fmap.a">links</str> <str name="fmap.div">ignored_</str> </lst> </requestHandler> <requestHandler name="/update/xslt" startup="lazy" class="solr.XsltUpdateRequestHandler"/> <requestHandler name="/analysis/field" startup="lazy" class="solr.FieldAnalysisRequestHandler" /> <requestHandler name="/analysis/document" class="solr.DocumentAnalysisRequestHandler" startup="lazy" /> <requestHandler name="/admin/" class="solr.admin.AdminHandlers" /> <requestHandler name="/admin/ping" class="solr.PingRequestHandler"> <lst name="invariants"> <str name="q">solrpingquery</str> </lst> <lst name="defaults"> <str name="echoParams">all</str> </lst> </requestHandler> <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" > <lst name="defaults"> <str name="echoParams">explicit</str> <str name="echoHandler">true</str> </lst> </requestHandler> <searchComponent name="spellcheck" class="solr.SpellCheckComponent"> <str name="queryAnalyzerFieldType">textSpell</str> <lst name="spellchecker"> <str name="name">default</str> <str name="field">name</str> <str name="spellcheckIndexDir">spellchecker</str> </lst> </searchComponent> <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy"> <lst name="defaults"> <str name="df">text</str> <str name="spellcheck.onlyMorePopular">false</str> <str name="spellcheck.extendedResults">false</str> <str name="spellcheck.count">1</str> </lst> <arr name="last-components"> <str>spellcheck</str> </arr> </requestHandler> <searchComponent name="tvComponent" class="solr.TermVectorComponent"/> <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy"> <lst name="defaults"> <str name="df">text</str> <bool name="tv">true</bool> </lst> <arr name="last-components"> <str>tvComponent</str> </arr> </requestHandler> <searchComponent name="clustering" enable="${solr.clustering.enabled:false}" class="solr.clustering.ClusteringComponent" > <lst name="engine"> <str name="name">default</str> <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str> <str name="carrot.lexicalResourcesDir">clustering/carrot2</str> <str name="MultilingualClustering.defaultLanguage">ENGLISH</str> </lst> <lst name="engine"> <str name="name">stc</str> <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str> </lst> </searchComponent> <requestHandler name="/clustering" startup="lazy" enable="${solr.clustering.enabled:false}" class="solr.SearchHandler"> <lst name="defaults"> <bool name="clustering">true</bool> <str name="clustering.engine">default</str> <bool name="clustering.results">true</bool> <str name="carrot.title">name</str> <str name="carrot.url">id</str> <str name="carrot.snippet">features</str> <bool name="carrot.produceSummary">true</bool> <bool name="carrot.outputSubClusters">false</bool> <str name="df">text</str> <str name="defType">edismax</str> <str name="qf"> text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 </str> <str name="q.alt">*:*</str> <str name="rows">10</str> <str name="fl">*,score</str> </lst> <arr name="last-components"> <str>clustering</str> </arr> </requestHandler> <searchComponent name="terms" class="solr.TermsComponent"/> <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy"> <lst name="defaults"> <bool name="terms">true</bool> </lst> <arr name="components"> <str>terms</str> </arr> </requestHandler> <searchComponent name="elevator" class="solr.QueryElevationComponent" > <str name="queryFieldType">string</str> <str name="config-file">elevate.xml</str> </searchComponent> <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy"> <lst name="defaults"> <str name="echoParams">explicit</str> <str name="df">text</str> </lst> <arr name="last-components"> <str>elevator</str> </arr> </requestHandler> <searchComponent class="solr.HighlightComponent" name="highlight"> <highlighting> <fragmenter name="gap" default="true" class="solr.highlight.GapFragmenter"> <lst name="defaults"> <int name="hl.fragsize">100</int> </lst> </fragmenter> <fragmenter name="regex" class="solr.highlight.RegexFragmenter"> <lst name="defaults"> <int name="hl.fragsize">70</int> <float name="hl.regex.slop">0.5</float> <str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str> </lst> </fragmenter> <formatter name="html" default="true" class="solr.highlight.HtmlFormatter"> <lst name="defaults"> <str name="hl.simple.pre"></str> <str name="hl.simple.post"></str> </lst> </formatter> <encoder name="html" class="solr.highlight.HtmlEncoder" /> <fragListBuilder name="simple" default="true" class="solr.highlight.SimpleFragListBuilder"/> <fragListBuilder name="single" class="solr.highlight.SingleFragListBuilder"/> <fragmentsBuilder name="default" default="true" class="solr.highlight.ScoreOrderFragmentsBuilder"> </fragmentsBuilder> <fragmentsBuilder name="colored" class="solr.highlight.ScoreOrderFragmentsBuilder"> <lst name="defaults"> <str name="hl.tag.pre"></str> <str name="hl.tag.post"></str> </lst> </fragmentsBuilder> <boundaryScanner name="default" default="true" class="solr.highlight.SimpleBoundaryScanner"> <lst name="defaults"> <str name="hl.bs.maxScan">10</str> <str name="hl.bs.chars">.,!? 	 </str> </lst> </boundaryScanner> <boundaryScanner name="breakIterator" class="solr.highlight.BreakIteratorBoundaryScanner"> <lst name="defaults"> <str name="hl.bs.type">WORD</str> <str name="hl.bs.language">en</str> <str name="hl.bs.country">US</str> </lst> </boundaryScanner> </highlighting> </searchComponent> <queryResponseWriter name="json" class="solr.JSONResponseWriter"> <str name="content-type">text/plain; charset=UTF-8</str> </queryResponseWriter> <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/> <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter"> <int name="xsltCacheLifetimeSeconds">5</int> </queryResponseWriter> <admin> <defaultQuery>*:*</defaultQuery> </admin> </config> -- View this message in context: http://lucene.472066.n3.nabble.com/Indexing-data-from-pdf-tp3979876p3980346.html Sent from the Solr - User mailing list archive at Nabble.com.