Maybe you are confusing things by mixing instructions - there are SEPARATE
instructions for directly using SolrCell and implicitly using it via
post.jar. Pick which you want and stick with it. DO NOT MIX the
instructions.
You wrote: " I run that command:
java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
523387.pdf"
Was there a GOOD reason that you chose that URL?
Best to stay with what the post.jar wiki recommends:
Post all CSV, XML, JSON and PDF documents using AUTO mode which detects type
based on file name:
java -Dauto -jar post.jar *.csv *.xml *.json *.pdf
Or, stick with SolrCell directly, but follow its distinct instructions:
http://wiki.apache.org/solr/ExtractingRequestHandler
Again, DO NOT MIX the instructions from the two.
post.jar is designed so that you do not need to know or care exactly how
rich document indexing works.
-- Jack Krupansky
-----Original Message-----
From: Furkan KAMACI
Sent: Friday, April 26, 2013 5:30 AM
To: solr-user@lucene.apache.org
Subject: Document is missing mandatory uniqueKey field: id for Solr PDF
indexing
I use Solr 4.2.1 and these are my fields:
<field name="id" type="string" indexed="true" stored="true" required="true"
multiValued="false" />
<field name="text" type="text_general" indexed="true" stored="true"/>
<!-- Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF.
Some fields are multiValued only because Tika currently may return
multiple values for them. Some metadata is parsed from the documents,
but there are some which come from the client context:
"content_type": From the HTTP headers of incoming stream
"resourcename": From SolrCell request param resource.name
-->
<field name="title" type="text_general" indexed="true" stored="true"
multiValued="true"/>
<field name="subject" type="text_general" indexed="true" stored="true"/>
<field name="description" type="text_general" indexed="true" stored="true"/>
<field name="comments" type="text_general" indexed="true" stored="true"/>
<field name="author" type="text_general" indexed="true" stored="true"/>
<field name="keywords" type="text_general" indexed="true" stored="true"/>
<field name="category" type="text_general" indexed="true" stored="true"/>
<field name="resourcename" type="text_general" indexed="true"
stored="true"/>
<field name="url" type="text_general" indexed="true" stored="true"/>
<field name="content_type" type="string" indexed="true" stored="true"
multiValued="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="links" type="string" indexed="true" stored="true"
multiValued="true"/>
<!-- Main body of document extracted by SolrCell.
NOTE: This field is not indexed by default, since it is also copied to
"text"
using copyField below. This is to save space. Use this field for returning
and
highlighting document content. Use the "text" field to search the content.
-->
<field name="content" type="text_general" indexed="false" stored="true"
multiValued="true"/>
<!-- catchall field, containing all other searchable text fields
(implemented
via copyField further on in this schema -->
<!--
<field name="text" type="text_general" indexed="true" stored="false"
multiValued="true"/>
-->
<!-- catchall text field that indexes tokens both normally and in reverse
for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_general_rev" indexed="true"
stored="false" multiValued="true"/>
<!-- non-tokenized version of manufacturer to make it easier to sort or
group
results by manufacturer. copied from "manu" via copyField -->
<field name="manu_exact" type="string" indexed="true" stored="false"/>
<field name="payloads" type="payloads" indexed="true" stored="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
I run that command:
java -Durl=http://localhost:8983/solr/update/extract -jar post.jar
523387.pdf
However I get that error, any ideas?
Apr 26, 2013 12:26:51 PM org.apache.solr.common.SolrException log
SEVERE: org.apache.solr.common.SolrException: Document is missing mandatory
uniqueKey field: id
at
org.apache.solr.update.AddUpdateCommand.getIndexedId(AddUpdateCommand.java:88)
at
org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:464)
at
org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
at
org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.doAdd(ExtractingDocumentLoader.java:121)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.addDoc(ExtractingDocumentLoader.java:126)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1817)
at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:639)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
at org.eclipse.jetty.server.Server.handle(Server.java:365)
at
org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
at
org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
at
org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937)
at
org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856)
at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
at
org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
at
org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
at java.lang.Thread.run(Thread.java:722)