How to index only the pdf content/text

Croci Francesco Luigi (ID SWS) Tue, 25 Mar 2014 02:58:24 -0700

I searched a way to index only the content/text part of a PDF (without all the 
other fields Tika creates) and I found the "solution" with the "uprefix" = 
ignored_ and <dynamicField name="ignored_*" type="ignored" multiValued="true" 
indexed="false" stored="false" />.


The problem is, that uprefix works on fields that are not specified in the 
schema. In my schema I specified two fields (id and rmDocumentTitle) and this 
two fields are added to the content too (what I will avoid).

How can I exclude this two fields to be added to the fullText?

Here are my config files:

schema.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="simple" version="1.1">
                <types>
                               <fieldtype name="string" class="solr.StrField" 
postingsFormat="SimpleText" />
                               <fieldtype name="ignored" class="solr.TextField" 
/>
                               <fieldtype name="text" class="solr.TextField" 
postingsFormat="SimpleText">
                                               <analyzer type="index">
                                                               <tokenizer 
class="solr.StandardTokenizerFactory"/>
                                                               <!--<filter 
class="solr.ASCIIFoldingFilterFactory"/>--> <!--Converts alphabetic, numeric, 
and symbolic Unicode characters which are not in the first 127 ASCII characters 
into their ASCII equivalents, if one exists. -->
                                                               <filter 
class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each 
token. Leaves non-letter tokens alone.-->
                                                               <filter 
class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token. 
-->
                                                               <filter 
class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/> 
<!--Discards common words.  -->
                                                               <filter 
class="solr.PorterStemFilterFactory"/>
                                                               <!--<filter 
class="solr.SnowballPorterFilterFactory" language="German2" /> -->
                                                               <filter 
class="solr.RemoveDuplicatesTokenFilterFactory"/>
                                               </analyzer>
                                               <analyzer type="query">
                                                               <tokenizer 
class="solr.StandardTokenizerFactory"/>
                                                               <filter 
class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
                                                               <filter 
class="solr.LowerCaseFilterFactory" />
                                                               <filter 
class="solr.TrimFilterFactory"/>
                                                               <filter 
class="solr.PorterStemFilterFactory"/>
                                                               <!--<filter 
class="solr.SnowballPorterFilterFactory" language="German2" /> -->
                                                               <filter 
class="solr.RemoveDuplicatesTokenFilterFactory"/>
                                               </analyzer>
                               </fieldtype>
                </types>

                <fields>
                               <field name="signatureField" type="string" 
indexed="true" stored="true" multiValued="false" />
                               <dynamicField name="ignored_*" type="ignored" 
multiValued="true" indexed="false" stored="false" />
                               <field name="id" type="string" indexed="true" 
stored="true" multiValued="false" />
                               <field name="rmDocumentTitle" type="string" 
indexed="true" stored="true" multiValued="true"/>
                               <field name="fullText" indexed="true" 
type="text" multiValued="true" />
                </fields>

                <defaultSearchField>fullText</defaultSearchField>

                <solrQueryParser defaultOperator="OR" />
                <uniqueKey>id</uniqueKey>
</schema>


solrconfig.xml
<?xml version="1.0" encoding="UTF-8" ?>
<config>
                ...
                <requestHandler name="/update/extract" 
class="solr.extraction.ExtractingRequestHandler">
                               <lst name="defaults">
                                               <str 
name="captureAttr">true</str>
                                               <str 
name="lowernames">false</str>
                                               <str name="overwrite">false</str>
                                               <str 
name="captureAttr">true</str>
                                               <str 
name="literalsOverride">true</str>
                                               <str 
name="uprefix">ignored_</str>
                                               <str name="fmap.a">link</str>
                                               <str 
name="fmap.content">fullText</str>
                                               <!-- the configuration here 
could be useful for tests -->
                                               <str 
name="update.chain">deduplication</str>
                               </lst>
                </requestHandler>

                <updateRequestProcessorChain name="deduplication">
                               <processor
                                               
class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
                                               <bool 
name="overwriteDupes">false</bool>
                                               <str 
name="signatureField">signatureField</str>
                                               <bool name="enabled">true</bool>
                                               <str name="fields">content</str>
                                               <str name="minTokenLen">10</str>
                                               <str name="quantRate">.2</str>
                                               <str 
name="signatureClass">solr.update.processor.TextProfileSignature</str>
                               </processor>
                               <processor 
class="solr.LogUpdateProcessorFactory" />
                               <processor 
class="solr.RunUpdateProcessorFactory" />
                </updateRequestProcessorChain>

                <requestHandler name="/admin/"
                               
class="org.apache.solr.handler.admin.AdminHandlers" />

                <lockType>none</lockType>

                <admin>
                               <defaultQuery>*:*</defaultQuery>
                </admin>
</config>


Thank you for any help.
Francesco

How to index only the pdf content/text

Reply via email to