nevermind, I think I found my answer here:
http://www.mail-archive.com/solr-user@lucene.apache.org/msg34622.html

<http://www.mail-archive.com/solr-user@lucene.apache.org/msg34622.html>I
will add the HTML stripper to the data importer and see how that goes

On Tue, Feb 15, 2011 at 3:43 PM, Tanner Postert <tanner.post...@gmail.com>wrote:

> I have several fields defined and one of the field types includes a
> solr.HTMLStripCharFilterFactory field in the analyzer but it doesn't
> appear to be affecting the field as I would expect.
> I have tried a simple:
>
> <charFilter class="solr.HTMLStripCharFilterFactory">
> followed by the tokenizer
> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>
> or the combined factory
>
> <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" />
>
> but neither seems to work.
>
> Returned search results from the webtitle & webdescription as well as text
> include the original HTML characters that the title & description fields
> have.
>
> The relevant schema:
>
> <types>
> <fieldType name="string" class="solr.StrField" sortMissingLast="true"
> omitNorms="true"/>
>
> <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
>   <analyzer type="index">
>     <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" />
>
>     <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="stopwords.txt" enablePositionIncrements="true"/>
>
>     <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
> generateNumberParts="1" catenateWords="1" catenateNumbers="1"
> catenateAll="0" splitOnCaseChange="1"/>
>     <filter class="solr.LowerCaseFilterFactory"/>
>     <filter class="solr.SnowballPorterFilterFactory" language="English"
> protected="protwords.txt"/>
>   </analyzer>
>   <analyzer type="query">
>
>     <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" />
>
>     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>
>     <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="stopwords.txt" enablePositionIncrements="true"/>
>
>     <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
> generateNumberParts="1" catenateWords="0" catenateNumbers="0"
> catenateAll="0" splitOnCaseChange="1"/>
>
>     <filter class="solr.LowerCaseFilterFactory"/>
>     <filter class="solr.SnowballPorterFilterFactory" language="English"
> protected="protwords.txt"/>
>   </analyzer>
> </fieldType>
>
> <fieldType name="textSpell" class="solr.TextField"
> positionIncrementGap="100" omitNorms="true">
>   <analyzer type="index">
>     <tokenizer class="solr.HTMLStripStandardTokenizerFactory" />
>     <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="stopwords.txt"/>
>     <filter class="solr.LowerCaseFilterFactory"/>
>     <filter class="solr.StandardFilterFactory"/>
>   </analyzer>
>   <analyzer type="query">
>     <tokenizer class="solr.HTMLStripStandardTokenizerFactory" />
>     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>     <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="stopwords.txt"/>
>     <filter class="solr.LowerCaseFilterFactory"/>
>     <filter class="solr.StandardFilterFactory"/>
>   </analyzer>
> </fieldType>
> </types>
>
> <fields>
>   <field name="title"           type="string"   index="true"
>  stored="true"   multiValued="false" />
>   <field name="webtitle"        type="text"     index="true"
>  stored="true"   multiValued="false" />
>     <copyField source="title" dest="webtitle" />
>
>   <field name="description"     type="string"   index="true"
>  stored="true"   multiValued="false"     compressed="true" />
>   <field name="webdescription"  type="text"     index="true"
>  stored="true"   mutliValued="false"     compressed="true" />
>     <copyField source="description" dest="webdescription" />
>
>   <field name="spell"           type="textSpell" index="true"
> stored="true"   multiValued="true" />
>     <copyField source="title" dest="spell" />
>     <copyField source="description" dest="spell" />
>
>   <field name="text"            type="text"   index="true" stored="true"
> multiValued="true" />
>     <copyField source="title" dest="text" />
>     <copyField source="description" dest="text" />
>
> </fields>
>
>

Reply via email to