nevermind, I think I found my answer here: http://www.mail-archive.com/solr-user@lucene.apache.org/msg34622.html
<http://www.mail-archive.com/solr-user@lucene.apache.org/msg34622.html>I will add the HTML stripper to the data importer and see how that goes On Tue, Feb 15, 2011 at 3:43 PM, Tanner Postert <tanner.post...@gmail.com>wrote: > I have several fields defined and one of the field types includes a > solr.HTMLStripCharFilterFactory field in the analyzer but it doesn't > appear to be affecting the field as I would expect. > I have tried a simple: > > <charFilter class="solr.HTMLStripCharFilterFactory"> > followed by the tokenizer > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > > or the combined factory > > <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" /> > > but neither seems to work. > > Returned search results from the webtitle & webdescription as well as text > include the original HTML characters that the title & description fields > have. > > The relevant schema: > > <types> > <fieldType name="string" class="solr.StrField" sortMissingLast="true" > omitNorms="true"/> > > <fieldType name="text" class="solr.TextField" positionIncrementGap="100"> > <analyzer type="index"> > <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" /> > > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt" enablePositionIncrements="true"/> > > <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" > generateNumberParts="1" catenateWords="1" catenateNumbers="1" > catenateAll="0" splitOnCaseChange="1"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.SnowballPorterFilterFactory" language="English" > protected="protwords.txt"/> > </analyzer> > <analyzer type="query"> > > <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" /> > > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignoreCase="true" expand="true"/> > > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt" enablePositionIncrements="true"/> > > <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" > generateNumberParts="1" catenateWords="0" catenateNumbers="0" > catenateAll="0" splitOnCaseChange="1"/> > > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.SnowballPorterFilterFactory" language="English" > protected="protwords.txt"/> > </analyzer> > </fieldType> > > <fieldType name="textSpell" class="solr.TextField" > positionIncrementGap="100" omitNorms="true"> > <analyzer type="index"> > <tokenizer class="solr.HTMLStripStandardTokenizerFactory" /> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.StandardFilterFactory"/> > </analyzer> > <analyzer type="query"> > <tokenizer class="solr.HTMLStripStandardTokenizerFactory" /> > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignoreCase="true" expand="true"/> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.StandardFilterFactory"/> > </analyzer> > </fieldType> > </types> > > <fields> > <field name="title" type="string" index="true" > stored="true" multiValued="false" /> > <field name="webtitle" type="text" index="true" > stored="true" multiValued="false" /> > <copyField source="title" dest="webtitle" /> > > <field name="description" type="string" index="true" > stored="true" multiValued="false" compressed="true" /> > <field name="webdescription" type="text" index="true" > stored="true" mutliValued="false" compressed="true" /> > <copyField source="description" dest="webdescription" /> > > <field name="spell" type="textSpell" index="true" > stored="true" multiValued="true" /> > <copyField source="title" dest="spell" /> > <copyField source="description" dest="spell" /> > > <field name="text" type="text" index="true" stored="true" > multiValued="true" /> > <copyField source="title" dest="text" /> > <copyField source="description" dest="text" /> > > </fields> > >