I am using the data import handler and using the HTMLStripTransformer
doesn't seem to be working either.

I've changed webtitle and webdescription to not by copied from title and
description in the schema.xml file then set them both to just but duplicates
of title and description in the data importer query:

<document name="items">
 <entity dataSource="db" name="item" transformer="HTMLStripTransformer"
    query="select
      title as title,
      title as webtitle,
      description as description,
      description as webdescription
      FROM ... >
      <field column="webtitle" stripHTML="true" />
      <field column="webdescription" stripHTML="true" />
 </entity>
</document>

On Tue, Feb 15, 2011 at 3:49 PM, Tanner Postert <tanner.post...@gmail.com>wrote:

> nevermind, I think I found my answer here:
> http://www.mail-archive.com/solr-user@lucene.apache.org/msg34622.html
>
> <http://www.mail-archive.com/solr-user@lucene.apache.org/msg34622.html>I
> will add the HTML stripper to the data importer and see how that goes
>
>
> On Tue, Feb 15, 2011 at 3:43 PM, Tanner Postert 
> <tanner.post...@gmail.com>wrote:
>
>> I have several fields defined and one of the field types includes a
>> solr.HTMLStripCharFilterFactory field in the analyzer but it doesn't
>> appear to be affecting the field as I would expect.
>> I have tried a simple:
>>
>> <charFilter class="solr.HTMLStripCharFilterFactory">
>> followed by the tokenizer
>> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>>
>> or the combined factory
>>
>> <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" />
>>
>> but neither seems to work.
>>
>> Returned search results from the webtitle & webdescription as well as text
>> include the original HTML characters that the title & description fields
>> have.
>>
>> The relevant schema:
>>
>> <types>
>> <fieldType name="string" class="solr.StrField" sortMissingLast="true"
>> omitNorms="true"/>
>>
>> <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
>>   <analyzer type="index">
>>     <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" />
>>
>>     <filter class="solr.StopFilterFactory" ignoreCase="true"
>> words="stopwords.txt" enablePositionIncrements="true"/>
>>
>>     <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
>> generateNumberParts="1" catenateWords="1" catenateNumbers="1"
>> catenateAll="0" splitOnCaseChange="1"/>
>>     <filter class="solr.LowerCaseFilterFactory"/>
>>     <filter class="solr.SnowballPorterFilterFactory" language="English"
>> protected="protwords.txt"/>
>>   </analyzer>
>>   <analyzer type="query">
>>
>>     <tokenizer class="solr.HTMLStripWhitespaceTokenizerFactory" />
>>
>>     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
>> ignoreCase="true" expand="true"/>
>>
>>     <filter class="solr.StopFilterFactory" ignoreCase="true"
>> words="stopwords.txt" enablePositionIncrements="true"/>
>>
>>     <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
>> generateNumberParts="1" catenateWords="0" catenateNumbers="0"
>> catenateAll="0" splitOnCaseChange="1"/>
>>
>>     <filter class="solr.LowerCaseFilterFactory"/>
>>     <filter class="solr.SnowballPorterFilterFactory" language="English"
>> protected="protwords.txt"/>
>>   </analyzer>
>> </fieldType>
>>
>> <fieldType name="textSpell" class="solr.TextField"
>> positionIncrementGap="100" omitNorms="true">
>>   <analyzer type="index">
>>     <tokenizer class="solr.HTMLStripStandardTokenizerFactory" />
>>     <filter class="solr.StopFilterFactory" ignoreCase="true"
>> words="stopwords.txt"/>
>>     <filter class="solr.LowerCaseFilterFactory"/>
>>     <filter class="solr.StandardFilterFactory"/>
>>   </analyzer>
>>   <analyzer type="query">
>>     <tokenizer class="solr.HTMLStripStandardTokenizerFactory" />
>>     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
>> ignoreCase="true" expand="true"/>
>>     <filter class="solr.StopFilterFactory" ignoreCase="true"
>> words="stopwords.txt"/>
>>     <filter class="solr.LowerCaseFilterFactory"/>
>>     <filter class="solr.StandardFilterFactory"/>
>>   </analyzer>
>> </fieldType>
>> </types>
>>
>> <fields>
>>   <field name="title"           type="string"   index="true"
>>  stored="true"   multiValued="false" />
>>   <field name="webtitle"        type="text"     index="true"
>>  stored="true"   multiValued="false" />
>>     <copyField source="title" dest="webtitle" />
>>
>>   <field name="description"     type="string"   index="true"
>>  stored="true"   multiValued="false"     compressed="true" />
>>   <field name="webdescription"  type="text"     index="true"
>>  stored="true"   mutliValued="false"     compressed="true" />
>>     <copyField source="description" dest="webdescription" />
>>
>>   <field name="spell"           type="textSpell" index="true"
>> stored="true"   multiValued="true" />
>>     <copyField source="title" dest="spell" />
>>     <copyField source="description" dest="spell" />
>>
>>   <field name="text"            type="text"   index="true" stored="true"
>> multiValued="true" />
>>     <copyField source="title" dest="text" />
>>     <copyField source="description" dest="text" />
>>
>> </fields>
>>
>>
>

Reply via email to