Hi all, I have a database of e-commerce products (5M) and trying to build a search solution for it.
I have used steemer, edgengram and doublemetaphone phonetic fields for omiting common typos in queries. It works quite good with dismax QParser for queries longer than one word: "tv lc20", "sny psp 3001", "cannon 5d" etc. For not having too many results I manipulated with `mm` parameter. But when user type a single word like "ipad", "cannon". I always having a lot of results (~60000). This is unacceptable for my client. He would like to have then only the `good` results. That particulary match specific query. It's hard to acomplish for me cause of use doublemetaphone field which converts words like "apt", "opt" and "ipad" and even "ipod" to the same phonetic word - APT. And then all of these words are matched fairly the same gives me huge amount of results. Similar problems I have with other words like "canon", "canine" and "cannon" which are KNN in phonetic way. But lexically have different meanings: "canon" - camera, "canine" - cat food , "cannon" - may be a misspell for canon or part of book title about cannon weapons. My first idea was to make a second requestHandler without searching in *_phonetic fields. And use it for queries with only one word. But it didn't worked cause sometimes I want to correct user even if there is only one word and suggest him something better. Query "cannon" is a good example. I'm fairly sure that most of the time when someone type "cannon" it would be a typo for "canon" and I want to show user also CANON cameras. That's why I can't use second requestHandler for one word queries. I'm looking for any ideas how could I change my requestHandler. My regular queries are: http://localhost:8983/solr/select?q=cannon Below I put my configuration for requestHandler and schema.xml. solrconfig.xml: <requestHandler name="search" class="solr.SearchHandler" default="true"> <lst name="defaults"> <str name="q.alt">*:*</str> <str name="defType">dismax</str> <str name="qf"> title^1.3 title_text^0.9 title_phonetic^0.74 title_ng^0.17 title_ngram^0.54 producer_name^0.9 producer_name_text^0.89 category_path_text^0.8 category_path_phonetic^0.65 description^0.60 description_text^0.56 </str> <str name="pf">title_text^1.1 title^1.2 description^0.3</str> <int name="ps">3</int> <str name="tie">0.1</str> <str name="mm">2<100% 3<-1 5<85%</str> <str name="fl">*,score</str> </lst> </requestHandler> schema.xml: <?xml version="1.0" encoding="UTF-8" ?> <schema name="XX" version="1.2"> <types> <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0" /> <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true" /> <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true" /> <fieldType name="decimal" class="solr.TrieFloatField" precisionStep="2" omitNorms="true" positionIncrementGap="0" /> <fieldType name="text" class="solr.TextField" positionIncrementGap="100"> <analyzer> <charFilter class="solr.HTMLStripCharFilterFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory" /> <!-- Case insensitive stop word removal. add enablePositionIncrements=true in both the index and query analyzers to leave a 'gap' for more accurate phrase queries. --> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_pl.txt" enablePositionIncrements="true" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.LowerCaseFilterFactory" /> <filter class="solr.TrimFilterFactory" /> <filter class="solr.StempelPolishStemFilterFactory" /> </analyzer> </fieldType> <fieldType name="text_gen" class="solr.TextField" positionIncrementGap="100"> <analyzer> <charFilter class="solr.HTMLStripCharFilterFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory" /> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_pl.txt" enablePositionIncrements="true" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.LowerCaseFilterFactory" /> <filter class="solr.TrimFilterFactory" /> </analyzer> </fieldType> <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > <analyzer> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_pl.txt" enablePositionIncrements="true" /> <filter class="solr.DoubleMetaphoneFilterFactory" inject="false" maxCodeLength="8"/> </analyzer> </fieldtype> <fieldtype name="ngram" class="solr.TextField"> <analyzer type="index"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_pl.txt" enablePositionIncrements="true" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.NGramFilterFactory" minGramSize="2" maxGramSize="3" /> </analyzer> <analyzer type="query"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.NGramFilterFactory" minGramSize="2" maxGramSize="3" /> </analyzer> </fieldtype> <fieldtype name="edgengram" class="solr.TextField"> <analyzer> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_pl.txt" enablePositionIncrements="true" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.EdgeNGramFilterFactory" minGramSize="2" maxGramSize="15" side="front"/> </analyzer> </fieldtype> </types> <fields> <field name="id" type="string" indexed="true" stored="true" required="true" /> <field name="title" type="text_gen" indexed="true" stored="true" required="true" /> <field name="category_path" type="string" indexed="true" stored="true" /> <field name="producer_name" type="string" indexed="true" stored="false" /> <field name="description" type="text_gen" indexed="false" stored="true" /> <dynamicField name="*_text" type="text" indexed="true" stored="false" /> <dynamicField name="*_ascii" type="text_ascii" indexed="true" stored="false" /> <dynamicField name="*_phonetic" type="phonetic" indexed="true" stored="false" /> <dynamicField name="*_ng" type="edgengram" indexed="true" stored="false" /> <dynamicField name="*_ngram" type="ngram" indexed="true" stored="false" /> </fields> <uniqueKey>id</uniqueKey> <defaultSearchField>title</defaultSearchField> <solrQueryParser defaultOperator="AND" /> <copyField source="title" dest="title_sort" /> <copyField source="title" dest="title_text" /> <copyField source="title" dest="title_ascii" /> <copyField source="title" dest="title_phonetic" /> <copyField source="title" dest="title_ng" /> <copyField source="title" dest="title_ngram"/> <copyField source="producer_name" dest="producer_name_text" /> <copyField source="producer_name" dest="producer_name_phonetic" /> <copyField source="category_path" dest="category_path_text" /> <copyField source="category_path" dest="category_path_phonetic" /> <copyField source="description" dest="description_text" /> </schema> -- RafaĆ "RaVbaker" Piekarski. web: http://ja.ravbaker.net mail: ravba...@gmail.com jid/xmpp/aim: ravba...@gmail.com mobile: +48-663-808-481