Hello Webster, It smells like KeywordRepeat. In general it is not a problem if all terms are scored twice. But you also have RemoveDuplicates, and this causes that in some cases a term in one field is scored twice, but once in the other field and then you have a problem.
Due to lack of replies, in the end i chose to remove the RemoveDuplicates filter, so that everything is always scored twice. This 'solution' at least solved the general scoring problem of searching across many fields. Thus far there is no real solution to this problem as far as i know it. Regards, Markus http://lucene.472066.n3.nabble.com/Multiple-languages-boosting-and-stemming-and-KeywordRepeat-td4389086.html -----Original message----- > From:Webster Homer <webster.ho...@milliporesigma.com> > Sent: Tuesday 30th October 2018 22:34 > To: solr-user@lucene.apache.org > Subject: Odd Scoring behavior > > I noticed that sometimes query matches seem to get counted twice when they > are scored. This will happen if the fieldtype is being stemmed, and there is > a matching synonym. > It seems that the score for the field is 2X higher than it should be. We see > this only when there is a matching synonym that has a stemmed term in it. > > > We have this synonym defined: > bsa, bovine serum albumin > > We have this fieldtype: > <fieldType name="text_general" class="solr.TextField" > positionIncrementGap="100"> > <analyzer type="index"> > <tokenizer class="solr.StandardTokenizerFactory"/> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="lang/stopwords_en.txt" /> > <!-- in this example, we will only use synonyms at query time > <filter class="solr.SynonymGraphFilterFactory" > synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> > --> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.KeywordRepeatFilterFactory"/> > <filter class="solr.SnowballPorterFilterFactory"/> > <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> > </analyzer> > <analyzer type="query"> > <tokenizer class="solr.StandardTokenizerFactory"/> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="lang/stopwords_en.txt" /> > <filter class="solr.SynonymGraphFilterFactory" > synonyms="synonyms.txt" ignoreCase="true" expand="true"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.KeywordRepeatFilterFactory"/> > <filter class="solr.SnowballPorterFilterFactory"/> > <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> > </analyzer> > </fieldType> > > Which is used as: > <field name="search_en_root_name" type="text_general" indexed="true" > stored="true" required="false" multiValued="false" /> > > When we query this field using the eDismax query parser the field, > search_en_root_name seems to contribute twice to the score for this query: > bovine serum albumin > > once for the base query, and once for the stemmed form of the query: > bovin serum albumin > > If we remove the synonym it will only be counted once. We only see this > behavior If part of the synonym can be stemmed. This seems odd and has the > effect of overpowering boosts on other fields. > > The explain plan without synonym > { > "responseHeader":{ > "zkConnected":true, > "status":0, > "QTime":44, > "params":{ > "mm":"2<-25%", > "fl":"searchmv_pno, search_en_p_pri_name [explain style=nl]", > "group.limit":"1", > "q.op":"OR", > "sort":"score desc,sort_en_name asc ,sort_ds asc, search_pid asc", > "group.ngroups":"true", > "q":"bovine serum albumin", > "tie":".45", > "defType":"edismax", > "group.sort":"sort_ds asc, score desc", > "qf":"search_en_p_pri_name_min^7500 > search_en_root_name_min^12000 search_en_p_pri_name^3000 > search_pid^2500 searchmv_pno^2500 searchmv_cas_number^2500 > searchmv_p_skus^2500 search_lform_lc^2500 search_en_root_name^2500 > searchmv_en_s_pri_name^2500 searchmv_en_keywords^2500 > searchmv_lookahead_terms^2000 searchmv_user_term^2000 > searchmv_en_acronym^1500 searchmv_en_synonyms^1500 > searchmv_concat_sku^1000 search_concat_pno^1000 > searchmv_en_name_suf^1000 searchmv_component_cas^1000 > search_lform^1000 searchmv_pno_genr^500 search_concat_pno_genr^500 > searchmv_p_skus_genr^500 search_eform search_mol_form > searchmv_component_molform searchmv_en_descriptions searchmv_en_chem_comp > searchmv_en_attributes searchmv_en_page_title search_mdl_number > searchmv_xref_comparable_pno searchmv_xref_comparable_sku > searchmv_xref_equivalent_pno searchmv_xref_exact_pno searchmv_xref_exact_sku > searchmv_vendor_sku searchmv_material_number search_en_sortkey searchmv_rtecs > search_color_idx search_beilstein search_ecnumber search_egecnumber > search_femanumber searchmv_isbn", > "group.field":"id_s", > "_":"1540331449276", > "group":"true"}}, > "grouped":{ > "id_s":{ > "matches":4701, > "ngroups":4393, > "groups":[{ > "groupValue":"bovineserumalbumin123459048468", > "doclist":{"numFound":57,"start":0,"docs":[ > { > "search_en_p_pri_name":"Bovine Serum Albumin", > "searchmv_pno":["A2153"], > "[explain]":{ > "match":true, > "value":38145.117, > "description":"max plus 0.45 times others of:", > "details":[{ > "match":true, > "value":10434.111, > "description":"sum of:", > "details":[{ > "match":true, > "value":4042.5876, > > "description":"weight(Synonym(search_en_root_name:bovin > search_en_root_name:bovine) in 20407) [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":4042.5876, > "description":"score(doc=20407,freq=2.0 > = termFreq=2.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":204.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.617035, > "description":"tfNorm, computed as (freq * > (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":2.0, > "description":"termFreq=2.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > "description":"fieldLength"}]}]}]}, > { > "match":true, > "value":3195.7617, > > "description":"weight(search_en_root_name:serum in 20407) > [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":3195.7617, > "description":"score(doc=20407,freq=1.0 > = termFreq=1.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":245.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.2783047, > "description":"tfNorm, computed as (freq * > (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":1.0, > "description":"termFreq=1.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > "description":"fieldLength"}]}]}]}, > { > "match":true, > "value":3195.7617, > > "description":"weight(Synonym(search_en_root_name:albumin > search_en_root_name:albumina) in 20407) [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":3195.7617, > "description":"score(doc=20407,freq=1.0 > = termFreq=1.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":167.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.2783047, > "description":"tfNorm, computed as (freq * > (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":1.0, > "description":"termFreq=1.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > "description":"fieldLength"}]}]}]}]}, > > The explain with the synonym > Scoring for Bovine Serum Albumin in search_en_root_name - With Synonym > { > "responseHeader":{ > "zkConnected":true, > "status":0, > "QTime":1391, > "params":{ > "mm":"2<-25%", > "fl":"searchmv_pno, search_en_p_pri_name [explain style=nl]", > "group.limit":"1", > "q.op":"OR", > "sort":"score desc,sort_en_name asc ,sort_ds asc, search_pid asc", > "group.ngroups":"true", > "q":"bovine serum albumin", > "tie":".45", > "defType":"edismax", > "group.sort":"sort_ds asc, score desc", > "qf":"search_en_p_pri_name_min^7500 > search_en_root_name_min^12000 search_en_p_pri_name^3000 > search_pid^2500 searchmv_pno^2500 searchmv_cas_number^2500 > searchmv_p_skus^2500 search_lform_lc^2500 search_en_root_name^2500 > searchmv_en_s_pri_name^2500 searchmv_en_keywords^2500 > searchmv_lookahead_terms^2000 searchmv_user_term^2000 > searchmv_en_acronym^1500 searchmv_en_synonyms^1500 > searchmv_concat_sku^1000 search_concat_pno^1000 > searchmv_en_name_suf^1000 searchmv_component_cas^1000 > search_lform^1000 searchmv_pno_genr^500 search_concat_pno_genr^500 > searchmv_p_skus_genr^500 search_eform search_mol_form > searchmv_component_molform searchmv_en_descriptions searchmv_en_chem_comp > searchmv_en_attributes searchmv_en_page_title search_mdl_number > searchmv_xref_comparable_pno searchmv_xref_comparable_sku > searchmv_xref_equivalent_pno searchmv_xref_exact_pno searchmv_xref_exact_sku > searchmv_vendor_sku searchmv_material_number search_en_sortkey searchmv_rtecs > search_color_idx search_beilstein search_ecnumber search_egecnumber > search_femanumber searchmv_isbn", > "group.field":"id_s", > "_":"1540331449276", > "group":"true"}}, > "grouped":{ > "id_s":{ > "matches":9368, > "ngroups":8552, > "groups":[{ > "groupValue":"bovineserumalbumin123459048468", > "doclist":{"numFound":57,"start":0,"docs":[ > { > "search_en_p_pri_name":"Bovine Serum Albumin", > "searchmv_pno":["A2153"], > "[explain]":{ > "match":true, > "value":64754.367, > "description":"max plus 0.45 times others of:", > "details":[{ > "match":true, > "value":19174.57, > "description":"sum of:", > "details":[{ > "match":true, > "value":9587.285, > "description":"sum of:", > "details":[{ > "match":true, > "value":3195.7617, > > "description":"weight(search_en_root_name:bovine in 20407) > [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":3195.7617, > > "description":"score(doc=20407,freq=1.0 = termFreq=1.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":204.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.2783047, > "description":"tfNorm, computed as > (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":1.0, > "description":"termFreq=1.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > "description":"fieldLength"}]}]}]}, > { > "match":true, > "value":3195.7617, > > "description":"weight(search_en_root_name:serum in 20407) > [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":3195.7617, > > "description":"score(doc=20407,freq=1.0 = termFreq=1.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":245.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.2783047, > "description":"tfNorm, computed as > (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":1.0, > "description":"termFreq=1.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > "description":"fieldLength"}]}]}]}, > { > "match":true, > "value":3195.7617, > > "description":"weight(search_en_root_name:albumin in 20407) > [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":3195.7617, > > "description":"score(doc=20407,freq=1.0 = termFreq=1.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":167.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.2783047, > "description":"tfNorm, computed as > (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":1.0, > "description":"termFreq=1.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > > "description":"fieldLength"}]}]}]}]}, > { > "match":true, > "value":9587.285, > "description":"sum of:", > "details":[{ > "match":true, > "value":3195.7617, > > "description":"weight(search_en_root_name:bovin in 20407) > [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":3195.7617, > > "description":"score(doc=20407,freq=1.0 = termFreq=1.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":204.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.2783047, > "description":"tfNorm, computed as > (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":1.0, > "description":"termFreq=1.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > "description":"fieldLength"}]}]}]}, > { > "match":true, > "value":3195.7617, > > "description":"weight(search_en_root_name:serum in 20407) > [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":3195.7617, > > "description":"score(doc=20407,freq=1.0 = termFreq=1.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":245.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.2783047, > "description":"tfNorm, computed as > (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":1.0, > "description":"termFreq=1.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > "description":"fieldLength"}]}]}]}, > { > "match":true, > "value":3195.7617, > > "description":"weight(search_en_root_name:albumin in 20407) > [SialBM25Similarity], result of:", > "details":[{ > "match":true, > "value":3195.7617, > > "description":"score(doc=20407,freq=1.0 = termFreq=1.0\n), product of:", > "details":[{ > "match":true, > "value":2500.0, > "description":"boost"}, > { > "match":true, > "value":1.0, > "description":"idf, computed as > log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", > "details":[{ > "match":true, > "value":167.0, > "description":"docFreq"}, > { > "match":true, > "value":365301.0, > "description":"docCount"}]}, > { > "match":true, > "value":1.2783047, > "description":"tfNorm, computed as > (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / > avgFieldLength)) from:", > "details":[{ > "match":true, > "value":1.0, > "description":"termFreq=1.0"}, > { > "match":true, > "value":1.2, > "description":"parameter k1"}, > { > "match":true, > "value":0.75, > "description":"parameter b"}, > { > "match":true, > "value":6.4128513, > "description":"avgFieldLength"}, > { > "match":true, > "value":3.0, > > "description":"fieldLength"}]}]}]}]}]}, > { >