Hello again,
We have a problem with sorting on title field in Solr instance of our
production repository, we get the error message:
"HTTP Status 500 - there are more terms than documents in field
"titleStr", but it's impossible to sort on tokenized fields".
After some googling and searching in this listserv, we found that a
sorting field has to be untokenized but our sorting field "titleStr"
which is a copy of the "title" field has a string type.
What we did as configs in our schema.xml file :
1st config
++++++++++
<fieldtype name="string" class="solr.StrField"
sortMissingLast="true" omitNorms="true"/>
<fieldtype name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"
version="icu4j" composed="false" remove_diacritics="true"
remove_modifiers="true" fold="true"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"
version="icu4j" composed="false" remove_diacritics="true"
remove_modifiers="true" fold="true"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>
===============================
<field name="title" type="text" indexed="true" stored="true"
termVectors="true"/>
<field name="titleStr" type="string" indexed="true" stored="false"/>
=================================
<copyField source="title" dest="titleStr"/>
As you can see, the title field has the termVectors property as true, we
drop it in the second attempt of our config
2end attempt
++++++++++++
<fieldtype name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"
version="icu4j" composed="false" remove_diacritics="true"
remove_modifiers="true" fold="true"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"
version="icu4j" composed="false" remove_diacritics="true"
remove_modifiers="true" fold="true"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>
===============================
<field name="title" type="text" indexed="true" stored="true"/>
<field name="titleStr" type="string" indexed="true" stored="false"/>
=================================
<copyField source="title" dest="titleStr"/>
3rd attempt
+++++++++++
Create a new field type named 'text_exact' which doesn't use the
"WhitespaceTokenizer" tokenizer but instead uses the "KeywordTokenizer"
tokenizer.
<fieldtype name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"
version="icu4j" composed="false" remove_diacritics="true"
remove_modifiers="true" fold="true"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"
version="icu4j" composed="false" remove_diacritics="true"
remove_modifiers="true" fold="true"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="text_exact" class="solr.TextField"
sortMissingLast="true" omitNorms="true">
<analyzer type="index">
<!-- KeywordTokenizer does no actual tokenizing, so the entire
input string
is preserved as a single token -->
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"
version="icu4j" composed="false" remove_diacritics="true"
remove_modifiers="true" fold="true"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<!-- KeywordTokenizer does no actual tokenizing, so the entire
input string
is preserved as a single token -->
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="schema.UnicodeNormalizationFilterFactory"
version="icu4j" composed="false" remove_diacritics="true"
remove_modifiers="true" fold="true"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>
===============================
<field name="title" type="text" indexed="true" stored="true"/>
<field name="titleStr" type="text_exact" indexed="true"
stored="false"/>
=================================
No copyField.
4th attempt
+++++++++++
Same config as the 3rd attempt but adding explicitly that the property
'multiValued' of titleStr (text_exact type) field is false
<field name="titleStr" type="text_exact" indexed="true" stored="false"
multiValued="false"/>
For this last config, we noticed that the number of documents in our
index was downsized from ~22500 records to ~17800 records! We don't
understand this behavior of Sorl/Lucene?
For all these configs, we got the same error message, please note that
we encounter this issue on our production server
http://difusion.ulb.ac.be/vufind/Search/Home?lookfor=&sort=pubdate+desc&
submitButton=Recherche&type=general&sort=title (with ~22500 records),
or with the same config (the first one) on our test server
http://bib17.ulb.ac.be/vufind/Search/Home?lookfor=&sort=pubdate+desc&sub
mitButton=Find&type=general&sort=title (with ~57700 records), the
sorting on title is going well!
Thanks in advance for the time you can spend to have a look on this.
Best regards,
Hicham El Kasmi