Hello, We have a problem with sorting on title field in Solr instance of our production repository, we get the error message:
"HTTP Status 500 - there are more terms than documents in field "titleStr", but it's impossible to sort on tokenized fields". After some googling and searching in this listserv, we found that a sorting field has to be untokenized but our sorting field "titleStr" which is a copy of the "title" field has a string type. What we did as configs in our schema.xml file : 1st config ++++++++++ <fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> <fieldtype name="text" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true"/> <filter class="solr.ISOLatin1AccentFilterFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true"/> <filter class="solr.ISOLatin1AccentFilterFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> </fieldtype> =============================== <field name="title" type="text" indexed="true" stored="true" termVectors="true"/> <field name="titleStr" type="string" indexed="true" stored="false"/> ================================= <copyField source="title" dest="titleStr"/> As you can see, the title field has the termVectors property as true, we drop it in the second attempt of our config 2end attempt ++++++++++++ <fieldtype name="text" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true"/> <filter class="solr.ISOLatin1AccentFilterFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true"/> <filter class="solr.ISOLatin1AccentFilterFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> </fieldtype> =============================== <field name="title" type="text" indexed="true" stored="true"/> <field name="titleStr" type="string" indexed="true" stored="false"/> ================================= <copyField source="title" dest="titleStr"/> 3rd attempt +++++++++++ Create a new field type named 'text_exact' which doesn't use the "WhitespaceTokenizer" tokenizer but instead uses the "KeywordTokenizer" tokenizer. <fieldtype name="text" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true"/> <filter class="solr.ISOLatin1AccentFilterFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true"/> <filter class="solr.ISOLatin1AccentFilterFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> </fieldtype> <fieldtype name="text_exact" class="solr.TextField" sortMissingLast="true" omitNorms="true"> <analyzer type="index"> <!-- KeywordTokenizer does no actual tokenizing, so the entire input string is preserved as a single token --> <tokenizer class="solr.KeywordTokenizerFactory"/> <filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true"/> <filter class="solr.ISOLatin1AccentFilterFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> <analyzer type="query"> <!-- KeywordTokenizer does no actual tokenizing, so the entire input string is preserved as a single token --> <tokenizer class="solr.KeywordTokenizerFactory"/> <filter class="schema.UnicodeNormalizationFilterFactory" version="icu4j" composed="false" remove_diacritics="true" remove_modifiers="true" fold="true"/> <filter class="solr.ISOLatin1AccentFilterFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer> </fieldtype> =============================== <field name="title" type="text" indexed="true" stored="true"/> <field name="titleStr" type="text_exact" indexed="true" stored="false"/> ================================= No copyField. 4th attempt +++++++++++ Same config as the 3rd attempt but adding explicitly that the property 'multiValued' of titleStr (text_exact type) field is false <field name="titleStr" type="text_exact" indexed="true" stored="false" multiValued="false"/> For this last config, we noticed that the number of documents in our index was downsized from ~22500 records to ~17800 records! We don't understand this behavior of Sorl/Lucene? For all these configs, we got the same error message, please note that we encounter this issue on our production server http://difusion.ulb.ac.be/vufind/Search/Home?lookfor=&sort=pubdate+desc& submitButton=Recherche&type=general&sort=title (with ~22500 records), or with the same config (the first one) on our test server http://bib17.ulb.ac.be/vufind/Search/Home?lookfor=&sort=pubdate+desc&sub mitButton=Find&type=general&sort=title (with ~57700 records), the sorting on title is going well! Thanks in advance for the time you can spend to have a look on this. Best regards, Hicham El Kasmi