Thank you, Looks like the problem was https://issues.apache.org/jira/browse/SOLR-1667. I backported it to the 1.4 branch: http://svn.apache.org/repos/asf/lucene/solr/branches/branch-1.4/
On Wed, Nov 17, 2010 at 4:48 AM, j...@nuatech.net <j...@nuatech.net> wrote: > Hi Richard, > My full schema.xml is below (and attached). Do you want me to raise this in > Jira? > Regards, > John > <?xml version="1.0" encoding="UTF-8" ?> > <schema name="example" version="1.2"> > <types> > <fieldType name="string" class="solr.StrField" sortMissingLast="true" > omitNorms="true"/> > <!-- boolean type: "true" or "false" --> > <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" > omitNorms="true"/> > <!--Binary data type. The data should be sent/retrieved in as Base64 > encoded Strings --> > <fieldtype name="binary" class="solr.BinaryField"/> > <fieldType name="int" class="solr.TrieIntField" precisionStep="0" > omitNorms="true" positionIncrementGap="0"/> > <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" > omitNorms="true" positionIncrementGap="0"/> > <fieldType name="long" class="solr.TrieLongField" precisionStep="0" > omitNorms="true" positionIncrementGap="0"/> > <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" > omitNorms="true" positionIncrementGap="0"/> > <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" > omitNorms="true" positionIncrementGap="0"/> > <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" > omitNorms="true" positionIncrementGap="0"/> > <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" > omitNorms="true" positionIncrementGap="0"/> > <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" > omitNorms="true" positionIncrementGap="0"/> > <fieldType name="date" class="solr.TrieDateField" omitNorms="true" > precisionStep="0" positionIncrementGap="0"/> > <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" > precisionStep="6" positionIncrementGap="0"/> > <fieldType name="pint" class="solr.IntField" omitNorms="true"/> > <fieldType name="plong" class="solr.LongField" omitNorms="true"/> > <fieldType name="pfloat" class="solr.FloatField" omitNorms="true"/> > <fieldType name="pdouble" class="solr.DoubleField" omitNorms="true"/> > <fieldType name="pdate" class="solr.DateField" sortMissingLast="true" > omitNorms="true"/> > <fieldType name="sint" class="solr.SortableIntField" > sortMissingLast="true" omitNorms="true"/> > <fieldType name="slong" class="solr.SortableLongField" > sortMissingLast="true" omitNorms="true"/> > <fieldType name="sfloat" class="solr.SortableFloatField" > sortMissingLast="true" omitNorms="true"/> > <fieldType name="sdouble" class="solr.SortableDoubleField" > sortMissingLast="true" omitNorms="true"/> > <fieldType name="random" class="solr.RandomSortField" indexed="true" /> > <fieldType name="text_ws" class="solr.TextField" > positionIncrementGap="100"> > <analyzer> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > </analyzer> > </fieldType> > <fieldType name="text" class="solr.TextField" > positionIncrementGap="100"> > <analyzer type="index"> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > <filter class="solr.ISOLatin1AccentFilterFactory"/> > <filter class="solr.StopFilterFactory" > ignoreCase="true" > words="stopwords.txt" > enablePositionIncrements="true" > /> > <filter class="solr.WordDelimiterFilterFactory" > generateWordParts="1" generateNumberParts="1" catenateWords="1" > catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.SnowballPorterFilterFactory" language="English" > protected="protwords.txt"/> > </analyzer> > <analyzer type="query"> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > <filter class="solr.ISOLatin1AccentFilterFactory"/> > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignoreCase="true" expand="true"/> > <filter class="solr.StopFilterFactory" > ignoreCase="true" > words="stopwords.txt" > enablePositionIncrements="true" > /> > <filter class="solr.WordDelimiterFilterFactory" > generateWordParts="1" generateNumberParts="1" catenateWords="0" > catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.SnowballPorterFilterFactory" language="English" > protected="protwords.txt"/> > </analyzer> > </fieldType> > > <!-- Less flexible matching, but less false matches. Probably not ideal > for product names, > but may be good for SKUs. Can insert dashes in the wrong place and > still match. --> > <fieldType name="textTight" class="solr.TextField" > positionIncrementGap="100" > > <analyzer> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignoreCase="true" expand="false"/> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt"/> > <filter class="solr.WordDelimiterFilterFactory" > generateWordParts="0" generateNumberParts="0" catenateWords="1" > catenateNumbers="1" catenateAll="0"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.SnowballPorterFilterFactory" language="English" > protected="protwords.txt"/> > <!-- this filter can remove any duplicate tokens that appear at the > same position - sometimes > possible with WordDelimiterFilter in conjuncton with stemming. > --> > <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> > </analyzer> > </fieldType> > > <!-- A general unstemmed text field - good if one does not know the > language of the field --> > <fieldType name="textgen" class="solr.TextField" > positionIncrementGap="100"> > <analyzer type="index"> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt" enablePositionIncrements="true" /> > <filter class="solr.WordDelimiterFilterFactory" > generateWordParts="1" generateNumberParts="1" catenateWords="1" > catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> > <filter class="solr.LowerCaseFilterFactory"/> > </analyzer> > <analyzer type="query"> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignoreCase="true" expand="true"/> > <filter class="solr.StopFilterFactory" > ignoreCase="true" > words="stopwords.txt" > enablePositionIncrements="true" > /> > <filter class="solr.WordDelimiterFilterFactory" > generateWordParts="1" generateNumberParts="1" catenateWords="0" > catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> > <filter class="solr.LowerCaseFilterFactory"/> > </analyzer> > </fieldType> > > <!-- unstemmed text field tokenized by comma - used for the level field > --> > <fieldType name="commaSeperated" class="solr.TextField" > positionIncrementGap="100"> > <analyzer type="index"> > <tokenizer class="solr.PatternTokenizerFactory" pattern=", *" /> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt" enablePositionIncrements="true" /> > <filter class="solr.LowerCaseFilterFactory"/> > </analyzer> > <analyzer type="query"> > <tokenizer class="solr.PatternTokenizerFactory" pattern=", *" /> > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignoreCase="true" expand="true"/> > <filter class="solr.StopFilterFactory" > ignoreCase="true" > words="stopwords.txt" > enablePositionIncrements="true" > /> > <filter class="solr.LowerCaseFilterFactory"/> > </analyzer> > </fieldType> > > <!-- unstemmed text field tokenized by comma and non lowercased - used > for the faceting? --> > <fieldType name="nolc_commaSeperated" class="solr.TextField" > positionIncrementGap="100"> > <analyzer type="index"> > <tokenizer class="solr.PatternTokenizerFactory" pattern=", *" /> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt" enablePositionIncrements="true" /> > </analyzer> > <analyzer type="query"> > <tokenizer class="solr.PatternTokenizerFactory" pattern=", *" /> > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignoreCase="true" expand="true"/> > <filter class="solr.StopFilterFactory" > ignoreCase="true" > words="stopwords.txt" > enablePositionIncrements="true" > /> > </analyzer> > </fieldType> > > <!-- A general unstemmed text field that indexes tokens normally and > also > reversed (via ReversedWildcardFilterFactory), to enable more > efficient > leading wildcard queries. --> > <fieldType name="text_rev" class="solr.TextField" > positionIncrementGap="100"> > <analyzer type="index"> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > <filter class="solr.ISOLatin1AccentFilterFactory"/> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt" enablePositionIncrements="true" /> > <filter class="solr.WordDelimiterFilterFactory" > generateWordParts="1" generateNumberParts="1" catenateWords="1" > catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.ReversedWildcardFilterFactory" > withOriginal="true" > maxPosAsterisk="3" maxPosQuestion="2" > maxFractionAsterisk="0.33"/> > </analyzer> > <analyzer type="query"> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > <filter class="solr.ISOLatin1AccentFilterFactory"/> > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignoreCase="true" expand="true"/> > <filter class="solr.StopFilterFactory" > ignoreCase="true" > words="stopwords.txt" > enablePositionIncrements="true" > /> > <filter class="solr.WordDelimiterFilterFactory" > generateWordParts="1" generateNumberParts="1" catenateWords="0" > catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> > <filter class="solr.LowerCaseFilterFactory"/> > </analyzer> > </fieldType> > <fieldType name="textSpell" class="solr.TextField" > positionIncrementGap="100" stored="false" multiValued="true"> > <analyzer type="index"> > <tokenizer class="solr.StandardTokenizerFactory"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" > ignorecase="true" expand="true"/> > <filter class="solr.StopFilterFactory" ignorecase="true" > words="stopwords.txt"/> > <filter class="solr.StandardFilterFactory"/> > <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> > </analyzer> > <analyzer type="query"> > <tokenizer class="solr.StandardTokenizerFactory"/> > <filter class="solr.LowerCaseFilterFactory"/> > <filter class="solr.StopFilterFactory" ignoreCase="true" > words="stopwords.txt"/> > <filter class="solr.StandardFilterFactory"/> > <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> > </analyzer> > </fieldType> > > <fieldType name="textSpellPhrase" class="solr.TextField" > positionIncrementGap="100" stored="false" multiValued="true"> > <analyzer> > <tokenizer class="solr.PatternTokenizerFactory" pattern=", *" /> > <filter class="solr.ISOLatin1AccentFilterFactory"/> > <filter class="solr.StopFilterFactory" ignorecase="true" > words="stopwords.txt"/> > <filter class="solr.LowerCaseFilterFactory"/> > </analyzer> > </fieldType> > > <!-- charFilter + WhitespaceTokenizer --> > <!-- > <fieldType name="textCharNorm" class="solr.TextField" > positionIncrementGap="100" > > <analyzer> > <charFilter class="solr.MappingCharFilterFactory" > mapping="mapping-ISOLatin1Accent.txt"/> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > </analyzer> > </fieldType> > --> > <!-- This is an example of using the KeywordTokenizer along > With various TokenFilterFactories to produce a sortable field > that does not include some properties of the source text > --> > <fieldType name="alphaOnlySort" class="solr.TextField" > sortMissingLast="true" omitNorms="true"> > <analyzer> > <!-- KeywordTokenizer does no actual tokenizing, so the entire > input string is preserved as a single token > --> > <tokenizer class="solr.KeywordTokenizerFactory"/> > <!-- The LowerCase TokenFilter does what you expect, which can be > when you want your sorting to be case insensitive > --> > <filter class="solr.LowerCaseFilterFactory" /> > <!-- The TrimFilter removes any leading or trailing whitespace --> > <filter class="solr.TrimFilterFactory" /> > <!-- The PatternReplaceFilter gives you the flexibility to use > Java Regular expression to replace any sequence of characters > matching a pattern with an arbitrary replacement string, > which may include back references to portions of the original > string matched by the pattern. > > See the Java Regular Expression documentation for more > information on pattern and replacement string syntax. > > > http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html > --> > <filter class="solr.PatternReplaceFilterFactory" > pattern="([^a-z])" replacement="" replace="all" > /> > </analyzer> > </fieldType> > > <fieldtype name="phonetic" stored="false" indexed="true" > class="solr.TextField" > > <analyzer> > <tokenizer class="solr.StandardTokenizerFactory"/> > <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> > </analyzer> > </fieldtype> > <fieldtype name="payloads" stored="false" indexed="true" > class="solr.TextField" > > <analyzer> > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > <!-- > The DelimitedPayloadTokenFilter can put payloads on tokens... for > example, > a token of "foo|1.4" would be indexed as "foo" with a payload of > 1.4f > Attributes of the DelimitedPayloadTokenFilterFactory : > "delimiter" - a one character delimiter. Default is | (pipe) > "encoder" - how to encode the following value into a playload > float -> org.apache.lucene.analysis.payloads.FloatEncoder, > integer -> o.a.l.a.p.IntegerEncoder > identity -> o.a.l.a.p.IdentityEncoder > Fully Qualified class name implementing PayloadEncoder, Encoder > must have a no arg constructor. > --> > <filter class="solr.DelimitedPayloadTokenFilterFactory" > encoder="float"/> > </analyzer> > </fieldtype> > <!-- lowercases the entire field value, keeping it as a single token. > --> > <fieldType name="lowercase" class="solr.TextField" > positionIncrementGap="100"> > <analyzer> > <tokenizer class="solr.KeywordTokenizerFactory"/> > <filter class="solr.LowerCaseFilterFactory" /> > </analyzer> > </fieldType> > > <!-- keeping it as a single token. --> > <fieldType name="ktf" class="solr.TextField" positionIncrementGap="100"> > <analyzer> > <tokenizer class="solr.KeywordTokenizerFactory"/> > </analyzer> > </fieldType> > > <!-- since fields of this type are by default not stored or indexed, > any data added to them will be ignored outright. --> > <fieldtype name="ignored" stored="false" indexed="false" > multiValued="true" class="solr.StrField" /> > </types> > > <fields> > <!-- Valid attributes for fields: > name: mandatory - the name for the field > type: mandatory - the name of a previously defined type from the > <types> section > indexed: true if this field should be indexed (searchable or sortable) > stored: true if this field should be retrievable > compressed: [false] if this field should be stored using gzip > compression > (this will only apply if the field type is compressable; among > the standard field types, only TextField and StrField are) > multiValued: true if this field may contain multiple values per > document > omitNorms: (expert) set to true to omit the norms associated with > this field (this disables length normalization and index-time > boosting for the field, and saves some memory). Only full-text > fields or fields that need an index-time boost need norms. > termVectors: [false] set to true to store the term vector for a > given field. > When using MoreLikeThis, fields used for similarity should be > stored for best performance. > termPositions: Store position information with the term vector. > This will increase storage costs. > termOffsets: Store offset information with the term vector. This > will increase storage costs. > default: a value that should be used if no value is specified > when adding a document. > --> > <field name="date" type="date" indexed="true" stored="true"/> > <field name="datemodified" type="date" indexed="true" stored="true"/> > <field name="level" type="commaSeperated" indexed="true" stored="false" > multiValed="false"/> > <field name="title" type="text" indexed="true" stored="true" > multiValed="false"/> > <field name="body" type="text" indexed="true" stored="false" > multiValed="false"/> > <field name="description" type="text" indexed="true" stored="true" > multiValed="false"/> > <field name="categories" type="commaSeperated" indexed="true" > stored="false" multiValed="false"/> > <field name="keywords" type="commaSeperated" indexed="true" > stored="false" multiValed="false"/> > <field name="form" type="string" indexed="true" stored="true" > multiValed="false"/> > <field name="thumbnail" type="string" indexed="true" stored="true" > multiValed="false"/> > <field name="duration" type="int" indexed="true" stored="true" > multiValed="false"/> > <field name="format" type="string" indexed="true" stored="true" > multiValed="false"/> > <field name="profile" type="string" indexed="true" stored="true" > multiValed="false"/> > <field name="identifier" type="string" indexed="true" stored="true" > multiValed="false"/> > <field name="type" type="string" indexed="true" stored="true" > multiValed="false"/> > <field name="av" type="string" indexed="true" stored="true" > multiValed="false"/> > <field name="pillar" type="string" indexed="true" stored="false" > multiValed="false"/> > <field name="url" type="string" indexed="true" stored="true" > required='true'/> > <field name="impish" type="int" indexed="true" stored="false" > multiValed="false"/> > <field name="imp" type="int" indexed="true" stored="true" > multiValed="false"/> > <field name="a_spell" type="textSpell"/> > <field name="aa_spellPhrase" type="textSpellPhrase"/> > > <copyField source="title" dest="a_spell" /> > <copyField source="description" dest="a_spell" /> > <copyField source="body" dest="a_spell" /> > <copyField source="keywords" dest="a_spell" /> > > <copyField source="keywords" dest="aa_spellPhrase" /> > <copyField source="level" dest="aa_spellPhrase" /> > <copyField source="categories" dest="aa_spellPhrase"/> > <copyField source="title" dest="notok_title" stored="false"/> > > <copyField source="keywords" dest="fac_keywords" stored="false"/> > <copyField source="categories" dest="fac_categories" stored="false"/> > <copyField source="categories" dest="nolc_categories" stored="false"/> > <copyField source="keywords" dest="nolc_keywords" stored="false"/> > > <copyField source="title" dest="ktf_title" stored="false"/> > <copyField source="title" dest="lc_title" stored="false"/> > <!--faceted fields--> > <dynamicField name="fac_*" type="string" indexed="true" stored="false"/> > <dynamicField name="nolc_*" type="nolc_commaSeperated" indexed="true" > stored="false"/> > <dynamicField name="ktf_*" type="ktf" indexed="true" stored="false"/> > <dynamicField name="lc_*" type="lowercase" indexed="true" > stored="false"/> > <dynamicField name="tok_*" type="text" indexed="true" stored="false"/> > <dynamicField name="notok_*" type="string" indexed="true" > stored="false"/> > > </fields> > <uniqueKey>url</uniqueKey> > <!-- field for the QueryParser to use when an explicit fieldname is absent > --> > <defaultSearchField>description</defaultSearchField> > <!-- SolrQueryParser configuration: defaultOperator="AND|OR" --> > <solrQueryParser defaultOperator="OR"/> > </schema> > On 16 November 2010 18:12, Robert Muir <rcm...@gmail.com> wrote: >> >> > Nov 14, 2010 2:41:46 AM org.apache.solr.common.SolrException log >> > SEVERE: java.lang.IllegalArgumentException: Increment must be zero or >> > greater: -2147483648 >> >> >> Hi John, this looks like a tokenizer/tokenstreams bug. >> what I think is happening is that clearAttributes() is not properly >> called, so for each token the position increment is never reset, and >> it just keeps building and building. >> at some point it overflows, then you get this exception from the indexer. >> >> can you provide your schema definition so I know which ones might be >> involved? I thought we fixed this problem everywhere and backported >> any such bugfixes to solr 1.4.1, but its possible there is either >> still a bug, or we forgot to backport the fix! > > > > -- > _____________ > John G. Moylan > >