On 10/13/2015 9:30 AM, Dyer, James wrote:
Mark,

The older spellcheck implementations create an n-gram sidecar index, which is 
why you're seeing your name split into 2-grams like this.  See the IR Book by 
Manning et al, section 3.3.4 for more information.  Based on the results you're 
getting, I think it is loading your file correctly.  You should now try a query 
against this spelling index, using words *not* in the file you loaded that are 
within 1 or 2 edits from something that is in the dictionary.  If it doesn't 
yield suggestions, then post the relevant sections of the solrconfig.xml, 
schema.xml and also the query string you are trying.

James Dyer
Ingram Content Group

James, I've already done this. My query string was "fenbers". This is my last name which does *not* occur in the linux.words file. It is only 1 edit distance from "fenders" which *is* in the linux.words file. Yet, it claimed it found my misspelled word to be "fenber" without the "s" and it gave me these 8 suggestions:
f en be r
f e nb er
f en b er
f e n be r
f en b e r
f e nb e r
f e n b er
f e n b e r

So I'm attaching the the entire solrconfig.xml and schema.xml that is in effect. These are in a single file with all the block comments removed.

I'm also puzzled that you say "older implementations create a sidecar index"... because I am using v5.3.0, which was the latest version as of my download a month or two ago. So, with my implementation being recent, why is an n-gram sidecar index still (seemingly) being produced?

thanks for the help!
Mark



<?xml version="1.0" encoding="UTF-8" ?>
<config>
  <luceneMatchVersion>5.3.0</luceneMatchVersion>
  <dataDir>${solr.data.dir:}</dataDir>
  <directoryFactory name="DirectoryFactory" 
                    
class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}">
  </directoryFactory> 
  <codecFactory class="solr.SchemaCodecFactory"/>

  <schemaFactory class="ClassicIndexSchemaFactory"/>
  <indexConfig>
    <lockType>${solr.lock.type:native}</lockType>
     <infoStream>true</infoStream>
  </indexConfig>
  <jmx />
  <updateHandler class="solr.DirectUpdateHandler2">
    <updateLog>
      <str name="dir">${solr.ulog.dir:}</str>
      <int name="numVersionBuckets">${solr.ulog.numVersionBuckets:65536}</int>
    </updateLog>
     <autoCommit> 
       <maxTime>${solr.autoCommit.maxTime:15000}</maxTime> 
       <openSearcher>false</openSearcher> 
     </autoCommit>
     <autoSoftCommit> 
       <maxTime>${solr.autoSoftCommit.maxTime:-1}</maxTime> 
     </autoSoftCommit>

  </updateHandler>
  <query>
    <maxBooleanClauses>1024</maxBooleanClauses>
    <filterCache class="solr.FastLRUCache"
                 size="512"
                 initialSize="512"
                 autowarmCount="0"/>

    <queryResultCache class="solr.LRUCache"
                     size="512"
                     initialSize="512"
                     autowarmCount="0"/>
   
     <documentCache class="solr.LRUCache"
                   size="512"
                   initialSize="512"
                   autowarmCount="0"/>
    
    <cache name="perSegFilter"
      class="solr.search.LRUCache"
      size="10"
      initialSize="0"
      autowarmCount="10"
      regenerator="solr.NoOpRegenerator" />
    <enableLazyFieldLoading>true</enableLazyFieldLoading>

   <queryResultWindowSize>20</queryResultWindowSize>
   <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
    <useColdSearcher>false</useColdSearcher>
    <maxWarmingSearchers>2</maxWarmingSearchers>

  </query>
  <requestDispatcher handleSelect="false" >
     <requestParsers enableRemoteStreaming="true" 
                    multipartUploadLimitInKB="2048000"
                    formdataUploadLimitInKB="2048"
                    addHttpRequestToContext="false"/>

     <httpCaching never304="true" />

  </requestDispatcher>

  <requestHandler name="/select" class="solr.SearchHandler">
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <int name="rows">10</int>
     </lst>

    </requestHandler>

  <!-- A request handler that returns indented JSON by default -->
  <requestHandler name="/query" class="solr.SearchHandler">
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <str name="wt">json</str>
       <str name="indent">true</str>
       <str name="df">text</str>
     </lst>
  </requestHandler>

  <requestHandler name="/export" class="solr.SearchHandler">
    <lst name="invariants">
      <str name="rq">{!xport}</str>
      <str name="wt">xsort</str>
      <str name="distrib">false</str>
    </lst>

    <arr name="components">
      <str>query</str>
    </arr>
  </requestHandler>
  
  <requestHandler name="/dataimport" class="solr.DataImportHandler" >
    <lst name="defaults">
        <str 
name="config">/localapps/dev/EventLog/solr/EventLog2/conf/data-config.xml</str> 
   
    </lst>
  </requestHandler>

  <initParams path="/update/**,/query,/select,/tvrh,/elevate,/spell">
    <lst name="defaults">
      <str name="df">text</str>
    </lst>
  </initParams>

  <requestHandler name="/analysis/field" 
                  startup="lazy"
                  class="solr.FieldAnalysisRequestHandler" />
  <requestHandler name="/analysis/document" 
                  class="solr.DocumentAnalysisRequestHandler" 
                  startup="lazy" />

  <!-- Echo the request contents back to the client -->
  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
    <lst name="defaults">
     <str name="echoParams">explicit</str> 
     <str name="echoHandler">true</str>
    </lst>
  </requestHandler>
  
  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
    <str name="queryAnalyzerFieldType">text_en</str>

    <lst name="spellchecker">
      <str name="name">WordBreak</str>
      <str name="classname">solr.WordBreakSolrSpellChecker</str>
      <str name="field">logtext</str>
      <str name="combineWords">true</str>
      <str name="breakWords">true</str>
      <int name="maxChanges">10</int>
    </lst>
    
    <lst name="spellchecker">
         <str name="classname">solr.FileBasedSpellChecker</str>
        <str name="field">logtext</str>         
        <str name="name">FileDict</str>
         <str name="sourceLocation">/usr/share/dict/linux.words</str>
         <str name="characterEncoding">UTF-8</str>
         <str 
name="spellcheckIndexDir">/localapps/dev/EventLog/solr/EventLog2/data/spFile</str>
         <str name="buildOnStartup">true</str>
       <str name="accuracy">0.5</str>

      <int name="maxEdits">2</int>

      <int name="minPrefix">1</int>

      <int name="maxInspections">5</int>

      <int name="minQueryLength">4</int>

      <float name="maxQueryFrequency">0.01</float>

  </lst>
      
  </searchComponent>

  <requestHandler name="/ELspell" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">

      <str name="spellcheck.dictionary">FileDict</str>
      <str name="spellcheck.dictionary">WordBreak</str>
      <str name="spellcheck">on</str>
      <str name="spellcheck.extendedResults">true</str>
      <str name="spellcheck.count">10</str>
      <str name="spellcheck.alternativeTermCount">5</str>
      <str name="spellcheck.maxResultsForSuggest">5</str>
      <str name="spellcheck.collate">true</str>
      <str name="spellcheck.collateExtendedResults">true</str>
      <str name="spellcheck.maxCollationTries">10</str>
      <str name="spellcheck.maxCollations">5</str>
    </lst>
    <arr name="last-components">
      <str>spellcheck</str>
    </arr>
  </requestHandler>
  
  <searchComponent name="terms" class="solr.TermsComponent"/>

  <!-- A request handler for demonstrating the terms component -->
  <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
     <lst name="defaults">
      <bool name="terms">true</bool>
      <bool name="distrib">false</bool>
    </lst>     
    <arr name="components">
      <str>terms</str>
    </arr>
  </requestHandler>

  <!-- Legacy config for the admin interface -->
  <admin>
    <defaultQuery>*:*</defaultQuery>
  </admin>

</config>









<!-- Schema.xml -->
<?xml version="1.0" encoding="UTF-8" ?>

<schema name="example" version="1.5">


   <field name="_version_" type="long" indexed="true" stored="true"/>
   <field name="_root_" type="string" indexed="true" stored="false"/>
   <field name="id" type="string" indexed="true" stored="true" required="true" 
multiValued="false" /> 
   <field name="logtext" type="string" indexed="true" stored="true" 
required="true" multiValued="false" /> 
   <field name="username" type="string" indexed="true" stored="true" 
required="true" multiValued="false" /> 
   <field name="category" type="int" indexed="true" stored="true" 
required="true" multiValued="false" /> 

   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
   <dynamicField name="*_is" type="int"    indexed="true"  stored="true"  
multiValued="true"/>
   <dynamicField name="*_s"  type="text_en"  indexed="true"  stored="true" />
   <dynamicField name="*_ss" type="string"  indexed="true"  stored="true" 
multiValued="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
   <dynamicField name="*_ls" type="long"   indexed="true"  stored="true"  
multiValued="true"/>
   <dynamicField name="*_t"  type="text_general"    indexed="true"  
stored="true"/>
   <dynamicField name="*_txt" type="text_general"   indexed="true"  
stored="true" multiValued="true"/>
   <dynamicField name="*_en"  type="text_en"    indexed="true"  stored="true" 
multiValued="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true" stored="true"/>
   <dynamicField name="*_bs" type="boolean" indexed="true" stored="true"  
multiValued="true"/>
   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
   <dynamicField name="*_fs" type="float"  indexed="true"  stored="true"  
multiValued="true"/>
   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
   <dynamicField name="*_ds" type="double" indexed="true"  stored="true"  
multiValued="true"/>

   <!-- Type used to index the lat and lon components for the "location" 
FieldType -->
   <dynamicField name="*_coordinate"  type="tdouble" indexed="true"  
stored="false" />

   <dynamicField name="*_dt"  type="date"    indexed="true"  stored="true"/>
   <dynamicField name="*_dts" type="date"    indexed="true"  stored="true" 
multiValued="true"/>
   <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>

   <!-- some trie-coded dynamic fields for faster range queries -->
   <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
   <dynamicField name="*_tl" type="tlong"   indexed="true"  stored="true"/>
   <dynamicField name="*_tf" type="tfloat"  indexed="true"  stored="true"/>
   <dynamicField name="*_td" type="tdouble" indexed="true"  stored="true"/>
   <dynamicField name="*_tdt" type="tdate"  indexed="true"  stored="true"/>

   <dynamicField name="*_c"   type="currency" indexed="true"  stored="true"/>

   <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
   <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" 
multiValued="true"/>

   <dynamicField name="random_*" type="random" />

 <uniqueKey>id</uniqueKey>

    <fieldType name="string" class="solr.StrField" sortMissingLast="true" />

    <!-- boolean type: "true" or "false" -->
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>

    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" 
positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" 
positionIncrementGap="0"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" 
positionIncrementGap="0"/>
    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" 
positionIncrementGap="0"/>

    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" 
positionIncrementGap="0"/>
    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" 
positionIncrementGap="0"/>
    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" 
positionIncrementGap="0"/>
    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" 
positionIncrementGap="0"/>

     <fieldType name="date" class="solr.TrieDateField" precisionStep="0" 
positionIncrementGap="0"/>

    <!-- A Trie based date field for faster date range queries and date 
faceting. -->
    <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" 
positionIncrementGap="0"/>


    <!--Binary data type. The data should be sent/retrieved in as Base64 
encoded Strings -->
    <fieldType name="binary" class="solr.BinaryField"/>
    <fieldType name="random" class="solr.RandomSortField" indexed="true" />
    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
      </analyzer>
    </fieldType>
    <fieldType name="text_general" class="solr.TextField" 
positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" />
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" />
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>


    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="lang/stopwords_en.txt"
                />
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPossessiveFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="lang/stopwords_en.txt"
                />
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPossessiveFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
    </fieldType>

    <fieldType name="text_en_splitting" class="solr.TextField" 
positionIncrementGap="100" autoGeneratePhraseQueries="true">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="lang/stopwords_en.txt"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" 
splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="lang/stopwords_en.txt"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
    </fieldType>

    <fieldType name="text_en_splitting_tight" class="solr.TextField" 
positionIncrementGap="100" autoGeneratePhraseQueries="true">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="false"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="lang/stopwords_en.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" 
generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
        <filter class="solr.EnglishMinimalStemFilterFactory"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>

    <fieldType name="text_general_rev" class="solr.TextField" 
positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" />
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" />
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>

    <fieldType name="alphaOnlySort" class="solr.TextField" 
sortMissingLast="true" omitNorms="true">
      <analyzer>
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory" />
        <!-- The TrimFilter removes any leading or trailing whitespace -->
        <filter class="solr.TrimFilterFactory" />
        <filter class="solr.PatternReplaceFilterFactory"
                pattern="([^a-z])" replacement="" replace="all"
        />
      </analyzer>
    </fieldType>

    <!-- lowercases the entire field value, keeping it as a single token.  -->
    <fieldType name="lowercase" class="solr.TextField" 
positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory" />
      </analyzer>
    </fieldType>

    <fieldType name="ignored" stored="false" indexed="false" multiValued="true" 
class="solr.StrField" />

    <fieldType name="point" class="solr.PointType" dimension="2" 
subFieldSuffix="_d"/>

    <!-- A specialized field for geospatial search. If indexed, this fieldType 
must not be multivalued. -->
    <fieldType name="location" class="solr.LatLonType" 
subFieldSuffix="_coordinate"/>
    <fieldType name="location_rpt" 
class="solr.SpatialRecursivePrefixTreeFieldType"
        geo="true" distErrPct="0.025" maxDistErr="0.001" 
distanceUnits="kilometers" />

    <fieldType name="bbox" class="solr.BBoxField"
               geo="true" distanceUnits="kilometers" numberType="_bbox_coord" />
    <fieldType name="_bbox_coord" class="solr.TrieDoubleField" 
precisionStep="8" docValues="true" stored="false"/>
    <fieldType name="currency" class="solr.CurrencyField" precisionStep="8" 
defaultCurrency="USD" currencyConfig="currency.xml" />
</schema>

Reply via email to