Hi,
We are having some issue on scaling solr indexing. Looking for suggestion.
Setup : We have two solr cloud (7.4) instances running in separate cloud
VMs with an external zookeeper ensemble.
We are sending async / non-blocking http request to index documents in solr.
2
cloud VMs ( 4 core * 32 GB)
16 gb allocated for jvm
We are sending all types to document to solr , which it would extract and
index, Using /update/extract request handler
We have stopwords.txt and dictionary (7mb) for stemming.
Issue : indexing speed is quite slow for us. It is taking around 2 hours to
index around 3 gb of data. 10,000 documents(PDF, xls, word, etc). We are
planning to index approximately 10 tb of data.
Below is the solr config setting and schema,
<fieldType name="g_phonetic" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordRepeatFilterFactory"/>
<filter class="solr.BeiderMorseFilterFactory" nameType="GENERIC"
languageSet="auto" ruleType="APPROX" concat="true"/>
</analyzer>
</fieldType>
<fieldType name="g_stemming" class="solr.TextField">
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory"
tokenizerModel="en-token.bin" sentenceModel="en-sent.bin"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.OpenNLPPOSFilterFactory"
posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.OpenNLPLemmatizerFilterFactory"
dictionary="en-lemmatizer-again.dict.txt"/>
<filter class="solr.KeywordRepeatFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<field name="_root_" type="string" docValues="false" indexed="true"
stored="false"/>
<field name="_version_" type="plong" indexed="false" stored="false"/>
<field name="domain" type="text_general" multiValued="false"
indexed="true" stored="true"/>
<field name="id" type="string" multiValued="false" indexed="true"
required="true" stored="true"/>
<field name="entitytype" type="text_general" multiValued="false"
indexed="true" stored="true"/>
<field name="entityvalue" type="text_general" multiValued="false"
indexed="true" stored="true"/>
<field name="org" type="text_general" multiValued="false" indexed="true"
stored="true"/>
<field name="repnum" type="text_general" multiValued="false"
indexed="true" stored="true"/>
<field name="stream_size" type="plongs" multiValued="false"
indexed="true" stored="true" />
<field name="date" type="pdates" multiValued="true" indexed="true"
stored="true"/>
<field name="creation_date" type="pdates" multiValued="true"
indexed="true" stored="true"/>
<field name="last_modified" type="pdates" multiValued="true"
indexed="true" stored="true"/>
<field name="content_phonetic" type="g_phonetic" multiValued="true"
indexed="true" stored="false"/>
<field name="content_stemming" type="g_stemming" multiValued="true"
indexed="true" stored="false"/>
<field name="content" type="text_general" multiValued="true"
indexed="true" stored="true"/>
<field name="author" type="text_general" multiValued="true"
indexed="true" stored="true"/>
<field name="application_version" type="pdoubles" multiValued="true"
indexed="true" stored="true"/>
<copyField source="content" dest="content_phonetic"/>
<copyField source="content" dest="content_stemming"/>
<dynamicField name="*" type="text_general" indexed="false" stored="false"
docValues="false" />
And below is the solrConfig,
<codecFactory class="solr.SchemaCodecFactory">
<str name="compressionMode">BEST_COMPRESSION</str>
</codecFactory>
<autoCommit>
<maxDocs>1000</maxDocs>
<maxTime>600000</maxTime>
<openSearcher>false</openSearcher>
</autoCommit>
<autoSoftCommit>
<maxTime>${solr.autoSoftCommit.maxTime:-1}</maxTime>
</autoSoftCommit>
<requestHandler name="/update/extract"
startup="lazy"
class="solr.extraction.ExtractingRequestHandler" >
<lst name="defaults">
<str name="lowernames">true</str>
<str name="fmap.meta">ignored_</str>
<str name="fmap.content">content</str>
</lst>
</requestHandler>
*Thanks,*
*Parmeshwor Thapa*