You May want to review https://cwiki.apache.org/confluence/display/SOLR/SolrPerformanceProblems#SolrPerformanceProblems-SlowIndexing for some hints.
Make sure to index with multiple parallel threads. Also remember that using /extract on the solr side is resource intensive and may make your cluster slow and unstable. Better to use Tika or similar on the client side and send text docs to solr. Jan Høydahl > 13. aug. 2019 kl. 16:52 skrev Parmeshwor Thapa <thapa.parmesh...@gmail.com>: > > Hi, > > We are having some issue on scaling solr indexing. Looking for suggestion. > > Setup : We have two solr cloud (7.4) instances running in separate cloud > VMs with an external zookeeper ensemble. > > We are sending async / non-blocking http request to index documents in solr. > 2 > > cloud VMs ( 4 core * 32 GB) > > 16 gb allocated for jvm > > We are sending all types to document to solr , which it would extract and > index, Using /update/extract request handler > > We have stopwords.txt and dictionary (7mb) for stemming. > > > > Issue : indexing speed is quite slow for us. It is taking around 2 hours to > index around 3 gb of data. 10,000 documents(PDF, xls, word, etc). We are > planning to index approximately 10 tb of data. > > Below is the solr config setting and schema, > > > > <fieldType name="g_phonetic" class="solr.TextField"> > > <analyzer> > > <tokenizer class="solr.WhitespaceTokenizerFactory"/> > > <filter class="solr.LowerCaseFilterFactory"/> > > <filter class="solr.KeywordRepeatFilterFactory"/> > > <filter class="solr.BeiderMorseFilterFactory" nameType="GENERIC" > languageSet="auto" ruleType="APPROX" concat="true"/> > > </analyzer> > > </fieldType> > > <fieldType name="g_stemming" class="solr.TextField"> > > <analyzer> > > <tokenizer class="solr.OpenNLPTokenizerFactory" > tokenizerModel="en-token.bin" sentenceModel="en-sent.bin"/> > > <filter class="solr.LowerCaseFilterFactory"/> > > <filter class="solr.OpenNLPPOSFilterFactory" > posTaggerModel="en-pos-maxent.bin"/> > > <filter class="solr.OpenNLPLemmatizerFilterFactory" > dictionary="en-lemmatizer-again.dict.txt"/> > > > > <filter class="solr.KeywordRepeatFilterFactory"/> > > <filter class="solr.PorterStemFilterFactory"/> > > <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> > > </analyzer> > > </fieldType> > > > > <field name="_root_" type="string" docValues="false" indexed="true" > stored="false"/> > > <field name="_version_" type="plong" indexed="false" stored="false"/> > > > > <field name="domain" type="text_general" multiValued="false" > indexed="true" stored="true"/> > > <field name="id" type="string" multiValued="false" indexed="true" > required="true" stored="true"/> > > <field name="entitytype" type="text_general" multiValued="false" > indexed="true" stored="true"/> > > <field name="entityvalue" type="text_general" multiValued="false" > indexed="true" stored="true"/> > > <field name="org" type="text_general" multiValued="false" indexed="true" > stored="true"/> > > <field name="repnum" type="text_general" multiValued="false" > indexed="true" stored="true"/> > > <field name="stream_size" type="plongs" multiValued="false" > indexed="true" stored="true" /> > > <field name="date" type="pdates" multiValued="true" indexed="true" > stored="true"/> > > <field name="creation_date" type="pdates" multiValued="true" > indexed="true" stored="true"/> > > <field name="last_modified" type="pdates" multiValued="true" > indexed="true" stored="true"/> > > <field name="content_phonetic" type="g_phonetic" multiValued="true" > indexed="true" stored="false"/> > > <field name="content_stemming" type="g_stemming" multiValued="true" > indexed="true" stored="false"/> > > <field name="content" type="text_general" multiValued="true" > indexed="true" stored="true"/> > > <field name="author" type="text_general" multiValued="true" > indexed="true" stored="true"/> > > <field name="application_version" type="pdoubles" multiValued="true" > indexed="true" stored="true"/> > > > > <copyField source="content" dest="content_phonetic"/> > > <copyField source="content" dest="content_stemming"/> > > > > <dynamicField name="*" type="text_general" indexed="false" stored="false" > docValues="false" /> > > > > And below is the solrConfig, > > > > <codecFactory class="solr.SchemaCodecFactory"> > > <str name="compressionMode">BEST_COMPRESSION</str> > > </codecFactory> > > > > <autoCommit> > > <maxDocs>1000</maxDocs> > > <maxTime>600000</maxTime> > > <openSearcher>false</openSearcher> > > </autoCommit> > > > > <autoSoftCommit> > > <maxTime>${solr.autoSoftCommit.maxTime:-1}</maxTime> > > </autoSoftCommit> > > > > <requestHandler name="/update/extract" > > startup="lazy" > > class="solr.extraction.ExtractingRequestHandler" > > > <lst name="defaults"> > > <str name="lowernames">true</str> > > <str name="fmap.meta">ignored_</str> > > <str name="fmap.content">content</str> > > </lst> > > </requestHandler> > > *Thanks,* > > *Parmeshwor Thapa*