full-import takes 4 days(48 hours) to complete where main db table size 700k only

srinalluri Wed, 24 Apr 2013 12:25:56 -0700

Hi,

Environment is Solr 3.6.1. The database is having enough indexes. The box is
having enough memory. The DB is performance is good. Auto commit is enabled
for every 1 minute. 
Please see the following entity. The full-import of this entity is taking
over 48 hours to complete on production environment. The number records in
the main table is around 700,000 only.  I tried materialized view, but that
view is having duplicate records. So I can't go with materialized view for
all these queries. 
Can someone please suggest how to improve the performance for full-import?


<entity name="oracle-article" dataSource="oracle" pk="VCMID"
preImportDeleteQuery="content_type:article AND repository:oracleqa"
query="select ID as VCMID from tab_story2 order by published_date desc"
deltaImportQuery="select '${dataimporter.delta.VCMID}' as VCMID from dual"
deltaQuery="select s2.ID as VCMID from tab_story2 s2, gnasmomap mms2,
gnasmometadata mmd where s2.id = mms2.keystring1 and mms2.recordid =
mmd.contentmgmtid and mmd.lastpublishdate > ((CAST(SYS_EXTRACT_UTC(TIMESTAMP
'${dataimporter.oracle-article.last_index_time}') AS DATE) -
TO_DATE('01/01/1970 00:00:00', 'MM-DD-YYYY HH24:MI:SS')) * 24 * 60 * 60 *
1000)-300000">
<entity name="recordid" dataSource="oracle"
transformer="TemplateTransformer" query="select RECORDID from gnasmomap
where keystring1 = '${oracle-article.VCMID}'">
<field column="content_type" template="article"/>
<field column="RECORDID" name="native_id"/>
<field column="repository" template="oracleqa"/>
</entity>
<entity name=&quot;article_details&quot; dataSource=&quot;oracle&quot;
transformer=&quot;ClobTransformer,RegexTransformer,script:trimTicker,script:hasBody,script:hasDeck&quot;
query=&quot;select STORY_TITLE, STORY_HEADLINE, SOURCE, DECK,
regexp_replace(body, '\&lt;p\>\[(pullquote|summary)\]\</p\>|\[video
[0-9]+?\]|\[youtube .+?\]', '') as BODY, PUBLISHED_DATE, MODIFIED_DATE,
DATELINE, REPORTER_NAME, TICKER_CODES,ADVERTORIAL_CONTENT from tab_story2
where id = '${oracle-article.VCMID}'">
<field column="STORY_TITLE" name="title"/>
<field column="DECK" name="description" clob="true"/>
<field column="PUBLISHED_DATE" name="date"/>
<field column="MODIFIED_DATE" name="last_modified_date"/>
<field column="BODY" name="body" clob="true"/>
<field column="SOURCE" name="source"/>
<field column="DATELINE" name="dateline"/>
<field column="STORY_HEADLINE" name="export_headline"/>
<field column="ticker" splitBy="," sourceColName="TICKER_CODES"/>
<field column="ADVERTORIAL_CONTENT" name="advertorial_content"/>
<field column="has_body" sourceColName="body"/>
<field column="has_description" sourceColName="description"/>
</entity>
<entity name="site" dataSource="oracle" query="select CASE WHEN
site.name='fq2' THEN 'fqn' WHEN site.name='fq' THEN 'sbc' WHEN
site.name='fq-lat' THEN 'latino' ELSE 'gc' END SITE, CASE WHEN
site.name='fq2' THEN 'v8-qa.tabbusiness.com' WHEN site.name='fb' THEN
'v8-qa.smallbusiness.tabbusiness.com' WHEN site.name='qn-latino' THEN
'v8-qa.latino.tabdays.com' ELSE 'v8-qa.tabdays.com' END SERVER from
gnasmomap mm, gnaschannelfileassociation cfa, gnaschannel ch, gnassite site
where mm.keystring1 = '${oracle-article.VCMID}' and mm.recordid =
cfa.vcmobjectid and cfa.channelid = ch.id and ch.siteid = site.id and rownum
= 1">
<field column="SITE" name="site"/>
<entity name="url" dataSource="oracle" query="select 'http://' ||
'${site.SERVER}' || furl as URL from tab_furl where parent_id =
'${oracle-article.VCMID}'">
<field column="URL" name="url"/>
</entity>
<entity name="image" dataSource="oracle" transformer="script:hasImageURL"
query="select distinct('http://qa.global.fqstatic.com' || sourcepath) as
IMAGE_URL from ( select mc.sourcepath from tab_rel_content rc, tab_story2
st, gnasmomap mm, dsx_media_common mc where rc.parent_id =
'${oracle-article.VCMID}' and rc.parent_id = st.id and (st.NO_FEATURED_MEDIA
!= 'yes' OR st.NO_FEATURED_MEDIA is null) and rc.ref_id = mm.recordid and
mm.keystring1 = mc.mediaid and rc.rank = 1 union all select mc.sourcepath
from tab_rel_content arm, tab_story2 st, gnasmomap cmm, tab_rel_media crm,
gnasmomap mmm, dsx_media_common mc where arm.parent_id =
'${oracle-article.VCMID}' and arm.parent_id = st.id and
(st.NO_FEATURED_MEDIA !='yes' OR st.NO_FEATURED_MEDIA is null) and
arm.ref_id = cmm.recordid and cmm.keystring1 = crm.parent_id and crm.rank =
1 and crm.ref_id = mmm.recordid and mmm.keystring1 = mc.mediaid and arm.rank
= 1)">
<field column="IMAGE_URL" name="image_url"/>
<field column="has_image_url" sourceColName="IMAGE_URL"/>
</entity>
</entity>
<entity name="taxonomy" dataSource="oracle" query="select tc.PATH from
gnasmomap mm, gndaassociation ass, gndataxonomycategory tc where mm.recordid
= ass.cmsobjectid and ass.categoryid = tc.id and mm.keystring1 =
'${oracle-article.VCMID}'">
<field column="PATH" name="taxonomy_path"/>
</entity>
<entity name="keyword" dataSource="oracle"
transformer="RegexTransformer,script:trimKeyword" query="select KEYWORDS
from tab_rel_metadata where parent_id = '${oracle-article.VCMID}'">
<field column="keyword" splitBy="," sourceColName="KEYWORDS"/>
</entity>
<entity name="author" dataSource="oracle" query="select pmm.recordid as
author_id, trim(trim(trailing ',' from trim(regexp_replace(p.first_name || '
' || p.last_name || ', ' || p.suffix,'( ){2,}', ' ')))) as author from
tab_person p, gnasmomap pmm, tab_rel_reporter rr where p.id = pmm.keystring1
and pmm.recordid = rr.ref_id and rr.parent_id =
'${oracle-article.VCMID}'"></entity>
<entity name="channels" dataSource="oracle" query="select CASE WHEN
cp.site='fb2' THEN 'fbn' WHEN cp.site='fb' THEN 'sbc' WHEN
cp.site='qn-latino' THEN 'latino' ELSE 'gc' END || cp.path as PATH,
cp.isa_path ISA_PATH from gnasmomap mm, gnaschannelfileassociation cfa,
tab_mview_channelpath cp where mm.recordid = cfa.vcmobjectid and
cfa.channelid = cp.vcmid and mm.keystring1 = '${oracle-article.VCMID}'">
<field column="PATH" name="section_path"/>
<field column="ISA_PATH" name="section_path"/>
</entity>
</entity>



--
View this message in context: 
http://lucene.472066.n3.nabble.com/full-import-takes-4-days-48-hours-to-complete-where-main-db-table-size-700k-only-tp4058690.html
Sent from the Solr - User mailing list archive at Nabble.com.

full-import takes 4 days(48 hours) to complete where main db table size 700k only

Reply via email to