Scrapy loop not working

bassamfarooq Mon, 30 Jan 2017 11:28:17 -0800

in the following code i need to loop through al thee <li> elements under ul 
and checking them if the <a> anchor tag is present in that <li> tag as per 
the xpath mentioned. But the spider stops crawling after only one <li> tag. 
Don't understand what's wrong.




           Here is the spider:


                       
Enter code here...import scrapy
from shinjukuproject.items import HandlingStoreInfo
from w3lib.html import remove_tags

class ShijukuHandlingStores(scrapy.Spider):
    name = "singlepagestores"
    start_urls = [
'https://suumo.jp/chintai/tokyo/sc_shinjuku/jnc_000013357603/']
    
    
    
            
            
    def parse(self, response):
        
        for li in response.xpath('/html/body/div[5]/div[6]/ul//li'):
        
            
            hsurl = li.xpath(
'.//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-object"]/div[@class="itemcassette_img"]/div[@class="itemcassette_img-desc"]/a/@href'
).extract_first()
            
            if(hsurl):
                item = HandlingStoreInfo()
                item['Room_ID'] = response.xpath(
'/html/head/link[@rel="canonical"]/@href').re('\w+\_\d+')
                request = scrapy.Request(response.urljoin(hsurl), callback=
self.parse_storeinfo, dont_filter = True)
                request.meta['item'] = item
                return request
            else:
                item = HandlingStoreInfo()
                item['Room_ID'] = response.xpath(
'/html/head/link[@rel="canonical"]/@href').re('\w+\_\d+')
                hsn = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-header"]/span[@class="itemcassette-header-ttl"]/text()'
).extract_first('Null').strip()
                item['Handling_Store_Name'] = remove_tags(hsn)
                item['Handling_Store_id'] = item['Room_ID']
                item['Location'] = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell01"]/text()'
).extract_first('Null').strip()
                item['Transportation_Facilities'] = "N/A"
                contact = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell04"]/span/text()'
).re('\d+\-\d+\-\d+')
                item['Contact'] = remove_tags(contact)
                item['Fax'] = "N/A"
                bh = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell02"]/text()'
).extract_first(' ').strip()
                item['Buisiness_Hours'] = remove_tags(bh)
                rh = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell03"]/text()'
).extract_first(' ').strip()
                item['Regular_Holidays'] = remove_tags(rh)
                item['License_Number'] = "N/A"
                item['Store_Characteristics'] = "N/A"
                return item        
                
                
    def parse_storeinfo(self, response):
        
        
        item = response.meta['item']
        item['Handling_Store_Name'] = response.css('html 
body.chintai.ch_leaf div#wrapper div#contents.ch-shdt h1::text').
extract_first(' ').strip()
        item['Handling_Store_id'] = response.css('html head 
link[rel=canonical]::attr(href)').re('\w+\_\d+\_\d+')
        item['Location'] = response.css('div#wrapper div#contents.ch-shdt 
div.section table.data_table.table_gaiyou tr:nth-of-type(1) 
td:nth-of-type(1)::text').extract_first(' ').strip()
        item['Transportation_Facilities'] = response.css('div#wrapper 
div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(1) td:nth-of-type(2) ul li::text').extract_first()
        item['Contact'] = response.css('html body.chintai.ch_leaf 
div#wrapper div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(2) td:nth-of-type(1) span.col-notice em::text').extract_first
(' ').strip()
        item['Fax'] = response.css('html body.chintai.ch_leaf div#wrapper 
div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(2) td:nth-of-type(2)::text').re('\d+\-\d+\-\d+')
        bh = response.css('html body.chintai.ch_leaf div#wrapper 
div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(3) td:nth-of-type(1)::text').extract_first(' ').strip()
        item['Buisiness_Hours'] = remove_tags(bh)
        rh = response.css('html body.chintai.ch_leaf div#wrapper 
div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(3) td:nth-of-type(2)::text').extract_first(' ').strip()
        item['Regular_Holidays'] = remove_tags(rh)
        item['License_Number'] = response.css('html body.chintai.ch_leaf 
div#wrapper div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(4) td:nth-of-type(2)::text').extract_first(' ').strip()
        item['Store_Characteristics'] = response.css('html 
body.chintai.ch_leaf div#wrapper div#contents.ch-shdt div.section 
table.data_table.table_gaiyou tr:nth-of-type(5) td:nth-of-type(1)::text').
extract_first(' ').strip()
        return item            


-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Scrapy loop not working

Reply via email to