in the following code i need to loop through al thee <li> elements under ul
and checking them if the <a> anchor tag is present in that <li> tag as per
the xpath mentioned. But the spider stops crawling after only one <li> tag.
Don't understand what's wrong.
Here is the spider:
Enter code here...import scrapy
from shinjukuproject.items import HandlingStoreInfo
from w3lib.html import remove_tags
class ShijukuHandlingStores(scrapy.Spider):
name = "singlepagestores"
start_urls = [
'https://suumo.jp/chintai/tokyo/sc_shinjuku/jnc_000013357603/']
def parse(self, response):
for li in response.xpath('/html/body/div[5]/div[6]/ul//li'):
hsurl = li.xpath(
'.//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-object"]/div[@class="itemcassette_img"]/div[@class="itemcassette_img-desc"]/a/@href'
).extract_first()
if(hsurl):
item = HandlingStoreInfo()
item['Room_ID'] = response.xpath(
'/html/head/link[@rel="canonical"]/@href').re('\w+\_\d+')
request = scrapy.Request(response.urljoin(hsurl), callback=
self.parse_storeinfo, dont_filter = True)
request.meta['item'] = item
return request
else:
item = HandlingStoreInfo()
item['Room_ID'] = response.xpath(
'/html/head/link[@rel="canonical"]/@href').re('\w+\_\d+')
hsn = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-header"]/span[@class="itemcassette-header-ttl"]/text()'
).extract_first('Null').strip()
item['Handling_Store_Name'] = remove_tags(hsn)
item['Handling_Store_id'] = item['Room_ID']
item['Location'] = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell01"]/text()'
).extract_first('Null').strip()
item['Transportation_Facilities'] = "N/A"
contact = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell04"]/span/text()'
).re('\d+\-\d+\-\d+')
item['Contact'] = remove_tags(contact)
item['Fax'] = "N/A"
bh = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell02"]/text()'
).extract_first(' ').strip()
item['Buisiness_Hours'] = remove_tags(bh)
rh = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell03"]/text()'
).extract_first(' ').strip()
item['Regular_Holidays'] = remove_tags(rh)
item['License_Number'] = "N/A"
item['Store_Characteristics'] = "N/A"
return item
def parse_storeinfo(self, response):
item = response.meta['item']
item['Handling_Store_Name'] = response.css('html
body.chintai.ch_leaf div#wrapper div#contents.ch-shdt h1::text').
extract_first(' ').strip()
item['Handling_Store_id'] = response.css('html head
link[rel=canonical]::attr(href)').re('\w+\_\d+\_\d+')
item['Location'] = response.css('div#wrapper div#contents.ch-shdt
div.section table.data_table.table_gaiyou tr:nth-of-type(1)
td:nth-of-type(1)::text').extract_first(' ').strip()
item['Transportation_Facilities'] = response.css('div#wrapper
div#contents.ch-shdt div.section table.data_table.table_gaiyou
tr:nth-of-type(1) td:nth-of-type(2) ul li::text').extract_first()
item['Contact'] = response.css('html body.chintai.ch_leaf
div#wrapper div#contents.ch-shdt div.section table.data_table.table_gaiyou
tr:nth-of-type(2) td:nth-of-type(1) span.col-notice em::text').extract_first
(' ').strip()
item['Fax'] = response.css('html body.chintai.ch_leaf div#wrapper
div#contents.ch-shdt div.section table.data_table.table_gaiyou
tr:nth-of-type(2) td:nth-of-type(2)::text').re('\d+\-\d+\-\d+')
bh = response.css('html body.chintai.ch_leaf div#wrapper
div#contents.ch-shdt div.section table.data_table.table_gaiyou
tr:nth-of-type(3) td:nth-of-type(1)::text').extract_first(' ').strip()
item['Buisiness_Hours'] = remove_tags(bh)
rh = response.css('html body.chintai.ch_leaf div#wrapper
div#contents.ch-shdt div.section table.data_table.table_gaiyou
tr:nth-of-type(3) td:nth-of-type(2)::text').extract_first(' ').strip()
item['Regular_Holidays'] = remove_tags(rh)
item['License_Number'] = response.css('html body.chintai.ch_leaf
div#wrapper div#contents.ch-shdt div.section table.data_table.table_gaiyou
tr:nth-of-type(4) td:nth-of-type(2)::text').extract_first(' ').strip()
item['Store_Characteristics'] = response.css('html
body.chintai.ch_leaf div#wrapper div#contents.ch-shdt div.section
table.data_table.table_gaiyou tr:nth-of-type(5) td:nth-of-type(1)::text').
extract_first(' ').strip()
return item
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.