I am trying to loop over last page until next button is not present in the web page.
CODE:
import scrapy
from scrapy_splash import SplashRequest
class FuneralHomesSpider(scrapy.Spider):
name = 'funeral_homes'
#allowed_domains = ["https://www.watkinsfuneralhomes.com","https://www.lanefuneralhomes.com"]
allowed_domains = ["https://www.watkinsfuneralhomes.com"]
#start_urls = ['https://www.legacy.com/']
#j=2
def __init__(self):
self.j=1
#def file(self):
def start_requests(self):
#url_list=['https://www.watkinsfuneralhomes.com/obituaries/obituary-listings/js']
#url_list=['https://www.watkinsfuneralhomes.com/obituaries/obituary-listings/js']
yield SplashRequest(
url='https://www.watkinsfuneralhomes.com/obituaries/obituary-listings/js',
callback=self.parse,
endpoint="render.html",
args={"wait":2},
#meta = {"proxy": "http://lum-customer-hl_e8cc701c-zone-static:[email protected]:22225"}
)
def parse(self, response):
#for i in response.css("#obituariesResults"):
#for i in range(4,6):
#obituaryListBody > div > div:nth-child(1) > div.obituary-info > div.name > a
#obituaryListBody > div > div:nth-child(1) > div.obituary-info > div.dates
#obituaryListBody > div > div:nth-child(1) > div.link > a:nth-child(1)
#selector="#obituaryListBody > div > div:nth-child(1) > div.obituary-info > div."
#Nameselector=selector+"name > a"
#Name=selector+" > div > div.screen-title-copy > h3>"
#Date=selector+"dates"
#print(Nameselector)
ObituaryNewsselector=["body/main/div/div/div/div/div/div/form/div/div/div/div/a[1]/@href"]
NameSelector=["body/main/div/div/div/div/div/div/form/div/div/div/div/div/a/text()"]
yield{
#"Link":response.xpath("body/main/div/div/div/div/div/div/form/div/div/div/div/a[1]/@href").extract(),
"Link":response.xpath(str(ObituaryNewsselector[0])).extract(),
"Name":response.xpath(str(NameSelector[0])).extract(),
#"Name":response.xpath("body/main/div/div/div/div/div/div/form/div/div/div/div/div/a/text()").extract()
#body/div/div/div/div/div/div/div/form/div/div>div/div/a
#//*[@id='obituaryListBody']/div/div/div[3]/a[1]/@href
#"Name":response.css(Name).extract_first(),
#"Date":response.css(Date).extract_first(),
}
condition=response.xpath("(//*[@id='obituariesListPageItemsForm']/div[3]/span[2]/@class)[1]").extract()
next_page_url='https://www.watkinsfuneralhomes.com/obituaries/obituary-listings'+'#'+str(self.j)
print(next_page_url)
print(condition)
self.j+=1
if condition==['next fun-button primary']:
#next_page_url=response.urljoin(//*[@id="obituariesListPageItemsForm"]/div[3]/span[2])
yield scrapy.Request(url=next_page_url, callback=self.start_requests)
Here, its not able to extract data from next page and quitting.
Output:
2021-01-07 18:14:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.watkinsfuneralhomes.com/obituaries/obituary-listings/js>
{'Link': ['/obituaries/Connie-Jean-Brown?obId=19541919#/obituaryInfo', '/obituaries/Beverly--Anne-Cardona?obId=19541985#/obituaryInfo', '/obituaries/Frances-Jean-Strachan?obId=19542115#/obituaryInfo', '/obituaries/Thomas-John-Parker-Sr?obId=19543212#/obituaryInfo', '/obituaries/Gertrude-Sukie-E-Cooper?obId=19542013#/obituaryInfo'], 'Name': ['Connie Jean Brown', 'Beverly Anne Cardona', 'Frances Jean Strachan', 'Thomas John Parker, Sr.', 'Gertrude "Sukie" E. Cooper']}
https://www.watkinsfuneralhomes.com/obituaries/obituary-listings#1/js
['next fun-button primary']
2021-01-07 18:14:43 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.watkinsfuneralhomes.com': <GET https://www.watkinsfuneralhomes.com/obituaries/obituary-listings#1/js>
2021-01-07 18:14:43 [scrapy.core.engine] INFO: Closing spider (finished)
2021-01-07 18:14:43 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1009,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 2,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 148934,
'downloader/response_count': 3,
'downloader/response_status_count/200': 2,
'downloader/response_status_count/404': 1,
'elapsed_time_seconds': 7.106809,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 1, 7, 12, 44, 43, 695459),
'item_scraped_count': 1,
'log_count/DEBUG': 5,
'log_count/INFO': 10,
'log_count/WARNING': 2,
'offsite/domains': 1,
'offsite/filtered': 1,
'request_depth_max': 1,
'response_received_count': 3,
'robotstxt/request_count': 2,
'robotstxt/response_count': 2,
'robotstxt/response_status_count/404': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'splash/render.html/request_count': 1,
'splash/render.html/response_count/200': 1,
'start_time': datetime.datetime(2021, 1, 7, 12, 44, 36, 588650)}
2021-01-07 18:14:43 [scrapy.core.engine] INFO: Spider closed (finished)
In output we can see ['next fun-button primary'](this is used as condition for going to next page since href element not present for next button) meaning it executed yield scrapy.Request(url=next_page_url, callback=self.start_requests) but couldn't retrieve the data.
Assistance required.