I'm wondering if someone would be able to help changing the below script logic, so it would create a csv file and append the file directly with results
and status
data instead of doing it in the script while it runs using dictionaries and for
loops for each URL in the urls
list?
I'm aiming here to enhance the script performance as currently it seems to affect disk memory very badly. And as at the beginning of the script the scan goes smoothly then towards the end it just goes slower as there are may items in the dictionaries I assume.
Any suggestions how I could achieve this? thank you in advance.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import io
import requests.exceptions
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('myScan2.csv')
urls = df.T.values.tolist()[2]
l = len(urls)
results = {}
status = {}
#function specifying web crawl criteria
def scrap(url):
try:
r = requests.get(url, timeout=30, headers={"User-Agent": "Python"})
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
if soup.body:
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"AC_Button": soup.findAll(text = re.compile('_button.js'))}
results[url] = data
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.MissingSchema):
status[url] = "Connection Error"
except (requests.exceptions.HTTPError):
status[url] = "Http Error"
except (requests.exceptions.TooManyRedirects):
status[url] = "Redirects"
except (requests.exceptions.RequestException) as err:
status[url] = "Fatal Error: " + err + url
else:
status[url] = "OK"
for i, url in enumerate(urls):
[scrap(url)]
time.sleep(0.1)
comingList= []
openingList = []
underList = []
AC_Button = []
statusList = []
#mark x if there are any hits for specific keyword
for url in urls:
if(not results.get(url)):
statusList.append(status.get(url))
comingList.append("-")
openingList.append("-")
underList.append("-")
AC_Button.append("-")
else:
statusList.append(status.get(url))
comingList.append("x" if len(results[url].get("coming soon")) > 0 else "-")
openingList.append("x" if len(results[url].get("Opening Soon")) > 0 else "-")
underList.append("x" if len(results[url].get("Under Construction")) > 0 else "-")
AC_Button.append("x" if len(results[url].get("AC_Button")) > 0 else "-")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["AC_Button"] = pd.DataFrame(AC_Button, columns=['AC_Button'])
df['status'] = pd.DataFrame(statusList, columns=['Status'])
df.to_csv('URLcrawl.csv', index=False)
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…