i built my first scrapy project and works perfect when i save it as csv but when i try to send it to mysql i get problems let me know what i am doing wrong so i can learn too thank you.
import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlencode
import json
from datetime import datetime
from ..items import InstascraperItem
API = 'inserapikeyhere'
user_accounts = ['omnesinfluencers']
def get_url(url):
payload = {'api_key': API, 'url': url}
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
return proxy_url
class InstagramSpider(scrapy.Spider):
name = 'instagram'
allowed_domains = ['api.scraperapi.com']
custom_settings = {'CONCURRENT_REQUESTS_PER_DOMAIN': 5}
custom_settings = {'FEEDS':{'basic.csv':{'format':'csv'}}}
def start_requests(self):
for username in user_accounts:
url = f'https://www.instagram.com/{username}/?hl=en'
yield scrapy.Request(get_url(url), callback=self.parse)
def parse(self, response):
items = InstascraperItem()
x = response.xpath("//script[starts-with(.,'window._sharedData')]/text()").extract_first()
json_string = x.strip().split('= ')[1][:-1]
data = json.loads(json_string)
edges = data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']
for i in edges:
url = 'https://www.instagram.com/p/' + i['node']['shortcode']
mediatype = i['node']['is_video']
date_posted_timestamp = i['node']['taken_at_timestamp']
date_posted = datetime.fromtimestamp(date_posted_timestamp).strftime("%d/%m/%Y %H:%M:%S")
likeCount = i['node']['edge_media_preview_like']['count'] if "edge_media_preview_like" in i['node'].keys() else ''
commentCount = i['node']['edge_media_to_comment']['count'] if 'owner' in i['node'].keys() else ''
handleid = i['node']['owner']['id'] if 'owner' in i['node'].keys() else ''
usernameid = i['node']['owner']['username']
captions = ""
if i['node']['edge_media_to_caption']:
for i2 in i['node']['edge_media_to_caption']['edges']:
captions += i2['node']['text'] + "
"
if mediatype:
image_url = i['node']['display_url']
else:
image_url = i['node']['thumbnail_resources'][-1]['src']
items['handleid'] = handleid
items['usernameid'] = usernameid
items['url'] = url
items['mediatype'] = mediatype
items['date_posted'] = date_posted
items['date_posted_timestamp'] = date_posted_timestamp
items['likeCount'] = likeCount
items['commentCount'] = commentCount
items['image_url'] = image_url
items['captions'] = captions
if mediatype:
yield scrapy.Request(get_url(url), callback=self.get_video, meta={'items': items})
else:
items['videoURL'] = ''
yield items
def get_video(self, response):
# only from the first page
items = response.meta['items']
video_url = response.xpath('//meta[@property="og:video"]/@content').extract_first()
items['videoURL'] = video_url
yield items
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(InstagramSpider)
process.start()
here is my items.py
import scrapy
class InstascraperItem(scrapy.Item):
# define the fields for your item here like:
handleid = scrapy.Field()
usernameid = scrapy.Field()
url = scrapy.Field()
mediatype = scrapy.Field()
date_posted = scrapy.Field()
date_posted_timestamp = scrapy.Field()
likeCount = scrapy.Field()
commentCount = scrapy.Field()
image_url = scrapy.Field()
captions = scrapy.Field()
videoURL = scrapy.Field()
my pipeline.py
import mysql.connector
class InstascraperPipeline:
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
passwd = 'gerdeh533',
database = 'instagram',
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS instagram_tb""")
self.curr.execute("""create table instagram_tb(
handleid text,
usernameid text,
url text,
mediatype text,
date_posted text,
date_posted_timestamp text,
likeCount text,
commentCount text,
image_url text,
captions text,
videoURL text,
)""")
def store_db(self, item):
self.curr.execute("""insert into instagram_tb values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""", (
item['handleid'],
item['usernameid'],
item['url'],
item['mediatype'],
item['date_posted'],
item['date_posted_timestamp'],
item['likeCount'],
item['commentCount'],
item['image_url'],
item['captions'],
item['videoURL']
))
self.conn.commit()
def process_item(self, item, spider):
self.store_db(item)
return item
for your note pipeline_Item is activated on setting.py and mysql-connector-pythong is installed on my pycharm and mysql is full version installed too when i run the code i get this Error
2021-01-12 22:59:34 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line
192, in crawl
return self._crawl(crawler, *args, **kwargs)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line
196, in _crawl
d = crawler.crawl(*args, **kwargs)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-
packageswistedinternetdefer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-
packageswistedinternetdefer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "c:userswannapycharmprojectspythonprojectvenvlibsite-
packageswistedinternetdefer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line
87, in crawl
self.engine = self._create_engine()
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line
101, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycoreengine.py",
line 70, in __init__
self.scraper = Scraper(crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycorescraper.py",
line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapymiddleware.py",
line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapymiddleware.py",
line 35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapyutilsmisc.py",
line 173, in create_instance
instance = objcls(*args, **kwargs)
File "C:UserswannaPycharmProjectspythonProjectinstascraperinstascraperpipelines.py", line
14, in __init__
self.create_table()
File "C:UserswannaPycharmProjectspythonProjectinstascraperinstascraperpipelines.py", line 27,
in
create_table
self.curr.execute("""create table instagram_tb(
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesmysqlconnectorcursor.py",
line 569, in execute
self._handle_result(self._connection.cmd_query(stmt))
File "c:userswannapycharmprojectspythonprojectvenvlibsite-
packagesmysqlconnectorconnection.py", line 651, in cmd_query
result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
File "c:userswannapycharmprojectspythonprojectvenvlibsite-
packagesmysqlconnectorconnection.py", line 538, in _handle_result
raise errors.get_exception(packet)
mysql.connector.errors.ProgrammingError: 1064 (42000): You have an error in your SQL syntax; check
the manual that corresponds to your MySQL server version for the right syntax to use nea
r ')' at line 13
2021-01-12 22:59:34 [twisted] CRITICAL:
Traceback (most recent call last):
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packageswistedinternetdefer.py",
line 1418, in _inlineCallbacks
result = g.send(result)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line
87, in crawl
self.engine = self._create_engine()
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line
101, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycoreengine.py",
line 70, in __init__
self.scraper = Scraper(crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycorescraper.py",
line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapymiddleware.py", line
53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapymiddleware.py", line
35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapyutilsmisc.py", line
173, in create_instance
instance = objcls(*args, **kwargs)
File "C:UserswannaPycharmProjectspythonProjectinstascraperinstascraperpipelines.py", line 14,
in __init__
self.create_table()
File "C:UserswannaPycharmProjectspythonProjectinstascraperinstascraperpipelines.py", line 27,
in create_table
self.curr.execute("""create table instagram_tb(
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesmysqlconnectorcursor.py",
line 569, in execute
self._handle_result(self._connection.cmd_query(stmt))
File "c:userswannapycharmprojectspythonprojectvenvlibsite-
packagesmysqlconnectorconnection.py", line 651, in cmd_query
result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
File "c:userswannapycharmprojectspythonprojectvenvlibsite-
packagesmysqlconnectorconnection.py", line 538, in _handle_result
raise errors.get_exception(packet)
mysql.connector.errors.ProgrammingError: 1064 (42000): You have an error in your SQL syntax; check
the
manual that corresponds to your MySQL server version for the right syntax to use nea
r ')' at line 13
Thank you very much for any kind of help i can get :) Apptreciated from now!
after i deleted the comma in the end of create table i got a new error
2021-01-12 23:36:20 [scrapy.core.scraper] ERROR: Error processing
{'captions': 'Enjoy success among the best influencers on the OMNES '
'Influencers platform!
'
'The easiest way to connect to the biggest advertisers in the '
'business.
'
'Sig