Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
11.7k views
in Technique[技术] by (71.8m points)

python - scrapy insert data to Mysql

i built my first scrapy project and works perfect when i save it as csv but when i try to send it to mysql i get problems let me know what i am doing wrong so i can learn too thank you.

import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlencode
import json
from datetime import datetime
from ..items import InstascraperItem

API = 'inserapikeyhere'
user_accounts = ['omnesinfluencers']

def get_url(url):
payload = {'api_key': API, 'url': url}
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
return proxy_url


class InstagramSpider(scrapy.Spider):
name = 'instagram'
allowed_domains = ['api.scraperapi.com']
custom_settings = {'CONCURRENT_REQUESTS_PER_DOMAIN': 5}
custom_settings = {'FEEDS':{'basic.csv':{'format':'csv'}}}

def start_requests(self):
    for username in user_accounts:
        url = f'https://www.instagram.com/{username}/?hl=en'
        yield scrapy.Request(get_url(url), callback=self.parse)

def parse(self, response):
    items = InstascraperItem()
    x = response.xpath("//script[starts-with(.,'window._sharedData')]/text()").extract_first()
    json_string = x.strip().split('= ')[1][:-1]
    data = json.loads(json_string)
    edges = data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']
    for i in edges:
        url = 'https://www.instagram.com/p/' + i['node']['shortcode']
        mediatype = i['node']['is_video']
        date_posted_timestamp = i['node']['taken_at_timestamp']
        date_posted = datetime.fromtimestamp(date_posted_timestamp).strftime("%d/%m/%Y %H:%M:%S")
        likeCount = i['node']['edge_media_preview_like']['count'] if "edge_media_preview_like" in i['node'].keys() else ''
        commentCount = i['node']['edge_media_to_comment']['count'] if 'owner' in i['node'].keys() else ''
        handleid = i['node']['owner']['id'] if 'owner' in i['node'].keys() else ''
        usernameid = i['node']['owner']['username']
        captions = ""
        if i['node']['edge_media_to_caption']:
            for i2 in i['node']['edge_media_to_caption']['edges']:
                captions += i2['node']['text'] + "
"
        if mediatype:
            image_url = i['node']['display_url']
        else:
            image_url = i['node']['thumbnail_resources'][-1]['src']
        items['handleid'] = handleid
        items['usernameid'] = usernameid
        items['url'] = url
        items['mediatype'] = mediatype
        items['date_posted'] = date_posted
        items['date_posted_timestamp'] = date_posted_timestamp
        items['likeCount'] = likeCount
        items['commentCount'] = commentCount
        items['image_url'] = image_url
        items['captions'] = captions
        if mediatype:
            yield scrapy.Request(get_url(url), callback=self.get_video, meta={'items': items})
        else:
            items['videoURL'] = ''
            yield items

def get_video(self, response):
    # only from the first page
    items = response.meta['items']
    video_url = response.xpath('//meta[@property="og:video"]/@content').extract_first()
    items['videoURL'] = video_url
    yield items

            # main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(InstagramSpider)
process.start()

here is my items.py

 import scrapy

 class InstascraperItem(scrapy.Item):
# define the fields for your item here like:
 handleid = scrapy.Field()
 usernameid = scrapy.Field()
 url = scrapy.Field()
 mediatype = scrapy.Field()
 date_posted = scrapy.Field()
 date_posted_timestamp = scrapy.Field()
 likeCount = scrapy.Field()
 commentCount = scrapy.Field()
 image_url = scrapy.Field()
 captions = scrapy.Field()
 videoURL = scrapy.Field()

my pipeline.py

 import mysql.connector

 class InstascraperPipeline:

def __init__(self):
    self.create_connection()
    self.create_table()

def create_connection(self):
    self.conn = mysql.connector.connect(
        host = 'localhost',
        user = 'root',
        passwd = 'gerdeh533',
        database = 'instagram',
                )
    self.curr = self.conn.cursor()

def create_table(self):
    self.curr.execute("""DROP TABLE IF EXISTS instagram_tb""")
    self.curr.execute("""create table instagram_tb(
        handleid text,
        usernameid text,
        url text,
        mediatype text,
        date_posted text,
        date_posted_timestamp text,
        likeCount text,
        commentCount text,
        image_url text,
        captions text,
        videoURL text,
         )""")

def store_db(self, item):
    self.curr.execute("""insert into instagram_tb values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""", (
        item['handleid'],
        item['usernameid'],
        item['url'],
        item['mediatype'],
        item['date_posted'],
        item['date_posted_timestamp'],
        item['likeCount'],
        item['commentCount'],
        item['image_url'],
        item['captions'],
        item['videoURL']
    ))
    self.conn.commit()


def process_item(self, item, spider):
    self.store_db(item)
    return item

for your note pipeline_Item is activated on setting.py and mysql-connector-pythong is installed on my pycharm and mysql is full version installed too when i run the code i get this Error

 2021-01-12 22:59:34 [twisted] CRITICAL: Unhandled error in Deferred:

 Traceback (most recent call last):
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line 
 192, in crawl
 return self._crawl(crawler, *args, **kwargs)
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line 
 196, in _crawl
 d = crawler.crawl(*args, **kwargs)
 File "c:userswannapycharmprojectspythonprojectvenvlibsite- 
 packageswistedinternetdefer.py", line 1613, in unwindGenerator
 return _cancellableInlineCallbacks(gen)
 File "c:userswannapycharmprojectspythonprojectvenvlibsite- 
 packageswistedinternetdefer.py", line 1529, in _cancellableInlineCallbacks
 _inlineCallbacks(None, g, status)
 --- <exception caught here> ---
 File "c:userswannapycharmprojectspythonprojectvenvlibsite- 
 packageswistedinternetdefer.py", line 1418, in _inlineCallbacks
 result = g.send(result)
  File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line 
 87, in crawl
 self.engine = self._create_engine()
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line 
 101, in _create_engine
 return ExecutionEngine(self, lambda _: self.stop())
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycoreengine.py", 
 line 70, in __init__
 self.scraper = Scraper(crawler)
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycorescraper.py", 
 line 71, in __init__
 self.itemproc = itemproc_cls.from_crawler(crawler)
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapymiddleware.py", 
 line 53, in from_crawler
 return cls.from_settings(crawler.settings, crawler)
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapymiddleware.py", 
 line 35, in from_settings
 mw = create_instance(mwcls, settings, crawler)
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapyutilsmisc.py", 
 line 173, in create_instance
 instance = objcls(*args, **kwargs)
  File "C:UserswannaPycharmProjectspythonProjectinstascraperinstascraperpipelines.py", line 
 14, in __init__
 self.create_table()
 File "C:UserswannaPycharmProjectspythonProjectinstascraperinstascraperpipelines.py", line 27, 
in 
create_table
self.curr.execute("""create table instagram_tb(
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesmysqlconnectorcursor.py", 
line 569, in execute
self._handle_result(self._connection.cmd_query(stmt))
File "c:userswannapycharmprojectspythonprojectvenvlibsite- 
packagesmysqlconnectorconnection.py", line 651, in cmd_query
result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
File "c:userswannapycharmprojectspythonprojectvenvlibsite- 
packagesmysqlconnectorconnection.py", line 538, in _handle_result
raise errors.get_exception(packet)
mysql.connector.errors.ProgrammingError: 1064 (42000): You have an error in your SQL syntax; check 
the manual that corresponds to your MySQL server version for the right syntax to use nea
 r ')' at line 13

2021-01-12 22:59:34 [twisted] CRITICAL:
Traceback (most recent call last):
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packageswistedinternetdefer.py", 
line 1418, in _inlineCallbacks
result = g.send(result)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line 
87, in crawl
self.engine = self._create_engine()
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycrawler.py", line 
101, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycoreengine.py", 
line 70, in __init__
self.scraper = Scraper(crawler)
 File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapycorescraper.py", 
line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapymiddleware.py", line 
53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapymiddleware.py", line 
35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesscrapyutilsmisc.py", line 
173, in create_instance
instance = objcls(*args, **kwargs)
 File "C:UserswannaPycharmProjectspythonProjectinstascraperinstascraperpipelines.py", line 14, 
in __init__
self.create_table()
File "C:UserswannaPycharmProjectspythonProjectinstascraperinstascraperpipelines.py", line 27, 
in create_table
self.curr.execute("""create table instagram_tb(
File "c:userswannapycharmprojectspythonprojectvenvlibsite-packagesmysqlconnectorcursor.py", 
line 569, in execute
self._handle_result(self._connection.cmd_query(stmt))
File "c:userswannapycharmprojectspythonprojectvenvlibsite- 
packagesmysqlconnectorconnection.py", line 651, in cmd_query
 result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
 File "c:userswannapycharmprojectspythonprojectvenvlibsite- 
 packagesmysqlconnectorconnection.py", line 538, in _handle_result
 raise errors.get_exception(packet)
 mysql.connector.errors.ProgrammingError: 1064 (42000): You have an error in your SQL syntax; check 
 the 
 manual that corresponds to your MySQL server version for the right syntax to use nea
 r ')' at line 13

Thank you very much for any kind of help i can get :) Apptreciated from now!

after i deleted the comma in the end of create table i got a new error

 2021-01-12 23:36:20 [scrapy.core.scraper] ERROR: Error processing 
 {'captions': 'Enjoy success among the best influencers on the OMNES '
         'Influencers platform! 
'
         'The easiest way to connect to the biggest advertisers in the '
         'business. 
'
         'Sig

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

You have a type here also:

    def process_item(self, item, spider):
        self.sotre_db(item)
        return item

Should be store_db


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

2.1m questions

2.1m answers

60 comments

57.0k users

...