Storing scraped data with SCRAPY in MySQL database -
i'm new here, it's first time i'm using scrapy , need help. know asked before , did try lot of solutions none of them works.
my pipelines file:
import sys import mysqldb import hashlib scrapy.exceptions import notconfigured scrapy.exceptions import dropitem scrapy.http import request projetpfe.items import projetpfeitem class mysqlstorepipeline(object): def __init__(self): try: self.conn= mysqldb.connect(user='root', passwd='root123', host='localhost', db='pressebam', use_unicode=true, charset='utf8') self.cursor = self.conn.cursor() self.cursor.execute("create table if not exists scrapeddata2( idscrapeddata int not null auto_increment primary key, nomorganepresse varchar(200), titrearticle varchar(200), url varchar(200), nomjournaliste varchar(200), jour varchar(100), annee varchar(100), categorie varchar(100), contenuarticle varchar(5000), lienimage varchar(200)) ") self.conn.commit() except (attributeerror, mysqldb.operationalerror), e: raise e def process_item(self, item, spider): try: self.cursor.execute( "insert scrapeddata2 ( nomorganepresse, titrearticle, url, jour, contenuarticle, lienimage) values (%s, %s, %s,%s,%s, %s)", (item['organepresse'], item['titre'], item['url'], item['jour'], item['contenu'], item['lienimage'] )) self.conn.commit() except mysqldb.error, e: print "error %d: %s" % (e.args[0], e.args[1]) return item
and spider file
import urlparse scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors.sgml import sgmllinkextractor scrapy.selector import htmlxpathselector projetpfe.items import projetpfeitem class projetpfespider(crawlspider): name = 'telquel' start_urls = ['http://telquel.ma'] # urls spider start crawling rules = [rule(sgmllinkextractor(allow=[r'page/\d+']), follow=true), # r'page/\d+' : regular expression http://telquelpage/x urls rule(sgmllinkextractor(allow=[r'\d{4}/\d{2}/\d{2}/\w+']), callback='parse_telquel')] # r'\d{4}/\d{2}/\w+' : regular expression http://telquel.ma/yyyy/mm/title urls def parse_telquel(self, response): hxs = htmlxpathselector(response) item = projetpfeitem() # xpath selector title item['titre'] = hxs.select("//h1[@class='article-title']/text()").extract() item['lienimage'] = hxs.select("//div[@class='main-article-content']//img[@class='setborder']/@src").extract() item['organepresse'] = hxs.select("//img[@class='logo']/@alt").extract() item['jour'] = hxs.select("//div[@class='calendar-date']/text()").extract() item['contenu'] = hxs.select("//div[@class='shortcode-content']").extract() item['url'] = hxs.select("/html/head/link[5]/@href").extract() return item
this settings file
bot_name = 'projetpfe' spider_modules = ['projetpfe.spiders'] newspider_module = 'projetpfe.spiders' item_pipelines = {'projetpfe.pipelines.mysqlstorepipeline' : 300}
and items
from scrapy.item import item, field class projetpfeitem(item): organepresse = field() titre = field() journaliste = field() jour = field() annee = field() categorie = field() contenu = field() lienimage = field() url = field()
so spider works fine nada stored in database. help!!!
Comments
Post a Comment