Alors j'ai un élément que j'ai déclaré dans mon fichier items.py appelé ArtscraperItem.
import scrapy
class ArtscraperItem(scrapy.Item):
# définir les champs de votre élément ici comme:
# name = scrapy.Field()
date = scrapy.Field()
date_str = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
art_content = scrapy.Field()
Je lance cette araignée et je collecte des données du fichier xml. Cependant, j'ai également besoin d'obtenir les urls du fichier xml puis de scraper ces urls pour obtenir le contenu de l'article et l'ajouter comme item[art_content]. J'ai vu quelque chose de similaire sur stackoverflow, mais ils n'utilisaient pas un élément précédemment déclaré, donc je ne savais pas comment l'utiliser. Donc, j'ai besoin d'obtenir le contenu de l'url que j'ai scrapée et de l'ajouter à mon ArtscraperItem que j'ai créé dans ma méthode d'analyse.
Merci d'avance.
La méthode en question et la deuxième méthode parse_article qui est censée scraper l'url collectée et renvoyer le contenu de l'article.
from datetime import datetime as dt
import scrapy
from ArtScraper.items import ArtscraperItem
class PostSpider(scrapy.Spider):
article = ""
name = 'crawly'
allowed_domains = ['bbc.com/arabic']
start_urls = ['http://feeds.bbci.co.uk/arabic/rss.xml']
def parse(self, response):
articles = response.xpath('//channel/item')
for article in articles:
item = ArtscraperItem()
item['date']= dt.today()
item['date_str'] = article.xpath('pubDate/text()').extract_first()
item['url'] = article.xpath('link/text()').extract_first()
item['title'] = article.xpath('title/text()').extract_first()
url = item['url']
yield scrapy.Request(url, callback=self.parse_article)
yield item
def parse_article(self, response):
pars = response.xpath("//div[@class='story-body']/div[@class='story-body__inner']/p/text()").extract()
article = '-'.join(pars)
yield{
'art_content': article
}
Le fichier settings.py
#Settings.py
BOT_NAME = 'ArtScraper'
SPIDER_MODULES = ['ArtScraper.spiders']
NEWSPIDER_MODULE = 'ArtScraper.spiders'
# Crawl responsibly by identifying yourself (and your website) on the
user-agent
#USER_AGENT = 'ArtScraper (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default:
16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-
delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = .25
RANDOMIZE_DOWNLOAD_DELAY=True
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'ArtScraper.middlewares.ArtscraperSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-
middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'ArtScraper.middlewares.ArtscraperDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'ArtScraper.pipelines.MongoPipeline': 300,
}
MONGO_URI='localhost:27017'
MONGO_DATABASE='george'
#Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel
to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-
middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE =
'scrapy.extensions.httpcache.FilesystemCacheStorage'
Le fichier pipelines.py
import logging
import pymongo
class MongoPipeline(object):
collection_name = 'articles'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
## tirer des informations de settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initialisation de l'araignée
## ouverture de la connexion db
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## nettoyage lorsque l'araignée est fermée
self.client.close()
def process_item(self, item, spider):
## comment gérer chaque post
self.db[self.collection_name].insert(dict(item))
logging.debug("Post added to MongoDB")
return item