1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/crawler/spiders/recursive_sitemap_crawler.py
2016-11-09 18:33:45 +01:00

70 lines
2.1 KiB
Python

import logging
import scrapy
from ...helper_classes.url_extractor import UrlExtractor
class RecursiveSitemapCrawler(scrapy.spiders.SitemapSpider):
name = "RecursiveSitemapCrawler"
allowed_domains = None
sitemap_urls = None
original_url = None
log = None
config = None
helper = None
ignore_regex = None
ignore_file_extensions = None
def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
self.log = logging.getLogger(__name__)
self.config = config
self.helper = helper
self.ignore_regex = ignore_regex
self.ignore_file_extensions = self.config.section(
'Crawler')['ignore_file_extensions']
self.original_url = url
self.allowed_domains = [self.helper.url_extractor
.get_allowed_domain(url)]
self.sitemap_urls = [self.helper.url_extractor.get_sitemap_url(
url, config.section('Crawler')['sitemap_allow_subdomains'])]
super(RecursiveSitemapCrawler, self).__init__(*args, **kwargs)
def parse(self, response):
"""
Checks any given response on being an article and if positiv,
passes the response to the pipeline.
:param obj response: The scrapy response
"""
if not self.helper.parse_crawler.content_type(response):
return
for request in self.helper.parse_crawler \
.recursive_requests(response, self, self.ignore_regex,
self.ignore_file_extensions):
yield request
yield self.helper.parse_crawler.pass_to_pipeline_if_article(
response, self.allowed_domains[0], self.original_url)
@staticmethod
def supports_site(url):
"""
Sitemap-Crawler are supported by every site which have a
Sitemap set in the robots.txt.
Determines if this crawler works on the given url.
:param str url: The url to test
:return bool: Determines wether this crawler work on the given url
"""
return UrlExtractor.sitemap_check(url)