mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-27 00:44:24 +03:00
99 lines
3.0 KiB
Python
99 lines
3.0 KiB
Python
try:
|
|
import urllib2
|
|
except ImportError:
|
|
import urllib.request as urllib2
|
|
import re
|
|
import logging
|
|
import scrapy
|
|
|
|
|
|
class RssCrawler(scrapy.Spider):
|
|
name = "RssCrawler"
|
|
ignored_allowed_domains = None
|
|
start_urls = None
|
|
original_url = None
|
|
|
|
log = None
|
|
|
|
config = None
|
|
helper = None
|
|
|
|
def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
|
|
self.log = logging.getLogger(__name__)
|
|
|
|
self.config = config
|
|
self.helper = helper
|
|
|
|
self.original_url = url
|
|
|
|
self.ignored_allowed_domain = self.helper.url_extractor \
|
|
.get_allowed_domain(url)
|
|
self.start_urls = [self.helper.url_extractor.get_start_url(url)]
|
|
|
|
super(RssCrawler, self).__init__(*args, **kwargs)
|
|
|
|
def parse(self, response):
|
|
"""
|
|
Extracts the Rss Feed and initiates crawling it.
|
|
|
|
:param obj response: The scrapy response
|
|
"""
|
|
yield scrapy.Request(self.helper.url_extractor.get_rss_url(response),
|
|
callback=self.rss_parse)
|
|
|
|
def rss_parse(self, response):
|
|
"""
|
|
Extracts all article links and initiates crawling them.
|
|
|
|
:param obj response: The scrapy response
|
|
"""
|
|
for item in response.xpath('//item'):
|
|
for url in item.xpath('link/text()').extract():
|
|
yield scrapy.Request(url, lambda resp: self.article_parse(
|
|
resp, item.xpath('title/text()').extract()[0]))
|
|
|
|
def article_parse(self, response, rss_title=None):
|
|
"""
|
|
Checks any given response on being an article and if positiv,
|
|
passes the response to the pipeline.
|
|
|
|
:param obj response: The scrapy response
|
|
:param str rss_title: Title extracted from the rss feed
|
|
"""
|
|
if not self.helper.parse_crawler.content_type(response):
|
|
return
|
|
|
|
yield self.helper.parse_crawler.pass_to_pipeline_if_article(
|
|
response, self.ignored_allowed_domain, self.original_url,
|
|
rss_title)
|
|
|
|
@staticmethod
|
|
def only_extracts_articles():
|
|
"""
|
|
Meta-Method, so if the heuristic "crawler_contains_only_article_alikes"
|
|
is called, the heuristic will return True on this crawler.
|
|
"""
|
|
return True
|
|
|
|
@staticmethod
|
|
def supports_site(url):
|
|
"""
|
|
Rss Crawler are supported if by every site containing an rss feed.
|
|
|
|
Determines if this crawler works on the given url.
|
|
|
|
:param str url: The url to test
|
|
:return bool: Determines wether this crawler work on the given url
|
|
"""
|
|
|
|
# Follow redirects
|
|
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler)
|
|
redirect = opener.open(url).url
|
|
response = urllib2.urlopen(redirect).read()
|
|
|
|
# Check if a standard rss feed exists
|
|
return re.search(
|
|
r'(<link[^>]*href[^>]*type ?= ?"application\/rss\+xml"|' +
|
|
r'<link[^>]*type ?= ?"application\/rss\+xml"[^>]*href)',
|
|
response.decode('utf-8')) is not None
|