1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/helper_classes/parse_crawler.py
2016-11-09 18:33:45 +01:00

120 lines
4.5 KiB
Python

"""
This is a helper class for the crawler's parse methods
"""
import time
import re
import logging
import scrapy
from ..crawler.items import NewscrawlerItem
class ParseCrawler(object):
"""
Helper class for the crawler's parse methods.
"""
helper = None
log = None
def __init__(self, helper):
self.helper = helper
self.log = logging.getLogger(__name__)
def pass_to_pipeline_if_article(
self,
response,
source_domain,
original_url,
rss_title=None
):
"""
Responsible for passing a NewscrawlerItem to the pipeline if the
response contains an article.
:param obj response: the scrapy response to work on
:param str source_domain: the response's domain as set for the crawler
:param str original_url: the url set in the json file
:param str rss_title: the title extracted by an rssCrawler
:return NewscrawlerItem: NewscrawlerItem to pass to the pipeline
"""
if self.helper.heuristics.is_article(response, original_url):
return self.pass_to_pipeline(
response, source_domain, rss_title=None)
def pass_to_pipeline(
self,
response,
source_domain,
rss_title=None
):
timestamp = time.strftime('%Y-%m-%d %H:%M:%S',
time.gmtime(time.time()))
relative_local_path = self.helper.savepath_parser \
.get_savepath(response.url)
article = NewscrawlerItem()
article['local_path'] = self.helper.savepath_parser \
.get_formatted_relative_path(relative_local_path)
article['abs_local_path'] = self.helper.savepath_parser \
.get_abs_path(relative_local_path)
article['modified_date'] = timestamp
article['download_date'] = timestamp
article['source_domain'] = source_domain.encode("utf-8")
article['url'] = response.url
article['html_title'] = response.selector.xpath('//title/text()') \
.extract_first().encode("utf-8")
if rss_title is None:
article['rss_title'] = 'NULL'
else:
article['rss_title'] = rss_title.encode("utf-8")
article['spider_response'] = response
article['article_title'] = 'NULL'
article['article_description'] = 'NULL'
article['article_text'] = 'NULL'
article['article_image'] = 'NULL'
article['article_author'] = 'NULL'
article['article_publish_date'] = 'NULL'
article['article_language'] = 'NULL'
return article
@staticmethod
def recursive_requests(response, spider, ignore_regex='',
ignore_file_extensions='pdf'):
"""
Manages recursive requests.
Determines urls to recursivly crawl if they do not match certain file
extensions and do not match a certain regex set in the config file.
:param obj response: the response to extract any urls from
:param obj spider: the crawler the callback should be called on
:param str ignore_regex: a regex that should that any extracted url
shouldn't match
:param str ignore_file_extensions: a regex of file extensions that the
end of any url may not match
:return list: Scrapy Requests
"""
# Recursivly crawl all URLs on the current page
# that do not point to irrelevant file types
# or contain any of the given ignore_regex regexes
return [scrapy.Request(response.urljoin(href), callback=spider.parse)
for href in response.css("a::attr('href')").extract()
if re.match(r'.*\.' + ignore_file_extensions + r'$',
response.urljoin(href), re.IGNORECASE) is None and
len(re.match(ignore_regex,
response.urljoin(href)).group(0)) == 0]
def content_type(self, response):
"""
Ensures the response is of type
:param obj response: The scrapy response
:return bool: Determines wether the response is of the correct type
"""
if not re.match('text/html', response.headers.get('Content-Type').decode('utf-8')):
self.log.warn("Dropped: %s's content is not of type "
"text/html but %s", response.url,
response.headers.get('Content-Type'))
return False
else:
return True