1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/crawler/spiders/download_crawler.py
2016-11-09 18:33:45 +01:00

52 lines
1.2 KiB
Python

import logging
import scrapy
class Download(scrapy.Spider):
name = "Download"
start_urls = None
log = None
config = None
helper = None
def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
self.log = logging.getLogger(__name__)
self.config = config
self.helper = helper
if isinstance(url, list):
self.start_urls = url
else:
self.start_urls = [url]
super(Download, self).__init__(*args, **kwargs)
def parse(self, response):
"""
Passes the response to the pipeline.
:param obj response: The scrapy response
"""
if not self.helper.parse_crawler.content_type(response):
return
yield self.helper.parse_crawler.pass_to_pipeline(
response,
self.helper.url_extractor.get_allowed_domain(response.url)
)
@staticmethod
def supports_site(url):
"""
As long as the url exists, this crawler will work!
Determines if this crawler works on the given url.
:param str url: The url to test
:return bool: Determines wether this crawler work on the given url
"""
return True