diff --git a/README.md b/README.md index 1b1c5e4..830149d 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,17 @@ ## Features * **works out of the box**: install with pip, add URLs of your pages, run :-) +* execute it conveniently with the **CLI** or use it as a **library** within your own software + +### CLI mode * stores extracted results in **JSON files or ElasticSearch** (other storages can be added easily) * **simple but extensive configuration** (if you want to tweak the results) * runs on your favorite Python version (2.7+ and 3+) * revisions: crawl articles multiple times and track changes +### Library mode +* crawl and extract information for a list of article URLs (currently the fullsite-crawling is only supported via the CLI) + ## Getting started It's super easy, we promise! @@ -27,7 +33,14 @@ It's super easy, we promise! $ sudo pip install news-please ``` -### Run the crawler +### Use within your own code +``` +from newsplease import NewsPleaseLib +article = NewsPleaseLib.download_article('https://www.nytimes.com/2017/02/23/us/politics/cpac-stephen-bannon-reince-priebus.html?hp') +print(article['title']) +``` + +### Run the crawler (CLI) ``` $ news-please @@ -47,10 +60,11 @@ news-please also supports export to ElasticSearch. Using Elasticsearch will also [Scrapy] - ITEM_PIPELINES = {'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100, - 'newscrawler.pipeline.pipelines.LocalStorage':200, - 'newscrawler.pipeline.pipelines.ElasticSearchStorage':350 - } + ITEM_PIPELINES = { + 'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100, + 'newscrawler.pipeline.pipelines.LocalStorage':200, + 'newscrawler.pipeline.pipelines.ElasticSearchStorage':350 + } That's it! Except, if your Elasticsearch database is not located at `http://localhost:9200`, uses a different username / password or CA-certificate authentication. In these cases, you will also need to change the following. diff --git a/newsplease/__main__.py b/newsplease/__main__.py index f0150ed..d7e1813 100644 --- a/newsplease/__main__.py +++ b/newsplease/__main__.py @@ -25,7 +25,7 @@ if sys.version_info[0] < 3: ConnectionError = OSError -class NewsPlease(object): +class NewsPleaseLauncher(object): """ This class is supposed to be called initially to start all processes. It sets up and manages all crawlers. @@ -50,11 +50,12 @@ class NewsPlease(object): number_of_active_crawlers = 0 config_directory_default_path = "~/news-please/config/" config_file_default_name = "config.cfg" + library_mode = None __single_crawler = False def __init__(self, cfg_directory_path, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql, - is_no_confirm): + is_no_confirm, library_mode=False): """ The constructor of the main class, thus the real entry point to the tool. :param cfg_file_path: @@ -64,13 +65,13 @@ class NewsPlease(object): :param is_reset_mysql: :param is_no_confirm: """ - # print("newsplease is starting on Python " + sys.version) configure_logging({"LOG_LEVEL": "ERROR"}) self.log = logging.getLogger(__name__) # other parameters self.shall_resume = is_resume self.no_confirm = is_no_confirm + self.library_mode = library_mode # Sets an environmental variable called 'CColon', so scripts can import # modules of this project in relation to this script's dir @@ -120,8 +121,7 @@ class NewsPlease(object): self.crawler_list = self.CrawlerList() self.daemon_list = self.DaemonList() - self.__single_crawler = self.get_abs_file_path("./single_crawler.py", - True, False) + self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False) self.manage_crawlers() @@ -284,9 +284,14 @@ class NewsPlease(object): if os.path.exists(self.cfg_directory_path): return - sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. " - + "Should a default config directory be created at this path? [Y/n]") - user_choice = input().lower().replace("yes", "y").replace("no", "n") + user_choice = 'n' + if self.no_confirm: + user_choice = 'y' + else: + sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. " + + "Should a default config directory be created at this path? [Y/n]") + user_choice = input().lower().replace("yes", "y").replace("no", "n") + if not user_choice or user_choice == '': # the default is yes user_choice = "y" if "y" not in user_choice and "n" not in user_choice: @@ -621,7 +626,7 @@ def cli(cfg_file_path: ('path to the config file', 'option', 'c'), if cfg_file_path and not cfg_file_path.endswith(os.path.sep): cfg_file_path += os.path.sep - NewsPlease(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm) + NewsPleaseLauncher(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm) pass diff --git a/newsplease/api/__init__.py b/newsplease/api/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/newsplease/api/server.py b/newsplease/api/server.py deleted file mode 100644 index 865245d..0000000 --- a/newsplease/api/server.py +++ /dev/null @@ -1,37 +0,0 @@ -from extractor.document import Document -from extractor.five_w_extractor import FiveWExtractor -from flask import Flask, request, jsonify -import logging - - -app = Flask(__name__) -log = logging.getLogger(__name__) -host = None -port = 5001 -debug = False -options = None -extractor = FiveWExtractor() -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) -log.addHandler(ch) -log.setLevel(logging.DEBUG) - - -def run(): - log.info("starting server on port %i", port) - app.run(host, port, debug) - log.info("server has stopped") - - -@app.route('/crawl', methods=['GET', 'POST']) -def extract(): - json_article = request.get_json() - log.debug("retrieved raw article for extraction: %s", json_article['title']) - - document = Document(json_article['title'], json_article['description'], json_article['text']) - extractor.parse(document) - - return jsonify(document.questions) - -if __name__ == "__main__": - run() diff --git a/newsplease/config/config_lib.cfg b/newsplease/config/config_lib.cfg new file mode 100644 index 0000000..95d728d --- /dev/null +++ b/newsplease/config/config_lib.cfg @@ -0,0 +1,324 @@ +# !!! DO NOT CHANGE THIS FILE !!! +# if you want to change news-please's options, you should run it first and change +# the config.cfg file that is created on the first run of news-please (by default the config file will be in +# [HOMEDIR]/news-please/config/config.cfg +# !!! NEVER CHANGE THE config_lib.cfg FILE !!! news-please uses this when run in library mode + + +# IMPORTANT +# All variables get parsed to the correct python-types (if not other declared)! +# So bools have to be True or False (uppercase-first), +# Floats need dots . (not comma) +# Ints are just normal ints +# dicts need to be like this { key: value } +# arrays need to be like this [ value1, value2, value3 ] +# All values in dicts and arrays will also be parsed. +# Everything that does not match any of the above criteria will be parsed as string. + + +[Crawler] + +# GENERAL +# ------- + +# Crawling heuristics +# Default Crawlers: +# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir) +# default: SitemapCrawler +default = SitemapCrawler + +# default: +# fallbacks = { +# "RssCrawler": None, +# "RecursiveSitemapCrawler": "RecursiveCrawler", +# "SitemapCrawler": "RecursiveCrawler", +# "RecursiveCrawler": None, +# "Download": None +# } +fallbacks = { + "RssCrawler": None, + "RecursiveSitemapCrawler": "RecursiveCrawler", + "SitemapCrawler": "RecursiveCrawler", + "RecursiveCrawler": None, + "Download": None + } + +# Determines how many hours need to pass since the last download of a webpage +# to be downloaded again by the RssCrawler +# default: 6 +hours_to_pass_for_redownload_by_rss_crawler = 6 + + + +# PROCESSES +# --------- + +# Number of crawlers, that should crawl parallel +# not counting in daemonized crawlers +# default: 5 +number_of_parallel_crawlers = 5 + +# Number of daemons, will be added to daemons. +# default: 10 +number_of_parallel_daemons = 10 + + + +# SPECIAL CASES +# ------------- + +# urls which end on any of the following file extensions are ignored for recursive crawling +# default: "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)" +ignore_file_extensions = "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)" + +# urls which match the following regex are ignored for recursive crawling +# default: "" +ignore_regex = "" + +# Crawl the sitemaps of subdomains (if sitemap is enabled) +# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap. +# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting. +# if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt +# default: True +sitemap_allow_subdomains = True + + + +[Heuristics] + +# Enabled heuristics, +# Currently: +# - og_type +# - linked_headlines +# - self_linked_headlines +# - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded) +# - meta_contains_article_keyword +# - crawler_contains_only_article_alikes +# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py: +# Every method not starting with __ should be a heuristic, except is_article) +# These heuristics can be overwritten by sitelist.json for each site +# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"} +enabled_heuristics = {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"} + +# Heuristics can be combined with others +# The heuristics need to have the same name as in enabled_heuristics +# Possible condition-characters / literals are: (, ), not, and, or +# All heuristics used here need to be enabled in enabled_heuristics as well! +# Examples: +# "og_type and (self_linked_headlines or linked_headlines)" +# "og_type" +# default: "og_type and (linked_headlines or self_linked_headlines)" +pass_heuristics_condition = "og_type and (linked_headlines or self_linked_headlines)" + +# The maximum ratio of headlines divided by linked_headlines in a file + +# The minimum number of headlines in a file to check for the ratio +# If less then this number are in the file, the file will pass the test. +# default: 5 +min_headlines_for_linked_test = 5 + + + +[Files] + +# GENERAL: +# ------- + +# Paths: +# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False) +# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from +# Default: True +relative_to_start_processes_file = True + + + +# INPUT: +# ----- + +# Here you can specify the input JSON-Filename +# default: sitelist.hjson +url_input_file_name = sitelist.hjson + + + +# OUTPUT: +# ------ + +# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database +# True: ./data would become data +# default: True +working_path = ~/news-please/ + +# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars) +# +# %working_path = the path specified in OUTPUT["working_path"] +# %time_download() = current time at download; will be replaced with strftime() where is a string, explained further here: http://strftime.org/ +# %time_execution() = current time at execution; will be replaced with strftime() where is a string, explained further here: http://strftime.org/ +# %timestamp_download = current time at download; unix-timestamp +# %timestamp_execution = current time at execution; unix-timestamp +# %domain() = first chars of the domain of the crawled file (e.g. zeit.de) +# %appendmd5_domain() = appends the md5 to %domain(< - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than +# %md5_domain() = first chars of md5 hash of %domain +# %full_domain() = first chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de) +# %appendmd5_full_domain() = appends the md5 to %full_domain(< - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than +# %md5_full_domain() = first chars of md5 hash of %full_domain +# %subdomains() = first chars of the domain's subdomains +# %appendmd5_subdomains() = appends the md5 to %subdomains(< - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than +# %md5_subdomains() = first chars of md5 hash of %subdomains +# %url_directory_string() = first chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename +# %appendmd5_url_directory_string() = appends the md5 to %url_directory_string(< - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than +# %md5_url_directory_string() = first chars of md5 hash of %url_directory_string() +# %url_file_name() = first chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index +# %md5_url_file_name() = first chars of md5 hash of %url_file_name +# %max_url_file_name = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 ) +# %appendmd5_max_url_file_name = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 ) +# +# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets. +# To be able to use cleanup commands, it should also start with a static folder name like 'data'. +# +# default: %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html +local_data_directory = %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html + +# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database +# True: ./data would become data +# default: True +format_relative_path = True + + + +[MySQL] + +# MySQL-Connection required for saving meta-informations +host = localhost +port = 3306 +db = 'news-please' +username = 'root' +password = 'password' + + + +[Elasticsearch] + +# Elasticsearch-Connection required for saving detailed meta-information +host = localhost +port = 9200 +index_current = 'news-please' +index_archive = 'news-please-archive' + +# Elasticsearch supports user authentication by CA certificates. If your database is protected by certificate +# fill in the following parameters, otherwise you can ignore them. +use_ca_certificates = False +ca_cert_path = /path/to/cacert.pem +client_cert_path = /path/to/client_cert.pem +client_key_path = /path/to/client_key.pem +username = 'root' +secret = 'password' + +# Properties of the document type used for storage. +mapping = { + 'url': {'type': 'string', 'index': 'not_analyzed'}, + 'sourceDomain': {'type': 'string', 'index': 'not_analyzed'}, + 'pageTitle': {'type': 'string'}, + 'rss_title': {'type': 'string'}, + 'localpath': {'type': 'string', 'index' : 'not_analyzed'}, + 'ancestor': {'type': 'string'}, + 'descendant': {'type': 'string'}, + 'version': {'type': 'long'}, + 'downloadDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"}, + 'modifiedDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"}, + 'publish_date': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"}, + 'title': {'type': 'string'}, + 'description': {'type': 'string'}, + 'text': {'type': 'string'}, + 'author': {'type': 'string'}, + 'image': {'type': 'string', 'index' : 'not_analyzed'}, + 'language': {'type': 'string', 'index' : 'not_analyzed'} + } + + + +[ArticleMasterExtractor] + +# Choose which extractors you want to use. +# +# The Default is ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'], +# which are all integrated extractors right now. +# Possibly extractors are 'newspaper_extractor' , 'readability_extractor' , 'date_extractor_extractor and 'lang_detect_extractor' +# Examples: -Only Newspaper and date_extractor: extractors = ['newspaper', 'date_extractor'] +# -Only Newspaper: extractors = ['newspaper'] +extractors = ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'] + + + +[DateFilter] + +# If added to the pipeline, this module provides the means to filter the extracted articles based on the publishing date. +# Therefore this module has to be placed after the KM4 article extractor to access the publishing dates. +# +# All articles, with a publishing date outside of the given time interval are dropped. The dates used to specify the +# time interval are included and should follow this format: 'yyyy-mm-dd hh:mm:ss'. +# +# It is also possible to only define one date, assigning the other variable the value 'None' to create an half-bounded +# interval. + +start_date = '1999-01-01 00:00:00' +end_date = '2999-12-31 00:00:00' + +# If 'True' articles without a publishing date are dropped. +strict_mode = False + + + +[Scrapy] + +# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG +# default: WARNING +LOG_LEVEL = ERROR + +# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes +# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s +LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s + +# Can be a filename or None +# default: None +LOG_FILE = None + +LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S + +LOG_STDOUT = False + +LOG_ENCODING = utf-8 + +BOT_NAME = 'news-please' + +SPIDER_MODULES = ['newsplease.crawler.spiders'] +NEWSPIDER_MODULE = 'newsplease.crawler.spiders' + +# Resume/Pause functionality activation +# default: .resume_jobdir +JOBDIRNAME = .resume_jobdir + +# Respect robots.txt activation +# default: False +ROBOTSTXT_OBEY=True + +# Maximum number of concurrent requests across all domains +# default: 16 +# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one. +CONCURRENT_REQUESTS=16 + +# Maximum number of active requests per domain +# default: 4 +CONCURRENT_REQUESTS_PER_DOMAIN=4 + +# User-agent activation +# default: 'news-please (+http://www.example.com/)' +USER_AGENT = 'news-please (+http://www.example.com/)' + +# Pipeline activation +# Syntax: '.': +# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300} +# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350 +ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, + 'newsplease.pipeline.pipelines.InMemoryStorage':200 + } \ No newline at end of file diff --git a/newsplease/newspleaselib.py b/newsplease/newspleaselib.py new file mode 100644 index 0000000..83d39e1 --- /dev/null +++ b/newsplease/newspleaselib.py @@ -0,0 +1,45 @@ +import sys +import os + +sys.path.append(os.path.dirname(os.path.realpath(__file__))) +from newsplease.pipeline.pipelines import InMemoryStorage +from newsplease.single_crawler import SingleCrawler + + +class NewsPleaseLib: + """ + Access news-please functionality via this interface + """ + + @staticmethod + def download_article(url): + """ + Crawls the article from the url and extracts relevant information. + :param url: + :return: + """ + SingleCrawler.create_as_library(url) + results = InMemoryStorage.get_results() + article = results[url] + del results[url] + return article + + @staticmethod + def download_articles(urls): + """ + Crawls articles from the urls and extracts relevant information. + :param urls: + :return: + """ + SingleCrawler.create_as_library(urls) + results = InMemoryStorage.get_results() + articles = [] + for url in urls: + article = results[url] + del results[url] + articles.append(article) + print(article['title']) + return articles + +if __name__ == '__main__': + NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan') \ No newline at end of file diff --git a/newsplease/pipeline/extractor/extractors/newspaper_extractor.py b/newsplease/pipeline/extractor/extractors/newspaper_extractor.py index 4d23b47..8dc29bc 100644 --- a/newsplease/pipeline/extractor/extractors/newspaper_extractor.py +++ b/newsplease/pipeline/extractor/extractors/newspaper_extractor.py @@ -1,7 +1,6 @@ import logging from .abstract_extractor import AbstractExtractor from ..article_candidate import ArticleCandidate -# Import Newspaper Article Extractor Library. from newspaper import Article diff --git a/newsplease/pipeline/pipelines.py b/newsplease/pipeline/pipelines.py index 4c2ee9f..e5b2b19 100644 --- a/newsplease/pipeline/pipelines.py +++ b/newsplease/pipeline/pipelines.py @@ -292,6 +292,7 @@ class ExtractedInformationStorage(object): def extract_relevant_info(item): """ extracts from an item only fields that we want to output as extracted information + :rtype: object :param item: :return: """ @@ -314,6 +315,27 @@ class ExtractedInformationStorage(object): } +class InMemoryStorage(ExtractedInformationStorage): + """ + Stores extracted information in a dictionary in memory - for use with library mode. + """ + + results = {} # this is a static variable + + def process_item(self, item, spider): + # get the original url, so that the library class (or whoever wants to read this) can access the article + if 'redirect_urls' in item._values['spider_response'].meta: + url = item._values['spider_response'].meta['redirect_urls'][0] + else: + url = item._values['url'] + InMemoryStorage.results[url] = ExtractedInformationStorage.extract_relevant_info(item) + return item + + @staticmethod + def get_results(): + return InMemoryStorage.results + + class JsonFileStorage(ExtractedInformationStorage): """ Handles remote storage of the data in Json files diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py index 33cba7d..a6c2260 100644 --- a/newsplease/single_crawler.py +++ b/newsplease/single_crawler.py @@ -34,7 +34,7 @@ class SingleCrawler(object): cfg = None json = None log = None - crawler = None + crawler_name = None process = None helper = None cfg_file_path = None @@ -46,8 +46,21 @@ class SingleCrawler(object): shall_resume = False daemonize = False + @classmethod + def create_as_library(cls, url): + """ + Creates a single crawler as in library mode. Crawling will start immediately. + :param url: + :return: + """ + site = { + "crawler": "Download", + "url": url + } + return cls('config/config_lib.cfg', site, 0, False, False, True) + def __init__(self, cfg_file_path, json_file_path, - site_index, shall_resume, daemonize): + site_index, shall_resume, daemonize, library_mode=False): # set up logging before it's defined via the config file, # this will be overwritten and all other levels will be put out # as well, if it will be changed. @@ -69,11 +82,16 @@ class SingleCrawler(object): self.cfg_crawler = self.cfg.section("Crawler") - # load the URL-input-json-file - self.json = JsonConfig.get_instance() - self.json.setup(self.json_file_path) - - site = self.json.get_site_objects()[self.site_number] + # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information ( + # kind of hacky..) + if not library_mode: + self.json = JsonConfig.get_instance() + self.json.setup(self.json_file_path) + sites = self.json.get_site_objects() + site = sites[self.site_number] + else: + sites = [json_file_path] + site = json_file_path if "ignore_regex" in site: ignore_regex = "(%s)|" % site["ignore_regex"] @@ -83,13 +101,13 @@ class SingleCrawler(object): # Get the default crawler. The crawler can be overwritten by fallbacks. if "additional_rss_daemon" in site and self.daemonize: - self.crawler = "RssCrawler" + self.crawler_name = "RssCrawler" elif "crawler" in site: - self.crawler = site["crawler"] + self.crawler_name = site["crawler"] else: - self.crawler = self.cfg.section("Crawler")["default"] + self.crawler_name = self.cfg.section("Crawler")["default"] # Get the real crawler-class (already "fallen back") - crawler_class = self.get_crawler(self.crawler, site["url"]) + crawler_class = self.get_crawler(self.crawler_name, site["url"]) if not self.cfg.section('Files')['relative_to_start_processes_file']: relative_to_path = os.path.dirname(self.cfg_file_path) @@ -101,7 +119,7 @@ class SingleCrawler(object): self.cfg.section("Files")["local_data_directory"], relative_to_path, self.cfg.section('Files')['format_relative_path'], - self.json.get_site_objects(), + sites, crawler_class, self.cfg.get_working_path()) @@ -116,7 +134,6 @@ class SingleCrawler(object): self.load_crawler(crawler_class, site["url"], ignore_regex) - self.process.start() def update_jobdir(self, site): @@ -133,7 +150,7 @@ class SingleCrawler(object): if not jobdirname.endswith("/"): jobdirname += "/" - site_string = ''.join(site["url"]) + self.crawler + site_string = ''.join(site["url"]) + self.crawler_name hashed = hashlib.md5(site_string.encode('utf-8')) self.__scrapy_options["JOBDIR"] = working_path + jobdirname + hashed.hexdigest() @@ -221,6 +238,7 @@ class SingleCrawler(object): self.log.info("Removed " + jobdir + " since '--resume' was not passed to" " initial.py or this crawler was daemonized.") + if __name__ == "__main__": SingleCrawler(cfg_file_path=sys.argv[1], json_file_path=sys.argv[2],