From 32ba438a069cb3a83731547c0c03ac9ad75229e5 Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Fri, 17 Feb 2017 15:47:43 +0100 Subject: [PATCH 1/9] Update README.md --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3e44d10..49e2a48 100644 --- a/README.md +++ b/README.md @@ -47,10 +47,11 @@ news-please also supports export to ElasticSearch. Using Elasticsearch will also [Scrapy] - ITEM_PIPELINES = {'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100, - 'newscrawler.pipeline.pipelines.LocalStorage':200, - 'newscrawler.pipeline.pipelines.ElasticSearchStorage':350 - } + ITEM_PIPELINES = { + 'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100, + 'newscrawler.pipeline.pipelines.LocalStorage':200, + 'newscrawler.pipeline.pipelines.ElasticSearchStorage':350 + } That's it! Except, if your Elasticsearch database is not located at `http://localhost:9200`, uses a different username / password or CA-certificate authentication. In these cases, you will also need to change the following. From 4f4570e32d47e70d7cb847c6423d85cbbfdd6b99 Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Thu, 23 Feb 2017 16:52:56 +0100 Subject: [PATCH 2/9] add library download --- newsplease/__main__.py | 19 +++++++----- newsplease/config/sitelist.hjson | 40 ++---------------------- newsplease/library.py | 42 +++++++++++++++++++++++++ newsplease/single_crawler.py | 53 ++++++++++++++++++++++++-------- 4 files changed, 96 insertions(+), 58 deletions(-) create mode 100644 newsplease/library.py diff --git a/newsplease/__main__.py b/newsplease/__main__.py index f0150ed..288d49c 100644 --- a/newsplease/__main__.py +++ b/newsplease/__main__.py @@ -50,11 +50,12 @@ class NewsPlease(object): number_of_active_crawlers = 0 config_directory_default_path = "~/news-please/config/" config_file_default_name = "config.cfg" + library_mode = None __single_crawler = False def __init__(self, cfg_directory_path, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql, - is_no_confirm): + is_no_confirm, library_mode=False): """ The constructor of the main class, thus the real entry point to the tool. :param cfg_file_path: @@ -64,13 +65,13 @@ class NewsPlease(object): :param is_reset_mysql: :param is_no_confirm: """ - # print("newsplease is starting on Python " + sys.version) configure_logging({"LOG_LEVEL": "ERROR"}) self.log = logging.getLogger(__name__) # other parameters self.shall_resume = is_resume self.no_confirm = is_no_confirm + self.library_mode = library_mode # Sets an environmental variable called 'CColon', so scripts can import # modules of this project in relation to this script's dir @@ -120,8 +121,7 @@ class NewsPlease(object): self.crawler_list = self.CrawlerList() self.daemon_list = self.DaemonList() - self.__single_crawler = self.get_abs_file_path("./single_crawler.py", - True, False) + self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False) self.manage_crawlers() @@ -284,9 +284,14 @@ class NewsPlease(object): if os.path.exists(self.cfg_directory_path): return - sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. " - + "Should a default config directory be created at this path? [Y/n]") - user_choice = input().lower().replace("yes", "y").replace("no", "n") + user_choice = 'n' + if self.no_confirm: + user_choice = 'y' + else: + sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. " + + "Should a default config directory be created at this path? [Y/n]") + user_choice = input().lower().replace("yes", "y").replace("no", "n") + if not user_choice or user_choice == '': # the default is yes user_choice = "y" if "y" not in user_choice and "n" not in user_choice: diff --git a/newsplease/config/sitelist.hjson b/newsplease/config/sitelist.hjson index 3af27e9..1eac4c0 100644 --- a/newsplease/config/sitelist.hjson +++ b/newsplease/config/sitelist.hjson @@ -6,46 +6,10 @@ "base_urls" : [ { # Start crawling from faz.net - "url": "http://www.faz.net/", + "url": "https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html?hp", # Overwrite the default crawler and use th RecursiveCrawler instead - "crawler": "RecursiveCrawler", - - # Because this site is weirt, use the - # meta_contains_article_keyword-heuristic and disable all others because - # overwrite will merge the defaults from "newscrawler.cfg" with - # this - "overwrite_heuristics": { - "meta_contains_article_keyword": true, - "og_type": false, - "linked_headlines": false, - "self_linked_headlines": false - }, - # Also state that in the condition, all heuristics used in the condition - # have to be activated in "overwrite_heuristics" (or default) as well. - "pass_heuristics_condition": "meta_contains_article_keyword" - }, - { - # zeit.de has a blog which we do not want to crawl - "url": "http://www.zeit.de", - - "overwrite_heuristics": { - # because we do not want to crawl that blog, disable all downloads from - # subdomains - "is_not_from_subdomain": true - }, - # Update the condition as well, all the other heuristics are enabled in - # newscrawler.cfg - "pass_heuristics_condition": "is_not_from_subdomain and og_type and self_linked_headlines and linked_headlines" - }, - { - # nytimes.com should run pretty well with default config: - "url": "http://www.nytimes.com/" - - # to create an additional RssCrawler daemon for this site that runs every hour, we could either use - # "additional_rss_daemon": 3600 - # or create an additional array-object with "crawler": "RssCrawler" and "daemonize": 3600 - # it is not possible to create an additional_rss_daemon for a daemonized array-object + "crawler": "Download", } ] } diff --git a/newsplease/library.py b/newsplease/library.py new file mode 100644 index 0000000..4c95de2 --- /dev/null +++ b/newsplease/library.py @@ -0,0 +1,42 @@ +from newsplease.single_crawler import SingleCrawler +import os + + +class Library: + """ + Access news-please functionality via this interface + """ + crawler = None + + def __init__(self): + url = 'https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html' + SingleCrawler.create_as_library(url) + + def download_article(self, url): + """ + Crawls the article from the url and extracts relevant information. + :param url: + :return: + """ + # self.crawler.library_download_urls([url]) + pass + + def download_articles(self, urls): + """ + Crawls articles from the urls and extracts relevant information. + :param urls: + :return: + """ + articles = [] + for url in urls: + articles.append(self.downloadArticle(url)) + return articles + + +if __name__ == '__main__': + lib = Library() + lib.download_article( + 'https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html') + print("hi") + lib.download_article( + 'http://www.faz.net/aktuell/gesellschaft/kenia-droht-hungerkatastrophe-wegen-el-ni-o-14890707.html') \ No newline at end of file diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py index 33cba7d..977f718 100644 --- a/newsplease/single_crawler.py +++ b/newsplease/single_crawler.py @@ -34,6 +34,7 @@ class SingleCrawler(object): cfg = None json = None log = None + crawler_name = None crawler = None process = None helper = None @@ -46,8 +47,16 @@ class SingleCrawler(object): shall_resume = False daemonize = False + @classmethod + def create_as_library(cls, url): + site = { + "crawler": "Download", + "url": url + } + return cls('config/config.cfg', site, 0, False, False, True) + def __init__(self, cfg_file_path, json_file_path, - site_index, shall_resume, daemonize): + site_index, shall_resume, daemonize, library_mode=False): # set up logging before it's defined via the config file, # this will be overwritten and all other levels will be put out # as well, if it will be changed. @@ -69,11 +78,15 @@ class SingleCrawler(object): self.cfg_crawler = self.cfg.section("Crawler") - # load the URL-input-json-file - self.json = JsonConfig.get_instance() - self.json.setup(self.json_file_path) - - site = self.json.get_site_objects()[self.site_number] + # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (kind of hacky..) + if not library_mode: + self.json = JsonConfig.get_instance() + self.json.setup(self.json_file_path) + sites = self.json.get_site_objects() + site = sites[self.site_number] + else: + sites = [json_file_path] + site = json_file_path if "ignore_regex" in site: ignore_regex = "(%s)|" % site["ignore_regex"] @@ -83,13 +96,13 @@ class SingleCrawler(object): # Get the default crawler. The crawler can be overwritten by fallbacks. if "additional_rss_daemon" in site and self.daemonize: - self.crawler = "RssCrawler" + self.crawler_name = "RssCrawler" elif "crawler" in site: - self.crawler = site["crawler"] + self.crawler_name = site["crawler"] else: - self.crawler = self.cfg.section("Crawler")["default"] + self.crawler_name = self.cfg.section("Crawler")["default"] # Get the real crawler-class (already "fallen back") - crawler_class = self.get_crawler(self.crawler, site["url"]) + crawler_class = self.get_crawler(self.crawler_name, site["url"]) if not self.cfg.section('Files')['relative_to_start_processes_file']: relative_to_path = os.path.dirname(self.cfg_file_path) @@ -101,7 +114,7 @@ class SingleCrawler(object): self.cfg.section("Files")["local_data_directory"], relative_to_path, self.cfg.section('Files')['format_relative_path'], - self.json.get_site_objects(), + sites, crawler_class, self.cfg.get_working_path()) @@ -113,10 +126,13 @@ class SingleCrawler(object): # if not stated otherwise in the arguments passed to this script self.remove_jobdir_if_not_resume() + #if library_mode: + # self.crawler = crawler_class + # self.library_mode = library_mode + #else: self.load_crawler(crawler_class, site["url"], ignore_regex) - self.process.start() def update_jobdir(self, site): @@ -133,7 +149,7 @@ class SingleCrawler(object): if not jobdirname.endswith("/"): jobdirname += "/" - site_string = ''.join(site["url"]) + self.crawler + site_string = ''.join(site["url"]) + self.crawler_name hashed = hashlib.md5(site_string.encode('utf-8')) self.__scrapy_options["JOBDIR"] = working_path + jobdirname + hashed.hexdigest() @@ -221,6 +237,17 @@ class SingleCrawler(object): self.log.info("Removed " + jobdir + " since '--resume' was not passed to" " initial.py or this crawler was daemonized.") + def library_download_urls(self, urls): + """ + Downloads one or more articles given the urls + :param urls: + :return: + """ + if not self.library_mode: + sys.exit("invoked library_download_urls without being in library mode") + self.load_crawler(self.crawler, urls, False) + self.process.start() + if __name__ == "__main__": SingleCrawler(cfg_file_path=sys.argv[1], json_file_path=sys.argv[2], From 728c0f7f316d306e0a6a3d49fed9401a6cf69831 Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Thu, 23 Feb 2017 18:34:17 +0100 Subject: [PATCH 3/9] add library download --- newsplease/__main__.py | 4 +- newsplease/config/config.cfg | 3 +- newsplease/library.py | 42 ----------------- newsplease/newspleaselib.py | 45 +++++++++++++++++++ .../extractors/newspaper_extractor.py | 1 - newsplease/pipeline/pipelines.py | 22 +++++++++ newsplease/single_crawler.py | 22 +++------ 7 files changed, 77 insertions(+), 62 deletions(-) delete mode 100644 newsplease/library.py create mode 100644 newsplease/newspleaselib.py diff --git a/newsplease/__main__.py b/newsplease/__main__.py index 288d49c..d7e1813 100644 --- a/newsplease/__main__.py +++ b/newsplease/__main__.py @@ -25,7 +25,7 @@ if sys.version_info[0] < 3: ConnectionError = OSError -class NewsPlease(object): +class NewsPleaseLauncher(object): """ This class is supposed to be called initially to start all processes. It sets up and manages all crawlers. @@ -626,7 +626,7 @@ def cli(cfg_file_path: ('path to the config file', 'option', 'c'), if cfg_file_path and not cfg_file_path.endswith(os.path.sep): cfg_file_path += os.path.sep - NewsPlease(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm) + NewsPleaseLauncher(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm) pass diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg index 31ef799..0af4246 100644 --- a/newsplease/config/config.cfg +++ b/newsplease/config/config.cfg @@ -314,6 +314,5 @@ USER_AGENT = 'news-please (+http://www.example.com/)' # default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300} # Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350 ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, - 'newsplease.pipeline.pipelines.LocalStorage':200, - 'newsplease.pipeline.pipelines.JsonFileStorage':300 + 'newsplease.pipeline.pipelines.InMemoryStorage':200 } \ No newline at end of file diff --git a/newsplease/library.py b/newsplease/library.py deleted file mode 100644 index 4c95de2..0000000 --- a/newsplease/library.py +++ /dev/null @@ -1,42 +0,0 @@ -from newsplease.single_crawler import SingleCrawler -import os - - -class Library: - """ - Access news-please functionality via this interface - """ - crawler = None - - def __init__(self): - url = 'https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html' - SingleCrawler.create_as_library(url) - - def download_article(self, url): - """ - Crawls the article from the url and extracts relevant information. - :param url: - :return: - """ - # self.crawler.library_download_urls([url]) - pass - - def download_articles(self, urls): - """ - Crawls articles from the urls and extracts relevant information. - :param urls: - :return: - """ - articles = [] - for url in urls: - articles.append(self.downloadArticle(url)) - return articles - - -if __name__ == '__main__': - lib = Library() - lib.download_article( - 'https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html') - print("hi") - lib.download_article( - 'http://www.faz.net/aktuell/gesellschaft/kenia-droht-hungerkatastrophe-wegen-el-ni-o-14890707.html') \ No newline at end of file diff --git a/newsplease/newspleaselib.py b/newsplease/newspleaselib.py new file mode 100644 index 0000000..babd329 --- /dev/null +++ b/newsplease/newspleaselib.py @@ -0,0 +1,45 @@ +import sys +import os + +sys.path.append(os.path.dirname(os.path.realpath(__file__))) +from newsplease.pipeline.pipelines import InMemoryStorage +from newsplease.single_crawler import SingleCrawler + + +class NewsPleaseLib: + """ + Access news-please functionality via this interface + """ + + @staticmethod + def download_article(url): + """ + Crawls the article from the url and extracts relevant information. + :param url: + :return: + """ + SingleCrawler.create_as_library(url) + results = InMemoryStorage.get_results() + article = results[url] + del results[url] + return article + + @staticmethod + def download_articles(urls): + """ + Crawls articles from the urls and extracts relevant information. + :param urls: + :return: + """ + SingleCrawler.create_as_library(urls) + results = InMemoryStorage.get_results() + articles = [] + for url in urls: + article = results[url] + del results[url] + articles.append(article) + print(article['title']) + return articles + +if __name__ == '__main__': + NewsPleaseLib.download_article('www.zeit.de') diff --git a/newsplease/pipeline/extractor/extractors/newspaper_extractor.py b/newsplease/pipeline/extractor/extractors/newspaper_extractor.py index 4d23b47..8dc29bc 100644 --- a/newsplease/pipeline/extractor/extractors/newspaper_extractor.py +++ b/newsplease/pipeline/extractor/extractors/newspaper_extractor.py @@ -1,7 +1,6 @@ import logging from .abstract_extractor import AbstractExtractor from ..article_candidate import ArticleCandidate -# Import Newspaper Article Extractor Library. from newspaper import Article diff --git a/newsplease/pipeline/pipelines.py b/newsplease/pipeline/pipelines.py index 4c2ee9f..e5b2b19 100644 --- a/newsplease/pipeline/pipelines.py +++ b/newsplease/pipeline/pipelines.py @@ -292,6 +292,7 @@ class ExtractedInformationStorage(object): def extract_relevant_info(item): """ extracts from an item only fields that we want to output as extracted information + :rtype: object :param item: :return: """ @@ -314,6 +315,27 @@ class ExtractedInformationStorage(object): } +class InMemoryStorage(ExtractedInformationStorage): + """ + Stores extracted information in a dictionary in memory - for use with library mode. + """ + + results = {} # this is a static variable + + def process_item(self, item, spider): + # get the original url, so that the library class (or whoever wants to read this) can access the article + if 'redirect_urls' in item._values['spider_response'].meta: + url = item._values['spider_response'].meta['redirect_urls'][0] + else: + url = item._values['url'] + InMemoryStorage.results[url] = ExtractedInformationStorage.extract_relevant_info(item) + return item + + @staticmethod + def get_results(): + return InMemoryStorage.results + + class JsonFileStorage(ExtractedInformationStorage): """ Handles remote storage of the data in Json files diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py index 977f718..61558a0 100644 --- a/newsplease/single_crawler.py +++ b/newsplease/single_crawler.py @@ -49,6 +49,11 @@ class SingleCrawler(object): @classmethod def create_as_library(cls, url): + """ + Creates a single crawler as in library mode. Crawling will start immediately. + :param url: + :return: + """ site = { "crawler": "Download", "url": url @@ -78,7 +83,8 @@ class SingleCrawler(object): self.cfg_crawler = self.cfg.section("Crawler") - # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (kind of hacky..) + # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information ( + # kind of hacky..) if not library_mode: self.json = JsonConfig.get_instance() self.json.setup(self.json_file_path) @@ -126,10 +132,6 @@ class SingleCrawler(object): # if not stated otherwise in the arguments passed to this script self.remove_jobdir_if_not_resume() - #if library_mode: - # self.crawler = crawler_class - # self.library_mode = library_mode - #else: self.load_crawler(crawler_class, site["url"], ignore_regex) @@ -237,16 +239,6 @@ class SingleCrawler(object): self.log.info("Removed " + jobdir + " since '--resume' was not passed to" " initial.py or this crawler was daemonized.") - def library_download_urls(self, urls): - """ - Downloads one or more articles given the urls - :param urls: - :return: - """ - if not self.library_mode: - sys.exit("invoked library_download_urls without being in library mode") - self.load_crawler(self.crawler, urls, False) - self.process.start() if __name__ == "__main__": SingleCrawler(cfg_file_path=sys.argv[1], From 9d26f8dbf2268993b806fa5e20a08dca23dc2dd4 Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Thu, 23 Feb 2017 18:49:23 +0100 Subject: [PATCH 4/9] add library download --- newsplease/config/config_lib.cfg | 324 +++++++++++++++++++++++++++++++ newsplease/newspleaselib.py | 3 - newsplease/single_crawler.py | 2 +- 3 files changed, 325 insertions(+), 4 deletions(-) create mode 100644 newsplease/config/config_lib.cfg diff --git a/newsplease/config/config_lib.cfg b/newsplease/config/config_lib.cfg new file mode 100644 index 0000000..62e3c98 --- /dev/null +++ b/newsplease/config/config_lib.cfg @@ -0,0 +1,324 @@ +# !!! DO NOT CHANGE THIS FILE !!! +# if you want to change news-please's options, you should run it first and change +# the config.cfg file that is created on the first run of news-please (by default the config file will be in +# [HOMEDIR]/news-please/config/config.cfg +# !!! NEVER CHANGE THE config_lib.cfg FILE !!! news-please uses this when run in library mode + + +# IMPORTANT +# All variables get parsed to the correct python-types (if not other declared)! +# So bools have to be True or False (uppercase-first), +# Floats need dots . (not comma) +# Ints are just normal ints +# dicts need to be like this { key: value } +# arrays need to be like this [ value1, value2, value3 ] +# All values in dicts and arrays will also be parsed. +# Everything that does not match any of the above criteria will be parsed as string. + + +[Crawler] + +# GENERAL +# ------- + +# Crawling heuristics +# Default Crawlers: +# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir) +# default: SitemapCrawler +default = SitemapCrawler + +# default: +# fallbacks = { +# "RssCrawler": None, +# "RecursiveSitemapCrawler": "RecursiveCrawler", +# "SitemapCrawler": "RecursiveCrawler", +# "RecursiveCrawler": None, +# "Download": None +# } +fallbacks = { + "RssCrawler": None, + "RecursiveSitemapCrawler": "RecursiveCrawler", + "SitemapCrawler": "RecursiveCrawler", + "RecursiveCrawler": None, + "Download": None + } + +# Determines how many hours need to pass since the last download of a webpage +# to be downloaded again by the RssCrawler +# default: 6 +hours_to_pass_for_redownload_by_rss_crawler = 6 + + + +# PROCESSES +# --------- + +# Number of crawlers, that should crawl parallel +# not counting in daemonized crawlers +# default: 5 +number_of_parallel_crawlers = 5 + +# Number of daemons, will be added to daemons. +# default: 10 +number_of_parallel_daemons = 10 + + + +# SPECIAL CASES +# ------------- + +# urls which end on any of the following file extensions are ignored for recursive crawling +# default: "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)" +ignore_file_extensions = "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)" + +# urls which match the following regex are ignored for recursive crawling +# default: "" +ignore_regex = "" + +# Crawl the sitemaps of subdomains (if sitemap is enabled) +# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap. +# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting. +# if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt +# default: True +sitemap_allow_subdomains = True + + + +[Heuristics] + +# Enabled heuristics, +# Currently: +# - og_type +# - linked_headlines +# - self_linked_headlines +# - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded) +# - meta_contains_article_keyword +# - crawler_contains_only_article_alikes +# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py: +# Every method not starting with __ should be a heuristic, except is_article) +# These heuristics can be overwritten by sitelist.json for each site +# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"} +enabled_heuristics = {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"} + +# Heuristics can be combined with others +# The heuristics need to have the same name as in enabled_heuristics +# Possible condition-characters / literals are: (, ), not, and, or +# All heuristics used here need to be enabled in enabled_heuristics as well! +# Examples: +# "og_type and (self_linked_headlines or linked_headlines)" +# "og_type" +# default: "og_type and (linked_headlines or self_linked_headlines)" +pass_heuristics_condition = "og_type and (linked_headlines or self_linked_headlines)" + +# The maximum ratio of headlines divided by linked_headlines in a file + +# The minimum number of headlines in a file to check for the ratio +# If less then this number are in the file, the file will pass the test. +# default: 5 +min_headlines_for_linked_test = 5 + + + +[Files] + +# GENERAL: +# ------- + +# Paths: +# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False) +# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from +# Default: True +relative_to_start_processes_file = True + + + +# INPUT: +# ----- + +# Here you can specify the input JSON-Filename +# default: sitelist.hjson +url_input_file_name = sitelist.hjson + + + +# OUTPUT: +# ------ + +# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database +# True: ./data would become data +# default: True +working_path = ~/news-please/ + +# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars) +# +# %working_path = the path specified in OUTPUT["working_path"] +# %time_download() = current time at download; will be replaced with strftime() where is a string, explained further here: http://strftime.org/ +# %time_execution() = current time at execution; will be replaced with strftime() where is a string, explained further here: http://strftime.org/ +# %timestamp_download = current time at download; unix-timestamp +# %timestamp_execution = current time at execution; unix-timestamp +# %domain() = first chars of the domain of the crawled file (e.g. zeit.de) +# %appendmd5_domain() = appends the md5 to %domain(< - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than +# %md5_domain() = first chars of md5 hash of %domain +# %full_domain() = first chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de) +# %appendmd5_full_domain() = appends the md5 to %full_domain(< - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than +# %md5_full_domain() = first chars of md5 hash of %full_domain +# %subdomains() = first chars of the domain's subdomains +# %appendmd5_subdomains() = appends the md5 to %subdomains(< - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than +# %md5_subdomains() = first chars of md5 hash of %subdomains +# %url_directory_string() = first chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename +# %appendmd5_url_directory_string() = appends the md5 to %url_directory_string(< - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than +# %md5_url_directory_string() = first chars of md5 hash of %url_directory_string() +# %url_file_name() = first chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index +# %md5_url_file_name() = first chars of md5 hash of %url_file_name +# %max_url_file_name = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 ) +# %appendmd5_max_url_file_name = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 ) +# +# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets. +# To be able to use cleanup commands, it should also start with a static folder name like 'data'. +# +# default: %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html +local_data_directory = %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html + +# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database +# True: ./data would become data +# default: True +format_relative_path = True + + + +[MySQL] + +# MySQL-Connection required for saving meta-informations +host = localhost +port = 3306 +db = 'news-please' +username = 'root' +password = 'password' + + + +[Elasticsearch] + +# Elasticsearch-Connection required for saving detailed meta-information +host = localhost +port = 9200 +index_current = 'news-please' +index_archive = 'news-please-archive' + +# Elasticsearch supports user authentication by CA certificates. If your database is protected by certificate +# fill in the following parameters, otherwise you can ignore them. +use_ca_certificates = False +ca_cert_path = /path/to/cacert.pem +client_cert_path = /path/to/client_cert.pem +client_key_path = /path/to/client_key.pem +username = 'root' +secret = 'password' + +# Properties of the document type used for storage. +mapping = { + 'url': {'type': 'string', 'index': 'not_analyzed'}, + 'sourceDomain': {'type': 'string', 'index': 'not_analyzed'}, + 'pageTitle': {'type': 'string'}, + 'rss_title': {'type': 'string'}, + 'localpath': {'type': 'string', 'index' : 'not_analyzed'}, + 'ancestor': {'type': 'string'}, + 'descendant': {'type': 'string'}, + 'version': {'type': 'long'}, + 'downloadDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"}, + 'modifiedDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"}, + 'publish_date': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"}, + 'title': {'type': 'string'}, + 'description': {'type': 'string'}, + 'text': {'type': 'string'}, + 'author': {'type': 'string'}, + 'image': {'type': 'string', 'index' : 'not_analyzed'}, + 'language': {'type': 'string', 'index' : 'not_analyzed'} + } + + + +[ArticleMasterExtractor] + +# Choose which extractors you want to use. +# +# The Default is ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'], +# which are all integrated extractors right now. +# Possibly extractors are 'newspaper_extractor' , 'readability_extractor' , 'date_extractor_extractor and 'lang_detect_extractor' +# Examples: -Only Newspaper and date_extractor: extractors = ['newspaper', 'date_extractor'] +# -Only Newspaper: extractors = ['newspaper'] +extractors = ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'] + + + +[DateFilter] + +# If added to the pipeline, this module provides the means to filter the extracted articles based on the publishing date. +# Therefore this module has to be placed after the KM4 article extractor to access the publishing dates. +# +# All articles, with a publishing date outside of the given time interval are dropped. The dates used to specify the +# time interval are included and should follow this format: 'yyyy-mm-dd hh:mm:ss'. +# +# It is also possible to only define one date, assigning the other variable the value 'None' to create an half-bounded +# interval. + +start_date = '1999-01-01 00:00:00' +end_date = '2999-12-31 00:00:00' + +# If 'True' articles without a publishing date are dropped. +strict_mode = False + + + +[Scrapy] + +# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG +# default: WARNING +LOG_LEVEL = INFO + +# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes +# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s +LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s + +# Can be a filename or None +# default: None +LOG_FILE = None + +LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S + +LOG_STDOUT = False + +LOG_ENCODING = utf-8 + +BOT_NAME = 'news-please' + +SPIDER_MODULES = ['newsplease.crawler.spiders'] +NEWSPIDER_MODULE = 'newsplease.crawler.spiders' + +# Resume/Pause functionality activation +# default: .resume_jobdir +JOBDIRNAME = .resume_jobdir + +# Respect robots.txt activation +# default: False +ROBOTSTXT_OBEY=True + +# Maximum number of concurrent requests across all domains +# default: 16 +# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one. +CONCURRENT_REQUESTS=16 + +# Maximum number of active requests per domain +# default: 4 +CONCURRENT_REQUESTS_PER_DOMAIN=4 + +# User-agent activation +# default: 'news-please (+http://www.example.com/)' +USER_AGENT = 'news-please (+http://www.example.com/)' + +# Pipeline activation +# Syntax: '.': +# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300} +# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350 +ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, + 'newsplease.pipeline.pipelines.InMemoryStorage':200 + } \ No newline at end of file diff --git a/newsplease/newspleaselib.py b/newsplease/newspleaselib.py index babd329..9b3a193 100644 --- a/newsplease/newspleaselib.py +++ b/newsplease/newspleaselib.py @@ -40,6 +40,3 @@ class NewsPleaseLib: articles.append(article) print(article['title']) return articles - -if __name__ == '__main__': - NewsPleaseLib.download_article('www.zeit.de') diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py index 61558a0..ec71bf8 100644 --- a/newsplease/single_crawler.py +++ b/newsplease/single_crawler.py @@ -58,7 +58,7 @@ class SingleCrawler(object): "crawler": "Download", "url": url } - return cls('config/config.cfg', site, 0, False, False, True) + return cls('config/config_lib.cfg', site, 0, False, False, True) def __init__(self, cfg_file_path, json_file_path, site_index, shall_resume, daemonize, library_mode=False): From 87816445d9b9a5192254e30b9ea29ef94af1615b Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Thu, 23 Feb 2017 18:52:54 +0100 Subject: [PATCH 5/9] add library download --- newsplease/config/config_lib.cfg | 2 +- newsplease/config/sitelist.hjson | 40 ++++++++++++++++++++++++++++++-- newsplease/newspleaselib.py | 3 +++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/newsplease/config/config_lib.cfg b/newsplease/config/config_lib.cfg index 62e3c98..95d728d 100644 --- a/newsplease/config/config_lib.cfg +++ b/newsplease/config/config_lib.cfg @@ -273,7 +273,7 @@ strict_mode = False # Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG # default: WARNING -LOG_LEVEL = INFO +LOG_LEVEL = ERROR # logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes # default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s diff --git a/newsplease/config/sitelist.hjson b/newsplease/config/sitelist.hjson index 1eac4c0..3af27e9 100644 --- a/newsplease/config/sitelist.hjson +++ b/newsplease/config/sitelist.hjson @@ -6,10 +6,46 @@ "base_urls" : [ { # Start crawling from faz.net - "url": "https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html?hp", + "url": "http://www.faz.net/", # Overwrite the default crawler and use th RecursiveCrawler instead - "crawler": "Download", + "crawler": "RecursiveCrawler", + + # Because this site is weirt, use the + # meta_contains_article_keyword-heuristic and disable all others because + # overwrite will merge the defaults from "newscrawler.cfg" with + # this + "overwrite_heuristics": { + "meta_contains_article_keyword": true, + "og_type": false, + "linked_headlines": false, + "self_linked_headlines": false + }, + # Also state that in the condition, all heuristics used in the condition + # have to be activated in "overwrite_heuristics" (or default) as well. + "pass_heuristics_condition": "meta_contains_article_keyword" + }, + { + # zeit.de has a blog which we do not want to crawl + "url": "http://www.zeit.de", + + "overwrite_heuristics": { + # because we do not want to crawl that blog, disable all downloads from + # subdomains + "is_not_from_subdomain": true + }, + # Update the condition as well, all the other heuristics are enabled in + # newscrawler.cfg + "pass_heuristics_condition": "is_not_from_subdomain and og_type and self_linked_headlines and linked_headlines" + }, + { + # nytimes.com should run pretty well with default config: + "url": "http://www.nytimes.com/" + + # to create an additional RssCrawler daemon for this site that runs every hour, we could either use + # "additional_rss_daemon": 3600 + # or create an additional array-object with "crawler": "RssCrawler" and "daemonize": 3600 + # it is not possible to create an additional_rss_daemon for a daemonized array-object } ] } diff --git a/newsplease/newspleaselib.py b/newsplease/newspleaselib.py index 9b3a193..83d39e1 100644 --- a/newsplease/newspleaselib.py +++ b/newsplease/newspleaselib.py @@ -40,3 +40,6 @@ class NewsPleaseLib: articles.append(article) print(article['title']) return articles + +if __name__ == '__main__': + NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan') \ No newline at end of file From 2a329740f87d0a84a385cf169aad41cd4d703bc5 Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Thu, 23 Feb 2017 18:54:44 +0100 Subject: [PATCH 6/9] add library download --- newsplease/config/config.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg index 0af4246..31ef799 100644 --- a/newsplease/config/config.cfg +++ b/newsplease/config/config.cfg @@ -314,5 +314,6 @@ USER_AGENT = 'news-please (+http://www.example.com/)' # default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300} # Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350 ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, - 'newsplease.pipeline.pipelines.InMemoryStorage':200 + 'newsplease.pipeline.pipelines.LocalStorage':200, + 'newsplease.pipeline.pipelines.JsonFileStorage':300 } \ No newline at end of file From c19f841074a99d01b395cc640e3bc4d714ca6bf7 Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Thu, 23 Feb 2017 18:56:28 +0100 Subject: [PATCH 7/9] add library download --- newsplease/single_crawler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py index ec71bf8..a6c2260 100644 --- a/newsplease/single_crawler.py +++ b/newsplease/single_crawler.py @@ -35,7 +35,6 @@ class SingleCrawler(object): json = None log = None crawler_name = None - crawler = None process = None helper = None cfg_file_path = None From 6af8f7c520bf4f3de29bada7c4c988cec79e5392 Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Fri, 24 Feb 2017 11:58:30 +0100 Subject: [PATCH 8/9] remove api dir --- newsplease/api/__init__.py | 0 newsplease/api/server.py | 37 ------------------------------------- 2 files changed, 37 deletions(-) delete mode 100644 newsplease/api/__init__.py delete mode 100644 newsplease/api/server.py diff --git a/newsplease/api/__init__.py b/newsplease/api/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/newsplease/api/server.py b/newsplease/api/server.py deleted file mode 100644 index 865245d..0000000 --- a/newsplease/api/server.py +++ /dev/null @@ -1,37 +0,0 @@ -from extractor.document import Document -from extractor.five_w_extractor import FiveWExtractor -from flask import Flask, request, jsonify -import logging - - -app = Flask(__name__) -log = logging.getLogger(__name__) -host = None -port = 5001 -debug = False -options = None -extractor = FiveWExtractor() -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) -log.addHandler(ch) -log.setLevel(logging.DEBUG) - - -def run(): - log.info("starting server on port %i", port) - app.run(host, port, debug) - log.info("server has stopped") - - -@app.route('/crawl', methods=['GET', 'POST']) -def extract(): - json_article = request.get_json() - log.debug("retrieved raw article for extraction: %s", json_article['title']) - - document = Document(json_article['title'], json_article['description'], json_article['text']) - extractor.parse(document) - - return jsonify(document.questions) - -if __name__ == "__main__": - run() From cdb173d08a0ba2e1b118e00a8ed2c1880bae6e56 Mon Sep 17 00:00:00 2001 From: Felix Hamborg Date: Fri, 24 Feb 2017 12:05:40 +0100 Subject: [PATCH 9/9] add library description --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dccee69..830149d 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,17 @@ ## Features * **works out of the box**: install with pip, add URLs of your pages, run :-) +* execute it conveniently with the **CLI** or use it as a **library** within your own software + +### CLI mode * stores extracted results in **JSON files or ElasticSearch** (other storages can be added easily) * **simple but extensive configuration** (if you want to tweak the results) * runs on your favorite Python version (2.7+ and 3+) * revisions: crawl articles multiple times and track changes +### Library mode +* crawl and extract information for a list of article URLs (currently the fullsite-crawling is only supported via the CLI) + ## Getting started It's super easy, we promise! @@ -27,7 +33,14 @@ It's super easy, we promise! $ sudo pip install news-please ``` -### Run the crawler +### Use within your own code +``` +from newsplease import NewsPleaseLib +article = NewsPleaseLib.download_article('https://www.nytimes.com/2017/02/23/us/politics/cpac-stephen-bannon-reince-priebus.html?hp') +print(article['title']) +``` + +### Run the crawler (CLI) ``` $ news-please