From 32ba438a069cb3a83731547c0c03ac9ad75229e5 Mon Sep 17 00:00:00 2001
From: Felix Hamborg <fhamborg@users.noreply.github.com>
Date: Fri, 17 Feb 2017 15:47:43 +0100
Subject: [PATCH 1/9] Update README.md

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 3e44d10..49e2a48 100644
--- a/README.md
+++ b/README.md
@@ -47,10 +47,11 @@ news-please also supports export to ElasticSearch. Using Elasticsearch will also
 
     [Scrapy]
     
-    ITEM_PIPELINES = {'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100,
-                  'newscrawler.pipeline.pipelines.LocalStorage':200,
-                  'newscrawler.pipeline.pipelines.ElasticSearchStorage':350
-                  }
+    ITEM_PIPELINES = {
+                       'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100,
+                       'newscrawler.pipeline.pipelines.LocalStorage':200,
+                       'newscrawler.pipeline.pipelines.ElasticSearchStorage':350
+                     }
 
 That's it! Except, if your Elasticsearch database is not located at `http://localhost:9200`, uses a different username / password or CA-certificate authentication. In these cases, you will also need to change the following.
 

From 4f4570e32d47e70d7cb847c6423d85cbbfdd6b99 Mon Sep 17 00:00:00 2001
From: Felix Hamborg <felix@hamborg.eu>
Date: Thu, 23 Feb 2017 16:52:56 +0100
Subject: [PATCH 2/9] add library download

---
 newsplease/__main__.py           | 19 +++++++-----
 newsplease/config/sitelist.hjson | 40 ++----------------------
 newsplease/library.py            | 42 +++++++++++++++++++++++++
 newsplease/single_crawler.py     | 53 ++++++++++++++++++++++++--------
 4 files changed, 96 insertions(+), 58 deletions(-)
 create mode 100644 newsplease/library.py

diff --git a/newsplease/__main__.py b/newsplease/__main__.py
index f0150ed..288d49c 100644
--- a/newsplease/__main__.py
+++ b/newsplease/__main__.py
@@ -50,11 +50,12 @@ class NewsPlease(object):
     number_of_active_crawlers = 0
     config_directory_default_path = "~/news-please/config/"
     config_file_default_name = "config.cfg"
+    library_mode = None
 
     __single_crawler = False
 
     def __init__(self, cfg_directory_path, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql,
-                 is_no_confirm):
+                 is_no_confirm, library_mode=False):
         """
         The constructor of the main class, thus the real entry point to the tool.
         :param cfg_file_path:
@@ -64,13 +65,13 @@ class NewsPlease(object):
         :param is_reset_mysql:
         :param is_no_confirm:
         """
-        # print("newsplease is starting on Python " + sys.version)
         configure_logging({"LOG_LEVEL": "ERROR"})
         self.log = logging.getLogger(__name__)
 
         # other parameters
         self.shall_resume = is_resume
         self.no_confirm = is_no_confirm
+        self.library_mode = library_mode
 
         # Sets an environmental variable called 'CColon', so scripts can import
         # modules of this project in relation to this script's dir
@@ -120,8 +121,7 @@ class NewsPlease(object):
         self.crawler_list = self.CrawlerList()
         self.daemon_list = self.DaemonList()
 
-        self.__single_crawler = self.get_abs_file_path("./single_crawler.py",
-                                                       True, False)
+        self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False)
 
         self.manage_crawlers()
 
@@ -284,9 +284,14 @@ class NewsPlease(object):
         if os.path.exists(self.cfg_directory_path):
             return
 
-        sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. "
-                         + "Should a default config directory be created at this path? [Y/n]")
-        user_choice = input().lower().replace("yes", "y").replace("no", "n")
+        user_choice = 'n'
+        if self.no_confirm:
+            user_choice = 'y'
+        else:
+            sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. "
+                             + "Should a default config directory be created at this path? [Y/n]")
+            user_choice = input().lower().replace("yes", "y").replace("no", "n")
+
         if not user_choice or user_choice == '':  # the default is yes
             user_choice = "y"
         if "y" not in user_choice and "n" not in user_choice:
diff --git a/newsplease/config/sitelist.hjson b/newsplease/config/sitelist.hjson
index 3af27e9..1eac4c0 100644
--- a/newsplease/config/sitelist.hjson
+++ b/newsplease/config/sitelist.hjson
@@ -6,46 +6,10 @@
   "base_urls" : [
     {
       # Start crawling from faz.net
-      "url": "http://www.faz.net/",
+      "url": "https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html?hp",
 
       # Overwrite the default crawler and use th RecursiveCrawler instead
-      "crawler": "RecursiveCrawler",
-
-      # Because this site is weirt, use the
-      # meta_contains_article_keyword-heuristic and disable all others because
-      # overwrite will merge the defaults from "newscrawler.cfg" with
-      # this
-      "overwrite_heuristics": {
-        "meta_contains_article_keyword": true,
-        "og_type": false,
-        "linked_headlines": false,
-        "self_linked_headlines": false
-      },
-      # Also state that in the condition, all heuristics used in the condition
-      # have to be activated in "overwrite_heuristics" (or default) as well.
-      "pass_heuristics_condition": "meta_contains_article_keyword"
-    },
-    {
-      # zeit.de has a blog which we do not want to crawl
-      "url": "http://www.zeit.de",
-
-      "overwrite_heuristics": {
-        # because we do not want to crawl that blog, disable all downloads from
-        # subdomains
-        "is_not_from_subdomain": true
-      },
-      # Update the condition as well, all the other heuristics are enabled in
-      # newscrawler.cfg
-      "pass_heuristics_condition": "is_not_from_subdomain and og_type and self_linked_headlines and linked_headlines"
-    },
-    {
-      # nytimes.com should run pretty well with default config:
-      "url": "http://www.nytimes.com/"
-
-      # to create an additional RssCrawler daemon for this site that runs every hour, we could either use
-      # "additional_rss_daemon": 3600
-      # or create an additional array-object with "crawler": "RssCrawler" and "daemonize": 3600
-      # it is not possible to create an additional_rss_daemon for a daemonized array-object
+      "crawler": "Download",
     }
   ]
 }
diff --git a/newsplease/library.py b/newsplease/library.py
new file mode 100644
index 0000000..4c95de2
--- /dev/null
+++ b/newsplease/library.py
@@ -0,0 +1,42 @@
+from newsplease.single_crawler import SingleCrawler
+import os
+
+
+class Library:
+    """
+    Access news-please functionality via this interface
+    """
+    crawler = None
+
+    def __init__(self):
+        url = 'https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html'
+        SingleCrawler.create_as_library(url)
+
+    def download_article(self, url):
+        """
+        Crawls the article from the url and extracts relevant information.
+        :param url:
+        :return:
+        """
+        # self.crawler.library_download_urls([url])
+        pass
+
+    def download_articles(self, urls):
+        """
+        Crawls articles from the urls and extracts relevant information.
+        :param urls:
+        :return:
+        """
+        articles = []
+        for url in urls:
+            articles.append(self.downloadArticle(url))
+        return articles
+
+
+if __name__ == '__main__':
+    lib = Library()
+    lib.download_article(
+        'https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html')
+    print("hi")
+    lib.download_article(
+        'http://www.faz.net/aktuell/gesellschaft/kenia-droht-hungerkatastrophe-wegen-el-ni-o-14890707.html')
\ No newline at end of file
diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py
index 33cba7d..977f718 100644
--- a/newsplease/single_crawler.py
+++ b/newsplease/single_crawler.py
@@ -34,6 +34,7 @@ class SingleCrawler(object):
     cfg = None
     json = None
     log = None
+    crawler_name = None
     crawler = None
     process = None
     helper = None
@@ -46,8 +47,16 @@ class SingleCrawler(object):
     shall_resume = False
     daemonize = False
 
+    @classmethod
+    def create_as_library(cls, url):
+        site = {
+            "crawler": "Download",
+            "url": url
+        }
+        return cls('config/config.cfg', site, 0, False, False, True)
+
     def __init__(self, cfg_file_path, json_file_path,
-                 site_index, shall_resume, daemonize):
+                 site_index, shall_resume, daemonize, library_mode=False):
         # set up logging before it's defined via the config file,
         # this will be overwritten and all other levels will be put out
         # as well, if it will be changed.
@@ -69,11 +78,15 @@ class SingleCrawler(object):
 
         self.cfg_crawler = self.cfg.section("Crawler")
 
-        # load the URL-input-json-file
-        self.json = JsonConfig.get_instance()
-        self.json.setup(self.json_file_path)
-
-        site = self.json.get_site_objects()[self.site_number]
+        # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (kind of hacky..)
+        if not library_mode:
+            self.json = JsonConfig.get_instance()
+            self.json.setup(self.json_file_path)
+            sites = self.json.get_site_objects()
+            site = sites[self.site_number]
+        else:
+            sites = [json_file_path]
+            site = json_file_path
 
         if "ignore_regex" in site:
             ignore_regex = "(%s)|" % site["ignore_regex"]
@@ -83,13 +96,13 @@ class SingleCrawler(object):
 
         # Get the default crawler. The crawler can be overwritten by fallbacks.
         if "additional_rss_daemon" in site and self.daemonize:
-            self.crawler = "RssCrawler"
+            self.crawler_name = "RssCrawler"
         elif "crawler" in site:
-            self.crawler = site["crawler"]
+            self.crawler_name = site["crawler"]
         else:
-            self.crawler = self.cfg.section("Crawler")["default"]
+            self.crawler_name = self.cfg.section("Crawler")["default"]
         # Get the real crawler-class (already "fallen back")
-        crawler_class = self.get_crawler(self.crawler, site["url"])
+        crawler_class = self.get_crawler(self.crawler_name, site["url"])
 
         if not self.cfg.section('Files')['relative_to_start_processes_file']:
             relative_to_path = os.path.dirname(self.cfg_file_path)
@@ -101,7 +114,7 @@ class SingleCrawler(object):
                              self.cfg.section("Files")["local_data_directory"],
                              relative_to_path,
                              self.cfg.section('Files')['format_relative_path'],
-                             self.json.get_site_objects(),
+                             sites,
                              crawler_class,
                              self.cfg.get_working_path())
 
@@ -113,10 +126,13 @@ class SingleCrawler(object):
         # if not stated otherwise in the arguments passed to this script
         self.remove_jobdir_if_not_resume()
 
+        #if library_mode:
+        #    self.crawler = crawler_class
+        #    self.library_mode = library_mode
+        #else:
         self.load_crawler(crawler_class,
                           site["url"],
                           ignore_regex)
-
         self.process.start()
 
     def update_jobdir(self, site):
@@ -133,7 +149,7 @@ class SingleCrawler(object):
         if not jobdirname.endswith("/"):
             jobdirname += "/"
 
-        site_string = ''.join(site["url"]) + self.crawler
+        site_string = ''.join(site["url"]) + self.crawler_name
         hashed = hashlib.md5(site_string.encode('utf-8'))
 
         self.__scrapy_options["JOBDIR"] = working_path + jobdirname + hashed.hexdigest()
@@ -221,6 +237,17 @@ class SingleCrawler(object):
             self.log.info("Removed " + jobdir + " since '--resume' was not passed to"
                           " initial.py or this crawler was daemonized.")
 
+    def library_download_urls(self, urls):
+        """
+        Downloads one or more articles given the urls
+        :param urls:
+        :return:
+        """
+        if not self.library_mode:
+            sys.exit("invoked library_download_urls without being in library mode")
+        self.load_crawler(self.crawler, urls, False)
+        self.process.start()
+
 if __name__ == "__main__":
     SingleCrawler(cfg_file_path=sys.argv[1],
                   json_file_path=sys.argv[2],

From 728c0f7f316d306e0a6a3d49fed9401a6cf69831 Mon Sep 17 00:00:00 2001
From: Felix Hamborg <felix@hamborg.eu>
Date: Thu, 23 Feb 2017 18:34:17 +0100
Subject: [PATCH 3/9] add library download

---
 newsplease/__main__.py                        |  4 +-
 newsplease/config/config.cfg                  |  3 +-
 newsplease/library.py                         | 42 -----------------
 newsplease/newspleaselib.py                   | 45 +++++++++++++++++++
 .../extractors/newspaper_extractor.py         |  1 -
 newsplease/pipeline/pipelines.py              | 22 +++++++++
 newsplease/single_crawler.py                  | 22 +++------
 7 files changed, 77 insertions(+), 62 deletions(-)
 delete mode 100644 newsplease/library.py
 create mode 100644 newsplease/newspleaselib.py

diff --git a/newsplease/__main__.py b/newsplease/__main__.py
index 288d49c..d7e1813 100644
--- a/newsplease/__main__.py
+++ b/newsplease/__main__.py
@@ -25,7 +25,7 @@ if sys.version_info[0] < 3:
     ConnectionError = OSError
 
 
-class NewsPlease(object):
+class NewsPleaseLauncher(object):
     """
     This class is supposed to be called initially to start all processes.  It
     sets up and manages all crawlers.
@@ -626,7 +626,7 @@ def cli(cfg_file_path: ('path to the config file', 'option', 'c'),
     if cfg_file_path and not cfg_file_path.endswith(os.path.sep):
         cfg_file_path += os.path.sep
 
-    NewsPlease(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm)
+    NewsPleaseLauncher(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm)
 
     pass
 
diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg
index 31ef799..0af4246 100644
--- a/newsplease/config/config.cfg
+++ b/newsplease/config/config.cfg
@@ -314,6 +314,5 @@ USER_AGENT = 'news-please (+http://www.example.com/)'
 # default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
 # Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
 ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
-                  'newsplease.pipeline.pipelines.LocalStorage':200,
-                  'newsplease.pipeline.pipelines.JsonFileStorage':300
+                  'newsplease.pipeline.pipelines.InMemoryStorage':200
                   }
\ No newline at end of file
diff --git a/newsplease/library.py b/newsplease/library.py
deleted file mode 100644
index 4c95de2..0000000
--- a/newsplease/library.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from newsplease.single_crawler import SingleCrawler
-import os
-
-
-class Library:
-    """
-    Access news-please functionality via this interface
-    """
-    crawler = None
-
-    def __init__(self):
-        url = 'https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html'
-        SingleCrawler.create_as_library(url)
-
-    def download_article(self, url):
-        """
-        Crawls the article from the url and extracts relevant information.
-        :param url:
-        :return:
-        """
-        # self.crawler.library_download_urls([url])
-        pass
-
-    def download_articles(self, urls):
-        """
-        Crawls articles from the urls and extracts relevant information.
-        :param urls:
-        :return:
-        """
-        articles = []
-        for url in urls:
-            articles.append(self.downloadArticle(url))
-        return articles
-
-
-if __name__ == '__main__':
-    lib = Library()
-    lib.download_article(
-        'https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html')
-    print("hi")
-    lib.download_article(
-        'http://www.faz.net/aktuell/gesellschaft/kenia-droht-hungerkatastrophe-wegen-el-ni-o-14890707.html')
\ No newline at end of file
diff --git a/newsplease/newspleaselib.py b/newsplease/newspleaselib.py
new file mode 100644
index 0000000..babd329
--- /dev/null
+++ b/newsplease/newspleaselib.py
@@ -0,0 +1,45 @@
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from newsplease.pipeline.pipelines import InMemoryStorage
+from newsplease.single_crawler import SingleCrawler
+
+
+class NewsPleaseLib:
+    """
+    Access news-please functionality via this interface
+    """
+
+    @staticmethod
+    def download_article(url):
+        """
+        Crawls the article from the url and extracts relevant information.
+        :param url:
+        :return:
+        """
+        SingleCrawler.create_as_library(url)
+        results = InMemoryStorage.get_results()
+        article = results[url]
+        del results[url]
+        return article
+
+    @staticmethod
+    def download_articles(urls):
+        """
+        Crawls articles from the urls and extracts relevant information.
+        :param urls:
+        :return:
+        """
+        SingleCrawler.create_as_library(urls)
+        results = InMemoryStorage.get_results()
+        articles = []
+        for url in urls:
+            article = results[url]
+            del results[url]
+            articles.append(article)
+            print(article['title'])
+        return articles
+
+if __name__ == '__main__':
+    NewsPleaseLib.download_article('www.zeit.de')
diff --git a/newsplease/pipeline/extractor/extractors/newspaper_extractor.py b/newsplease/pipeline/extractor/extractors/newspaper_extractor.py
index 4d23b47..8dc29bc 100644
--- a/newsplease/pipeline/extractor/extractors/newspaper_extractor.py
+++ b/newsplease/pipeline/extractor/extractors/newspaper_extractor.py
@@ -1,7 +1,6 @@
 import logging
 from .abstract_extractor import AbstractExtractor
 from ..article_candidate import ArticleCandidate
-# Import Newspaper Article Extractor Library.
 from newspaper import Article
 
 
diff --git a/newsplease/pipeline/pipelines.py b/newsplease/pipeline/pipelines.py
index 4c2ee9f..e5b2b19 100644
--- a/newsplease/pipeline/pipelines.py
+++ b/newsplease/pipeline/pipelines.py
@@ -292,6 +292,7 @@ class ExtractedInformationStorage(object):
     def extract_relevant_info(item):
         """
         extracts from an item only fields that we want to output as extracted information
+        :rtype: object
         :param item:
         :return:
         """
@@ -314,6 +315,27 @@ class ExtractedInformationStorage(object):
         }
 
 
+class InMemoryStorage(ExtractedInformationStorage):
+    """
+    Stores extracted information in a dictionary in memory - for use with library mode.
+    """
+
+    results = {}  # this is a static variable
+
+    def process_item(self, item, spider):
+        # get the original url, so that the library class (or whoever wants to read this) can access the article
+        if 'redirect_urls' in item._values['spider_response'].meta:
+            url = item._values['spider_response'].meta['redirect_urls'][0]
+        else:
+            url = item._values['url']
+        InMemoryStorage.results[url] = ExtractedInformationStorage.extract_relevant_info(item)
+        return item
+
+    @staticmethod
+    def get_results():
+        return InMemoryStorage.results
+
+
 class JsonFileStorage(ExtractedInformationStorage):
     """
     Handles remote storage of the data in Json files
diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py
index 977f718..61558a0 100644
--- a/newsplease/single_crawler.py
+++ b/newsplease/single_crawler.py
@@ -49,6 +49,11 @@ class SingleCrawler(object):
 
     @classmethod
     def create_as_library(cls, url):
+        """
+        Creates a single crawler as in library mode. Crawling will start immediately.
+        :param url:
+        :return:
+        """
         site = {
             "crawler": "Download",
             "url": url
@@ -78,7 +83,8 @@ class SingleCrawler(object):
 
         self.cfg_crawler = self.cfg.section("Crawler")
 
-        # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (kind of hacky..)
+        # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (
+        # kind of hacky..)
         if not library_mode:
             self.json = JsonConfig.get_instance()
             self.json.setup(self.json_file_path)
@@ -126,10 +132,6 @@ class SingleCrawler(object):
         # if not stated otherwise in the arguments passed to this script
         self.remove_jobdir_if_not_resume()
 
-        #if library_mode:
-        #    self.crawler = crawler_class
-        #    self.library_mode = library_mode
-        #else:
         self.load_crawler(crawler_class,
                           site["url"],
                           ignore_regex)
@@ -237,16 +239,6 @@ class SingleCrawler(object):
             self.log.info("Removed " + jobdir + " since '--resume' was not passed to"
                           " initial.py or this crawler was daemonized.")
 
-    def library_download_urls(self, urls):
-        """
-        Downloads one or more articles given the urls
-        :param urls:
-        :return:
-        """
-        if not self.library_mode:
-            sys.exit("invoked library_download_urls without being in library mode")
-        self.load_crawler(self.crawler, urls, False)
-        self.process.start()
 
 if __name__ == "__main__":
     SingleCrawler(cfg_file_path=sys.argv[1],

From 9d26f8dbf2268993b806fa5e20a08dca23dc2dd4 Mon Sep 17 00:00:00 2001
From: Felix Hamborg <felix@hamborg.eu>
Date: Thu, 23 Feb 2017 18:49:23 +0100
Subject: [PATCH 4/9] add library download

---
 newsplease/config/config_lib.cfg | 324 +++++++++++++++++++++++++++++++
 newsplease/newspleaselib.py      |   3 -
 newsplease/single_crawler.py     |   2 +-
 3 files changed, 325 insertions(+), 4 deletions(-)
 create mode 100644 newsplease/config/config_lib.cfg

diff --git a/newsplease/config/config_lib.cfg b/newsplease/config/config_lib.cfg
new file mode 100644
index 0000000..62e3c98
--- /dev/null
+++ b/newsplease/config/config_lib.cfg
@@ -0,0 +1,324 @@
+# !!! DO NOT CHANGE THIS FILE !!!
+# if you want to change news-please's options, you should run it first and change
+# the config.cfg file that is created on the first run of news-please (by default the config file will be in
+# [HOMEDIR]/news-please/config/config.cfg
+# !!! NEVER CHANGE THE config_lib.cfg FILE !!! news-please uses this when run in library mode
+
+
+# IMPORTANT
+# All variables get parsed to the correct python-types (if not other declared)!
+# So bools have to be True or False (uppercase-first),
+# Floats need dots . (not comma)
+# Ints are just normal ints
+# dicts need to be like this { key: value }
+# arrays need to be like this [ value1, value2, value3 ]
+# All values in dicts and arrays will also be parsed.
+# Everything that does not match any of the above criteria will be parsed as string.
+
+
+[Crawler]
+
+# GENERAL
+# -------
+
+# Crawling heuristics
+# Default Crawlers:
+# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir)
+# default: SitemapCrawler
+default = SitemapCrawler
+
+# default:
+# fallbacks = {
+#     "RssCrawler": None,
+#     "RecursiveSitemapCrawler": "RecursiveCrawler",
+#     "SitemapCrawler": "RecursiveCrawler",
+#     "RecursiveCrawler": None,
+#     "Download": None
+#     }
+fallbacks = {
+    "RssCrawler": None,
+    "RecursiveSitemapCrawler": "RecursiveCrawler",
+    "SitemapCrawler": "RecursiveCrawler",
+    "RecursiveCrawler": None,
+    "Download": None
+    }
+
+# Determines how many hours need to pass since the last download of a webpage
+# to be downloaded again by the RssCrawler
+# default: 6
+hours_to_pass_for_redownload_by_rss_crawler = 6
+
+
+
+# PROCESSES
+# ---------
+
+# Number of crawlers, that should crawl parallel
+# not counting in daemonized crawlers
+# default: 5
+number_of_parallel_crawlers = 5
+
+# Number of daemons, will be added to daemons.
+# default: 10
+number_of_parallel_daemons = 10
+
+
+
+# SPECIAL CASES
+# -------------
+
+# urls which end on any of the following file extensions are ignored for recursive crawling
+# default: "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
+ignore_file_extensions = "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
+
+# urls which match the following regex are ignored for recursive crawling
+# default: ""
+ignore_regex = ""
+
+# Crawl the sitemaps of subdomains (if sitemap is enabled)
+# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap.
+# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting.
+#      if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt
+# default: True
+sitemap_allow_subdomains = True
+
+
+
+[Heuristics]
+
+# Enabled heuristics,
+# Currently:
+#    - og_type
+#    - linked_headlines
+#    - self_linked_headlines
+#    - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
+#    - meta_contains_article_keyword
+#    - crawler_contains_only_article_alikes
+# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py:
+#  Every method not starting with __ should be a heuristic, except is_article)
+# These heuristics can be overwritten by sitelist.json for each site
+# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
+enabled_heuristics = {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
+
+# Heuristics can be combined with others
+# The heuristics need to have the same name as in enabled_heuristics
+# Possible condition-characters / literals are: (, ), not, and, or
+# All heuristics used here need to be enabled in enabled_heuristics as well!
+# Examples:
+#     "og_type and (self_linked_headlines or linked_headlines)"
+#     "og_type"
+# default: "og_type and (linked_headlines or self_linked_headlines)"
+pass_heuristics_condition = "og_type and (linked_headlines or self_linked_headlines)"
+
+# The maximum ratio of headlines divided by linked_headlines in a file
+
+# The minimum number of headlines in a file to check for the ratio
+# If less then this number are in the file, the file will pass the test.
+# default: 5
+min_headlines_for_linked_test = 5
+
+
+
+[Files]
+
+# GENERAL:
+# -------
+
+# Paths:
+# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False)
+# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from
+# Default: True
+relative_to_start_processes_file = True
+
+
+
+# INPUT:
+# -----
+
+# Here you can specify the input JSON-Filename
+# default: sitelist.hjson
+url_input_file_name = sitelist.hjson
+
+
+
+# OUTPUT:
+# ------
+
+# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
+# True: ./data would become data
+# default: True
+working_path = ~/news-please/
+
+# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars)
+#
+# %working_path                           = the path specified in OUTPUT["working_path"]
+# %time_download(<code>)                  = current time at download; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
+# %time_execution(<code>)                 = current time at execution; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
+# %timestamp_download                     = current time at download; unix-timestamp
+# %timestamp_execution                    = current time at execution; unix-timestamp
+# %domain(<size>)                         = first <size> chars of the domain of the crawled file (e.g. zeit.de)
+# %appendmd5_domain(<size>)               = appends the md5 to %domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than <size>
+# %md5_domain(<size>)                     = first <size> chars of md5 hash of %domain
+# %full_domain(<size>)                    = first <size> chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de)
+# %appendmd5_full_domain(<size>)          = appends the md5 to %full_domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than <size>
+# %md5_full_domain(<size>)                = first <size> chars of md5 hash of %full_domain
+# %subdomains(<size>)                     = first <size> chars of the domain's subdomains
+# %appendmd5_subdomains(<size>)           = appends the md5 to %subdomains(<<size> - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than <size>
+# %md5_subdomains(<size>)                 = first <size> chars of md5 hash of %subdomains
+# %url_directory_string(<size>)           = first <size> chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename
+# %appendmd5_url_directory_string(<size>) = appends the md5 to %url_directory_string(<<size> - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than <size>
+# %md5_url_directory_string(<size>)       = first <size> chars of md5 hash of %url_directory_string(<size>)
+# %url_file_name(<size>)                  = first <size> chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index
+# %md5_url_file_name(<size>)              = first <size> chars of md5 hash of %url_file_name
+# %max_url_file_name                      = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 <NUL>)
+# %appendmd5_max_url_file_name            = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 <NUL>)
+#
+# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets.
+# To be able to use cleanup commands, it should also start with a static folder name like 'data'.
+#
+# default: %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
+local_data_directory = %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
+
+# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
+# True: ./data would become data
+# default: True
+format_relative_path = True
+
+
+
+[MySQL]
+
+# MySQL-Connection required for saving meta-informations
+host = localhost
+port = 3306
+db = 'news-please'
+username = 'root'
+password = 'password'
+
+
+
+[Elasticsearch]
+
+# Elasticsearch-Connection required for saving detailed meta-information
+host = localhost
+port = 9200
+index_current = 'news-please'
+index_archive = 'news-please-archive'
+
+# Elasticsearch supports user authentication by CA certificates. If your database is protected by certificate
+# fill in the following parameters, otherwise you can ignore them.
+use_ca_certificates = False
+ca_cert_path = /path/to/cacert.pem
+client_cert_path = /path/to/client_cert.pem
+client_key_path = /path/to/client_key.pem
+username = 'root'
+secret = 'password'
+
+# Properties of the document type used for storage.
+mapping = {
+    'url': {'type': 'string', 'index': 'not_analyzed'},
+    'sourceDomain': {'type': 'string', 'index': 'not_analyzed'},
+    'pageTitle': {'type': 'string'},
+    'rss_title': {'type': 'string'},
+    'localpath': {'type': 'string', 'index' : 'not_analyzed'},
+    'ancestor': {'type': 'string'},
+    'descendant': {'type': 'string'},
+    'version': {'type': 'long'},
+    'downloadDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
+    'modifiedDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
+    'publish_date': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
+    'title': {'type': 'string'},
+    'description': {'type': 'string'},
+    'text': {'type': 'string'},
+    'author': {'type': 'string'},
+    'image': {'type': 'string', 'index' : 'not_analyzed'},
+    'language': {'type': 'string', 'index' : 'not_analyzed'}
+    }
+
+
+
+[ArticleMasterExtractor]
+
+# Choose which extractors you want to use.
+#
+# The Default is ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'],
+# which are all integrated extractors right now.
+# Possibly extractors are 'newspaper_extractor' , 'readability_extractor' , 'date_extractor_extractor and 'lang_detect_extractor'
+# Examples: -Only Newspaper and date_extractor: extractors = ['newspaper', 'date_extractor']
+#           -Only Newspaper: extractors = ['newspaper']
+extractors = ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor']
+
+
+
+[DateFilter]
+
+# If added to the pipeline, this module provides the means to filter the extracted articles based on the publishing date.
+# Therefore this module has to be placed after the KM4 article extractor to access the publishing dates.
+#
+# All articles, with a publishing date outside of the given time interval are dropped. The dates used to specify the
+# time interval are included and should follow this format: 'yyyy-mm-dd hh:mm:ss'.
+#
+# It is also possible to only define one date, assigning the other variable the value 'None' to create an half-bounded
+# interval.
+
+start_date = '1999-01-01 00:00:00'
+end_date = '2999-12-31 00:00:00'
+
+# If 'True' articles without a publishing date are dropped.
+strict_mode = False
+
+
+
+[Scrapy]
+
+# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG
+# default: WARNING
+LOG_LEVEL = INFO
+
+# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes
+# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s
+LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s
+
+# Can be a filename or None
+# default: None
+LOG_FILE = None
+
+LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S
+
+LOG_STDOUT = False
+
+LOG_ENCODING = utf-8
+
+BOT_NAME = 'news-please'
+
+SPIDER_MODULES = ['newsplease.crawler.spiders']
+NEWSPIDER_MODULE = 'newsplease.crawler.spiders'
+
+# Resume/Pause functionality activation
+# default: .resume_jobdir
+JOBDIRNAME = .resume_jobdir
+
+# Respect robots.txt activation
+# default: False
+ROBOTSTXT_OBEY=True
+
+# Maximum number of concurrent requests across all domains
+# default: 16
+# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one.
+CONCURRENT_REQUESTS=16
+
+# Maximum number of active requests per domain
+# default: 4
+CONCURRENT_REQUESTS_PER_DOMAIN=4
+
+# User-agent activation
+# default: 'news-please (+http://www.example.com/)'
+USER_AGENT = 'news-please (+http://www.example.com/)'
+
+# Pipeline activation
+# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
+# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
+# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
+ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
+                  'newsplease.pipeline.pipelines.InMemoryStorage':200
+                  }
\ No newline at end of file
diff --git a/newsplease/newspleaselib.py b/newsplease/newspleaselib.py
index babd329..9b3a193 100644
--- a/newsplease/newspleaselib.py
+++ b/newsplease/newspleaselib.py
@@ -40,6 +40,3 @@ class NewsPleaseLib:
             articles.append(article)
             print(article['title'])
         return articles
-
-if __name__ == '__main__':
-    NewsPleaseLib.download_article('www.zeit.de')
diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py
index 61558a0..ec71bf8 100644
--- a/newsplease/single_crawler.py
+++ b/newsplease/single_crawler.py
@@ -58,7 +58,7 @@ class SingleCrawler(object):
             "crawler": "Download",
             "url": url
         }
-        return cls('config/config.cfg', site, 0, False, False, True)
+        return cls('config/config_lib.cfg', site, 0, False, False, True)
 
     def __init__(self, cfg_file_path, json_file_path,
                  site_index, shall_resume, daemonize, library_mode=False):

From 87816445d9b9a5192254e30b9ea29ef94af1615b Mon Sep 17 00:00:00 2001
From: Felix Hamborg <felix@hamborg.eu>
Date: Thu, 23 Feb 2017 18:52:54 +0100
Subject: [PATCH 5/9] add library download

---
 newsplease/config/config_lib.cfg |  2 +-
 newsplease/config/sitelist.hjson | 40 ++++++++++++++++++++++++++++++--
 newsplease/newspleaselib.py      |  3 +++
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/newsplease/config/config_lib.cfg b/newsplease/config/config_lib.cfg
index 62e3c98..95d728d 100644
--- a/newsplease/config/config_lib.cfg
+++ b/newsplease/config/config_lib.cfg
@@ -273,7 +273,7 @@ strict_mode = False
 
 # Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG
 # default: WARNING
-LOG_LEVEL = INFO
+LOG_LEVEL = ERROR
 
 # logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes
 # default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s
diff --git a/newsplease/config/sitelist.hjson b/newsplease/config/sitelist.hjson
index 1eac4c0..3af27e9 100644
--- a/newsplease/config/sitelist.hjson
+++ b/newsplease/config/sitelist.hjson
@@ -6,10 +6,46 @@
   "base_urls" : [
     {
       # Start crawling from faz.net
-      "url": "https://www.nytimes.com/2017/02/22/us/politics/devos-sessions-transgender-students-rights.html?hp",
+      "url": "http://www.faz.net/",
 
       # Overwrite the default crawler and use th RecursiveCrawler instead
-      "crawler": "Download",
+      "crawler": "RecursiveCrawler",
+
+      # Because this site is weirt, use the
+      # meta_contains_article_keyword-heuristic and disable all others because
+      # overwrite will merge the defaults from "newscrawler.cfg" with
+      # this
+      "overwrite_heuristics": {
+        "meta_contains_article_keyword": true,
+        "og_type": false,
+        "linked_headlines": false,
+        "self_linked_headlines": false
+      },
+      # Also state that in the condition, all heuristics used in the condition
+      # have to be activated in "overwrite_heuristics" (or default) as well.
+      "pass_heuristics_condition": "meta_contains_article_keyword"
+    },
+    {
+      # zeit.de has a blog which we do not want to crawl
+      "url": "http://www.zeit.de",
+
+      "overwrite_heuristics": {
+        # because we do not want to crawl that blog, disable all downloads from
+        # subdomains
+        "is_not_from_subdomain": true
+      },
+      # Update the condition as well, all the other heuristics are enabled in
+      # newscrawler.cfg
+      "pass_heuristics_condition": "is_not_from_subdomain and og_type and self_linked_headlines and linked_headlines"
+    },
+    {
+      # nytimes.com should run pretty well with default config:
+      "url": "http://www.nytimes.com/"
+
+      # to create an additional RssCrawler daemon for this site that runs every hour, we could either use
+      # "additional_rss_daemon": 3600
+      # or create an additional array-object with "crawler": "RssCrawler" and "daemonize": 3600
+      # it is not possible to create an additional_rss_daemon for a daemonized array-object
     }
   ]
 }
diff --git a/newsplease/newspleaselib.py b/newsplease/newspleaselib.py
index 9b3a193..83d39e1 100644
--- a/newsplease/newspleaselib.py
+++ b/newsplease/newspleaselib.py
@@ -40,3 +40,6 @@ class NewsPleaseLib:
             articles.append(article)
             print(article['title'])
         return articles
+
+if __name__ == '__main__':
+    NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan')
\ No newline at end of file

From 2a329740f87d0a84a385cf169aad41cd4d703bc5 Mon Sep 17 00:00:00 2001
From: Felix Hamborg <felix@hamborg.eu>
Date: Thu, 23 Feb 2017 18:54:44 +0100
Subject: [PATCH 6/9] add library download

---
 newsplease/config/config.cfg | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg
index 0af4246..31ef799 100644
--- a/newsplease/config/config.cfg
+++ b/newsplease/config/config.cfg
@@ -314,5 +314,6 @@ USER_AGENT = 'news-please (+http://www.example.com/)'
 # default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
 # Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
 ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
-                  'newsplease.pipeline.pipelines.InMemoryStorage':200
+                  'newsplease.pipeline.pipelines.LocalStorage':200,
+                  'newsplease.pipeline.pipelines.JsonFileStorage':300
                   }
\ No newline at end of file

From c19f841074a99d01b395cc640e3bc4d714ca6bf7 Mon Sep 17 00:00:00 2001
From: Felix Hamborg <felix@hamborg.eu>
Date: Thu, 23 Feb 2017 18:56:28 +0100
Subject: [PATCH 7/9] add library download

---
 newsplease/single_crawler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/newsplease/single_crawler.py b/newsplease/single_crawler.py
index ec71bf8..a6c2260 100644
--- a/newsplease/single_crawler.py
+++ b/newsplease/single_crawler.py
@@ -35,7 +35,6 @@ class SingleCrawler(object):
     json = None
     log = None
     crawler_name = None
-    crawler = None
     process = None
     helper = None
     cfg_file_path = None

From 6af8f7c520bf4f3de29bada7c4c988cec79e5392 Mon Sep 17 00:00:00 2001
From: Felix Hamborg <felix@hamborg.eu>
Date: Fri, 24 Feb 2017 11:58:30 +0100
Subject: [PATCH 8/9] remove api dir

---
 newsplease/api/__init__.py |  0
 newsplease/api/server.py   | 37 -------------------------------------
 2 files changed, 37 deletions(-)
 delete mode 100644 newsplease/api/__init__.py
 delete mode 100644 newsplease/api/server.py

diff --git a/newsplease/api/__init__.py b/newsplease/api/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/newsplease/api/server.py b/newsplease/api/server.py
deleted file mode 100644
index 865245d..0000000
--- a/newsplease/api/server.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from extractor.document import Document
-from extractor.five_w_extractor import FiveWExtractor
-from flask import Flask, request, jsonify
-import logging
-
-
-app = Flask(__name__)
-log = logging.getLogger(__name__)
-host = None
-port = 5001
-debug = False
-options = None
-extractor = FiveWExtractor()
-ch = logging.StreamHandler()
-ch.setLevel(logging.DEBUG)
-log.addHandler(ch)
-log.setLevel(logging.DEBUG)
-
-
-def run():
-    log.info("starting server on port %i", port)
-    app.run(host, port, debug)
-    log.info("server has stopped")
-
-
-@app.route('/crawl', methods=['GET', 'POST'])
-def extract():
-    json_article = request.get_json()
-    log.debug("retrieved raw article for extraction: %s", json_article['title'])
-
-    document = Document(json_article['title'], json_article['description'], json_article['text'])
-    extractor.parse(document)
-
-    return jsonify(document.questions)
-
-if __name__ == "__main__":
-    run()

From cdb173d08a0ba2e1b118e00a8ed2c1880bae6e56 Mon Sep 17 00:00:00 2001
From: Felix Hamborg <felix@hamborg.eu>
Date: Fri, 24 Feb 2017 12:05:40 +0100
Subject: [PATCH 9/9] add library description

---
 README.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dccee69..830149d 100644
--- a/README.md
+++ b/README.md
@@ -12,11 +12,17 @@
 
 ## Features
 * **works out of the box**: install with pip, add URLs of your pages, run :-)
+* execute it conveniently with the **CLI** or use it as a **library** within your own software
+
+### CLI mode
 * stores extracted results in **JSON files or ElasticSearch** (other storages can be added easily)
 * **simple but extensive configuration** (if you want to tweak the results)
 * runs on your favorite Python version (2.7+ and 3+)
 * revisions: crawl articles multiple times and track changes
 
+### Library mode
+* crawl and extract information for a list of article URLs (currently the fullsite-crawling is only supported via the CLI)
+
 ## Getting started
 
 It's super easy, we promise!
@@ -27,7 +33,14 @@ It's super easy, we promise!
 $ sudo pip install news-please
 ```
 
-### Run the crawler
+### Use within your own code
+```
+from newsplease import NewsPleaseLib
+article = NewsPleaseLib.download_article('https://www.nytimes.com/2017/02/23/us/politics/cpac-stephen-bannon-reince-priebus.html?hp')
+print(article['title'])
+```
+
+### Run the crawler (CLI)
 
 ```
 $ news-please