Merge branch 'master' of https://github.com/fhamborg/news-please

2021-09-19 22:26:00 +03:00 · 2017-02-24 17:16:55 +01:00
parent 089b90f391 cdb173d08a
commit 331a60ebff
9 changed files with 456 additions and 66 deletions
--- a/README.md
+++ b/README.md
@@ -12,11 +12,17 @@

 ## Features
 * **works out of the box**: install with pip, add URLs of your pages, run :-)
+* execute it conveniently with the **CLI** or use it as a **library** within your own software
+
+### CLI mode
 * stores extracted results in **JSON files or ElasticSearch** (other storages can be added easily)
 * **simple but extensive configuration** (if you want to tweak the results)
 * runs on your favorite Python version (2.7+ and 3+)
 * revisions: crawl articles multiple times and track changes

+### Library mode
+* crawl and extract information for a list of article URLs (currently the fullsite-crawling is only supported via the CLI)
+
 ## Getting started

 It's super easy, we promise!
@@ -27,7 +33,14 @@ It's super easy, we promise!
 $ sudo pip install news-please
 ```

-### Run the crawler
+### Use within your own code
+```
+from newsplease import NewsPleaseLib
+article = NewsPleaseLib.download_article('https://www.nytimes.com/2017/02/23/us/politics/cpac-stephen-bannon-reince-priebus.html?hp')
+print(article['title'])
+```
+
+### Run the crawler (CLI)

 ```
 $ news-please
@@ -47,10 +60,11 @@ news-please also supports export to ElasticSearch. Using Elasticsearch will also

    [Scrapy]
    
-    ITEM_PIPELINES = {'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100,
-                  'newscrawler.pipeline.pipelines.LocalStorage':200,
-                  'newscrawler.pipeline.pipelines.ElasticSearchStorage':350
-                  }
+    ITEM_PIPELINES = {
+                       'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100,
+                       'newscrawler.pipeline.pipelines.LocalStorage':200,
+                       'newscrawler.pipeline.pipelines.ElasticSearchStorage':350
+                     }

 That's it! Except, if your Elasticsearch database is not located at `http://localhost:9200`, uses a different username / password or CA-certificate authentication. In these cases, you will also need to change the following.

--- a/newsplease/main.py
+++ b/newsplease/main.py
@@ -25,7 +25,7 @@ if sys.version_info[0] < 3:
    ConnectionError = OSError


-class NewsPlease(object):
+class NewsPleaseLauncher(object):
    """
    This class is supposed to be called initially to start all processes.  It
    sets up and manages all crawlers.
@@ -50,11 +50,12 @@ class NewsPlease(object):
    number_of_active_crawlers = 0
    config_directory_default_path = "~/news-please/config/"
    config_file_default_name = "config.cfg"
+    library_mode = None

    __single_crawler = False

    def __init__(self, cfg_directory_path, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql,
-                 is_no_confirm):
+                 is_no_confirm, library_mode=False):
        """
        The constructor of the main class, thus the real entry point to the tool.
        :param cfg_file_path:
@@ -64,13 +65,13 @@ class NewsPlease(object):
        :param is_reset_mysql:
        :param is_no_confirm:
        """
-        # print("newsplease is starting on Python " + sys.version)
        configure_logging({"LOG_LEVEL": "ERROR"})
        self.log = logging.getLogger(__name__)

        # other parameters
        self.shall_resume = is_resume
        self.no_confirm = is_no_confirm
+        self.library_mode = library_mode

        # Sets an environmental variable called 'CColon', so scripts can import
        # modules of this project in relation to this script's dir
@@ -120,8 +121,7 @@ class NewsPlease(object):
        self.crawler_list = self.CrawlerList()
        self.daemon_list = self.DaemonList()

-        self.__single_crawler = self.get_abs_file_path("./single_crawler.py",
-                                                       True, False)
+        self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False)

        self.manage_crawlers()

@@ -284,9 +284,14 @@ class NewsPlease(object):
        if os.path.exists(self.cfg_directory_path):
            return

-        sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. "
-                         + "Should a default config directory be created at this path? [Y/n]")
-        user_choice = input().lower().replace("yes", "y").replace("no", "n")
+        user_choice = 'n'
+        if self.no_confirm:
+            user_choice = 'y'
+        else:
+            sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. "
+                             + "Should a default config directory be created at this path? [Y/n]")
+            user_choice = input().lower().replace("yes", "y").replace("no", "n")
+
        if not user_choice or user_choice == '':  # the default is yes
            user_choice = "y"
        if "y" not in user_choice and "n" not in user_choice:
@@ -621,7 +626,7 @@ def cli(cfg_file_path: ('path to the config file', 'option', 'c'),
    if cfg_file_path and not cfg_file_path.endswith(os.path.sep):
        cfg_file_path += os.path.sep

-    NewsPlease(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm)
+    NewsPleaseLauncher(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm)

    pass

--- a/newsplease/api/init.py
+++ b/newsplease/api/init.py
--- a/newsplease/api/server.py
+++ b/newsplease/api/server.py
@@ -1,37 +0,0 @@
-from extractor.document import Document
-from extractor.five_w_extractor import FiveWExtractor
-from flask import Flask, request, jsonify
-import logging
-
-
-app = Flask(__name__)
-log = logging.getLogger(__name__)
-host = None
-port = 5001
-debug = False
-options = None
-extractor = FiveWExtractor()
-ch = logging.StreamHandler()
-ch.setLevel(logging.DEBUG)
-log.addHandler(ch)
-log.setLevel(logging.DEBUG)
-
-
-def run():
-    log.info("starting server on port %i", port)
-    app.run(host, port, debug)
-    log.info("server has stopped")
-
-
-@app.route('/crawl', methods=['GET', 'POST'])
-def extract():
-    json_article = request.get_json()
-    log.debug("retrieved raw article for extraction: %s", json_article['title'])
-
-    document = Document(json_article['title'], json_article['description'], json_article['text'])
-    extractor.parse(document)
-
-    return jsonify(document.questions)
-
-if __name__ == "__main__":
-    run()
--- a/newsplease/config/config_lib.cfg
+++ b/newsplease/config/config_lib.cfg
@@ -0,0 +1,324 @@
+# !!! DO NOT CHANGE THIS FILE !!!
+# if you want to change news-please's options, you should run it first and change
+# the config.cfg file that is created on the first run of news-please (by default the config file will be in
+# [HOMEDIR]/news-please/config/config.cfg
+# !!! NEVER CHANGE THE config_lib.cfg FILE !!! news-please uses this when run in library mode
+
+
+# IMPORTANT
+# All variables get parsed to the correct python-types (if not other declared)!
+# So bools have to be True or False (uppercase-first),
+# Floats need dots . (not comma)
+# Ints are just normal ints
+# dicts need to be like this { key: value }
+# arrays need to be like this [ value1, value2, value3 ]
+# All values in dicts and arrays will also be parsed.
+# Everything that does not match any of the above criteria will be parsed as string.
+
+
+[Crawler]
+
+# GENERAL
+# -------
+
+# Crawling heuristics
+# Default Crawlers:
+# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir)
+# default: SitemapCrawler
+default = SitemapCrawler
+
+# default:
+# fallbacks = {
+#     "RssCrawler": None,
+#     "RecursiveSitemapCrawler": "RecursiveCrawler",
+#     "SitemapCrawler": "RecursiveCrawler",
+#     "RecursiveCrawler": None,
+#     "Download": None
+#     }
+fallbacks = {
+    "RssCrawler": None,
+    "RecursiveSitemapCrawler": "RecursiveCrawler",
+    "SitemapCrawler": "RecursiveCrawler",
+    "RecursiveCrawler": None,
+    "Download": None
+    }
+
+# Determines how many hours need to pass since the last download of a webpage
+# to be downloaded again by the RssCrawler
+# default: 6
+hours_to_pass_for_redownload_by_rss_crawler = 6
+
+
+
+# PROCESSES
+# ---------
+
+# Number of crawlers, that should crawl parallel
+# not counting in daemonized crawlers
+# default: 5
+number_of_parallel_crawlers = 5
+
+# Number of daemons, will be added to daemons.
+# default: 10
+number_of_parallel_daemons = 10
+
+
+
+# SPECIAL CASES
+# -------------
+
+# urls which end on any of the following file extensions are ignored for recursive crawling
+# default: "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
+ignore_file_extensions = "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
+
+# urls which match the following regex are ignored for recursive crawling
+# default: ""
+ignore_regex = ""
+
+# Crawl the sitemaps of subdomains (if sitemap is enabled)
+# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap.
+# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting.
+#      if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt
+# default: True
+sitemap_allow_subdomains = True
+
+
+
+[Heuristics]
+
+# Enabled heuristics,
+# Currently:
+#    - og_type
+#    - linked_headlines
+#    - self_linked_headlines
+#    - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
+#    - meta_contains_article_keyword
+#    - crawler_contains_only_article_alikes
+# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py:
+#  Every method not starting with __ should be a heuristic, except is_article)
+# These heuristics can be overwritten by sitelist.json for each site
+# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
+enabled_heuristics = {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
+
+# Heuristics can be combined with others
+# The heuristics need to have the same name as in enabled_heuristics
+# Possible condition-characters / literals are: (, ), not, and, or
+# All heuristics used here need to be enabled in enabled_heuristics as well!
+# Examples:
+#     "og_type and (self_linked_headlines or linked_headlines)"
+#     "og_type"
+# default: "og_type and (linked_headlines or self_linked_headlines)"
+pass_heuristics_condition = "og_type and (linked_headlines or self_linked_headlines)"
+
+# The maximum ratio of headlines divided by linked_headlines in a file
+
+# The minimum number of headlines in a file to check for the ratio
+# If less then this number are in the file, the file will pass the test.
+# default: 5
+min_headlines_for_linked_test = 5
+
+
+
+[Files]
+
+# GENERAL:
+# -------
+
+# Paths:
+# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False)
+# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from
+# Default: True
+relative_to_start_processes_file = True
+
+
+
+# INPUT:
+# -----
+
+# Here you can specify the input JSON-Filename
+# default: sitelist.hjson
+url_input_file_name = sitelist.hjson
+
+
+
+# OUTPUT:
+# ------
+
+# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
+# True: ./data would become data
+# default: True
+working_path = ~/news-please/
+
+# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars)
+#
+# %working_path                           = the path specified in OUTPUT["working_path"]
+# %time_download(<code>)                  = current time at download; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
+# %time_execution(<code>)                 = current time at execution; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
+# %timestamp_download                     = current time at download; unix-timestamp
+# %timestamp_execution                    = current time at execution; unix-timestamp
+# %domain(<size>)                         = first <size> chars of the domain of the crawled file (e.g. zeit.de)
+# %appendmd5_domain(<size>)               = appends the md5 to %domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than <size>
+# %md5_domain(<size>)                     = first <size> chars of md5 hash of %domain
+# %full_domain(<size>)                    = first <size> chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de)
+# %appendmd5_full_domain(<size>)          = appends the md5 to %full_domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than <size>
+# %md5_full_domain(<size>)                = first <size> chars of md5 hash of %full_domain
+# %subdomains(<size>)                     = first <size> chars of the domain's subdomains
+# %appendmd5_subdomains(<size>)           = appends the md5 to %subdomains(<<size> - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than <size>
+# %md5_subdomains(<size>)                 = first <size> chars of md5 hash of %subdomains
+# %url_directory_string(<size>)           = first <size> chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename
+# %appendmd5_url_directory_string(<size>) = appends the md5 to %url_directory_string(<<size> - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than <size>
+# %md5_url_directory_string(<size>)       = first <size> chars of md5 hash of %url_directory_string(<size>)
+# %url_file_name(<size>)                  = first <size> chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index
+# %md5_url_file_name(<size>)              = first <size> chars of md5 hash of %url_file_name
+# %max_url_file_name                      = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 <NUL>)
+# %appendmd5_max_url_file_name            = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 <NUL>)
+#
+# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets.
+# To be able to use cleanup commands, it should also start with a static folder name like 'data'.
+#
+# default: %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
+local_data_directory = %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
+
+# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
+# True: ./data would become data
+# default: True
+format_relative_path = True
+
+
+
+[MySQL]
+
+# MySQL-Connection required for saving meta-informations
+host = localhost
+port = 3306
+db = 'news-please'
+username = 'root'
+password = 'password'
+
+
+
+[Elasticsearch]
+
+# Elasticsearch-Connection required for saving detailed meta-information
+host = localhost
+port = 9200
+index_current = 'news-please'
+index_archive = 'news-please-archive'
+
+# Elasticsearch supports user authentication by CA certificates. If your database is protected by certificate
+# fill in the following parameters, otherwise you can ignore them.
+use_ca_certificates = False
+ca_cert_path = /path/to/cacert.pem
+client_cert_path = /path/to/client_cert.pem
+client_key_path = /path/to/client_key.pem
+username = 'root'
+secret = 'password'
+
+# Properties of the document type used for storage.
+mapping = {
+    'url': {'type': 'string', 'index': 'not_analyzed'},
+    'sourceDomain': {'type': 'string', 'index': 'not_analyzed'},
+    'pageTitle': {'type': 'string'},
+    'rss_title': {'type': 'string'},
+    'localpath': {'type': 'string', 'index' : 'not_analyzed'},
+    'ancestor': {'type': 'string'},
+    'descendant': {'type': 'string'},
+    'version': {'type': 'long'},
+    'downloadDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
+    'modifiedDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
+    'publish_date': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
+    'title': {'type': 'string'},
+    'description': {'type': 'string'},
+    'text': {'type': 'string'},
+    'author': {'type': 'string'},
+    'image': {'type': 'string', 'index' : 'not_analyzed'},
+    'language': {'type': 'string', 'index' : 'not_analyzed'}
+    }
+
+
+
+[ArticleMasterExtractor]
+
+# Choose which extractors you want to use.
+#
+# The Default is ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'],
+# which are all integrated extractors right now.
+# Possibly extractors are 'newspaper_extractor' , 'readability_extractor' , 'date_extractor_extractor and 'lang_detect_extractor'
+# Examples: -Only Newspaper and date_extractor: extractors = ['newspaper', 'date_extractor']
+#           -Only Newspaper: extractors = ['newspaper']
+extractors = ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor']
+
+
+
+[DateFilter]
+
+# If added to the pipeline, this module provides the means to filter the extracted articles based on the publishing date.
+# Therefore this module has to be placed after the KM4 article extractor to access the publishing dates.
+#
+# All articles, with a publishing date outside of the given time interval are dropped. The dates used to specify the
+# time interval are included and should follow this format: 'yyyy-mm-dd hh:mm:ss'.
+#
+# It is also possible to only define one date, assigning the other variable the value 'None' to create an half-bounded
+# interval.
+
+start_date = '1999-01-01 00:00:00'
+end_date = '2999-12-31 00:00:00'
+
+# If 'True' articles without a publishing date are dropped.
+strict_mode = False
+
+
+
+[Scrapy]
+
+# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG
+# default: WARNING
+LOG_LEVEL = ERROR
+
+# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes
+# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s
+LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s
+
+# Can be a filename or None
+# default: None
+LOG_FILE = None
+
+LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S
+
+LOG_STDOUT = False
+
+LOG_ENCODING = utf-8
+
+BOT_NAME = 'news-please'
+
+SPIDER_MODULES = ['newsplease.crawler.spiders']
+NEWSPIDER_MODULE = 'newsplease.crawler.spiders'
+
+# Resume/Pause functionality activation
+# default: .resume_jobdir
+JOBDIRNAME = .resume_jobdir
+
+# Respect robots.txt activation
+# default: False
+ROBOTSTXT_OBEY=True
+
+# Maximum number of concurrent requests across all domains
+# default: 16
+# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one.
+CONCURRENT_REQUESTS=16
+
+# Maximum number of active requests per domain
+# default: 4
+CONCURRENT_REQUESTS_PER_DOMAIN=4
+
+# User-agent activation
+# default: 'news-please (+http://www.example.com/)'
+USER_AGENT = 'news-please (+http://www.example.com/)'
+
+# Pipeline activation
+# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
+# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
+# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
+ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
+                  'newsplease.pipeline.pipelines.InMemoryStorage':200
+                  }
--- a/newsplease/newspleaselib.py
+++ b/newsplease/newspleaselib.py
@@ -0,0 +1,45 @@
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from newsplease.pipeline.pipelines import InMemoryStorage
+from newsplease.single_crawler import SingleCrawler
+
+
+class NewsPleaseLib:
+    """
+    Access news-please functionality via this interface
+    """
+
+    @staticmethod
+    def download_article(url):
+        """
+        Crawls the article from the url and extracts relevant information.
+        :param url:
+        :return:
+        """
+        SingleCrawler.create_as_library(url)
+        results = InMemoryStorage.get_results()
+        article = results[url]
+        del results[url]
+        return article
+
+    @staticmethod
+    def download_articles(urls):
+        """
+        Crawls articles from the urls and extracts relevant information.
+        :param urls:
+        :return:
+        """
+        SingleCrawler.create_as_library(urls)
+        results = InMemoryStorage.get_results()
+        articles = []
+        for url in urls:
+            article = results[url]
+            del results[url]
+            articles.append(article)
+            print(article['title'])
+        return articles
+
+if __name__ == '__main__':
+    NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan')
--- a/newsplease/pipeline/extractor/extractors/newspaper_extractor.py
+++ b/newsplease/pipeline/extractor/extractors/newspaper_extractor.py
@@ -1,7 +1,6 @@
 import logging
 from .abstract_extractor import AbstractExtractor
 from ..article_candidate import ArticleCandidate
-# Import Newspaper Article Extractor Library.
 from newspaper import Article


--- a/newsplease/pipeline/pipelines.py
+++ b/newsplease/pipeline/pipelines.py
@@ -292,6 +292,7 @@ class ExtractedInformationStorage(object):
    def extract_relevant_info(item):
        """
        extracts from an item only fields that we want to output as extracted information
+        :rtype: object
        :param item:
        :return:
        """
@@ -314,6 +315,27 @@ class ExtractedInformationStorage(object):
        }


+class InMemoryStorage(ExtractedInformationStorage):
+    """
+    Stores extracted information in a dictionary in memory - for use with library mode.
+    """
+
+    results = {}  # this is a static variable
+
+    def process_item(self, item, spider):
+        # get the original url, so that the library class (or whoever wants to read this) can access the article
+        if 'redirect_urls' in item._values['spider_response'].meta:
+            url = item._values['spider_response'].meta['redirect_urls'][0]
+        else:
+            url = item._values['url']
+        InMemoryStorage.results[url] = ExtractedInformationStorage.extract_relevant_info(item)
+        return item
+
+    @staticmethod
+    def get_results():
+        return InMemoryStorage.results
+
+
 class JsonFileStorage(ExtractedInformationStorage):
    """
    Handles remote storage of the data in Json files
--- a/newsplease/single_crawler.py
+++ b/newsplease/single_crawler.py
@@ -34,7 +34,7 @@ class SingleCrawler(object):
    cfg = None
    json = None
    log = None
-    crawler = None
+    crawler_name = None
    process = None
    helper = None
    cfg_file_path = None
@@ -46,8 +46,21 @@ class SingleCrawler(object):
    shall_resume = False
    daemonize = False

+    @classmethod
+    def create_as_library(cls, url):
+        """
+        Creates a single crawler as in library mode. Crawling will start immediately.
+        :param url:
+        :return:
+        """
+        site = {
+            "crawler": "Download",
+            "url": url
+        }
+        return cls('config/config_lib.cfg', site, 0, False, False, True)
+
    def __init__(self, cfg_file_path, json_file_path,
-                 site_index, shall_resume, daemonize):
+                 site_index, shall_resume, daemonize, library_mode=False):
        # set up logging before it's defined via the config file,
        # this will be overwritten and all other levels will be put out
        # as well, if it will be changed.
@@ -69,11 +82,16 @@ class SingleCrawler(object):

        self.cfg_crawler = self.cfg.section("Crawler")

-        # load the URL-input-json-file
-        self.json = JsonConfig.get_instance()
-        self.json.setup(self.json_file_path)
-
-        site = self.json.get_site_objects()[self.site_number]
+        # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (
+        # kind of hacky..)
+        if not library_mode:
+            self.json = JsonConfig.get_instance()
+            self.json.setup(self.json_file_path)
+            sites = self.json.get_site_objects()
+            site = sites[self.site_number]
+        else:
+            sites = [json_file_path]
+            site = json_file_path

        if "ignore_regex" in site:
            ignore_regex = "(%s)|" % site["ignore_regex"]
@@ -83,13 +101,13 @@ class SingleCrawler(object):

        # Get the default crawler. The crawler can be overwritten by fallbacks.
        if "additional_rss_daemon" in site and self.daemonize:
-            self.crawler = "RssCrawler"
+            self.crawler_name = "RssCrawler"
        elif "crawler" in site:
-            self.crawler = site["crawler"]
+            self.crawler_name = site["crawler"]
        else:
-            self.crawler = self.cfg.section("Crawler")["default"]
+            self.crawler_name = self.cfg.section("Crawler")["default"]
        # Get the real crawler-class (already "fallen back")
-        crawler_class = self.get_crawler(self.crawler, site["url"])
+        crawler_class = self.get_crawler(self.crawler_name, site["url"])

        if not self.cfg.section('Files')['relative_to_start_processes_file']:
            relative_to_path = os.path.dirname(self.cfg_file_path)
@@ -101,7 +119,7 @@ class SingleCrawler(object):
                             self.cfg.section("Files")["local_data_directory"],
                             relative_to_path,
                             self.cfg.section('Files')['format_relative_path'],
-                             self.json.get_site_objects(),
+                             sites,
                             crawler_class,
                             self.cfg.get_working_path())

@@ -116,7 +134,6 @@ class SingleCrawler(object):
        self.load_crawler(crawler_class,
                          site["url"],
                          ignore_regex)
-
        self.process.start()

    def update_jobdir(self, site):
@@ -133,7 +150,7 @@ class SingleCrawler(object):
        if not jobdirname.endswith("/"):
            jobdirname += "/"

-        site_string = ''.join(site["url"]) + self.crawler
+        site_string = ''.join(site["url"]) + self.crawler_name
        hashed = hashlib.md5(site_string.encode('utf-8'))

        self.__scrapy_options["JOBDIR"] = working_path + jobdirname + hashed.hexdigest()
@@ -221,6 +238,7 @@ class SingleCrawler(object):
            self.log.info("Removed " + jobdir + " since '--resume' was not passed to"
                          " initial.py or this crawler was daemonized.")

+
 if __name__ == "__main__":
    SingleCrawler(cfg_file_path=sys.argv[1],
                  json_file_path=sys.argv[2],