mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-19 22:26:00 +03:00
Merge branch 'master' of https://github.com/fhamborg/news-please
This commit is contained in:
24
README.md
24
README.md
@@ -12,11 +12,17 @@
|
||||
|
||||
## Features
|
||||
* **works out of the box**: install with pip, add URLs of your pages, run :-)
|
||||
* execute it conveniently with the **CLI** or use it as a **library** within your own software
|
||||
|
||||
### CLI mode
|
||||
* stores extracted results in **JSON files or ElasticSearch** (other storages can be added easily)
|
||||
* **simple but extensive configuration** (if you want to tweak the results)
|
||||
* runs on your favorite Python version (2.7+ and 3+)
|
||||
* revisions: crawl articles multiple times and track changes
|
||||
|
||||
### Library mode
|
||||
* crawl and extract information for a list of article URLs (currently the fullsite-crawling is only supported via the CLI)
|
||||
|
||||
## Getting started
|
||||
|
||||
It's super easy, we promise!
|
||||
@@ -27,7 +33,14 @@ It's super easy, we promise!
|
||||
$ sudo pip install news-please
|
||||
```
|
||||
|
||||
### Run the crawler
|
||||
### Use within your own code
|
||||
```
|
||||
from newsplease import NewsPleaseLib
|
||||
article = NewsPleaseLib.download_article('https://www.nytimes.com/2017/02/23/us/politics/cpac-stephen-bannon-reince-priebus.html?hp')
|
||||
print(article['title'])
|
||||
```
|
||||
|
||||
### Run the crawler (CLI)
|
||||
|
||||
```
|
||||
$ news-please
|
||||
@@ -47,10 +60,11 @@ news-please also supports export to ElasticSearch. Using Elasticsearch will also
|
||||
|
||||
[Scrapy]
|
||||
|
||||
ITEM_PIPELINES = {'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100,
|
||||
'newscrawler.pipeline.pipelines.LocalStorage':200,
|
||||
'newscrawler.pipeline.pipelines.ElasticSearchStorage':350
|
||||
}
|
||||
ITEM_PIPELINES = {
|
||||
'newscrawler.pipeline.pipelines.ArticleMasterExtractor':100,
|
||||
'newscrawler.pipeline.pipelines.LocalStorage':200,
|
||||
'newscrawler.pipeline.pipelines.ElasticSearchStorage':350
|
||||
}
|
||||
|
||||
That's it! Except, if your Elasticsearch database is not located at `http://localhost:9200`, uses a different username / password or CA-certificate authentication. In these cases, you will also need to change the following.
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ if sys.version_info[0] < 3:
|
||||
ConnectionError = OSError
|
||||
|
||||
|
||||
class NewsPlease(object):
|
||||
class NewsPleaseLauncher(object):
|
||||
"""
|
||||
This class is supposed to be called initially to start all processes. It
|
||||
sets up and manages all crawlers.
|
||||
@@ -50,11 +50,12 @@ class NewsPlease(object):
|
||||
number_of_active_crawlers = 0
|
||||
config_directory_default_path = "~/news-please/config/"
|
||||
config_file_default_name = "config.cfg"
|
||||
library_mode = None
|
||||
|
||||
__single_crawler = False
|
||||
|
||||
def __init__(self, cfg_directory_path, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql,
|
||||
is_no_confirm):
|
||||
is_no_confirm, library_mode=False):
|
||||
"""
|
||||
The constructor of the main class, thus the real entry point to the tool.
|
||||
:param cfg_file_path:
|
||||
@@ -64,13 +65,13 @@ class NewsPlease(object):
|
||||
:param is_reset_mysql:
|
||||
:param is_no_confirm:
|
||||
"""
|
||||
# print("newsplease is starting on Python " + sys.version)
|
||||
configure_logging({"LOG_LEVEL": "ERROR"})
|
||||
self.log = logging.getLogger(__name__)
|
||||
|
||||
# other parameters
|
||||
self.shall_resume = is_resume
|
||||
self.no_confirm = is_no_confirm
|
||||
self.library_mode = library_mode
|
||||
|
||||
# Sets an environmental variable called 'CColon', so scripts can import
|
||||
# modules of this project in relation to this script's dir
|
||||
@@ -120,8 +121,7 @@ class NewsPlease(object):
|
||||
self.crawler_list = self.CrawlerList()
|
||||
self.daemon_list = self.DaemonList()
|
||||
|
||||
self.__single_crawler = self.get_abs_file_path("./single_crawler.py",
|
||||
True, False)
|
||||
self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False)
|
||||
|
||||
self.manage_crawlers()
|
||||
|
||||
@@ -284,9 +284,14 @@ class NewsPlease(object):
|
||||
if os.path.exists(self.cfg_directory_path):
|
||||
return
|
||||
|
||||
sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. "
|
||||
+ "Should a default config directory be created at this path? [Y/n]")
|
||||
user_choice = input().lower().replace("yes", "y").replace("no", "n")
|
||||
user_choice = 'n'
|
||||
if self.no_confirm:
|
||||
user_choice = 'y'
|
||||
else:
|
||||
sys.stdout.write("Config directory or file does not exist at '" + os.path.abspath(self.cfg_directory_path) + "'. "
|
||||
+ "Should a default config directory be created at this path? [Y/n]")
|
||||
user_choice = input().lower().replace("yes", "y").replace("no", "n")
|
||||
|
||||
if not user_choice or user_choice == '': # the default is yes
|
||||
user_choice = "y"
|
||||
if "y" not in user_choice and "n" not in user_choice:
|
||||
@@ -621,7 +626,7 @@ def cli(cfg_file_path: ('path to the config file', 'option', 'c'),
|
||||
if cfg_file_path and not cfg_file_path.endswith(os.path.sep):
|
||||
cfg_file_path += os.path.sep
|
||||
|
||||
NewsPlease(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm)
|
||||
NewsPleaseLauncher(cfg_file_path, resume, reset_elasticsearch, reset_json, reset_mysql, no_confirm)
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
from extractor.document import Document
|
||||
from extractor.five_w_extractor import FiveWExtractor
|
||||
from flask import Flask, request, jsonify
|
||||
import logging
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
log = logging.getLogger(__name__)
|
||||
host = None
|
||||
port = 5001
|
||||
debug = False
|
||||
options = None
|
||||
extractor = FiveWExtractor()
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.DEBUG)
|
||||
log.addHandler(ch)
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
def run():
|
||||
log.info("starting server on port %i", port)
|
||||
app.run(host, port, debug)
|
||||
log.info("server has stopped")
|
||||
|
||||
|
||||
@app.route('/crawl', methods=['GET', 'POST'])
|
||||
def extract():
|
||||
json_article = request.get_json()
|
||||
log.debug("retrieved raw article for extraction: %s", json_article['title'])
|
||||
|
||||
document = Document(json_article['title'], json_article['description'], json_article['text'])
|
||||
extractor.parse(document)
|
||||
|
||||
return jsonify(document.questions)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
324
newsplease/config/config_lib.cfg
Normal file
324
newsplease/config/config_lib.cfg
Normal file
@@ -0,0 +1,324 @@
|
||||
# !!! DO NOT CHANGE THIS FILE !!!
|
||||
# if you want to change news-please's options, you should run it first and change
|
||||
# the config.cfg file that is created on the first run of news-please (by default the config file will be in
|
||||
# [HOMEDIR]/news-please/config/config.cfg
|
||||
# !!! NEVER CHANGE THE config_lib.cfg FILE !!! news-please uses this when run in library mode
|
||||
|
||||
|
||||
# IMPORTANT
|
||||
# All variables get parsed to the correct python-types (if not other declared)!
|
||||
# So bools have to be True or False (uppercase-first),
|
||||
# Floats need dots . (not comma)
|
||||
# Ints are just normal ints
|
||||
# dicts need to be like this { key: value }
|
||||
# arrays need to be like this [ value1, value2, value3 ]
|
||||
# All values in dicts and arrays will also be parsed.
|
||||
# Everything that does not match any of the above criteria will be parsed as string.
|
||||
|
||||
|
||||
[Crawler]
|
||||
|
||||
# GENERAL
|
||||
# -------
|
||||
|
||||
# Crawling heuristics
|
||||
# Default Crawlers:
|
||||
# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir)
|
||||
# default: SitemapCrawler
|
||||
default = SitemapCrawler
|
||||
|
||||
# default:
|
||||
# fallbacks = {
|
||||
# "RssCrawler": None,
|
||||
# "RecursiveSitemapCrawler": "RecursiveCrawler",
|
||||
# "SitemapCrawler": "RecursiveCrawler",
|
||||
# "RecursiveCrawler": None,
|
||||
# "Download": None
|
||||
# }
|
||||
fallbacks = {
|
||||
"RssCrawler": None,
|
||||
"RecursiveSitemapCrawler": "RecursiveCrawler",
|
||||
"SitemapCrawler": "RecursiveCrawler",
|
||||
"RecursiveCrawler": None,
|
||||
"Download": None
|
||||
}
|
||||
|
||||
# Determines how many hours need to pass since the last download of a webpage
|
||||
# to be downloaded again by the RssCrawler
|
||||
# default: 6
|
||||
hours_to_pass_for_redownload_by_rss_crawler = 6
|
||||
|
||||
|
||||
|
||||
# PROCESSES
|
||||
# ---------
|
||||
|
||||
# Number of crawlers, that should crawl parallel
|
||||
# not counting in daemonized crawlers
|
||||
# default: 5
|
||||
number_of_parallel_crawlers = 5
|
||||
|
||||
# Number of daemons, will be added to daemons.
|
||||
# default: 10
|
||||
number_of_parallel_daemons = 10
|
||||
|
||||
|
||||
|
||||
# SPECIAL CASES
|
||||
# -------------
|
||||
|
||||
# urls which end on any of the following file extensions are ignored for recursive crawling
|
||||
# default: "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
|
||||
ignore_file_extensions = "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
|
||||
|
||||
# urls which match the following regex are ignored for recursive crawling
|
||||
# default: ""
|
||||
ignore_regex = ""
|
||||
|
||||
# Crawl the sitemaps of subdomains (if sitemap is enabled)
|
||||
# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap.
|
||||
# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting.
|
||||
# if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt
|
||||
# default: True
|
||||
sitemap_allow_subdomains = True
|
||||
|
||||
|
||||
|
||||
[Heuristics]
|
||||
|
||||
# Enabled heuristics,
|
||||
# Currently:
|
||||
# - og_type
|
||||
# - linked_headlines
|
||||
# - self_linked_headlines
|
||||
# - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
|
||||
# - meta_contains_article_keyword
|
||||
# - crawler_contains_only_article_alikes
|
||||
# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py:
|
||||
# Every method not starting with __ should be a heuristic, except is_article)
|
||||
# These heuristics can be overwritten by sitelist.json for each site
|
||||
# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
|
||||
enabled_heuristics = {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
|
||||
|
||||
# Heuristics can be combined with others
|
||||
# The heuristics need to have the same name as in enabled_heuristics
|
||||
# Possible condition-characters / literals are: (, ), not, and, or
|
||||
# All heuristics used here need to be enabled in enabled_heuristics as well!
|
||||
# Examples:
|
||||
# "og_type and (self_linked_headlines or linked_headlines)"
|
||||
# "og_type"
|
||||
# default: "og_type and (linked_headlines or self_linked_headlines)"
|
||||
pass_heuristics_condition = "og_type and (linked_headlines or self_linked_headlines)"
|
||||
|
||||
# The maximum ratio of headlines divided by linked_headlines in a file
|
||||
|
||||
# The minimum number of headlines in a file to check for the ratio
|
||||
# If less then this number are in the file, the file will pass the test.
|
||||
# default: 5
|
||||
min_headlines_for_linked_test = 5
|
||||
|
||||
|
||||
|
||||
[Files]
|
||||
|
||||
# GENERAL:
|
||||
# -------
|
||||
|
||||
# Paths:
|
||||
# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False)
|
||||
# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from
|
||||
# Default: True
|
||||
relative_to_start_processes_file = True
|
||||
|
||||
|
||||
|
||||
# INPUT:
|
||||
# -----
|
||||
|
||||
# Here you can specify the input JSON-Filename
|
||||
# default: sitelist.hjson
|
||||
url_input_file_name = sitelist.hjson
|
||||
|
||||
|
||||
|
||||
# OUTPUT:
|
||||
# ------
|
||||
|
||||
# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
|
||||
# True: ./data would become data
|
||||
# default: True
|
||||
working_path = ~/news-please/
|
||||
|
||||
# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars)
|
||||
#
|
||||
# %working_path = the path specified in OUTPUT["working_path"]
|
||||
# %time_download(<code>) = current time at download; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
|
||||
# %time_execution(<code>) = current time at execution; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
|
||||
# %timestamp_download = current time at download; unix-timestamp
|
||||
# %timestamp_execution = current time at execution; unix-timestamp
|
||||
# %domain(<size>) = first <size> chars of the domain of the crawled file (e.g. zeit.de)
|
||||
# %appendmd5_domain(<size>) = appends the md5 to %domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than <size>
|
||||
# %md5_domain(<size>) = first <size> chars of md5 hash of %domain
|
||||
# %full_domain(<size>) = first <size> chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de)
|
||||
# %appendmd5_full_domain(<size>) = appends the md5 to %full_domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than <size>
|
||||
# %md5_full_domain(<size>) = first <size> chars of md5 hash of %full_domain
|
||||
# %subdomains(<size>) = first <size> chars of the domain's subdomains
|
||||
# %appendmd5_subdomains(<size>) = appends the md5 to %subdomains(<<size> - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than <size>
|
||||
# %md5_subdomains(<size>) = first <size> chars of md5 hash of %subdomains
|
||||
# %url_directory_string(<size>) = first <size> chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename
|
||||
# %appendmd5_url_directory_string(<size>) = appends the md5 to %url_directory_string(<<size> - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than <size>
|
||||
# %md5_url_directory_string(<size>) = first <size> chars of md5 hash of %url_directory_string(<size>)
|
||||
# %url_file_name(<size>) = first <size> chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index
|
||||
# %md5_url_file_name(<size>) = first <size> chars of md5 hash of %url_file_name
|
||||
# %max_url_file_name = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 <NUL>)
|
||||
# %appendmd5_max_url_file_name = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 <NUL>)
|
||||
#
|
||||
# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets.
|
||||
# To be able to use cleanup commands, it should also start with a static folder name like 'data'.
|
||||
#
|
||||
# default: %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
|
||||
local_data_directory = %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
|
||||
|
||||
# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
|
||||
# True: ./data would become data
|
||||
# default: True
|
||||
format_relative_path = True
|
||||
|
||||
|
||||
|
||||
[MySQL]
|
||||
|
||||
# MySQL-Connection required for saving meta-informations
|
||||
host = localhost
|
||||
port = 3306
|
||||
db = 'news-please'
|
||||
username = 'root'
|
||||
password = 'password'
|
||||
|
||||
|
||||
|
||||
[Elasticsearch]
|
||||
|
||||
# Elasticsearch-Connection required for saving detailed meta-information
|
||||
host = localhost
|
||||
port = 9200
|
||||
index_current = 'news-please'
|
||||
index_archive = 'news-please-archive'
|
||||
|
||||
# Elasticsearch supports user authentication by CA certificates. If your database is protected by certificate
|
||||
# fill in the following parameters, otherwise you can ignore them.
|
||||
use_ca_certificates = False
|
||||
ca_cert_path = /path/to/cacert.pem
|
||||
client_cert_path = /path/to/client_cert.pem
|
||||
client_key_path = /path/to/client_key.pem
|
||||
username = 'root'
|
||||
secret = 'password'
|
||||
|
||||
# Properties of the document type used for storage.
|
||||
mapping = {
|
||||
'url': {'type': 'string', 'index': 'not_analyzed'},
|
||||
'sourceDomain': {'type': 'string', 'index': 'not_analyzed'},
|
||||
'pageTitle': {'type': 'string'},
|
||||
'rss_title': {'type': 'string'},
|
||||
'localpath': {'type': 'string', 'index' : 'not_analyzed'},
|
||||
'ancestor': {'type': 'string'},
|
||||
'descendant': {'type': 'string'},
|
||||
'version': {'type': 'long'},
|
||||
'downloadDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
|
||||
'modifiedDate': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
|
||||
'publish_date': {'type': 'date', "format":"yyyy-MM-dd HH:mm:ss"},
|
||||
'title': {'type': 'string'},
|
||||
'description': {'type': 'string'},
|
||||
'text': {'type': 'string'},
|
||||
'author': {'type': 'string'},
|
||||
'image': {'type': 'string', 'index' : 'not_analyzed'},
|
||||
'language': {'type': 'string', 'index' : 'not_analyzed'}
|
||||
}
|
||||
|
||||
|
||||
|
||||
[ArticleMasterExtractor]
|
||||
|
||||
# Choose which extractors you want to use.
|
||||
#
|
||||
# The Default is ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'],
|
||||
# which are all integrated extractors right now.
|
||||
# Possibly extractors are 'newspaper_extractor' , 'readability_extractor' , 'date_extractor_extractor and 'lang_detect_extractor'
|
||||
# Examples: -Only Newspaper and date_extractor: extractors = ['newspaper', 'date_extractor']
|
||||
# -Only Newspaper: extractors = ['newspaper']
|
||||
extractors = ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor']
|
||||
|
||||
|
||||
|
||||
[DateFilter]
|
||||
|
||||
# If added to the pipeline, this module provides the means to filter the extracted articles based on the publishing date.
|
||||
# Therefore this module has to be placed after the KM4 article extractor to access the publishing dates.
|
||||
#
|
||||
# All articles, with a publishing date outside of the given time interval are dropped. The dates used to specify the
|
||||
# time interval are included and should follow this format: 'yyyy-mm-dd hh:mm:ss'.
|
||||
#
|
||||
# It is also possible to only define one date, assigning the other variable the value 'None' to create an half-bounded
|
||||
# interval.
|
||||
|
||||
start_date = '1999-01-01 00:00:00'
|
||||
end_date = '2999-12-31 00:00:00'
|
||||
|
||||
# If 'True' articles without a publishing date are dropped.
|
||||
strict_mode = False
|
||||
|
||||
|
||||
|
||||
[Scrapy]
|
||||
|
||||
# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG
|
||||
# default: WARNING
|
||||
LOG_LEVEL = ERROR
|
||||
|
||||
# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes
|
||||
# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s
|
||||
LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s
|
||||
|
||||
# Can be a filename or None
|
||||
# default: None
|
||||
LOG_FILE = None
|
||||
|
||||
LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S
|
||||
|
||||
LOG_STDOUT = False
|
||||
|
||||
LOG_ENCODING = utf-8
|
||||
|
||||
BOT_NAME = 'news-please'
|
||||
|
||||
SPIDER_MODULES = ['newsplease.crawler.spiders']
|
||||
NEWSPIDER_MODULE = 'newsplease.crawler.spiders'
|
||||
|
||||
# Resume/Pause functionality activation
|
||||
# default: .resume_jobdir
|
||||
JOBDIRNAME = .resume_jobdir
|
||||
|
||||
# Respect robots.txt activation
|
||||
# default: False
|
||||
ROBOTSTXT_OBEY=True
|
||||
|
||||
# Maximum number of concurrent requests across all domains
|
||||
# default: 16
|
||||
# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one.
|
||||
CONCURRENT_REQUESTS=16
|
||||
|
||||
# Maximum number of active requests per domain
|
||||
# default: 4
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN=4
|
||||
|
||||
# User-agent activation
|
||||
# default: 'news-please (+http://www.example.com/)'
|
||||
USER_AGENT = 'news-please (+http://www.example.com/)'
|
||||
|
||||
# Pipeline activation
|
||||
# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
|
||||
# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
|
||||
# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
|
||||
ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
|
||||
'newsplease.pipeline.pipelines.InMemoryStorage':200
|
||||
}
|
||||
45
newsplease/newspleaselib.py
Normal file
45
newsplease/newspleaselib.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from newsplease.pipeline.pipelines import InMemoryStorage
|
||||
from newsplease.single_crawler import SingleCrawler
|
||||
|
||||
|
||||
class NewsPleaseLib:
|
||||
"""
|
||||
Access news-please functionality via this interface
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def download_article(url):
|
||||
"""
|
||||
Crawls the article from the url and extracts relevant information.
|
||||
:param url:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(url)
|
||||
results = InMemoryStorage.get_results()
|
||||
article = results[url]
|
||||
del results[url]
|
||||
return article
|
||||
|
||||
@staticmethod
|
||||
def download_articles(urls):
|
||||
"""
|
||||
Crawls articles from the urls and extracts relevant information.
|
||||
:param urls:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(urls)
|
||||
results = InMemoryStorage.get_results()
|
||||
articles = []
|
||||
for url in urls:
|
||||
article = results[url]
|
||||
del results[url]
|
||||
articles.append(article)
|
||||
print(article['title'])
|
||||
return articles
|
||||
|
||||
if __name__ == '__main__':
|
||||
NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan')
|
||||
@@ -1,7 +1,6 @@
|
||||
import logging
|
||||
from .abstract_extractor import AbstractExtractor
|
||||
from ..article_candidate import ArticleCandidate
|
||||
# Import Newspaper Article Extractor Library.
|
||||
from newspaper import Article
|
||||
|
||||
|
||||
|
||||
@@ -292,6 +292,7 @@ class ExtractedInformationStorage(object):
|
||||
def extract_relevant_info(item):
|
||||
"""
|
||||
extracts from an item only fields that we want to output as extracted information
|
||||
:rtype: object
|
||||
:param item:
|
||||
:return:
|
||||
"""
|
||||
@@ -314,6 +315,27 @@ class ExtractedInformationStorage(object):
|
||||
}
|
||||
|
||||
|
||||
class InMemoryStorage(ExtractedInformationStorage):
|
||||
"""
|
||||
Stores extracted information in a dictionary in memory - for use with library mode.
|
||||
"""
|
||||
|
||||
results = {} # this is a static variable
|
||||
|
||||
def process_item(self, item, spider):
|
||||
# get the original url, so that the library class (or whoever wants to read this) can access the article
|
||||
if 'redirect_urls' in item._values['spider_response'].meta:
|
||||
url = item._values['spider_response'].meta['redirect_urls'][0]
|
||||
else:
|
||||
url = item._values['url']
|
||||
InMemoryStorage.results[url] = ExtractedInformationStorage.extract_relevant_info(item)
|
||||
return item
|
||||
|
||||
@staticmethod
|
||||
def get_results():
|
||||
return InMemoryStorage.results
|
||||
|
||||
|
||||
class JsonFileStorage(ExtractedInformationStorage):
|
||||
"""
|
||||
Handles remote storage of the data in Json files
|
||||
|
||||
@@ -34,7 +34,7 @@ class SingleCrawler(object):
|
||||
cfg = None
|
||||
json = None
|
||||
log = None
|
||||
crawler = None
|
||||
crawler_name = None
|
||||
process = None
|
||||
helper = None
|
||||
cfg_file_path = None
|
||||
@@ -46,8 +46,21 @@ class SingleCrawler(object):
|
||||
shall_resume = False
|
||||
daemonize = False
|
||||
|
||||
@classmethod
|
||||
def create_as_library(cls, url):
|
||||
"""
|
||||
Creates a single crawler as in library mode. Crawling will start immediately.
|
||||
:param url:
|
||||
:return:
|
||||
"""
|
||||
site = {
|
||||
"crawler": "Download",
|
||||
"url": url
|
||||
}
|
||||
return cls('config/config_lib.cfg', site, 0, False, False, True)
|
||||
|
||||
def __init__(self, cfg_file_path, json_file_path,
|
||||
site_index, shall_resume, daemonize):
|
||||
site_index, shall_resume, daemonize, library_mode=False):
|
||||
# set up logging before it's defined via the config file,
|
||||
# this will be overwritten and all other levels will be put out
|
||||
# as well, if it will be changed.
|
||||
@@ -69,11 +82,16 @@ class SingleCrawler(object):
|
||||
|
||||
self.cfg_crawler = self.cfg.section("Crawler")
|
||||
|
||||
# load the URL-input-json-file
|
||||
self.json = JsonConfig.get_instance()
|
||||
self.json.setup(self.json_file_path)
|
||||
|
||||
site = self.json.get_site_objects()[self.site_number]
|
||||
# load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (
|
||||
# kind of hacky..)
|
||||
if not library_mode:
|
||||
self.json = JsonConfig.get_instance()
|
||||
self.json.setup(self.json_file_path)
|
||||
sites = self.json.get_site_objects()
|
||||
site = sites[self.site_number]
|
||||
else:
|
||||
sites = [json_file_path]
|
||||
site = json_file_path
|
||||
|
||||
if "ignore_regex" in site:
|
||||
ignore_regex = "(%s)|" % site["ignore_regex"]
|
||||
@@ -83,13 +101,13 @@ class SingleCrawler(object):
|
||||
|
||||
# Get the default crawler. The crawler can be overwritten by fallbacks.
|
||||
if "additional_rss_daemon" in site and self.daemonize:
|
||||
self.crawler = "RssCrawler"
|
||||
self.crawler_name = "RssCrawler"
|
||||
elif "crawler" in site:
|
||||
self.crawler = site["crawler"]
|
||||
self.crawler_name = site["crawler"]
|
||||
else:
|
||||
self.crawler = self.cfg.section("Crawler")["default"]
|
||||
self.crawler_name = self.cfg.section("Crawler")["default"]
|
||||
# Get the real crawler-class (already "fallen back")
|
||||
crawler_class = self.get_crawler(self.crawler, site["url"])
|
||||
crawler_class = self.get_crawler(self.crawler_name, site["url"])
|
||||
|
||||
if not self.cfg.section('Files')['relative_to_start_processes_file']:
|
||||
relative_to_path = os.path.dirname(self.cfg_file_path)
|
||||
@@ -101,7 +119,7 @@ class SingleCrawler(object):
|
||||
self.cfg.section("Files")["local_data_directory"],
|
||||
relative_to_path,
|
||||
self.cfg.section('Files')['format_relative_path'],
|
||||
self.json.get_site_objects(),
|
||||
sites,
|
||||
crawler_class,
|
||||
self.cfg.get_working_path())
|
||||
|
||||
@@ -116,7 +134,6 @@ class SingleCrawler(object):
|
||||
self.load_crawler(crawler_class,
|
||||
site["url"],
|
||||
ignore_regex)
|
||||
|
||||
self.process.start()
|
||||
|
||||
def update_jobdir(self, site):
|
||||
@@ -133,7 +150,7 @@ class SingleCrawler(object):
|
||||
if not jobdirname.endswith("/"):
|
||||
jobdirname += "/"
|
||||
|
||||
site_string = ''.join(site["url"]) + self.crawler
|
||||
site_string = ''.join(site["url"]) + self.crawler_name
|
||||
hashed = hashlib.md5(site_string.encode('utf-8'))
|
||||
|
||||
self.__scrapy_options["JOBDIR"] = working_path + jobdirname + hashed.hexdigest()
|
||||
@@ -221,6 +238,7 @@ class SingleCrawler(object):
|
||||
self.log.info("Removed " + jobdir + " since '--resume' was not passed to"
|
||||
" initial.py or this crawler was daemonized.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
SingleCrawler(cfg_file_path=sys.argv[1],
|
||||
json_file_path=sys.argv[2],
|
||||
|
||||
Reference in New Issue
Block a user