mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-19 22:26:00 +03:00
334 lines
14 KiB
INI
334 lines
14 KiB
INI
# IMPORTANT
|
|
# All variables get parsed to the correct python-types (if not other declared)!
|
|
# So bools have to be True or False (uppercase-first),
|
|
# Floats need dots . (not comma)
|
|
# Ints are just normal ints
|
|
# dicts need to be like this { key: value }
|
|
# arrays need to be like this [ value1, value2, value3 ]
|
|
# All values in dicts and arrays will also be parsed.
|
|
# Everything that does not match any of the above criteria will be parsed as string.
|
|
|
|
|
|
|
|
[Crawler]
|
|
|
|
# GENERAL
|
|
# -------
|
|
|
|
# Crawling heuristics
|
|
# Default Crawlers:
|
|
# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir)
|
|
# default: SitemapCrawler
|
|
default = SitemapCrawler
|
|
|
|
# default:
|
|
# fallbacks = {
|
|
# "RssCrawler": None,
|
|
# "RecursiveSitemapCrawler": "RecursiveCrawler",
|
|
# "SitemapCrawler": "RecursiveCrawler",
|
|
# "RecursiveCrawler": None,
|
|
# "Download": None
|
|
# }
|
|
fallbacks = {
|
|
"RssCrawler": None,
|
|
"RecursiveSitemapCrawler": "RecursiveCrawler",
|
|
"SitemapCrawler": "RecursiveCrawler",
|
|
"RecursiveCrawler": None,
|
|
"Download": None
|
|
}
|
|
|
|
# Determines how many hours need to pass since the last download of a webpage
|
|
# to be downloaded again by the RssCrawler
|
|
# default: 6
|
|
hours_to_pass_for_redownload_by_rss_crawler = 6
|
|
|
|
|
|
|
|
# PROCESSES
|
|
# ---------
|
|
|
|
# Number of crawlers, that should crawl parallel
|
|
# not counting in daemonized crawlers
|
|
# default: 5
|
|
number_of_parallel_crawlers = 5
|
|
|
|
# Number of daemons, will be added to daemons.
|
|
# default: 10
|
|
number_of_parallel_daemons = 10
|
|
|
|
|
|
|
|
# SPECIAL CASES
|
|
# -------------
|
|
|
|
# urls which end on any of the following file extensions are ignored for recursive crawling
|
|
# default: "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
|
|
ignore_file_extensions = "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
|
|
|
|
# urls which match the following regex are ignored for recursive crawling
|
|
# default: ""
|
|
ignore_regex = ""
|
|
|
|
# Crawl the sitemaps of subdomains (if sitemap is enabled)
|
|
# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap.
|
|
# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting.
|
|
# if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt
|
|
# default: True
|
|
sitemap_allow_subdomains = True
|
|
|
|
|
|
|
|
[Heuristics]
|
|
|
|
# Enabled heuristics,
|
|
# Currently:
|
|
# - og_type
|
|
# - linked_headlines
|
|
# - self_linked_headlines
|
|
# - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
|
|
# - meta_contains_article_keyword
|
|
# - crawler_contains_only_article_alikes
|
|
# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py:
|
|
# Every method not starting with __ should be a heuristic, except is_article)
|
|
# These heuristics can be overwritten by sitelist.json for each site
|
|
# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
|
|
enabled_heuristics = {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
|
|
|
|
# Heuristics can be combined with others
|
|
# The heuristics need to have the same name as in enabled_heuristics
|
|
# Possible condition-characters / literals are: (, ), not, and, or
|
|
# All heuristics used here need to be enabled in enabled_heuristics as well!
|
|
# Examples:
|
|
# "og_type and (self_linked_headlines or linked_headlines)"
|
|
# "og_type"
|
|
# default: "og_type and (linked_headlines or self_linked_headlines)"
|
|
pass_heuristics_condition = "og_type and (linked_headlines or self_linked_headlines)"
|
|
|
|
# The maximum ratio of headlines divided by linked_headlines in a file
|
|
|
|
# The minimum number of headlines in a file to check for the ratio
|
|
# If less then this number are in the file, the file will pass the test.
|
|
# default: 5
|
|
min_headlines_for_linked_test = 5
|
|
|
|
|
|
|
|
[Files]
|
|
|
|
# GENERAL:
|
|
# -------
|
|
|
|
# Paths:
|
|
# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False)
|
|
# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from
|
|
# Default: True
|
|
relative_to_start_processes_file = True
|
|
|
|
|
|
|
|
# INPUT:
|
|
# -----
|
|
|
|
# Here you can specify the input JSON-Filename
|
|
# default: sitelist.hjson
|
|
url_input_file_name = sitelist.hjson
|
|
|
|
|
|
|
|
# OUTPUT:
|
|
# ------
|
|
|
|
# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
|
|
# True: ./data would become data
|
|
# default: True
|
|
working_path = ~/news-please-repo/
|
|
|
|
# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars)
|
|
#
|
|
# %working_path = the path specified in OUTPUT["working_path"]
|
|
# %time_download(<code>) = current time at download; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
|
|
# %time_execution(<code>) = current time at execution; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
|
|
# %timestamp_download = current time at download; unix-timestamp
|
|
# %timestamp_execution = current time at execution; unix-timestamp
|
|
# %domain(<size>) = first <size> chars of the domain of the crawled file (e.g. zeit.de)
|
|
# %appendmd5_domain(<size>) = appends the md5 to %domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than <size>
|
|
# %md5_domain(<size>) = first <size> chars of md5 hash of %domain
|
|
# %full_domain(<size>) = first <size> chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de)
|
|
# %appendmd5_full_domain(<size>) = appends the md5 to %full_domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than <size>
|
|
# %md5_full_domain(<size>) = first <size> chars of md5 hash of %full_domain
|
|
# %subdomains(<size>) = first <size> chars of the domain's subdomains
|
|
# %appendmd5_subdomains(<size>) = appends the md5 to %subdomains(<<size> - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than <size>
|
|
# %md5_subdomains(<size>) = first <size> chars of md5 hash of %subdomains
|
|
# %url_directory_string(<size>) = first <size> chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename
|
|
# %appendmd5_url_directory_string(<size>) = appends the md5 to %url_directory_string(<<size> - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than <size>
|
|
# %md5_url_directory_string(<size>) = first <size> chars of md5 hash of %url_directory_string(<size>)
|
|
# %url_file_name(<size>) = first <size> chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index
|
|
# %md5_url_file_name(<size>) = first <size> chars of md5 hash of %url_file_name
|
|
# %max_url_file_name = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 <NUL>)
|
|
# %appendmd5_max_url_file_name = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 <NUL>)
|
|
#
|
|
# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets.
|
|
# To be able to use cleanup commands, it should also start with a static folder name like 'data'.
|
|
#
|
|
# default: %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
|
|
local_data_directory = %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
|
|
|
|
# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
|
|
# True: ./data would become data
|
|
# default: True
|
|
format_relative_path = True
|
|
|
|
|
|
|
|
[MySQL]
|
|
|
|
# MySQL-Connection required for saving meta-informations
|
|
host = localhost
|
|
port = 3306
|
|
db = 'news-please'
|
|
username = 'root'
|
|
password = 'password'
|
|
|
|
|
|
[Postgresql]
|
|
|
|
# Postgresql-Connection required for saving meta-informations
|
|
host = localhost
|
|
port = 5432
|
|
database = 'news-please'
|
|
user = 'root'
|
|
password = 'password'
|
|
|
|
|
|
[Elasticsearch]
|
|
|
|
# Elasticsearch-Connection required for saving detailed meta-information
|
|
host = localhost
|
|
port = 9200
|
|
index_current = 'news-please'
|
|
index_archive = 'news-please-archive'
|
|
|
|
# Elasticsearch supports user authentication by CA certificates. If your database is protected by certificate
|
|
# fill in the following parameters, otherwise you can ignore them.
|
|
use_ca_certificates = False
|
|
ca_cert_path = /path/to/cacert.pem
|
|
client_cert_path = /path/to/client_cert.pem
|
|
client_key_path = /path/to/client_key.pem
|
|
username = 'root'
|
|
secret = 'password'
|
|
|
|
# Properties of the document type used for storage.
|
|
mapping = {"properties": {
|
|
"url": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"source_domain": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"title_page": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"title_rss": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"localpath": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"filename": {"type": "keyword"},
|
|
"ancestor": {"type": "keyword"},
|
|
"descendant": {"type": "keyword"},
|
|
"version": {"type": "long"},
|
|
"date_download": {"type": "date", "format":"yyyy-MM-dd HH:mm:ss"},
|
|
"date_modify": {"type": "date", "format":"yyyy-MM-dd HH:mm:ss"},
|
|
"date_publish": {"type": "date", "format":"yyyy-MM-dd HH:mm:ss"},
|
|
"title": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"description": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"text": {"type": "text"},
|
|
"authors": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"image_url": {"type": "text","fields":{"keyword":{"type":"keyword"}}},
|
|
"language": {"type": "keyword"}
|
|
}}
|
|
|
|
|
|
|
|
[ArticleMasterExtractor]
|
|
|
|
# Choose which extractors you want to use.
|
|
#
|
|
# The Default is ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'],
|
|
# which are all integrated extractors right now.
|
|
# Possibly extractors are 'newspaper_extractor' , 'readability_extractor' , 'date_extractor_extractor and 'lang_detect_extractor'
|
|
# Examples: -Only Newspaper and date_extractor: extractors = ['newspaper', 'date_extractor']
|
|
# -Only Newspaper: extractors = ['newspaper']
|
|
extractors = ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor']
|
|
|
|
|
|
|
|
[DateFilter]
|
|
|
|
# If added to the pipeline, this module provides the means to filter the extracted articles based on the publishing date.
|
|
# Therefore this module has to be placed after the KM4 article extractor to access the publishing dates.
|
|
#
|
|
# All articles, with a publishing date outside of the given time interval are dropped. The dates used to specify the
|
|
# time interval are included and should follow this format: 'yyyy-mm-dd hh:mm:ss'.
|
|
#
|
|
# It is also possible to only define one date, assigning the other variable the value 'None' to create an half-bounded
|
|
# interval.
|
|
|
|
start_date = '1999-01-01 00:00:00'
|
|
end_date = '2999-12-31 00:00:00'
|
|
|
|
# If 'True' articles without a publishing date are dropped.
|
|
strict_mode = False
|
|
|
|
|
|
|
|
[Scrapy]
|
|
|
|
# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG
|
|
# default: WARNING
|
|
LOG_LEVEL = INFO
|
|
|
|
# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes
|
|
# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s
|
|
LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s
|
|
|
|
# Can be a filename or None
|
|
# default: None
|
|
LOG_FILE = None
|
|
|
|
LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S
|
|
|
|
LOG_STDOUT = False
|
|
|
|
LOG_ENCODING = utf-8
|
|
|
|
BOT_NAME = 'news-please'
|
|
|
|
SPIDER_MODULES = ['newsplease.crawler.spiders']
|
|
NEWSPIDER_MODULE = 'newsplease.crawler.spiders'
|
|
|
|
# Resume/Pause functionality activation
|
|
# default: .resume_jobdir
|
|
JOBDIRNAME = .resume_jobdir
|
|
|
|
# Respect robots.txt activation
|
|
# default: True
|
|
ROBOTSTXT_OBEY=True
|
|
|
|
# Maximum number of concurrent requests across all domains
|
|
# default: 16
|
|
# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one.
|
|
CONCURRENT_REQUESTS=16
|
|
|
|
# Maximum number of active requests per domain
|
|
# default: 4
|
|
CONCURRENT_REQUESTS_PER_DOMAIN=4
|
|
|
|
# User-agent activation
|
|
# default: 'news-please (+http://www.example.com/)'
|
|
USER_AGENT = 'news-please (+http://www.example.com/)'
|
|
|
|
# Pipeline activation
|
|
# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
|
|
# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.HtmlFileStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
|
|
# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
|
|
ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
|
|
'newsplease.pipeline.pipelines.HtmlFileStorage':200,
|
|
'newsplease.pipeline.pipelines.JsonFileStorage':300
|
|
}
|
|
|
|
ITEM_CLASS = 'newsplease.crawler.items.NewscrawlerItem'
|
|
|
|
[Pandas]
|
|
file_name = "PandasStorage" |