mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-19 22:26:00 +03:00
add warc file filtering
This commit is contained in:
@@ -54,6 +54,10 @@ my_filter_start_date = None # datetime.datetime(2016, 1, 1)
|
||||
# end date (if None, any date is OK as end date), as datetime
|
||||
my_filter_end_date = None # datetime.datetime(2016, 12, 31)
|
||||
# Only .warc files published within [my_warc_files_start_date, my_warc_files_end_date) will be downloaded.
|
||||
# Note that the date a warc file has been published does not imply it contains only news
|
||||
# articles from that date. Instead, you must assume that the warc file can contain articles
|
||||
# from ANY time before the warc file was published, e.g., a warc file published in August 2020
|
||||
# may contain news articles from December 2016.
|
||||
my_warc_files_start_date = None # example: datetime.datetime(2020, 3, 1)
|
||||
my_warc_files_end_date = None # example: datetime.datetime(2020, 3, 2)
|
||||
# if date filtering is strict and news-please could not detect the date of an article, the article will be discarded
|
||||
|
||||
2
setup.py
2
setup.py
@@ -1,7 +1,7 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(name='news-please',
|
||||
version='1.5.18',
|
||||
version='1.5.20',
|
||||
description="news-please is an open source easy-to-use news extractor that just works.",
|
||||
long_description="""\
|
||||
news-please is an open source, easy-to-use news crawler that extracts structured information from almost any news website. It can follow recursively internal hyperlinks and read RSS feeds to fetch both most recent and also old, archived articles. You only need to provide the root URL of the news website. Furthermore, its API allows developers to access the exctraction functionality within their software. news-please also implements a workflow optimized for the news archive provided by commoncrawl.org, allowing users to efficiently crawl and extract news articles including various filter options.""",
|
||||
|
||||
Reference in New Issue
Block a user