1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-19 22:26:00 +03:00

Support warc_files_end_date for common crawl crawler.

This commit is contained in:
Shang Wang
2021-04-19 00:17:33 -04:00
parent 44a196f1eb
commit 1bea565ca1
2 changed files with 40 additions and 10 deletions

View File

@@ -43,6 +43,8 @@ __counter_warc_skipped = 0
__counter_warc_processed = 0
__start_time = time.time()
# When Common Crawl started.
__common_crawl_start_date = datetime.datetime(2016, 8, 26)
def __setup(local_download_dir_warc, log_level):
"""
@@ -90,7 +92,14 @@ def __get_download_url(name):
"""
return __cc_base_url + name
def __iterate_by_month(start_date, end_date, month_step=1):
def __iterate_by_month(start_date=None, end_date=None, month_step=1):
if start_date is None:
# The starting month of Common Crawl.
start_date = __common_crawl_start_date
if end_date is None:
# Until now.
end_date = datetime.datetime.today()
current_date = start_date
while current_date < end_date:
yield current_date
@@ -109,11 +118,22 @@ def __extract_date_from_warc_filename(path):
return datetime.datetime.strptime(dt, '%Y%m%d%H%M%S')
def __get_remote_index(warc_files_start_date):
def __date_within_period(date, start_date=None, end_date=None):
if start_date is None:
# The starting month of Common Crawl.
start_date = __common_crawl_start_date
if end_date is None:
# Until now.
end_date = datetime.datetime.today()
return start_date <= date < end_date
def __get_remote_index(warc_files_start_date, warc_files_end_date):
"""
Gets the index of news crawl files from commoncrawl.org and returns an array of names
:param warc_files_start_date: only list .warc files with greater or equal date in
their filename
:param warc_files_end_date: only list .warc files with smaller date in their filename
:return:
"""
@@ -128,7 +148,7 @@ def __get_remote_index(warc_files_start_date):
# get the remote info
cmd = ''
if warc_files_start_date:
if warc_files_start_date or warc_files_end_date:
# cleanup
try:
os.remove(temp_filename)
@@ -136,7 +156,7 @@ def __get_remote_index(warc_files_start_date):
pass
# The news files are grouped per year and month in separate folders
warc_dates = __iterate_by_month(warc_files_start_date, datetime.datetime.today())
warc_dates = __iterate_by_month(start_date=warc_files_start_date, end_date=warc_files_end_date)
for date in warc_dates:
year = date.strftime('%Y')
month = date.strftime('%m')
@@ -155,10 +175,15 @@ def __get_remote_index(warc_files_start_date):
lines = stdout_data.splitlines()
if warc_files_start_date:
if warc_files_start_date or warc_files_end_date:
# Now filter further on day of month, hour, minute
lines = [p for p in lines
if __extract_date_from_warc_filename(p) >= warc_files_start_date]
lines = [
p for p in lines if __date_within_period(
__extract_date_from_warc_filename(p),
start_date=warc_files_start_date,
end_date=warc_files_end_date,
)
]
return lines
@@ -269,7 +294,7 @@ def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extract
def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_completed=None, valid_hosts=None,
start_date=None, end_date=None, warc_files_start_date=None, strict_date=True,
start_date=None, end_date=None, warc_files_start_date=None, warc_files_end_date=None, strict_date=True,
reuse_previously_downloaded_files=True, local_download_dir_warc=None,
continue_after_error=True, show_download_progress=False,
number_of_extraction_processes=4, log_level=logging.ERROR,
@@ -286,6 +311,8 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
:param valid_hosts:
:param start_date:
:param end_date:
:param warc_files_start_date
:param warc_files_end_date
:param strict_date:
:param reuse_previously_downloaded_files:
:param local_download_dir_warc:
@@ -300,7 +327,7 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
global __extern_callback_on_warc_completed
__extern_callback_on_warc_completed = callback_on_warc_completed
cc_news_crawl_names = __get_remote_index(warc_files_start_date)
cc_news_crawl_names = __get_remote_index(warc_files_start_date, warc_files_end_date)
global __number_of_warc_files_on_cc
__number_of_warc_files_on_cc = len(cc_news_crawl_names)
__logger.info('found %i files at commoncrawl.org', __number_of_warc_files_on_cc)

View File

@@ -53,8 +53,10 @@ my_filter_valid_hosts = [] # example: ['elrancaguino.cl']
my_filter_start_date = None # datetime.datetime(2016, 1, 1)
# end date (if None, any date is OK as end date), as datetime
my_filter_end_date = None # datetime.datetime(2016, 12, 31)
# if date filtering is strict and news-please could not detect the date of an article, the article will be discarded
# Only .warc files published within [my_warc_files_start_date, my_warc_files_end_date) will be downloaded.
my_warc_files_start_date = None # example: datetime.datetime(2020, 3, 1)
my_warc_files_end_date = None # example: datetime.datetime(2020, 3, 2)
# if date filtering is strict and news-please could not detect the date of an article, the article will be discarded
my_filter_strict_date = True
# if True, the script checks whether a file has been downloaded already and uses that file instead of downloading
# again. Note that there is no check whether the file has been downloaded completely or is valid!
@@ -167,6 +169,7 @@ def main():
start_date=my_filter_start_date,
end_date=my_filter_end_date,
warc_files_start_date=my_warc_files_start_date,
warc_files_end_date=my_warc_files_end_date,
strict_date=my_filter_strict_date,
reuse_previously_downloaded_files=my_reuse_previously_downloaded_files,
local_download_dir_warc=my_local_download_dir_warc,