1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-19 22:26:00 +03:00

add fetch_images optionto ccnc script (default=false)

This commit is contained in:
Felix Hamborg
2021-02-24 08:53:24 +01:00
parent 14b9cef89c
commit 145cb0b641
3 changed files with 23 additions and 11 deletions

View File

@@ -224,7 +224,7 @@ def __callback_on_warc_completed(warc_path, counter_article_passed, counter_arti
def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extracted=None,
callback_on_warc_completed=None, valid_hosts=None,
start_date=None, end_date=None,
start_date=None, end_date=None,
strict_date=True, reuse_previously_downloaded_files=True,
local_download_dir_warc=None,
continue_after_error=True, show_download_progress=False,
@@ -232,7 +232,8 @@ def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extract
delete_warc_after_extraction=True,
continue_process=True,
log_pathname_fully_extracted_warcs=None,
extractor_cls=CommonCrawlExtractor):
extractor_cls=CommonCrawlExtractor,
fetch_images=fetch_images):
"""
Starts a single CommonCrawlExtractor
:param warc_download_url:
@@ -263,16 +264,17 @@ def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extract
show_download_progress=show_download_progress,
log_level=log_level,
delete_warc_after_extraction=delete_warc_after_extraction,
log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs)
log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
fetch_images=fetch_images)
def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_completed=None, valid_hosts=None,
start_date=None, end_date=None, warc_files_start_date=None, strict_date=True,
reuse_previously_downloaded_files=True, local_download_dir_warc=None,
start_date=None, end_date=None, warc_files_start_date=None, strict_date=True,
reuse_previously_downloaded_files=True, local_download_dir_warc=None,
continue_after_error=True, show_download_progress=False,
number_of_extraction_processes=4, log_level=logging.ERROR,
delete_warc_after_extraction=True, continue_process=True,
extractor_cls=CommonCrawlExtractor):
extractor_cls=CommonCrawlExtractor, fetch_images=False):
"""
Crawl and extract articles form the news crawl provided by commoncrawl.org. For each article that was extracted
successfully the callback function callback_on_article_extracted is invoked where the first parameter is the
@@ -340,7 +342,8 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
log_level=log_level,
delete_warc_after_extraction=delete_warc_after_extraction,
log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
extractor_cls=extractor_cls),
extractor_cls=extractor_cls,
fetch_images=fetch_images),
warc_download_urls)
else:
for warc_download_url in warc_download_urls:
@@ -357,4 +360,5 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
log_level=log_level,
delete_warc_after_extraction=delete_warc_after_extraction,
log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
extractor_cls=extractor_cls)
extractor_cls=extractor_cls,
fetch_images=fetch_images)

View File

@@ -325,9 +325,9 @@ class CommonCrawlExtractor:
valid_hosts=None,
start_date=None, end_date=None,
strict_date=True, reuse_previously_downloaded_files=True, local_download_dir_warc=None,
continue_after_error=True, ignore_unicode_errors=False, fetch_images=False,
continue_after_error=True, ignore_unicode_errors=False,
show_download_progress=False, log_level=logging.ERROR, delete_warc_after_extraction=True,
log_pathname_fully_extracted_warcs=None):
log_pathname_fully_extracted_warcs=None, fetch_images=False):
"""
Crawl and extract articles form the news crawl provided by commoncrawl.org. For each article that was extracted
successfully the callback function callback_on_article_extracted is invoked where the first parameter is the

View File

@@ -23,6 +23,9 @@ that might have been installed with pip. Hence, you must run this script followi
git clone https://github.com/fhamborg/news-please.git
cd news-please
python3 -m newsplease.examples.commoncrawl
Note that by default the script does not extract main images since they are not contained
WARC files. You can enable extraction of main images by setting
"""
import hashlib
import json
@@ -71,6 +74,10 @@ my_delete_warc_after_extraction = True
# if True, will continue extraction from the latest fully downloaded but not fully extracted WARC files and then
# crawling new WARC files. This assumes that the filter criteria have not been changed since the previous run!
my_continue_process = True
# if True, will crawl and extract main image of each article. Note that the WARC files
# do not contain any images, so that news-please will crawl the current image from
# the articles online webpage, if this option is enabled.
my_fetch_images = False
############ END YOUR CONFIG #########
@@ -168,7 +175,8 @@ def main():
number_of_extraction_processes=my_number_of_extraction_processes,
log_level=my_log_level,
delete_warc_after_extraction=my_delete_warc_after_extraction,
continue_process=True)
continue_process=True,
fetch_images=my_fetch_images)
if __name__ == "__main__":