mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-19 22:26:00 +03:00
add fetch_images optionto ccnc script (default=false)
This commit is contained in:
@@ -224,7 +224,7 @@ def __callback_on_warc_completed(warc_path, counter_article_passed, counter_arti
|
||||
|
||||
def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extracted=None,
|
||||
callback_on_warc_completed=None, valid_hosts=None,
|
||||
start_date=None, end_date=None,
|
||||
start_date=None, end_date=None,
|
||||
strict_date=True, reuse_previously_downloaded_files=True,
|
||||
local_download_dir_warc=None,
|
||||
continue_after_error=True, show_download_progress=False,
|
||||
@@ -232,7 +232,8 @@ def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extract
|
||||
delete_warc_after_extraction=True,
|
||||
continue_process=True,
|
||||
log_pathname_fully_extracted_warcs=None,
|
||||
extractor_cls=CommonCrawlExtractor):
|
||||
extractor_cls=CommonCrawlExtractor,
|
||||
fetch_images=fetch_images):
|
||||
"""
|
||||
Starts a single CommonCrawlExtractor
|
||||
:param warc_download_url:
|
||||
@@ -263,16 +264,17 @@ def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extract
|
||||
show_download_progress=show_download_progress,
|
||||
log_level=log_level,
|
||||
delete_warc_after_extraction=delete_warc_after_extraction,
|
||||
log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs)
|
||||
log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
|
||||
fetch_images=fetch_images)
|
||||
|
||||
|
||||
def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_completed=None, valid_hosts=None,
|
||||
start_date=None, end_date=None, warc_files_start_date=None, strict_date=True,
|
||||
reuse_previously_downloaded_files=True, local_download_dir_warc=None,
|
||||
start_date=None, end_date=None, warc_files_start_date=None, strict_date=True,
|
||||
reuse_previously_downloaded_files=True, local_download_dir_warc=None,
|
||||
continue_after_error=True, show_download_progress=False,
|
||||
number_of_extraction_processes=4, log_level=logging.ERROR,
|
||||
delete_warc_after_extraction=True, continue_process=True,
|
||||
extractor_cls=CommonCrawlExtractor):
|
||||
extractor_cls=CommonCrawlExtractor, fetch_images=False):
|
||||
"""
|
||||
Crawl and extract articles form the news crawl provided by commoncrawl.org. For each article that was extracted
|
||||
successfully the callback function callback_on_article_extracted is invoked where the first parameter is the
|
||||
@@ -340,7 +342,8 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
|
||||
log_level=log_level,
|
||||
delete_warc_after_extraction=delete_warc_after_extraction,
|
||||
log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
|
||||
extractor_cls=extractor_cls),
|
||||
extractor_cls=extractor_cls,
|
||||
fetch_images=fetch_images),
|
||||
warc_download_urls)
|
||||
else:
|
||||
for warc_download_url in warc_download_urls:
|
||||
@@ -357,4 +360,5 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
|
||||
log_level=log_level,
|
||||
delete_warc_after_extraction=delete_warc_after_extraction,
|
||||
log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
|
||||
extractor_cls=extractor_cls)
|
||||
extractor_cls=extractor_cls,
|
||||
fetch_images=fetch_images)
|
||||
|
||||
@@ -325,9 +325,9 @@ class CommonCrawlExtractor:
|
||||
valid_hosts=None,
|
||||
start_date=None, end_date=None,
|
||||
strict_date=True, reuse_previously_downloaded_files=True, local_download_dir_warc=None,
|
||||
continue_after_error=True, ignore_unicode_errors=False, fetch_images=False,
|
||||
continue_after_error=True, ignore_unicode_errors=False,
|
||||
show_download_progress=False, log_level=logging.ERROR, delete_warc_after_extraction=True,
|
||||
log_pathname_fully_extracted_warcs=None):
|
||||
log_pathname_fully_extracted_warcs=None, fetch_images=False):
|
||||
"""
|
||||
Crawl and extract articles form the news crawl provided by commoncrawl.org. For each article that was extracted
|
||||
successfully the callback function callback_on_article_extracted is invoked where the first parameter is the
|
||||
|
||||
@@ -23,6 +23,9 @@ that might have been installed with pip. Hence, you must run this script followi
|
||||
git clone https://github.com/fhamborg/news-please.git
|
||||
cd news-please
|
||||
python3 -m newsplease.examples.commoncrawl
|
||||
|
||||
Note that by default the script does not extract main images since they are not contained
|
||||
WARC files. You can enable extraction of main images by setting
|
||||
"""
|
||||
import hashlib
|
||||
import json
|
||||
@@ -71,6 +74,10 @@ my_delete_warc_after_extraction = True
|
||||
# if True, will continue extraction from the latest fully downloaded but not fully extracted WARC files and then
|
||||
# crawling new WARC files. This assumes that the filter criteria have not been changed since the previous run!
|
||||
my_continue_process = True
|
||||
# if True, will crawl and extract main image of each article. Note that the WARC files
|
||||
# do not contain any images, so that news-please will crawl the current image from
|
||||
# the articles online webpage, if this option is enabled.
|
||||
my_fetch_images = False
|
||||
############ END YOUR CONFIG #########
|
||||
|
||||
|
||||
@@ -168,7 +175,8 @@ def main():
|
||||
number_of_extraction_processes=my_number_of_extraction_processes,
|
||||
log_level=my_log_level,
|
||||
delete_warc_after_extraction=my_delete_warc_after_extraction,
|
||||
continue_process=True)
|
||||
continue_process=True,
|
||||
fetch_images=my_fetch_images)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user