add fetch_images optionto ccnc script (default=false)

2021-09-19 22:26:00 +03:00 · 2021-02-24 08:53:24 +01:00
parent 14b9cef89c
commit 145cb0b641
3 changed files with 23 additions and 11 deletions
--- a/newsplease/crawler/commoncrawl_crawler.py
+++ b/newsplease/crawler/commoncrawl_crawler.py
@@ -224,7 +224,7 @@ def __callback_on_warc_completed(warc_path, counter_article_passed, counter_arti

 def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extracted=None,
                                  callback_on_warc_completed=None, valid_hosts=None,
-                                  start_date=None, end_date=None, 
+                                  start_date=None, end_date=None,
                                  strict_date=True, reuse_previously_downloaded_files=True,
                                  local_download_dir_warc=None,
                                  continue_after_error=True, show_download_progress=False,
@@ -232,7 +232,8 @@ def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extract
                                  delete_warc_after_extraction=True,
                                  continue_process=True,
                                  log_pathname_fully_extracted_warcs=None,
-                                  extractor_cls=CommonCrawlExtractor):
+                                  extractor_cls=CommonCrawlExtractor,
+                                  fetch_images=fetch_images):
    """
    Starts a single CommonCrawlExtractor
    :param warc_download_url:
@@ -263,16 +264,17 @@ def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extract
                                                   show_download_progress=show_download_progress,
                                                   log_level=log_level,
                                                   delete_warc_after_extraction=delete_warc_after_extraction,
-                                                   log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs)
+                                                   log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
+                                                   fetch_images=fetch_images)


 def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_completed=None, valid_hosts=None,
-                           start_date=None, end_date=None, warc_files_start_date=None, strict_date=True, 
-                           reuse_previously_downloaded_files=True, local_download_dir_warc=None, 
+                           start_date=None, end_date=None, warc_files_start_date=None, strict_date=True,
+                           reuse_previously_downloaded_files=True, local_download_dir_warc=None,
                           continue_after_error=True, show_download_progress=False,
                           number_of_extraction_processes=4, log_level=logging.ERROR,
                           delete_warc_after_extraction=True, continue_process=True,
-                           extractor_cls=CommonCrawlExtractor):
+                           extractor_cls=CommonCrawlExtractor, fetch_images=False):
    """
    Crawl and extract articles form the news crawl provided by commoncrawl.org. For each article that was extracted
    successfully the callback function callback_on_article_extracted is invoked where the first parameter is the
@@ -340,7 +342,8 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
                                                log_level=log_level,
                                                delete_warc_after_extraction=delete_warc_after_extraction,
                                                log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
-                                                extractor_cls=extractor_cls),
+                                                extractor_cls=extractor_cls,
+                                                fetch_images=fetch_images),
                                        warc_download_urls)
    else:
        for warc_download_url in warc_download_urls:
@@ -357,4 +360,5 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
                                          log_level=log_level,
                                          delete_warc_after_extraction=delete_warc_after_extraction,
                                          log_pathname_fully_extracted_warcs=__log_pathname_fully_extracted_warcs,
-                                          extractor_cls=extractor_cls)
+                                          extractor_cls=extractor_cls,
+                                          fetch_images=fetch_images)
--- a/newsplease/crawler/commoncrawl_extractor.py
+++ b/newsplease/crawler/commoncrawl_extractor.py
@@ -325,9 +325,9 @@ class CommonCrawlExtractor:
                                 valid_hosts=None,
                                 start_date=None, end_date=None,
                                 strict_date=True, reuse_previously_downloaded_files=True, local_download_dir_warc=None,
-                                 continue_after_error=True, ignore_unicode_errors=False, fetch_images=False,
+                                 continue_after_error=True, ignore_unicode_errors=False,
                                 show_download_progress=False, log_level=logging.ERROR, delete_warc_after_extraction=True,
-                                 log_pathname_fully_extracted_warcs=None):
+                                 log_pathname_fully_extracted_warcs=None, fetch_images=False):
        """
        Crawl and extract articles form the news crawl provided by commoncrawl.org. For each article that was extracted
        successfully the callback function callback_on_article_extracted is invoked where the first parameter is the
--- a/newsplease/examples/commoncrawl.py
+++ b/newsplease/examples/commoncrawl.py
@@ -23,6 +23,9 @@ that might have been installed with pip. Hence, you must run this script followi
 git clone https://github.com/fhamborg/news-please.git
 cd news-please
 python3 -m newsplease.examples.commoncrawl
+
+Note that by default the script does not extract main images since they are not contained
+WARC files. You can enable extraction of main images by setting
 """
 import hashlib
 import json
@@ -71,6 +74,10 @@ my_delete_warc_after_extraction = True
 # if True, will continue extraction from the latest fully downloaded but not fully extracted WARC files and then
 # crawling new WARC files. This assumes that the filter criteria have not been changed since the previous run!
 my_continue_process = True
+# if True, will crawl and extract main image of each article. Note that the WARC files
+# do not contain any images, so that news-please will crawl the current image from
+# the articles online webpage, if this option is enabled.
+my_fetch_images = False
 ############ END YOUR CONFIG #########


@@ -168,7 +175,8 @@ def main():
                                               number_of_extraction_processes=my_number_of_extraction_processes,
                                               log_level=my_log_level,
                                               delete_warc_after_extraction=my_delete_warc_after_extraction,
-                                               continue_process=True)
+                                               continue_process=True,
+                                               fetch_images=my_fetch_images)


 if __name__ == "__main__":