Support warc_files_end_date for common crawl crawler.

2021-09-19 22:26:00 +03:00 · 2021-04-19 00:17:33 -04:00
parent 44a196f1eb
commit 1bea565ca1
2 changed files with 40 additions and 10 deletions
--- a/newsplease/crawler/commoncrawl_crawler.py
+++ b/newsplease/crawler/commoncrawl_crawler.py
@@ -43,6 +43,8 @@ __counter_warc_skipped = 0
 __counter_warc_processed = 0
 __start_time = time.time()

+# When Common Crawl started.
+__common_crawl_start_date = datetime.datetime(2016, 8, 26)

 def __setup(local_download_dir_warc, log_level):
    """
@@ -90,7 +92,14 @@ def __get_download_url(name):
    """
    return __cc_base_url + name

-def __iterate_by_month(start_date, end_date, month_step=1):
+
+def __iterate_by_month(start_date=None, end_date=None, month_step=1):
+    if start_date is None:
+        # The starting month of Common Crawl.
+        start_date = __common_crawl_start_date
+    if end_date is None:
+        # Until now.
+        end_date = datetime.datetime.today()
    current_date = start_date
    while current_date < end_date:
        yield current_date
@@ -109,11 +118,22 @@ def __extract_date_from_warc_filename(path):
    return datetime.datetime.strptime(dt, '%Y%m%d%H%M%S')


-def __get_remote_index(warc_files_start_date):
+def __date_within_period(date, start_date=None, end_date=None):
+    if start_date is None:
+        # The starting month of Common Crawl.
+        start_date = __common_crawl_start_date
+    if end_date is None:
+        # Until now.
+        end_date = datetime.datetime.today()
+    return start_date <= date < end_date
+
+
+def __get_remote_index(warc_files_start_date, warc_files_end_date):
    """
    Gets the index of news crawl files from commoncrawl.org and returns an array of names
    :param warc_files_start_date: only list .warc files with greater or equal date in
    their filename
+    :param warc_files_end_date: only list .warc files with smaller date in their filename
    :return:
    """

@@ -128,7 +148,7 @@ def __get_remote_index(warc_files_start_date):
        # get the remote info

        cmd = ''
-        if warc_files_start_date:
+        if warc_files_start_date or warc_files_end_date:
            # cleanup
            try:
                os.remove(temp_filename)
@@ -136,7 +156,7 @@ def __get_remote_index(warc_files_start_date):
                pass

            # The news files are grouped per year and month in separate folders
-            warc_dates = __iterate_by_month(warc_files_start_date, datetime.datetime.today())
+            warc_dates = __iterate_by_month(start_date=warc_files_start_date, end_date=warc_files_end_date)
            for date in warc_dates:
                year = date.strftime('%Y')
                month = date.strftime('%m')
@@ -155,10 +175,15 @@ def __get_remote_index(warc_files_start_date):

    lines = stdout_data.splitlines()

-    if warc_files_start_date:
+    if warc_files_start_date or warc_files_end_date:
        # Now filter further on day of month, hour, minute
-        lines = [p for p in lines
-                 if __extract_date_from_warc_filename(p) >= warc_files_start_date]
+        lines = [
+            p for p in lines if __date_within_period(
+                __extract_date_from_warc_filename(p),
+                start_date=warc_files_start_date,
+                end_date=warc_files_end_date,
+            )
+        ]

    return lines

@@ -269,7 +294,7 @@ def __start_commoncrawl_extractor(warc_download_url, callback_on_article_extract


 def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_completed=None, valid_hosts=None,
-                           start_date=None, end_date=None, warc_files_start_date=None, strict_date=True,
+                           start_date=None, end_date=None, warc_files_start_date=None, warc_files_end_date=None, strict_date=True,
                           reuse_previously_downloaded_files=True, local_download_dir_warc=None,
                           continue_after_error=True, show_download_progress=False,
                           number_of_extraction_processes=4, log_level=logging.ERROR,
@@ -286,6 +311,8 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
    :param valid_hosts:
    :param start_date:
    :param end_date:
+    :param warc_files_start_date
+    :param warc_files_end_date
    :param strict_date:
    :param reuse_previously_downloaded_files:
    :param local_download_dir_warc:
@@ -300,7 +327,7 @@ def crawl_from_commoncrawl(callback_on_article_extracted, callback_on_warc_compl
    global __extern_callback_on_warc_completed
    __extern_callback_on_warc_completed = callback_on_warc_completed

-    cc_news_crawl_names = __get_remote_index(warc_files_start_date)
+    cc_news_crawl_names = __get_remote_index(warc_files_start_date, warc_files_end_date)
    global __number_of_warc_files_on_cc
    __number_of_warc_files_on_cc = len(cc_news_crawl_names)
    __logger.info('found %i files at commoncrawl.org', __number_of_warc_files_on_cc)
--- a/newsplease/examples/commoncrawl.py
+++ b/newsplease/examples/commoncrawl.py
@@ -53,8 +53,10 @@ my_filter_valid_hosts = []  # example: ['elrancaguino.cl']
 my_filter_start_date = None  # datetime.datetime(2016, 1, 1)
 # end date (if None, any date is OK as end date), as datetime
 my_filter_end_date = None  # datetime.datetime(2016, 12, 31)
-# if date filtering is strict and news-please could not detect the date of an article, the article will be discarded
+# Only .warc files published within [my_warc_files_start_date, my_warc_files_end_date) will be downloaded.
 my_warc_files_start_date = None # example: datetime.datetime(2020, 3, 1)
+my_warc_files_end_date = None # example: datetime.datetime(2020, 3, 2)
+# if date filtering is strict and news-please could not detect the date of an article, the article will be discarded
 my_filter_strict_date = True
 # if True, the script checks whether a file has been downloaded already and uses that file instead of downloading
 # again. Note that there is no check whether the file has been downloaded completely or is valid!
@@ -167,6 +169,7 @@ def main():
                                               start_date=my_filter_start_date,
                                               end_date=my_filter_end_date,
                                               warc_files_start_date=my_warc_files_start_date,
+                                               warc_files_end_date=my_warc_files_end_date,
                                               strict_date=my_filter_strict_date,
                                               reuse_previously_downloaded_files=my_reuse_previously_downloaded_files,
                                               local_download_dir_warc=my_local_download_dir_warc,