1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-19 22:26:00 +03:00

Merge pull request #198 from mood-mapping-muppets/robust-unicode-warc

Add option of replacing unicode decode errors in WARC/common crawl extraction
This commit is contained in:
Felix Hamborg
2021-02-08 16:20:04 +01:00
committed by GitHub
2 changed files with 24 additions and 7 deletions

View File

@@ -21,7 +21,7 @@ class NewsPlease:
"""
@staticmethod
def from_warc(warc_record):
def from_warc(warc_record, decode_errors="replace"):
"""
Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
extractor.
@@ -39,7 +39,11 @@ class NewsPlease:
# assume utf-8
encoding = 'utf-8'
html = raw_stream.decode(encoding)
try:
html = raw_stream.decode(encoding, errors=decode_errors)
except LookupError:
# non-existent encoding: fallback to utf-9
html = raw_stream.decode('utf-8', errors=decode_errors)
url = warc_record.rec_headers.get_header('WARC-Target-URI')
download_date = warc_record.rec_headers.get_header('WARC-Date')
article = NewsPlease.from_html(html, url=url, download_date=download_date)

View File

@@ -42,6 +42,8 @@ class CommonCrawlExtractor:
__reuse_previously_downloaded_files = True
# continue after error
__continue_after_error = False
# ignore unicode errors
__ignore_unicode_errors = False
# log level
__log_level = logging.INFO
__delete_warc_after_extraction = True
@@ -116,7 +118,7 @@ class CommonCrawlExtractor:
# filter by date
if self.__filter_start_date or self.__filter_end_date:
if not article:
article = NewsPlease.from_warc(warc_record)
article = self._from_warc(warc_record)
publishing_date = self.__get_publishing_date(warc_record, article)
if not publishing_date:
@@ -227,6 +229,9 @@ class CommonCrawlExtractor:
self.__logger.info('download completed, local file: %s', local_filepath)
return local_filepath
def _from_warc(self, record):
return NewsPlease.from_warc(record, decode_errors="replace" if self.__ignore_unicode_errors else "strict")
def __process_warc_gz_file(self, path_name):
"""
Iterates all transactions in one WARC file and for each transaction tries to extract an article object.
@@ -248,10 +253,17 @@ class CommonCrawlExtractor:
counter_article_total += 1
# if the article passes filter tests, we notify the user
filter_pass, article = self.filter_record(record)
try:
filter_pass, article = self.filter_record(record)
except UnicodeDecodeError:
filter_pass = False
if filter_pass:
try:
if not article:
article = self._from_warc(record)
except UnicodeDecodeError:
filter_pass = False
if filter_pass:
if not article:
article = NewsPlease.from_warc(record)
counter_article_passed += 1
self.__logger.info('article pass (%s; %s; %s)', article.source_domain, article.date_publish,
@@ -311,7 +323,7 @@ class CommonCrawlExtractor:
valid_hosts=None,
start_date=None, end_date=None,
strict_date=True, reuse_previously_downloaded_files=True, local_download_dir_warc=None,
continue_after_error=True, show_download_progress=False,
continue_after_error=True, ignore_unicode_errors=False, show_download_progress=False,
log_level=logging.ERROR, delete_warc_after_extraction=True,
log_pathname_fully_extracted_warcs=None):
"""
@@ -343,6 +355,7 @@ class CommonCrawlExtractor:
self.__local_download_dir_warc = local_download_dir_warc
self.__reuse_previously_downloaded_files = reuse_previously_downloaded_files
self.__continue_after_error = continue_after_error
self.__ignore_unicode_errors = ignore_unicode_errors
self.__callback_on_article_extracted = callback_on_article_extracted
self.__callback_on_warc_completed = callback_on_warc_completed
self.__show_download_progress = show_download_progress