1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-19 22:26:00 +03:00

Merge pull request #201 from mood-mapping-muppets/empty-warc-newspaper

Filter empty responses from WARC to avoid spurious exceptions from `newspaper`
This commit is contained in:
Felix Hamborg
2021-02-16 11:17:23 +01:00
committed by GitHub
2 changed files with 9 additions and 3 deletions

View File

@@ -15,6 +15,10 @@ from newsplease.pipeline.pipelines import ExtractedInformationStorage
from newsplease.crawler.simple_crawler import SimpleCrawler
class EmptyResponseError(ValueError):
pass
class NewsPlease:
"""
Access news-please functionality via this interface
@@ -44,6 +48,8 @@ class NewsPlease:
except LookupError:
# non-existent encoding: fallback to utf-9
html = raw_stream.decode('utf-8', errors=decode_errors)
if not html:
raise EmptyResponseError()
url = warc_record.rec_headers.get_header('WARC-Target-URI')
download_date = warc_record.rec_headers.get_header('WARC-Date')
article = NewsPlease.from_html(html, url=url, download_date=download_date)

View File

@@ -17,7 +17,7 @@ from scrapy.utils.log import configure_logging
from six.moves import urllib
from warcio.archiveiterator import ArchiveIterator
from .. import NewsPlease
from .. import NewsPlease, EmptyResponseError
__author__ = "Felix Hamborg"
__copyright__ = "Copyright 2017"
@@ -255,13 +255,13 @@ class CommonCrawlExtractor:
# if the article passes filter tests, we notify the user
try:
filter_pass, article = self.filter_record(record)
except UnicodeDecodeError:
except (UnicodeDecodeError, EmptyResponseError):
filter_pass = False
if filter_pass:
try:
if not article:
article = self._from_warc(record)
except UnicodeDecodeError:
except (UnicodeDecodeError, EmptyResponseError):
filter_pass = False
if filter_pass:
counter_article_passed += 1