mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-19 22:26:00 +03:00
Merge pull request #201 from mood-mapping-muppets/empty-warc-newspaper
Filter empty responses from WARC to avoid spurious exceptions from `newspaper`
This commit is contained in:
@@ -15,6 +15,10 @@ from newsplease.pipeline.pipelines import ExtractedInformationStorage
|
||||
from newsplease.crawler.simple_crawler import SimpleCrawler
|
||||
|
||||
|
||||
class EmptyResponseError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class NewsPlease:
|
||||
"""
|
||||
Access news-please functionality via this interface
|
||||
@@ -44,6 +48,8 @@ class NewsPlease:
|
||||
except LookupError:
|
||||
# non-existent encoding: fallback to utf-9
|
||||
html = raw_stream.decode('utf-8', errors=decode_errors)
|
||||
if not html:
|
||||
raise EmptyResponseError()
|
||||
url = warc_record.rec_headers.get_header('WARC-Target-URI')
|
||||
download_date = warc_record.rec_headers.get_header('WARC-Date')
|
||||
article = NewsPlease.from_html(html, url=url, download_date=download_date)
|
||||
|
||||
@@ -17,7 +17,7 @@ from scrapy.utils.log import configure_logging
|
||||
from six.moves import urllib
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from .. import NewsPlease
|
||||
from .. import NewsPlease, EmptyResponseError
|
||||
|
||||
__author__ = "Felix Hamborg"
|
||||
__copyright__ = "Copyright 2017"
|
||||
@@ -255,13 +255,13 @@ class CommonCrawlExtractor:
|
||||
# if the article passes filter tests, we notify the user
|
||||
try:
|
||||
filter_pass, article = self.filter_record(record)
|
||||
except UnicodeDecodeError:
|
||||
except (UnicodeDecodeError, EmptyResponseError):
|
||||
filter_pass = False
|
||||
if filter_pass:
|
||||
try:
|
||||
if not article:
|
||||
article = self._from_warc(record)
|
||||
except UnicodeDecodeError:
|
||||
except (UnicodeDecodeError, EmptyResponseError):
|
||||
filter_pass = False
|
||||
if filter_pass:
|
||||
counter_article_passed += 1
|
||||
|
||||
Reference in New Issue
Block a user