1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-19 22:26:00 +03:00

Fallback to utf-8 when document gives unknown encoding

This commit is contained in:
Frankie Robertson
2021-02-08 07:18:05 +02:00
parent 2ade640525
commit 5ed9b804bf

View File

@@ -39,7 +39,11 @@ class NewsPlease:
# assume utf-8
encoding = 'utf-8'
html = raw_stream.decode(encoding, errors=decode_errors)
try:
html = raw_stream.decode(encoding, errors=decode_errors)
except LookupError:
# non-existent encoding: fallback to utf-9
html = raw_stream.decode('utf-8', errors=decode_errors)
url = warc_record.rec_headers.get_header('WARC-Target-URI')
download_date = warc_record.rec_headers.get_header('WARC-Date')
article = NewsPlease.from_html(html, url=url, download_date=download_date)