mirror of
https://github.com/michaelharms/comcrawl.git
synced 2021-09-27 00:43:48 +03:00
Merge pull request #25 from michaelharms/fix/crash-no-html
Fix/crash no html
This commit is contained in:
@@ -45,7 +45,8 @@ def download_single_result(result: Result) -> Result:
|
||||
result["html"] = ""
|
||||
|
||||
if len(data) > 0:
|
||||
__, ___, result["html"] = data.strip().split("\r\n\r\n", 2)
|
||||
data_parts = data.strip().split("\r\n\r\n", 2)
|
||||
result["html"] = data_parts[2] if len(data_parts) == 3 else ""
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@@ -17,12 +17,31 @@ KNOWN_RESULT = {
|
||||
'CC-MAIN-20191207160050-20191207184050-00394.warc.gz',
|
||||
'status': '200'}
|
||||
|
||||
KNOWN_RESULT_NO_HTML = {
|
||||
'urlkey': 'org,wikipedia,de)/wiki/%20vaterl%c3%a4ndische_front',
|
||||
'timestamp': '20191211090655',
|
||||
'digest': '3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ',
|
||||
'redirect': 'https://de.wikipedia.org/wiki/Vaterl%C3%A4ndische_Front',
|
||||
'mime-detected': 'text/html',
|
||||
'offset': '27349613',
|
||||
'length': '1075',
|
||||
'filename': ('crawl-data/CC-MAIN-2019-51/segments/1575540530452.95/'
|
||||
'crawldiagnostics/CC-MAIN-20191211074417-20191211102417-00094.warc.gz'),
|
||||
'url': 'https://de.wikipedia.org/wiki/%20Vaterl%C3%A4ndische_Front',
|
||||
'status': '301',
|
||||
'mime': 'text/html'}
|
||||
|
||||
|
||||
def test_download_single_result(snapshot):
|
||||
result = download_single_result(KNOWN_RESULT)
|
||||
snapshot.assert_match(result["html"])
|
||||
|
||||
|
||||
def test_download_single_result_without_html():
|
||||
result = download_single_result(KNOWN_RESULT_NO_HTML)
|
||||
assert result["html"] == ""
|
||||
|
||||
|
||||
KNOWN_RESULTS = [{'charset': 'UTF-8',
|
||||
'digest': '745JGUNVPWB4L3TWJIGUQRQFTFSREJ5J',
|
||||
'filename': ('crawl-data/CC-MAIN-2019-51/segments/1575540500637.40/'
|
||||
|
||||
Reference in New Issue
Block a user