1
0
mirror of https://github.com/michaelharms/comcrawl.git synced 2021-09-27 00:43:48 +03:00

Merge pull request #25 from michaelharms/fix/crash-no-html

Fix/crash no html
This commit is contained in:
michaelharms
2020-03-06 14:30:53 +01:00
committed by GitHub
2 changed files with 21 additions and 1 deletions

View File

@@ -45,7 +45,8 @@ def download_single_result(result: Result) -> Result:
result["html"] = ""
if len(data) > 0:
__, ___, result["html"] = data.strip().split("\r\n\r\n", 2)
data_parts = data.strip().split("\r\n\r\n", 2)
result["html"] = data_parts[2] if len(data_parts) == 3 else ""
return result

View File

@@ -17,12 +17,31 @@ KNOWN_RESULT = {
'CC-MAIN-20191207160050-20191207184050-00394.warc.gz',
'status': '200'}
KNOWN_RESULT_NO_HTML = {
'urlkey': 'org,wikipedia,de)/wiki/%20vaterl%c3%a4ndische_front',
'timestamp': '20191211090655',
'digest': '3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ',
'redirect': 'https://de.wikipedia.org/wiki/Vaterl%C3%A4ndische_Front',
'mime-detected': 'text/html',
'offset': '27349613',
'length': '1075',
'filename': ('crawl-data/CC-MAIN-2019-51/segments/1575540530452.95/'
'crawldiagnostics/CC-MAIN-20191211074417-20191211102417-00094.warc.gz'),
'url': 'https://de.wikipedia.org/wiki/%20Vaterl%C3%A4ndische_Front',
'status': '301',
'mime': 'text/html'}
def test_download_single_result(snapshot):
result = download_single_result(KNOWN_RESULT)
snapshot.assert_match(result["html"])
def test_download_single_result_without_html():
result = download_single_result(KNOWN_RESULT_NO_HTML)
assert result["html"] == ""
KNOWN_RESULTS = [{'charset': 'UTF-8',
'digest': '745JGUNVPWB4L3TWJIGUQRQFTFSREJ5J',
'filename': ('crawl-data/CC-MAIN-2019-51/segments/1575540500637.40/'