1
0
mirror of https://github.com/michaelharms/comcrawl.git synced 2021-09-27 00:43:48 +03:00
Files
comcrawl-common-crawl/comcrawl/utils/_search_single_index.py
2020-01-15 07:57:16 +01:00

33 lines
811 B
Python

from typing import List, Dict
import json
import requests
SEARCH_URL_TEMPLATE = ("https://index.commoncrawl.org/CC-MAIN-"
"{index}-index?url={url}&output=json")
def _search_single_index(index: str,
url: str) -> List[Dict]:
"""Searches single Common Crawl index for given URL pattern.
Args:
index: Common Crawl index to search in.
url: URL pattern to search for.
Returns:
List of results found in specified Common Crawl index.
"""
results = []
search_url = SEARCH_URL_TEMPLATE.format(index=index, url=url)
response = requests.get(search_url)
if response.status_code == 200:
results = [
json.loads(result) for result in response.content.splitlines()
]
return results