1
0
mirror of https://github.com/michaelharms/comcrawl.git synced 2021-09-27 00:43:48 +03:00
Files
comcrawl-common-crawl/comcrawl/search_index.py
2020-01-10 09:12:44 +01:00

29 lines
728 B
Python

from typing import List, Dict
import json
import requests
search_url_template = "https://index.commoncrawl.org/CC-MAIN-{index}-index?url={url}&output=json"
def search_index(index: str, url: str) -> List[Dict]:
"""Searches single Common Crawl index for given URL pattern.
Args:
index: Common Crawl index to search in.
url: URL pattern to search for.
Returns:
List of results found in specified Common Crawl index.
"""
results = []
search_url = search_url_template.format(index=index, url=url)
response = requests.get(search_url)
if response.status_code == 200:
results = [json.loads(result) for result in response.content.splitlines()]
return results