1
0
mirror of https://github.com/michaelharms/comcrawl.git synced 2021-09-27 00:43:48 +03:00

beginning to implement search api

This commit is contained in:
Michael Harms
2020-01-10 09:12:44 +01:00
parent d237268572
commit 64c8fda352
8 changed files with 149 additions and 4 deletions

View File

@@ -1 +1,3 @@
__version__ = '0.1.0'
__version__ = "0.1.0"
from .search import search

View File

39
comcrawl/search.py Normal file
View File

@@ -0,0 +1,39 @@
from typing import List, Dict
import concurrent.futures
from .search_index import search_index
def search(
url: str,
indices: List[str],
threads: int = None
) -> List[Dict[str, Dict]]:
"""Searches multiple Common Crawl index for URL pattern.
Args:
url: The URL pattern to search for.
indices: List of Common Crawl indices to search in.
threads: Number of threads to use for faster search on multiple threads.
Returns:
List of all results found throughout the specified Common Crawl indices.
"""
results = []
# multi-threaded search
if threads:
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
future_to_index = {executor.submit(search_index, index, url): index for index in indices}
for future in concurrent.futures.as_completed(future_to_index):
results.extend(future.result())
# single-threaded search
else:
for index in indices:
index_results = search_index(index, url)
results.extend(index_results)
return results

28
comcrawl/search_index.py Normal file
View File

@@ -0,0 +1,28 @@
from typing import List, Dict
import json
import requests
search_url_template = "https://index.commoncrawl.org/CC-MAIN-{index}-index?url={url}&output=json"
def search_index(index: str, url: str) -> List[Dict]:
"""Searches single Common Crawl index for given URL pattern.
Args:
index: Common Crawl index to search in.
url: URL pattern to search for.
Returns:
List of results found in specified Common Crawl index.
"""
results = []
search_url = search_url_template.format(index=index, url=url)
response = requests.get(search_url)
if response.status_code == 200:
results = [json.loads(result) for result in response.content.splitlines()]
return results

77
poetry.lock generated
View File

@@ -64,6 +64,22 @@ colorama = ">=0.3.9"
six = ">=1.10.0"
stevedore = ">=1.20.0"
[[package]]
category = "main"
description = "Python package for providing Mozilla's CA Bundle."
name = "certifi"
optional = false
python-versions = "*"
version = "2019.11.28"
[[package]]
category = "main"
description = "Universal encoding detector for Python 2 and 3"
name = "chardet"
optional = false
python-versions = "*"
version = "3.0.4"
[[package]]
category = "dev"
description = "Cross-platform colored terminal text."
@@ -106,6 +122,14 @@ version = "3.0.5"
[package.dependencies]
gitdb2 = ">=2.0.0"
[[package]]
category = "main"
description = "Internationalized Domain Names in Applications (IDNA)"
name = "idna"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
version = "2.8"
[[package]]
category = "dev"
description = "Read metadata from Python packages"
@@ -306,6 +330,24 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
version = "5.3"
[[package]]
category = "main"
description = "Python HTTP for Humans."
name = "requests"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
version = "2.22.0"
[package.dependencies]
certifi = ">=2017.4.17"
chardet = ">=3.0.2,<3.1.0"
idna = ">=2.5,<2.9"
urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26"
[package.extras]
security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"]
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"]
[[package]]
category = "dev"
description = "Python 2 and 3 compatibility utilities"
@@ -350,6 +392,19 @@ optional = false
python-versions = "*"
version = "3.7.4.1"
[[package]]
category = "main"
description = "HTTP library with thread-safe connection pooling, file post, and more."
name = "urllib3"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
version = "1.25.7"
[package.extras]
brotli = ["brotlipy (>=0.6.0)"]
secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"]
[[package]]
category = "dev"
description = "Measures number of Terminal column cells of wide-character codes"
@@ -383,7 +438,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
testing = ["pathlib2", "contextlib2", "unittest2"]
[metadata]
content-hash = "3c5394e15ed1d821eea22c4ee68e816a079b593a732bdc93c7eba2588dbbfed4"
content-hash = "aa92ddd8019bcaf825a3e667a9eb1b36023bb1f66f3edffc2fe2c222d69f16e1"
python-versions = "^3.7"
[metadata.files]
@@ -406,6 +461,14 @@ bandit = [
{file = "bandit-1.6.2-py2.py3-none-any.whl", hash = "sha256:336620e220cf2d3115877685e264477ff9d9abaeb0afe3dc7264f55fa17a3952"},
{file = "bandit-1.6.2.tar.gz", hash = "sha256:41e75315853507aa145d62a78a2a6c5e3240fe14ee7c601459d0df9418196065"},
]
certifi = [
{file = "certifi-2019.11.28-py2.py3-none-any.whl", hash = "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3"},
{file = "certifi-2019.11.28.tar.gz", hash = "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"},
]
chardet = [
{file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
{file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
]
colorama = [
{file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"},
{file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"},
@@ -451,6 +514,10 @@ gitpython = [
{file = "GitPython-3.0.5-py3-none-any.whl", hash = "sha256:c155c6a2653593ccb300462f6ef533583a913e17857cfef8fc617c246b6dc245"},
{file = "GitPython-3.0.5.tar.gz", hash = "sha256:9c2398ffc3dcb3c40b27324b316f08a4f93ad646d5a6328cafbb871aa79f5e42"},
]
idna = [
{file = "idna-2.8-py2.py3-none-any.whl", hash = "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"},
{file = "idna-2.8.tar.gz", hash = "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407"},
]
importlib-metadata = [
{file = "importlib_metadata-1.3.0-py2.py3-none-any.whl", hash = "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"},
{file = "importlib_metadata-1.3.0.tar.gz", hash = "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45"},
@@ -559,6 +626,10 @@ pyyaml = [
{file = "PyYAML-5.3-cp38-cp38-win_amd64.whl", hash = "sha256:cb1f2f5e426dc9f07a7681419fe39cee823bb74f723f36f70399123f439e9b20"},
{file = "PyYAML-5.3.tar.gz", hash = "sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615"},
]
requests = [
{file = "requests-2.22.0-py2.py3-none-any.whl", hash = "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"},
{file = "requests-2.22.0.tar.gz", hash = "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4"},
]
six = [
{file = "six-1.13.0-py2.py3-none-any.whl", hash = "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd"},
{file = "six-1.13.0.tar.gz", hash = "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"},
@@ -598,6 +669,10 @@ typing-extensions = [
{file = "typing_extensions-3.7.4.1-py3-none-any.whl", hash = "sha256:cf8b63fedea4d89bab840ecbb93e75578af28f76f66c35889bd7065f5af88575"},
{file = "typing_extensions-3.7.4.1.tar.gz", hash = "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2"},
]
urllib3 = [
{file = "urllib3-1.25.7-py2.py3-none-any.whl", hash = "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293"},
{file = "urllib3-1.25.7.tar.gz", hash = "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"},
]
wcwidth = [
{file = "wcwidth-0.1.8-py2.py3-none-any.whl", hash = "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"},
{file = "wcwidth-0.1.8.tar.gz", hash = "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"},

View File

@@ -6,6 +6,7 @@ authors = ["Michael Harms <michaelharms95@icloud.com>"]
[tool.poetry.dependencies]
python = "^3.7"
requests = "^2.22.0"
[tool.poetry.dev-dependencies]
pytest = "^5.2"

View File

@@ -2,7 +2,7 @@
max-line-length = 120
[pydocstyle]
convention = numpy
convention = google
add-ignore = D205
[mypy]

View File

@@ -2,4 +2,4 @@ from comcrawl import __version__
def test_version():
assert __version__ == '0.1.0'
assert __version__ == "0.1.0"