mirror of
https://github.com/michaelharms/comcrawl.git
synced 2021-09-27 00:43:48 +03:00
beginning to implement search api
This commit is contained in:
@@ -1 +1,3 @@
|
||||
__version__ = '0.1.0'
|
||||
__version__ = "0.1.0"
|
||||
|
||||
from .search import search
|
||||
|
||||
0
comcrawl/default_indices.py
Normal file
0
comcrawl/default_indices.py
Normal file
39
comcrawl/search.py
Normal file
39
comcrawl/search.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from typing import List, Dict
|
||||
import concurrent.futures
|
||||
from .search_index import search_index
|
||||
|
||||
|
||||
def search(
|
||||
url: str,
|
||||
indices: List[str],
|
||||
threads: int = None
|
||||
) -> List[Dict[str, Dict]]:
|
||||
"""Searches multiple Common Crawl index for URL pattern.
|
||||
|
||||
Args:
|
||||
url: The URL pattern to search for.
|
||||
indices: List of Common Crawl indices to search in.
|
||||
threads: Number of threads to use for faster search on multiple threads.
|
||||
|
||||
Returns:
|
||||
List of all results found throughout the specified Common Crawl indices.
|
||||
|
||||
"""
|
||||
|
||||
results = []
|
||||
|
||||
# multi-threaded search
|
||||
if threads:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
||||
future_to_index = {executor.submit(search_index, index, url): index for index in indices}
|
||||
|
||||
for future in concurrent.futures.as_completed(future_to_index):
|
||||
results.extend(future.result())
|
||||
|
||||
# single-threaded search
|
||||
else:
|
||||
for index in indices:
|
||||
index_results = search_index(index, url)
|
||||
results.extend(index_results)
|
||||
|
||||
return results
|
||||
28
comcrawl/search_index.py
Normal file
28
comcrawl/search_index.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from typing import List, Dict
|
||||
import json
|
||||
import requests
|
||||
|
||||
search_url_template = "https://index.commoncrawl.org/CC-MAIN-{index}-index?url={url}&output=json"
|
||||
|
||||
|
||||
def search_index(index: str, url: str) -> List[Dict]:
|
||||
"""Searches single Common Crawl index for given URL pattern.
|
||||
|
||||
Args:
|
||||
index: Common Crawl index to search in.
|
||||
url: URL pattern to search for.
|
||||
|
||||
Returns:
|
||||
List of results found in specified Common Crawl index.
|
||||
|
||||
"""
|
||||
|
||||
results = []
|
||||
|
||||
search_url = search_url_template.format(index=index, url=url)
|
||||
response = requests.get(search_url)
|
||||
|
||||
if response.status_code == 200:
|
||||
results = [json.loads(result) for result in response.content.splitlines()]
|
||||
|
||||
return results
|
||||
77
poetry.lock
generated
77
poetry.lock
generated
@@ -64,6 +64,22 @@ colorama = ">=0.3.9"
|
||||
six = ">=1.10.0"
|
||||
stevedore = ">=1.20.0"
|
||||
|
||||
[[package]]
|
||||
category = "main"
|
||||
description = "Python package for providing Mozilla's CA Bundle."
|
||||
name = "certifi"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
version = "2019.11.28"
|
||||
|
||||
[[package]]
|
||||
category = "main"
|
||||
description = "Universal encoding detector for Python 2 and 3"
|
||||
name = "chardet"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
version = "3.0.4"
|
||||
|
||||
[[package]]
|
||||
category = "dev"
|
||||
description = "Cross-platform colored terminal text."
|
||||
@@ -106,6 +122,14 @@ version = "3.0.5"
|
||||
[package.dependencies]
|
||||
gitdb2 = ">=2.0.0"
|
||||
|
||||
[[package]]
|
||||
category = "main"
|
||||
description = "Internationalized Domain Names in Applications (IDNA)"
|
||||
name = "idna"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
version = "2.8"
|
||||
|
||||
[[package]]
|
||||
category = "dev"
|
||||
description = "Read metadata from Python packages"
|
||||
@@ -306,6 +330,24 @@ optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
version = "5.3"
|
||||
|
||||
[[package]]
|
||||
category = "main"
|
||||
description = "Python HTTP for Humans."
|
||||
name = "requests"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
version = "2.22.0"
|
||||
|
||||
[package.dependencies]
|
||||
certifi = ">=2017.4.17"
|
||||
chardet = ">=3.0.2,<3.1.0"
|
||||
idna = ">=2.5,<2.9"
|
||||
urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26"
|
||||
|
||||
[package.extras]
|
||||
security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"]
|
||||
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"]
|
||||
|
||||
[[package]]
|
||||
category = "dev"
|
||||
description = "Python 2 and 3 compatibility utilities"
|
||||
@@ -350,6 +392,19 @@ optional = false
|
||||
python-versions = "*"
|
||||
version = "3.7.4.1"
|
||||
|
||||
[[package]]
|
||||
category = "main"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
name = "urllib3"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
|
||||
version = "1.25.7"
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotlipy (>=0.6.0)"]
|
||||
secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
|
||||
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"]
|
||||
|
||||
[[package]]
|
||||
category = "dev"
|
||||
description = "Measures number of Terminal column cells of wide-character codes"
|
||||
@@ -383,7 +438,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
|
||||
testing = ["pathlib2", "contextlib2", "unittest2"]
|
||||
|
||||
[metadata]
|
||||
content-hash = "3c5394e15ed1d821eea22c4ee68e816a079b593a732bdc93c7eba2588dbbfed4"
|
||||
content-hash = "aa92ddd8019bcaf825a3e667a9eb1b36023bb1f66f3edffc2fe2c222d69f16e1"
|
||||
python-versions = "^3.7"
|
||||
|
||||
[metadata.files]
|
||||
@@ -406,6 +461,14 @@ bandit = [
|
||||
{file = "bandit-1.6.2-py2.py3-none-any.whl", hash = "sha256:336620e220cf2d3115877685e264477ff9d9abaeb0afe3dc7264f55fa17a3952"},
|
||||
{file = "bandit-1.6.2.tar.gz", hash = "sha256:41e75315853507aa145d62a78a2a6c5e3240fe14ee7c601459d0df9418196065"},
|
||||
]
|
||||
certifi = [
|
||||
{file = "certifi-2019.11.28-py2.py3-none-any.whl", hash = "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3"},
|
||||
{file = "certifi-2019.11.28.tar.gz", hash = "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"},
|
||||
]
|
||||
chardet = [
|
||||
{file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
|
||||
{file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
|
||||
]
|
||||
colorama = [
|
||||
{file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"},
|
||||
{file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"},
|
||||
@@ -451,6 +514,10 @@ gitpython = [
|
||||
{file = "GitPython-3.0.5-py3-none-any.whl", hash = "sha256:c155c6a2653593ccb300462f6ef533583a913e17857cfef8fc617c246b6dc245"},
|
||||
{file = "GitPython-3.0.5.tar.gz", hash = "sha256:9c2398ffc3dcb3c40b27324b316f08a4f93ad646d5a6328cafbb871aa79f5e42"},
|
||||
]
|
||||
idna = [
|
||||
{file = "idna-2.8-py2.py3-none-any.whl", hash = "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"},
|
||||
{file = "idna-2.8.tar.gz", hash = "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407"},
|
||||
]
|
||||
importlib-metadata = [
|
||||
{file = "importlib_metadata-1.3.0-py2.py3-none-any.whl", hash = "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"},
|
||||
{file = "importlib_metadata-1.3.0.tar.gz", hash = "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45"},
|
||||
@@ -559,6 +626,10 @@ pyyaml = [
|
||||
{file = "PyYAML-5.3-cp38-cp38-win_amd64.whl", hash = "sha256:cb1f2f5e426dc9f07a7681419fe39cee823bb74f723f36f70399123f439e9b20"},
|
||||
{file = "PyYAML-5.3.tar.gz", hash = "sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615"},
|
||||
]
|
||||
requests = [
|
||||
{file = "requests-2.22.0-py2.py3-none-any.whl", hash = "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"},
|
||||
{file = "requests-2.22.0.tar.gz", hash = "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4"},
|
||||
]
|
||||
six = [
|
||||
{file = "six-1.13.0-py2.py3-none-any.whl", hash = "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd"},
|
||||
{file = "six-1.13.0.tar.gz", hash = "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"},
|
||||
@@ -598,6 +669,10 @@ typing-extensions = [
|
||||
{file = "typing_extensions-3.7.4.1-py3-none-any.whl", hash = "sha256:cf8b63fedea4d89bab840ecbb93e75578af28f76f66c35889bd7065f5af88575"},
|
||||
{file = "typing_extensions-3.7.4.1.tar.gz", hash = "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2"},
|
||||
]
|
||||
urllib3 = [
|
||||
{file = "urllib3-1.25.7-py2.py3-none-any.whl", hash = "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293"},
|
||||
{file = "urllib3-1.25.7.tar.gz", hash = "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"},
|
||||
]
|
||||
wcwidth = [
|
||||
{file = "wcwidth-0.1.8-py2.py3-none-any.whl", hash = "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"},
|
||||
{file = "wcwidth-0.1.8.tar.gz", hash = "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"},
|
||||
|
||||
@@ -6,6 +6,7 @@ authors = ["Michael Harms <michaelharms95@icloud.com>"]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.7"
|
||||
requests = "^2.22.0"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
max-line-length = 120
|
||||
|
||||
[pydocstyle]
|
||||
convention = numpy
|
||||
convention = google
|
||||
add-ignore = D205
|
||||
|
||||
[mypy]
|
||||
|
||||
@@ -2,4 +2,4 @@ from comcrawl import __version__
|
||||
|
||||
|
||||
def test_version():
|
||||
assert __version__ == '0.1.0'
|
||||
assert __version__ == "0.1.0"
|
||||
|
||||
Reference in New Issue
Block a user