diff --git a/comcrawl/__init__.py b/comcrawl/__init__.py index b794fd4..cec3004 100644 --- a/comcrawl/__init__.py +++ b/comcrawl/__init__.py @@ -1 +1,3 @@ -__version__ = '0.1.0' +__version__ = "0.1.0" + +from .search import search diff --git a/comcrawl/default_indices.py b/comcrawl/default_indices.py new file mode 100644 index 0000000..e69de29 diff --git a/comcrawl/search.py b/comcrawl/search.py new file mode 100644 index 0000000..5e38a1d --- /dev/null +++ b/comcrawl/search.py @@ -0,0 +1,39 @@ +from typing import List, Dict +import concurrent.futures +from .search_index import search_index + + +def search( + url: str, + indices: List[str], + threads: int = None +) -> List[Dict[str, Dict]]: + """Searches multiple Common Crawl index for URL pattern. + + Args: + url: The URL pattern to search for. + indices: List of Common Crawl indices to search in. + threads: Number of threads to use for faster search on multiple threads. + + Returns: + List of all results found throughout the specified Common Crawl indices. + + """ + + results = [] + + # multi-threaded search + if threads: + with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: + future_to_index = {executor.submit(search_index, index, url): index for index in indices} + + for future in concurrent.futures.as_completed(future_to_index): + results.extend(future.result()) + + # single-threaded search + else: + for index in indices: + index_results = search_index(index, url) + results.extend(index_results) + + return results diff --git a/comcrawl/search_index.py b/comcrawl/search_index.py new file mode 100644 index 0000000..3c67d1c --- /dev/null +++ b/comcrawl/search_index.py @@ -0,0 +1,28 @@ +from typing import List, Dict +import json +import requests + +search_url_template = "https://index.commoncrawl.org/CC-MAIN-{index}-index?url={url}&output=json" + + +def search_index(index: str, url: str) -> List[Dict]: + """Searches single Common Crawl index for given URL pattern. + + Args: + index: Common Crawl index to search in. + url: URL pattern to search for. + + Returns: + List of results found in specified Common Crawl index. + + """ + + results = [] + + search_url = search_url_template.format(index=index, url=url) + response = requests.get(search_url) + + if response.status_code == 200: + results = [json.loads(result) for result in response.content.splitlines()] + + return results diff --git a/poetry.lock b/poetry.lock index de9a0c1..2638b63 100644 --- a/poetry.lock +++ b/poetry.lock @@ -64,6 +64,22 @@ colorama = ">=0.3.9" six = ">=1.10.0" stevedore = ">=1.20.0" +[[package]] +category = "main" +description = "Python package for providing Mozilla's CA Bundle." +name = "certifi" +optional = false +python-versions = "*" +version = "2019.11.28" + +[[package]] +category = "main" +description = "Universal encoding detector for Python 2 and 3" +name = "chardet" +optional = false +python-versions = "*" +version = "3.0.4" + [[package]] category = "dev" description = "Cross-platform colored terminal text." @@ -106,6 +122,14 @@ version = "3.0.5" [package.dependencies] gitdb2 = ">=2.0.0" +[[package]] +category = "main" +description = "Internationalized Domain Names in Applications (IDNA)" +name = "idna" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "2.8" + [[package]] category = "dev" description = "Read metadata from Python packages" @@ -306,6 +330,24 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" version = "5.3" +[[package]] +category = "main" +description = "Python HTTP for Humans." +name = "requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "2.22.0" + +[package.dependencies] +certifi = ">=2017.4.17" +chardet = ">=3.0.2,<3.1.0" +idna = ">=2.5,<2.9" +urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26" + +[package.extras] +security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"] +socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"] + [[package]] category = "dev" description = "Python 2 and 3 compatibility utilities" @@ -350,6 +392,19 @@ optional = false python-versions = "*" version = "3.7.4.1" +[[package]] +category = "main" +description = "HTTP library with thread-safe connection pooling, file post, and more." +name = "urllib3" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" +version = "1.25.7" + +[package.extras] +brotli = ["brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"] + [[package]] category = "dev" description = "Measures number of Terminal column cells of wide-character codes" @@ -383,7 +438,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["pathlib2", "contextlib2", "unittest2"] [metadata] -content-hash = "3c5394e15ed1d821eea22c4ee68e816a079b593a732bdc93c7eba2588dbbfed4" +content-hash = "aa92ddd8019bcaf825a3e667a9eb1b36023bb1f66f3edffc2fe2c222d69f16e1" python-versions = "^3.7" [metadata.files] @@ -406,6 +461,14 @@ bandit = [ {file = "bandit-1.6.2-py2.py3-none-any.whl", hash = "sha256:336620e220cf2d3115877685e264477ff9d9abaeb0afe3dc7264f55fa17a3952"}, {file = "bandit-1.6.2.tar.gz", hash = "sha256:41e75315853507aa145d62a78a2a6c5e3240fe14ee7c601459d0df9418196065"}, ] +certifi = [ + {file = "certifi-2019.11.28-py2.py3-none-any.whl", hash = "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3"}, + {file = "certifi-2019.11.28.tar.gz", hash = "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"}, +] +chardet = [ + {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"}, + {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"}, +] colorama = [ {file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"}, {file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"}, @@ -451,6 +514,10 @@ gitpython = [ {file = "GitPython-3.0.5-py3-none-any.whl", hash = "sha256:c155c6a2653593ccb300462f6ef533583a913e17857cfef8fc617c246b6dc245"}, {file = "GitPython-3.0.5.tar.gz", hash = "sha256:9c2398ffc3dcb3c40b27324b316f08a4f93ad646d5a6328cafbb871aa79f5e42"}, ] +idna = [ + {file = "idna-2.8-py2.py3-none-any.whl", hash = "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"}, + {file = "idna-2.8.tar.gz", hash = "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407"}, +] importlib-metadata = [ {file = "importlib_metadata-1.3.0-py2.py3-none-any.whl", hash = "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"}, {file = "importlib_metadata-1.3.0.tar.gz", hash = "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45"}, @@ -559,6 +626,10 @@ pyyaml = [ {file = "PyYAML-5.3-cp38-cp38-win_amd64.whl", hash = "sha256:cb1f2f5e426dc9f07a7681419fe39cee823bb74f723f36f70399123f439e9b20"}, {file = "PyYAML-5.3.tar.gz", hash = "sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615"}, ] +requests = [ + {file = "requests-2.22.0-py2.py3-none-any.whl", hash = "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"}, + {file = "requests-2.22.0.tar.gz", hash = "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4"}, +] six = [ {file = "six-1.13.0-py2.py3-none-any.whl", hash = "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd"}, {file = "six-1.13.0.tar.gz", hash = "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"}, @@ -598,6 +669,10 @@ typing-extensions = [ {file = "typing_extensions-3.7.4.1-py3-none-any.whl", hash = "sha256:cf8b63fedea4d89bab840ecbb93e75578af28f76f66c35889bd7065f5af88575"}, {file = "typing_extensions-3.7.4.1.tar.gz", hash = "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2"}, ] +urllib3 = [ + {file = "urllib3-1.25.7-py2.py3-none-any.whl", hash = "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293"}, + {file = "urllib3-1.25.7.tar.gz", hash = "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"}, +] wcwidth = [ {file = "wcwidth-0.1.8-py2.py3-none-any.whl", hash = "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"}, {file = "wcwidth-0.1.8.tar.gz", hash = "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"}, diff --git a/pyproject.toml b/pyproject.toml index 41f0a42..b369164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ authors = ["Michael Harms "] [tool.poetry.dependencies] python = "^3.7" +requests = "^2.22.0" [tool.poetry.dev-dependencies] pytest = "^5.2" diff --git a/setup.cfg b/setup.cfg index 03e544b..45d2abd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ max-line-length = 120 [pydocstyle] -convention = numpy +convention = google add-ignore = D205 [mypy] diff --git a/tests/test_comcrawl.py b/tests/test_comcrawl.py index 6902bca..65667b7 100644 --- a/tests/test_comcrawl.py +++ b/tests/test_comcrawl.py @@ -2,4 +2,4 @@ from comcrawl import __version__ def test_version(): - assert __version__ == '0.1.0' + assert __version__ == "0.1.0"