1
0
mirror of https://github.com/michaelharms/comcrawl.git synced 2021-09-27 00:43:48 +03:00

implementing basic functionality and basic tests

This commit is contained in:
Michael Harms
2020-01-10 18:54:08 +01:00
parent dfa8d8f6b3
commit c39f4284ba
16 changed files with 271 additions and 13 deletions

3
.gitignore vendored
View File

@@ -123,3 +123,6 @@ dmypy.json
# Pyre type checker
.pyre/
# VS Code Settings
.vscode

View File

@@ -2,3 +2,6 @@
[![Actions Status](https://github.com/michaelharms/comcrawl/workflows/ci/badge.svg)](https://github.com/michaelharms/comcrawl/actions)
[![License: MIT](https://img.shields.io/pypi/l/comcrawl)](https://github.com/michaelharms/comcrawl/blob/master/LICENSE)
https://www.bellingcat.com/resources/2015/08/13/using-python-to-mine-common-crawl/
https://groups.google.com/forum/#!msg/common-crawl/3QmQjFA_3y4/vTbhGqIBBQAJ

View File

@@ -1,3 +1,4 @@
__version__ = "0.1.0"
from .search import search
from .api import search
from .api import download

2
comcrawl/api/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
from .search import search
from .download import download

11
comcrawl/api/download.py Normal file
View File

@@ -0,0 +1,11 @@
from pandas import DataFrame, Series
from ..utils import download_single_result
def download(results: DataFrame) -> Series:
new_results = results.copy()
new_results["html"] = ""
for _, row in new_results.iterrows():
row["html"] = download_single_result(row.to_dict())
return new_results["html"]

View File

@@ -1,11 +1,15 @@
from typing import List, Dict
import concurrent.futures
from .search_index import search_index
import pandas as pd
from ..utils import search_single_index
default_indexes = open("comcrawl/config/default_indexes.txt", "r").read().split("\n")
def search(
url: str,
indices: List[str],
indexes: List[str] = default_indexes,
threads: int = None
) -> List[Dict[str, Dict]]:
"""Searches multiple Common Crawl indices for URL pattern.
@@ -25,15 +29,21 @@ def search(
# multi-threaded search
if threads:
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
future_to_index = {executor.submit(search_index, index, url): index for index in indices}
future_to_index = {
executor.submit(
search_single_index,
index,
url
): index for index in indexes
}
for future in concurrent.futures.as_completed(future_to_index):
results.extend(future.result())
# single-threaded search
else:
for index in indices:
index_results = search_index(index, url)
for index in indexes:
index_results = search_single_index(index, url)
results.extend(index_results)
return results
return pd.DataFrame(results)

View File

@@ -0,0 +1,68 @@
2019-51
2019-47
2019-43
2019-39
2019-35
2019-30
2019-26
2019-22
2019-18
2019-13
2019-09
2019-04
2018-51
2018-47
2018-43
2018-39
2018-34
2018-30
2018-26
2018-22
2018-17
2018-13
2018-09
2018-05
2017-51
2017-47
2017-43
2017-39
2017-34
2017-30
2017-26
2017-22
2017-17
2017-13
2017-09
2017-05
2016-50
2016-44
2016-40
2016-36
2016-30
2016-26
2016-22
2016-18
2016-07
2015-48
2015-40
2015-35
2015-32
2015-27
2015-22
2015-18
2015-14
2015-11
2015-06
2014-52
2014-49
2014-42
2014-41
2014-35
2014-23
2014-15
2014-10
2013-48
2013-20
2012
2009-2010
2008-2009

View File

@@ -0,0 +1,2 @@
from .search_single_index import search_single_index
from .download_single_result import download_single_result

View File

@@ -0,0 +1,28 @@
from typing import Dict
import requests
import json
import io
import gzip
def download_single_result(result: Dict) -> str:
result_url = result["url"]
offset, length = int(result["offset"]), int(result["length"])
offset_end = offset + length - 1
prefix = "https://commoncrawl.s3.amazonaws.com"
response = requests.get(f"{prefix}/{result['filename']}", headers={"Range": f"bytes={offset}-{offset_end}"})
raw_data = io.BytesIO(response.content)
f = gzip.GzipFile(fileobj=raw_data)
data = f.read()
data = data.decode("utf-8")
html = ""
if len(data) > 0:
_, _, html = data.strip().split("\r\n\r\n", 2)
return html

View File

@@ -5,7 +5,7 @@ import requests
search_url_template = "https://index.commoncrawl.org/CC-MAIN-{index}-index?url={url}&output=json"
def search_index(index: str, url: str) -> List[Dict]:
def search_single_index(index: str, url: str) -> List[Dict]:
"""Searches single Common Crawl index for given URL pattern.
Args:

6
examples/basic.py Normal file
View File

@@ -0,0 +1,6 @@
import comcrawl as cc
results = cc.search("https://index.commoncrawl.org/*")
results["html"] = cc.download(results)
results.to_csv("results.csv")

View File

@@ -0,0 +1,9 @@
import comcrawl as cc
results = cc.search("https://index.commoncrawl.org/*")
results = results.sort_values(by="timestamp")
results = results.drop_duplicates("url", keep="first")
results["html"] = cc.download(results)
results.to_csv("results.csv")

99
poetry.lock generated
View File

@@ -208,6 +208,14 @@ optional = false
python-versions = "*"
version = "0.4.3"
[[package]]
category = "main"
description = "NumPy is the fundamental package for array computing with Python."
name = "numpy"
optional = false
python-versions = ">=3.5"
version = "1.18.1"
[[package]]
category = "dev"
description = "Core utilities for Python packages"
@@ -220,6 +228,22 @@ version = "20.0"
pyparsing = ">=2.0.2"
six = "*"
[[package]]
category = "main"
description = "Powerful data structures for data analysis, time series, and statistics"
name = "pandas"
optional = false
python-versions = ">=3.5.3"
version = "0.25.3"
[package.dependencies]
numpy = ">=1.13.3"
python-dateutil = ">=2.6.1"
pytz = ">=2017.2"
[package.extras]
test = ["pytest (>=4.0.2)", "pytest-xdist", "hypothesis (>=3.58)"]
[[package]]
category = "dev"
description = "Python Build Reasonableness"
@@ -322,6 +346,25 @@ pytest = ">=3.6"
[package.extras]
testing = ["fields", "hunter", "process-tests (2.0.2)", "six", "virtualenv"]
[[package]]
category = "main"
description = "Extensions to the standard Python datetime module"
name = "python-dateutil"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
version = "2.8.1"
[package.dependencies]
six = ">=1.5"
[[package]]
category = "main"
description = "World timezone definitions, modern and historical"
name = "pytz"
optional = false
python-versions = "*"
version = "2019.3"
[[package]]
category = "dev"
description = "YAML parser and emitter for Python"
@@ -349,7 +392,7 @@ security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"]
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"]
[[package]]
category = "dev"
category = "main"
description = "Python 2 and 3 compatibility utilities"
name = "six"
optional = false
@@ -438,7 +481,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
testing = ["pathlib2", "contextlib2", "unittest2"]
[metadata]
content-hash = "aa92ddd8019bcaf825a3e667a9eb1b36023bb1f66f3edffc2fe2c222d69f16e1"
content-hash = "b01600ca1e8daac274043daedf3d6a55567c32b7a6eb75ddd514b594a78197e7"
python-versions = "^3.7"
[metadata.files]
@@ -577,10 +620,54 @@ mypy-extensions = [
{file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
{file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"},
]
numpy = [
{file = "numpy-1.18.1-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:20b26aaa5b3da029942cdcce719b363dbe58696ad182aff0e5dcb1687ec946dc"},
{file = "numpy-1.18.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:70a840a26f4e61defa7bdf811d7498a284ced303dfbc35acb7be12a39b2aa121"},
{file = "numpy-1.18.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:17aa7a81fe7599a10f2b7d95856dc5cf84a4eefa45bc96123cbbc3ebc568994e"},
{file = "numpy-1.18.1-cp35-cp35m-win32.whl", hash = "sha256:f3d0a94ad151870978fb93538e95411c83899c9dc63e6fb65542f769568ecfa5"},
{file = "numpy-1.18.1-cp35-cp35m-win_amd64.whl", hash = "sha256:1786a08236f2c92ae0e70423c45e1e62788ed33028f94ca99c4df03f5be6b3c6"},
{file = "numpy-1.18.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ae0975f42ab1f28364dcda3dde3cf6c1ddab3e1d4b2909da0cb0191fa9ca0480"},
{file = "numpy-1.18.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:cf7eb6b1025d3e169989416b1adcd676624c2dbed9e3bcb7137f51bfc8cc2572"},
{file = "numpy-1.18.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:b765ed3930b92812aa698a455847141869ef755a87e099fddd4ccf9d81fffb57"},
{file = "numpy-1.18.1-cp36-cp36m-win32.whl", hash = "sha256:2d75908ab3ced4223ccba595b48e538afa5ecc37405923d1fea6906d7c3a50bc"},
{file = "numpy-1.18.1-cp36-cp36m-win_amd64.whl", hash = "sha256:9acdf933c1fd263c513a2df3dceecea6f3ff4419d80bf238510976bf9bcb26cd"},
{file = "numpy-1.18.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:56bc8ded6fcd9adea90f65377438f9fea8c05fcf7c5ba766bef258d0da1554aa"},
{file = "numpy-1.18.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e422c3152921cece8b6a2fb6b0b4d73b6579bd20ae075e7d15143e711f3ca2ca"},
{file = "numpy-1.18.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:b3af02ecc999c8003e538e60c89a2b37646b39b688d4e44d7373e11c2debabec"},
{file = "numpy-1.18.1-cp37-cp37m-win32.whl", hash = "sha256:d92350c22b150c1cae7ebb0ee8b5670cc84848f6359cf6b5d8f86617098a9b73"},
{file = "numpy-1.18.1-cp37-cp37m-win_amd64.whl", hash = "sha256:77c3bfe65d8560487052ad55c6998a04b654c2fbc36d546aef2b2e511e760971"},
{file = "numpy-1.18.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c98c5ffd7d41611407a1103ae11c8b634ad6a43606eca3e2a5a269e5d6e8eb07"},
{file = "numpy-1.18.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:9537eecf179f566fd1c160a2e912ca0b8e02d773af0a7a1120ad4f7507cd0d26"},
{file = "numpy-1.18.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:e840f552a509e3380b0f0ec977e8124d0dc34dc0e68289ca28f4d7c1d0d79474"},
{file = "numpy-1.18.1-cp38-cp38-win32.whl", hash = "sha256:590355aeade1a2eaba17617c19edccb7db8d78760175256e3cf94590a1a964f3"},
{file = "numpy-1.18.1-cp38-cp38-win_amd64.whl", hash = "sha256:39d2c685af15d3ce682c99ce5925cc66efc824652e10990d2462dfe9b8918c6a"},
{file = "numpy-1.18.1.zip", hash = "sha256:b6ff59cee96b454516e47e7721098e6ceebef435e3e21ac2d6c3b8b02628eb77"},
]
packaging = [
{file = "packaging-20.0-py2.py3-none-any.whl", hash = "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb"},
{file = "packaging-20.0.tar.gz", hash = "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8"},
]
pandas = [
{file = "pandas-0.25.3-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133"},
{file = "pandas-0.25.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3"},
{file = "pandas-0.25.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0"},
{file = "pandas-0.25.3-cp35-cp35m-win32.whl", hash = "sha256:adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a"},
{file = "pandas-0.25.3-cp35-cp35m-win_amd64.whl", hash = "sha256:975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17"},
{file = "pandas-0.25.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"},
{file = "pandas-0.25.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9"},
{file = "pandas-0.25.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf"},
{file = "pandas-0.25.3-cp36-cp36m-win32.whl", hash = "sha256:8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f"},
{file = "pandas-0.25.3-cp36-cp36m-win_amd64.whl", hash = "sha256:26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7"},
{file = "pandas-0.25.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d"},
{file = "pandas-0.25.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7"},
{file = "pandas-0.25.3-cp37-cp37m-win32.whl", hash = "sha256:255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b"},
{file = "pandas-0.25.3-cp37-cp37m-win_amd64.whl", hash = "sha256:22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e"},
{file = "pandas-0.25.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d"},
{file = "pandas-0.25.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b"},
{file = "pandas-0.25.3-cp38-cp38-win32.whl", hash = "sha256:6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71"},
{file = "pandas-0.25.3-cp38-cp38-win_amd64.whl", hash = "sha256:33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2"},
{file = "pandas-0.25.3.tar.gz", hash = "sha256:52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4"},
]
pbr = [
{file = "pbr-5.4.4-py2.py3-none-any.whl", hash = "sha256:61aa52a0f18b71c5cc58232d2cf8f8d09cd67fcad60b742a60124cb8d6951488"},
{file = "pbr-5.4.4.tar.gz", hash = "sha256:139d2625547dbfa5fb0b81daebb39601c478c21956dc57e2e07b74450a8c506b"},
@@ -613,6 +700,14 @@ pytest-cov = [
{file = "pytest-cov-2.8.1.tar.gz", hash = "sha256:cc6742d8bac45070217169f5f72ceee1e0e55b0221f54bcf24845972d3a47f2b"},
{file = "pytest_cov-2.8.1-py2.py3-none-any.whl", hash = "sha256:cdbdef4f870408ebdbfeb44e63e07eb18bb4619fae852f6e760645fa36172626"},
]
python-dateutil = [
{file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"},
{file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"},
]
pytz = [
{file = "pytz-2019.3-py2.py3-none-any.whl", hash = "sha256:1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d"},
{file = "pytz-2019.3.tar.gz", hash = "sha256:b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be"},
]
pyyaml = [
{file = "PyYAML-5.3-cp27-cp27m-win32.whl", hash = "sha256:940532b111b1952befd7db542c370887a8611660d2b9becff75d39355303d82d"},
{file = "PyYAML-5.3-cp27-cp27m-win_amd64.whl", hash = "sha256:059b2ee3194d718896c0ad077dd8c043e5e909d9180f387ce42012662a4946d6"},

View File

@@ -7,6 +7,7 @@ authors = ["Michael Harms <michaelharms95@icloud.com>"]
[tool.poetry.dependencies]
python = "^3.7"
requests = "^2.22.0"
pandas = "^0.25.3"
[tool.poetry.dev-dependencies]
pytest = "^5.2"

View File

@@ -1,5 +1,24 @@
from comcrawl import __version__
import comcrawl as cc
from pprint import pprint
def test_version():
assert __version__ == "0.1.0"
def test_comcrawl():
results = cc.search("https://index.commoncrawl.org/*", indexes=["2019-51"])
assert results.shape == (3, 12)
print("\n")
print(f"Shape: {results.shape}")
print(results)
results = results.sort_values(by="timestamp")
results = results.drop_duplicates("urlkey", keep="last")
assert results.shape == (2, 12)
print("\n")
print(f"Shape: {results.shape}")
print(results)
results["html"] = cc.download(results)
pprint(results["html"])