1
0
mirror of https://github.com/michaelharms/comcrawl.git synced 2021-09-27 00:43:48 +03:00

doing some code style refactorings

This commit is contained in:
Michael Harms
2020-01-15 07:57:16 +01:00
parent b1003249be
commit f5dfbe4372
6 changed files with 28 additions and 24 deletions

View File

@@ -43,15 +43,15 @@ method-rgx=[a-z_][a-z0-9_]{2,70}$
[FORMAT]
# Maximum number of characters on a single line.
max-line-length = 100
max-line-length = 79
[DESIGN]
# Minimum number of public methods for a class (see R0903).
min-public-methods = 0
# Maximum number of attributes for a class (see R0902).
max-attributes = 15
max-attributes = 10
max-locals = 25
max-locals = 20
max-args = 10

View File

@@ -1,5 +1,5 @@
from pandas import DataFrame, Series
from ..utils import download_single_result
from ..utils import _download_single_result
def download(results: DataFrame) -> Series:
@@ -17,6 +17,6 @@ def download(results: DataFrame) -> Series:
new_results = results.copy()
new_results["html"] = ""
for _, row in new_results.iterrows():
row["html"] = download_single_result(row.to_dict())
row["html"] = _download_single_result(row.to_dict())
return new_results["html"]

View File

@@ -1,26 +1,28 @@
from typing import List, Dict
import concurrent.futures
from concurrent import futures
import pandas as pd
from ..utils import search_single_index
from ..utils import _search_single_index
DEFAULT_INDEXES = open("comcrawl/config/default_indexes.txt", "r").read().split("\n")
DEFAULT_INDEXES = (open("comcrawl/config/default_indexes.txt", "r")
.read()
.split("\n"))
def search(
url: str,
indexes: List[str] = DEFAULT_INDEXES,
threads: int = None
) -> List[Dict[str, Dict]]:
def search(url: str,
indexes: List[str] = DEFAULT_INDEXES,
threads: int = None) -> List[Dict[str, Dict]]:
"""Searches multiple Common Crawl indices for URL pattern.
Args:
url: The URL pattern to search for.
indices: List of Common Crawl indices to search in.
threads: Number of threads to use for faster search on multiple threads.
threads: Number of threads to use for faster search on
multiple threads.
Returns:
List of all results found throughout the specified Common Crawl indices.
List of all results found throughout the specified
Common Crawl indices.
"""
@@ -28,22 +30,22 @@ def search(
# multi-threaded search
if threads:
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
with futures.ThreadPoolExecutor(max_workers=threads) as executor:
future_to_index = {
executor.submit(
search_single_index,
_search_single_index,
index,
url
): index for index in indexes
}
for future in concurrent.futures.as_completed(future_to_index):
for future in futures.as_completed(future_to_index):
results.extend(future.result())
# single-threaded search
else:
for index in indexes:
index_results = search_single_index(index, url)
index_results = _search_single_index(index, url)
results.extend(index_results)
return pd.DataFrame(results)

View File

@@ -1,2 +1,2 @@
from .search_single_index import search_single_index
from .download_single_result import download_single_result
from ._search_single_index import _search_single_index
from ._download_single_result import _download_single_result

View File

@@ -4,7 +4,7 @@ import gzip
import requests
def download_single_result(result: Dict) -> str:
def _download_single_result(result: Dict) -> str:
"""Downloads HTML for single search result.
Args:

View File

@@ -2,10 +2,12 @@ from typing import List, Dict
import json
import requests
SEARCH_URL_TEMPLATE = "https://index.commoncrawl.org/CC-MAIN-{index}-index?url={url}&output=json"
SEARCH_URL_TEMPLATE = ("https://index.commoncrawl.org/CC-MAIN-"
"{index}-index?url={url}&output=json")
def search_single_index(index: str, url: str) -> List[Dict]:
def _search_single_index(index: str,
url: str) -> List[Dict]:
"""Searches single Common Crawl index for given URL pattern.
Args: