mirror of
https://github.com/michaelharms/comcrawl.git
synced 2021-09-27 00:43:48 +03:00
doing some code style refactorings
This commit is contained in:
@@ -43,15 +43,15 @@ method-rgx=[a-z_][a-z0-9_]{2,70}$
|
||||
[FORMAT]
|
||||
|
||||
# Maximum number of characters on a single line.
|
||||
max-line-length = 100
|
||||
max-line-length = 79
|
||||
|
||||
[DESIGN]
|
||||
# Minimum number of public methods for a class (see R0903).
|
||||
min-public-methods = 0
|
||||
|
||||
# Maximum number of attributes for a class (see R0902).
|
||||
max-attributes = 15
|
||||
max-attributes = 10
|
||||
|
||||
max-locals = 25
|
||||
max-locals = 20
|
||||
|
||||
max-args = 10
|
||||
@@ -1,5 +1,5 @@
|
||||
from pandas import DataFrame, Series
|
||||
from ..utils import download_single_result
|
||||
from ..utils import _download_single_result
|
||||
|
||||
|
||||
def download(results: DataFrame) -> Series:
|
||||
@@ -17,6 +17,6 @@ def download(results: DataFrame) -> Series:
|
||||
new_results = results.copy()
|
||||
new_results["html"] = ""
|
||||
for _, row in new_results.iterrows():
|
||||
row["html"] = download_single_result(row.to_dict())
|
||||
row["html"] = _download_single_result(row.to_dict())
|
||||
|
||||
return new_results["html"]
|
||||
|
||||
@@ -1,26 +1,28 @@
|
||||
from typing import List, Dict
|
||||
import concurrent.futures
|
||||
from concurrent import futures
|
||||
import pandas as pd
|
||||
from ..utils import search_single_index
|
||||
from ..utils import _search_single_index
|
||||
|
||||
|
||||
DEFAULT_INDEXES = open("comcrawl/config/default_indexes.txt", "r").read().split("\n")
|
||||
DEFAULT_INDEXES = (open("comcrawl/config/default_indexes.txt", "r")
|
||||
.read()
|
||||
.split("\n"))
|
||||
|
||||
|
||||
def search(
|
||||
url: str,
|
||||
indexes: List[str] = DEFAULT_INDEXES,
|
||||
threads: int = None
|
||||
) -> List[Dict[str, Dict]]:
|
||||
def search(url: str,
|
||||
indexes: List[str] = DEFAULT_INDEXES,
|
||||
threads: int = None) -> List[Dict[str, Dict]]:
|
||||
"""Searches multiple Common Crawl indices for URL pattern.
|
||||
|
||||
Args:
|
||||
url: The URL pattern to search for.
|
||||
indices: List of Common Crawl indices to search in.
|
||||
threads: Number of threads to use for faster search on multiple threads.
|
||||
threads: Number of threads to use for faster search on
|
||||
multiple threads.
|
||||
|
||||
Returns:
|
||||
List of all results found throughout the specified Common Crawl indices.
|
||||
List of all results found throughout the specified
|
||||
Common Crawl indices.
|
||||
|
||||
"""
|
||||
|
||||
@@ -28,22 +30,22 @@ def search(
|
||||
|
||||
# multi-threaded search
|
||||
if threads:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
||||
with futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
||||
future_to_index = {
|
||||
executor.submit(
|
||||
search_single_index,
|
||||
_search_single_index,
|
||||
index,
|
||||
url
|
||||
): index for index in indexes
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(future_to_index):
|
||||
for future in futures.as_completed(future_to_index):
|
||||
results.extend(future.result())
|
||||
|
||||
# single-threaded search
|
||||
else:
|
||||
for index in indexes:
|
||||
index_results = search_single_index(index, url)
|
||||
index_results = _search_single_index(index, url)
|
||||
results.extend(index_results)
|
||||
|
||||
return pd.DataFrame(results)
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
from .search_single_index import search_single_index
|
||||
from .download_single_result import download_single_result
|
||||
from ._search_single_index import _search_single_index
|
||||
from ._download_single_result import _download_single_result
|
||||
|
||||
@@ -4,7 +4,7 @@ import gzip
|
||||
import requests
|
||||
|
||||
|
||||
def download_single_result(result: Dict) -> str:
|
||||
def _download_single_result(result: Dict) -> str:
|
||||
"""Downloads HTML for single search result.
|
||||
|
||||
Args:
|
||||
@@ -2,10 +2,12 @@ from typing import List, Dict
|
||||
import json
|
||||
import requests
|
||||
|
||||
SEARCH_URL_TEMPLATE = "https://index.commoncrawl.org/CC-MAIN-{index}-index?url={url}&output=json"
|
||||
SEARCH_URL_TEMPLATE = ("https://index.commoncrawl.org/CC-MAIN-"
|
||||
"{index}-index?url={url}&output=json")
|
||||
|
||||
|
||||
def search_single_index(index: str, url: str) -> List[Dict]:
|
||||
def _search_single_index(index: str,
|
||||
url: str) -> List[Dict]:
|
||||
"""Searches single Common Crawl index for given URL pattern.
|
||||
|
||||
Args:
|
||||
Reference in New Issue
Block a user