mirror of
https://github.com/unclecode/crawl4ai.git
synced 2024-12-22 15:52:24 +03:00
feat(cache): introduce CacheMode and CacheContext for enhanced caching behavior
chore(requirements): add colorama dependency refactor(config): add SHOW_DEPRECATION_WARNINGS flag and clean up code fix(docs): update example scripts for clarity and consistency
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# __init__.py
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from .models import CrawlResult
|
||||
from .__version__ import __version__
|
||||
# __version__ = "0.3.73"
|
||||
@@ -8,6 +8,7 @@ from .__version__ import __version__
|
||||
__all__ = [
|
||||
"AsyncWebCrawler",
|
||||
"CrawlResult",
|
||||
"CacheMode",
|
||||
]
|
||||
|
||||
def is_sync_version_installed():
|
||||
|
||||
@@ -669,8 +669,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if self.accept_downloads:
|
||||
page.on("download", lambda download: asyncio.create_task(self._handle_download(download)))
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
||||
# if self.verbose:
|
||||
# print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
@@ -873,8 +873,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await asyncio.sleep(screenshot_wait_for)
|
||||
screenshot_data = await self.take_screenshot(page)
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
# if self.verbose:
|
||||
# print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
|
||||
344
crawl4ai/async_webcrawler.3.73.py
Normal file
344
crawl4ai/async_webcrawler.3.73.py
Normal file
@@ -0,0 +1,344 @@
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import json
|
||||
import asyncio
|
||||
from .models import CrawlResult
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
InvalidCSSSelectorError,
|
||||
format_html
|
||||
)
|
||||
from .__version__ import __version__ as crawl4ai_version
|
||||
|
||||
class AsyncWebCrawler:
|
||||
def __init__(
|
||||
self,
|
||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
base_directory: str = str(Path.home()),
|
||||
**kwargs,
|
||||
):
|
||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||||
**kwargs
|
||||
)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
# self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
self.ready = False
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
async def __aenter__(self):
|
||||
await self.crawler_strategy.__aenter__()
|
||||
await self.awarmup()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
async def awarmup(self):
|
||||
# Print a message for crawl4ai and its version
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||
# await async_db_manager.ainit_db()
|
||||
# # await async_db_manager.initialize()
|
||||
# await self.arun(
|
||||
# url="https://google.com/",
|
||||
# word_count_threshold=5,
|
||||
# bypass_cache=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
self.ready = True
|
||||
if self.verbose:
|
||||
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
||||
|
||||
async def arun(
|
||||
self,
|
||||
url: str,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
disable_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl. Supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
CrawlResult: The result of the crawling and processing.
|
||||
"""
|
||||
try:
|
||||
if disable_cache:
|
||||
bypass_cache = True
|
||||
no_cache_read = True
|
||||
no_cache_write = True
|
||||
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
|
||||
|
||||
async_response: AsyncCrawlResponse = None
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
|
||||
is_web_url = url.startswith(('http://', 'https://'))
|
||||
is_local_file = url.startswith("file://")
|
||||
is_raw_html = url.startswith("raw:")
|
||||
_url = url if not is_raw_html else "Raw HTML"
|
||||
|
||||
start_time = time.perf_counter()
|
||||
cached_result = None
|
||||
if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache:
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if cached_result:
|
||||
html = sanitize_input_encode(cached_result.html)
|
||||
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
|
||||
if screenshot:
|
||||
screenshot_data = cached_result.screenshot
|
||||
if not screenshot_data:
|
||||
cached_result = None
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
|
||||
if not cached or not html:
|
||||
t1 = time.perf_counter()
|
||||
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
|
||||
html = sanitize_input_encode(async_response.html)
|
||||
screenshot_data = async_response.screenshot
|
||||
t2 = time.perf_counter()
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
t1 = time.perf_counter()
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot_data,
|
||||
verbose=verbose,
|
||||
is_cached=bool(cached),
|
||||
async_response=async_response,
|
||||
bypass_cache=bypass_cache,
|
||||
is_web_url = is_web_url,
|
||||
is_local_file = is_local_file,
|
||||
is_raw_html = is_raw_html,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if async_response:
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
else:
|
||||
crawl_result.status_code = 200
|
||||
crawl_result.response_headers = cached_result.response_headers if cached_result else {}
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = kwargs.get("session_id", None)
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
if not is_raw_html and not no_cache_write:
|
||||
if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
|
||||
return crawl_result
|
||||
|
||||
except Exception as e:
|
||||
if not hasattr(e, "msg"):
|
||||
e.msg = str(e)
|
||||
print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}")
|
||||
return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg)
|
||||
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> List[CrawlResult]:
|
||||
"""
|
||||
Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
|
||||
|
||||
Args:
|
||||
urls (List[str]): A list of URLs with supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
List[CrawlResult]: The results of the crawling and processing.
|
||||
"""
|
||||
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
async def crawl_with_semaphore(url):
|
||||
async with semaphore:
|
||||
return await self.arun(
|
||||
url,
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
bypass_cache=bypass_cache,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
user_agent=user_agent,
|
||||
verbose=verbose,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||
|
||||
async def aprocess_html(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
extracted_content: str,
|
||||
word_count_threshold: int,
|
||||
extraction_strategy: ExtractionStrategy,
|
||||
chunking_strategy: ChunkingStrategy,
|
||||
css_selector: str,
|
||||
screenshot: str,
|
||||
verbose: bool,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.perf_counter()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||
t1 = time.perf_counter()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
# result = await scrapping_strategy.ascrap(
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
html,
|
||||
word_count_threshold=word_count_threshold,
|
||||
css_selector=css_selector,
|
||||
only_text=kwargs.get("only_text", False),
|
||||
image_description_min_word_threshold=kwargs.get(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if result is None:
|
||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
except Exception as e:
|
||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
||||
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
||||
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
|
||||
t1 = time.perf_counter()
|
||||
# Check if extraction strategy is type of JsonCssExtractionStrategy
|
||||
if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy):
|
||||
extraction_strategy.verbose = verbose
|
||||
extracted_content = extraction_strategy.run(url, [html])
|
||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||
else:
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
extracted_content = extraction_strategy.run(url, sections)
|
||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=format_html(cleaned_html),
|
||||
markdown=markdown,
|
||||
fit_markdown=fit_markdown,
|
||||
fit_html= fit_html,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=screenshot,
|
||||
extracted_content=extracted_content,
|
||||
success=True,
|
||||
error_message="",
|
||||
)
|
||||
|
||||
async def aclear_cache(self):
|
||||
# await async_db_manager.aclear_db()
|
||||
await async_db_manager.cleanup()
|
||||
|
||||
async def aflush_cache(self):
|
||||
await async_db_manager.aflush_db()
|
||||
|
||||
async def aget_cache_size(self):
|
||||
return await async_db_manager.aget_total_count()
|
||||
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import os
|
||||
import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from colorama import init, Fore, Back, Style
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Optional, List, Union
|
||||
import json
|
||||
import asyncio
|
||||
from .models import CrawlResult
|
||||
@@ -9,8 +12,13 @@ from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
SHOW_DEPRECATION_WARNINGS # New import
|
||||
)
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
InvalidCSSSelectorError,
|
||||
@@ -18,19 +26,77 @@ from .utils import (
|
||||
)
|
||||
from .__version__ import __version__ as crawl4ai_version
|
||||
|
||||
|
||||
class AsyncWebCrawler:
|
||||
"""
|
||||
Asynchronous web crawler with flexible caching capabilities.
|
||||
|
||||
Migration Guide (from version X.X.X):
|
||||
Old way (deprecated):
|
||||
crawler = AsyncWebCrawler(always_by_pass_cache=True)
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
bypass_cache=True,
|
||||
no_cache_read=True,
|
||||
no_cache_write=False
|
||||
)
|
||||
|
||||
New way (recommended):
|
||||
crawler = AsyncWebCrawler(always_bypass_cache=True)
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
cache_mode=CacheMode.WRITE_ONLY
|
||||
)
|
||||
|
||||
To disable deprecation warnings:
|
||||
Set SHOW_DEPRECATION_WARNINGS = False in config.py
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
always_bypass_cache: bool = False,
|
||||
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
||||
base_directory: str = str(Path.home()),
|
||||
**kwargs,
|
||||
):
|
||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||||
**kwargs
|
||||
)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
# self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
"""
|
||||
Initialize the AsyncWebCrawler.
|
||||
|
||||
Args:
|
||||
crawler_strategy: Strategy for crawling web pages
|
||||
always_bypass_cache: Whether to always bypass cache (new parameter)
|
||||
always_by_pass_cache: Deprecated, use always_bypass_cache instead
|
||||
base_directory: Base directory for storing cache
|
||||
"""
|
||||
init()
|
||||
self.log_width = 10 # Width of "[COMPLETE]"
|
||||
self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
|
||||
self.log_icons = {
|
||||
'INIT': '→', # Alternative: '▶' or '►'
|
||||
'READY': '✓', # Alternative: '√'
|
||||
'FETCH': '↓', # Alternative: '▼'
|
||||
'SCRAPE': '◆', # Alternative: '♦'
|
||||
'EXTRACT': '■', # Alternative: '□'
|
||||
'COMPLETE': '●', # Alternative: '○'
|
||||
'ERROR': '×'
|
||||
}
|
||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs)
|
||||
|
||||
# Handle deprecated parameter
|
||||
if always_by_pass_cache is not None:
|
||||
if SHOW_DEPRECATION_WARNINGS:
|
||||
warnings.warn(
|
||||
"'always_by_pass_cache' is deprecated and will be removed in version X.X.X. "
|
||||
"Use 'always_bypass_cache' instead. "
|
||||
"Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
self.always_bypass_cache = always_by_pass_cache
|
||||
else:
|
||||
self.always_bypass_cache = always_bypass_cache
|
||||
|
||||
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
@@ -46,21 +112,13 @@ class AsyncWebCrawler:
|
||||
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
async def awarmup(self):
|
||||
# Print a message for crawl4ai and its version
|
||||
"""Initialize the crawler with warm-up sequence."""
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||
# await async_db_manager.ainit_db()
|
||||
# # await async_db_manager.initialize()
|
||||
# await self.arun(
|
||||
# url="https://google.com/",
|
||||
# word_count_threshold=5,
|
||||
# bypass_cache=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
|
||||
self.ready = True
|
||||
if self.verbose:
|
||||
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
||||
print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")
|
||||
|
||||
async def arun(
|
||||
self,
|
||||
@@ -68,35 +126,81 @@ class AsyncWebCrawler:
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
cache_mode: Optional[CacheMode] = None,
|
||||
# Deprecated parameters
|
||||
bypass_cache: bool = False,
|
||||
disable_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False,
|
||||
# Other parameters
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
disable_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
|
||||
Migration from legacy cache parameters:
|
||||
Old way (deprecated):
|
||||
await crawler.arun(url, bypass_cache=True, no_cache_read=True)
|
||||
|
||||
New way:
|
||||
await crawler.arun(url, cache_mode=CacheMode.BYPASS)
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl. Supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
url: The URL to crawl (http://, https://, file://, or raw:)
|
||||
cache_mode: Cache behavior control (recommended)
|
||||
word_count_threshold: Minimum word count threshold
|
||||
extraction_strategy: Strategy for content extraction
|
||||
chunking_strategy: Strategy for content chunking
|
||||
css_selector: CSS selector for content extraction
|
||||
screenshot: Whether to capture screenshot
|
||||
user_agent: Custom user agent
|
||||
verbose: Enable verbose logging
|
||||
|
||||
Deprecated Args:
|
||||
bypass_cache: Use cache_mode=CacheMode.BYPASS instead
|
||||
disable_cache: Use cache_mode=CacheMode.DISABLED instead
|
||||
no_cache_read: Use cache_mode=CacheMode.WRITE_ONLY instead
|
||||
no_cache_write: Use cache_mode=CacheMode.READ_ONLY instead
|
||||
|
||||
Returns:
|
||||
CrawlResult: The result of the crawling and processing.
|
||||
CrawlResult: The result of crawling and processing
|
||||
"""
|
||||
try:
|
||||
if disable_cache:
|
||||
bypass_cache = True
|
||||
no_cache_read = True
|
||||
no_cache_write = True
|
||||
# Handle deprecated parameters
|
||||
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
||||
if SHOW_DEPRECATION_WARNINGS:
|
||||
warnings.warn(
|
||||
"Cache control boolean flags are deprecated and will be removed in version X.X.X. "
|
||||
"Use 'cache_mode' parameter instead. Examples:\n"
|
||||
"- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n"
|
||||
"- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n"
|
||||
"- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n"
|
||||
"- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n"
|
||||
"Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
# Convert legacy parameters if cache_mode not provided
|
||||
if cache_mode is None:
|
||||
cache_mode = _legacy_to_cache_mode(
|
||||
disable_cache=disable_cache,
|
||||
bypass_cache=bypass_cache,
|
||||
no_cache_read=no_cache_read,
|
||||
no_cache_write=no_cache_write
|
||||
)
|
||||
|
||||
# Default to ENABLED if no cache mode specified
|
||||
if cache_mode is None:
|
||||
cache_mode = CacheMode.ENABLED
|
||||
|
||||
# Create cache context
|
||||
cache_context = CacheContext(url, cache_mode, self.always_bypass_cache)
|
||||
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
@@ -107,18 +211,14 @@ class AsyncWebCrawler:
|
||||
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
|
||||
|
||||
async_response: AsyncCrawlResponse = None
|
||||
cached = None
|
||||
cached_result = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
|
||||
is_web_url = url.startswith(('http://', 'https://'))
|
||||
is_local_file = url.startswith("file://")
|
||||
is_raw_html = url.startswith("raw:")
|
||||
_url = url if not is_raw_html else "Raw HTML"
|
||||
|
||||
start_time = time.perf_counter()
|
||||
cached_result = None
|
||||
if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache:
|
||||
|
||||
# Try to get cached result if appropriate
|
||||
if cache_context.should_read():
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if cached_result:
|
||||
@@ -129,26 +229,27 @@ class AsyncWebCrawler:
|
||||
if not screenshot_data:
|
||||
cached_result = None
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
|
||||
|
||||
|
||||
if not cached or not html:
|
||||
# Fetch fresh content if needed
|
||||
if not cached_result or not html:
|
||||
t1 = time.perf_counter()
|
||||
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
|
||||
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
|
||||
url,
|
||||
screenshot=screenshot,
|
||||
**kwargs
|
||||
)
|
||||
html = sanitize_input_encode(async_response.html)
|
||||
screenshot_data = async_response.screenshot
|
||||
t2 = time.perf_counter()
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
||||
)
|
||||
print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
|
||||
|
||||
t1 = time.perf_counter()
|
||||
# Process the HTML content
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
@@ -159,15 +260,15 @@ class AsyncWebCrawler:
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot_data,
|
||||
verbose=verbose,
|
||||
is_cached=bool(cached),
|
||||
is_cached=bool(cached_result),
|
||||
async_response=async_response,
|
||||
bypass_cache=bypass_cache,
|
||||
is_web_url = is_web_url,
|
||||
is_local_file = is_local_file,
|
||||
is_raw_html = is_raw_html,
|
||||
is_web_url=cache_context.is_web_url,
|
||||
is_local_file=cache_context.is_local_file,
|
||||
is_raw_html=cache_context.is_raw_html,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Set response data
|
||||
if async_response:
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
@@ -180,22 +281,26 @@ class AsyncWebCrawler:
|
||||
crawl_result.session_id = kwargs.get("session_id", None)
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url} | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
|
||||
|
||||
if not is_raw_html and not no_cache_write:
|
||||
if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
# Update cache if appropriate
|
||||
if cache_context.should_write() and not bool(cached_result):
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
return crawl_result
|
||||
|
||||
except Exception as e:
|
||||
if not hasattr(e, "msg"):
|
||||
e.msg = str(e)
|
||||
print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}")
|
||||
return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg)
|
||||
print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url} | {e.msg}{Style.RESET_ALL}")
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}",
|
||||
success=False,
|
||||
error_message=e.msg
|
||||
)
|
||||
|
||||
async def arun_many(
|
||||
self,
|
||||
@@ -203,6 +308,8 @@ class AsyncWebCrawler:
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
cache_mode: Optional[CacheMode] = None,
|
||||
# Deprecated parameters
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
@@ -211,19 +318,35 @@ class AsyncWebCrawler:
|
||||
**kwargs,
|
||||
) -> List[CrawlResult]:
|
||||
"""
|
||||
Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
|
||||
Runs the crawler for multiple URLs concurrently.
|
||||
|
||||
Migration from legacy parameters:
|
||||
Old way (deprecated):
|
||||
results = await crawler.arun_many(urls, bypass_cache=True)
|
||||
|
||||
New way:
|
||||
results = await crawler.arun_many(urls, cache_mode=CacheMode.BYPASS)
|
||||
|
||||
Args:
|
||||
urls (List[str]): A list of URLs with supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
urls: List of URLs to crawl
|
||||
cache_mode: Cache behavior control (recommended)
|
||||
[other parameters same as arun()]
|
||||
|
||||
Returns:
|
||||
List[CrawlResult]: The results of the crawling and processing.
|
||||
List[CrawlResult]: Results for each URL
|
||||
"""
|
||||
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||
if bypass_cache and SHOW_DEPRECATION_WARNINGS:
|
||||
warnings.warn(
|
||||
"'bypass_cache' is deprecated and will be removed in version X.X.X. "
|
||||
"Use 'cache_mode=CacheMode.BYPASS' instead. "
|
||||
"Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
if cache_mode is None:
|
||||
cache_mode = CacheMode.BYPASS
|
||||
|
||||
semaphore_count = kwargs.get('semaphore_count', 5)
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
async def crawl_with_semaphore(url):
|
||||
@@ -233,7 +356,7 @@ class AsyncWebCrawler:
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
bypass_cache=bypass_cache,
|
||||
cache_mode=cache_mode,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
user_agent=user_agent,
|
||||
@@ -245,6 +368,7 @@ class AsyncWebCrawler:
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||
|
||||
|
||||
async def aprocess_html(
|
||||
self,
|
||||
url: str,
|
||||
@@ -258,7 +382,6 @@ class AsyncWebCrawler:
|
||||
verbose: bool,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.perf_counter()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||
@@ -293,9 +416,9 @@ class AsyncWebCrawler:
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds"
|
||||
)
|
||||
print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url}{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
|
||||
|
||||
|
||||
|
||||
if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
|
||||
t1 = time.perf_counter()
|
||||
@@ -309,9 +432,9 @@ class AsyncWebCrawler:
|
||||
extracted_content = extraction_strategy.run(url, sections)
|
||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds"
|
||||
)
|
||||
print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url}{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
|
||||
|
||||
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
@@ -332,13 +455,15 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
async def aclear_cache(self):
|
||||
# await async_db_manager.aclear_db()
|
||||
"""Clear the cache database."""
|
||||
await async_db_manager.cleanup()
|
||||
|
||||
async def aflush_cache(self):
|
||||
"""Flush the cache database."""
|
||||
await async_db_manager.aflush_db()
|
||||
|
||||
async def aget_cache_size(self):
|
||||
"""Get the total number of cached items."""
|
||||
return await async_db_manager.aget_total_count()
|
||||
|
||||
|
||||
|
||||
79
crawl4ai/cache_context.py
Normal file
79
crawl4ai/cache_context.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class CacheMode(Enum):
|
||||
"""
|
||||
Defines the caching behavior for web crawling operations.
|
||||
|
||||
Modes:
|
||||
- ENABLED: Normal caching behavior (read and write)
|
||||
- DISABLED: No caching at all
|
||||
- READ_ONLY: Only read from cache, don't write
|
||||
- WRITE_ONLY: Only write to cache, don't read
|
||||
- BYPASS: Bypass cache for this operation
|
||||
"""
|
||||
ENABLED = "enabled"
|
||||
DISABLED = "disabled"
|
||||
READ_ONLY = "read_only"
|
||||
WRITE_ONLY = "write_only"
|
||||
BYPASS = "bypass"
|
||||
|
||||
|
||||
class CacheContext:
|
||||
"""
|
||||
Encapsulates cache-related decisions and URL handling.
|
||||
|
||||
This class centralizes all cache-related logic and URL type checking,
|
||||
making the caching behavior more predictable and maintainable.
|
||||
"""
|
||||
def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
|
||||
self.url = url
|
||||
self.cache_mode = cache_mode
|
||||
self.always_bypass = always_bypass
|
||||
self.is_cacheable = url.startswith(('http://', 'https://', 'file://'))
|
||||
self.is_web_url = url.startswith(('http://', 'https://'))
|
||||
self.is_local_file = url.startswith("file://")
|
||||
self.is_raw_html = url.startswith("raw:")
|
||||
self._url_display = url if not self.is_raw_html else "Raw HTML"
|
||||
|
||||
def should_read(self) -> bool:
|
||||
"""Determines if cache should be read based on context."""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
|
||||
|
||||
def should_write(self) -> bool:
|
||||
"""Determines if cache should be written based on context."""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
|
||||
|
||||
@property
|
||||
def display_url(self) -> str:
|
||||
"""Returns the URL in display format."""
|
||||
return self._url_display
|
||||
|
||||
|
||||
def _legacy_to_cache_mode(
|
||||
disable_cache: bool = False,
|
||||
bypass_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False
|
||||
) -> CacheMode:
|
||||
"""
|
||||
Converts legacy cache parameters to the new CacheMode enum.
|
||||
|
||||
This is an internal function to help transition from the old boolean flags
|
||||
to the new CacheMode system.
|
||||
"""
|
||||
if disable_cache:
|
||||
return CacheMode.DISABLED
|
||||
if bypass_cache:
|
||||
return CacheMode.BYPASS
|
||||
if no_cache_read and no_cache_write:
|
||||
return CacheMode.DISABLED
|
||||
if no_cache_read:
|
||||
return CacheMode.WRITE_ONLY
|
||||
if no_cache_write:
|
||||
return CacheMode.READ_ONLY
|
||||
return CacheMode.ENABLED
|
||||
@@ -54,4 +54,5 @@ IMAGE_SCORE_THRESHOLD = 2
|
||||
|
||||
MAX_METRICS_HISTORY = 1000
|
||||
|
||||
NEED_MIGRATION = True
|
||||
NEED_MIGRATION = True
|
||||
SHOW_DEPRECATION_WARNINGS = True
|
||||
@@ -48,8 +48,8 @@ class Crawl4AiTester:
|
||||
def test_docker_deployment(version="basic"):
|
||||
tester = Crawl4AiTester(
|
||||
base_url="http://localhost:11235" ,
|
||||
# base_url="https://crawl4ai-sby74.ondigitalocean.app",
|
||||
api_token="test"
|
||||
# base_url="https://api.crawl4ai.com" # just for example
|
||||
# api_token="test" # just for example
|
||||
)
|
||||
print(f"Testing Crawl4AI Docker {version} version")
|
||||
|
||||
@@ -69,6 +69,7 @@ def test_docker_deployment(version="basic"):
|
||||
|
||||
# Test cases based on version
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl_sync(tester)
|
||||
|
||||
# if version in ["full", "transformer"]:
|
||||
|
||||
@@ -71,12 +71,12 @@ async def use_proxy():
|
||||
"Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
|
||||
)
|
||||
# Uncomment and modify the following lines to use a proxy
|
||||
# async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
|
||||
# result = await crawler.arun(
|
||||
# url="https://www.nbcnews.com/business",
|
||||
# bypass_cache=True
|
||||
# )
|
||||
# print(result.markdown[:500]) # Print first 500 characters
|
||||
async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
bypass_cache=True
|
||||
)
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
async def capture_and_save_screenshot(url: str, output_path: str):
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
|
||||
@@ -11,4 +11,5 @@ beautifulsoup4~=4.12
|
||||
tf-playwright-stealth~=1.0
|
||||
xxhash~=3.4
|
||||
rank-bm25~=0.2
|
||||
aiofiles~=24.0
|
||||
aiofiles~=24.0
|
||||
colorama~=0.4
|
||||
332
tests/docker_example.py
Normal file
332
tests/docker_example.py
Normal file
@@ -0,0 +1,332 @@
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import base64
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
class Crawl4AiTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
|
||||
self.base_url = base_url
|
||||
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback
|
||||
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
|
||||
|
||||
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
|
||||
# Submit crawl job
|
||||
response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
|
||||
if response.status_code == 403:
|
||||
raise Exception("API token is invalid or missing")
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Task ID: {task_id}")
|
||||
|
||||
# Poll for result
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
|
||||
|
||||
result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
|
||||
status = result.json()
|
||||
|
||||
if status["status"] == "failed":
|
||||
print("Task failed:", status.get("error"))
|
||||
raise Exception(f"Task failed: {status.get('error')}")
|
||||
|
||||
if status["status"] == "completed":
|
||||
return status
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
|
||||
if response.status_code == 408:
|
||||
raise TimeoutError("Task did not complete within server timeout")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def test_docker_deployment(version="basic"):
|
||||
tester = Crawl4AiTester(
|
||||
# base_url="http://localhost:11235" ,
|
||||
base_url="https://crawl4ai-sby74.ondigitalocean.app",
|
||||
api_token="test"
|
||||
)
|
||||
print(f"Testing Crawl4AI Docker {version} version")
|
||||
|
||||
# Health check with timeout and retry
|
||||
max_retries = 5
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
health = requests.get(f"{tester.base_url}/health", timeout=10)
|
||||
print("Health check:", health.json())
|
||||
break
|
||||
except requests.exceptions.RequestException as e:
|
||||
if i == max_retries - 1:
|
||||
print(f"Failed to connect after {max_retries} attempts")
|
||||
sys.exit(1)
|
||||
print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
|
||||
time.sleep(5)
|
||||
|
||||
# Test cases based on version
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl_sync(tester)
|
||||
|
||||
# if version in ["full", "transformer"]:
|
||||
# test_cosine_extraction(tester)
|
||||
|
||||
# test_js_execution(tester)
|
||||
# test_css_selector(tester)
|
||||
# test_structured_extraction(tester)
|
||||
# test_llm_extraction(tester)
|
||||
# test_llm_with_ollama(tester)
|
||||
# test_screenshot(tester)
|
||||
|
||||
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
"session_id": "test"
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
assert len(result["result"]["markdown"]) > 0
|
||||
|
||||
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
"session_id": "test"
|
||||
}
|
||||
|
||||
result = tester.submit_sync(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result['status'] == 'completed'
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 8,
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
],
|
||||
"wait_for": "article.tease-card:nth-child(10)",
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
}
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"JS execution result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
|
||||
def test_css_selector(tester: Crawl4AiTester):
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
},
|
||||
"extra": {"word_count_threshold": 10}
|
||||
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"CSS selector result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
|
||||
def test_structured_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Structured Extraction ===")
|
||||
schema = {
|
||||
"name": "Coinbase Crypto Prices",
|
||||
"baseSelector": ".cds-tableRow-t45thuk",
|
||||
"fields": [
|
||||
{
|
||||
"name": "crypto",
|
||||
"selector": "td:nth-child(1) h2",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "symbol",
|
||||
"selector": "td:nth-child(1) p",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": "td:nth-child(2)",
|
||||
"type": "text",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"priority": 9,
|
||||
"extraction_config": {
|
||||
"type": "json_css",
|
||||
"params": {
|
||||
"schema": schema
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print(f"Extracted {len(extracted)} items")
|
||||
print("Sample item:", json.dumps(extracted[0], indent=2))
|
||||
assert result["result"]["success"]
|
||||
assert len(extracted) > 0
|
||||
|
||||
def test_llm_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing LLM Extraction ===")
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model_name": {
|
||||
"type": "string",
|
||||
"description": "Name of the OpenAI model."
|
||||
},
|
||||
"input_fee": {
|
||||
"type": "string",
|
||||
"description": "Fee for input token for the OpenAI model."
|
||||
},
|
||||
"output_fee": {
|
||||
"type": "string",
|
||||
"description": "Fee for output token for the OpenAI model."
|
||||
}
|
||||
},
|
||||
"required": ["model_name", "input_fee", "output_fee"]
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://openai.com/api/pricing",
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
"params": {
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"api_token": os.getenv("OPENAI_API_KEY"),
|
||||
"schema": schema,
|
||||
"extraction_type": "schema",
|
||||
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
|
||||
}
|
||||
},
|
||||
"crawler_params": {"word_count_threshold": 1}
|
||||
}
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print(f"Extracted {len(extracted)} model pricing entries")
|
||||
print("Sample entry:", json.dumps(extracted[0], indent=2))
|
||||
assert result["result"]["success"]
|
||||
except Exception as e:
|
||||
print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
|
||||
|
||||
def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
print("\n=== Testing LLM with Ollama ===")
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"article_title": {
|
||||
"type": "string",
|
||||
"description": "The main title of the news article"
|
||||
},
|
||||
"summary": {
|
||||
"type": "string",
|
||||
"description": "A brief summary of the article content"
|
||||
},
|
||||
"main_topics": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Main topics or themes discussed in the article"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
"params": {
|
||||
"provider": "ollama/llama2",
|
||||
"schema": schema,
|
||||
"extraction_type": "schema",
|
||||
"instruction": "Extract the main article information including title, summary, and main topics."
|
||||
}
|
||||
},
|
||||
"extra": {"word_count_threshold": 1},
|
||||
"crawler_params": {"verbose": True}
|
||||
}
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print("Extracted content:", json.dumps(extracted, indent=2))
|
||||
assert result["result"]["success"]
|
||||
except Exception as e:
|
||||
print(f"Ollama extraction test failed: {str(e)}")
|
||||
|
||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Cosine Extraction ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "cosine",
|
||||
"params": {
|
||||
"semantic_filter": "business finance economy",
|
||||
"word_count_threshold": 10,
|
||||
"max_dist": 0.2,
|
||||
"top_k": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print(f"Extracted {len(extracted)} text clusters")
|
||||
print("First cluster tags:", extracted[0]["tags"])
|
||||
assert result["result"]["success"]
|
||||
except Exception as e:
|
||||
print(f"Cosine extraction test failed: {str(e)}")
|
||||
|
||||
def test_screenshot(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Screenshot ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
}
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print("Screenshot captured:", bool(result["result"]["screenshot"]))
|
||||
|
||||
if result["result"]["screenshot"]:
|
||||
# Save screenshot
|
||||
screenshot_data = base64.b64decode(result["result"]["screenshot"])
|
||||
with open("test_screenshot.jpg", "wb") as f:
|
||||
f.write(screenshot_data)
|
||||
print("Screenshot saved as test_screenshot.jpg")
|
||||
|
||||
assert result["result"]["success"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
version = sys.argv[1] if len(sys.argv) > 1 else "basic"
|
||||
# version = "full"
|
||||
test_docker_deployment(version)
|
||||
Reference in New Issue
Block a user