feat(cache): introduce CacheMode and CacheContext for enhanced caching behavior

chore(requirements): add colorama dependency refactor(config): add SHOW_DEPRECATION_WARNINGS flag and clean up code fix(docs): update example scripts for clarity and consistency
2024-12-22 15:52:24 +03:00 · 2024-11-17 15:30:56 +08:00
parent 4b45b28f25
commit 3a66aa8a60
10 changed files with 979 additions and 95 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -1,6 +1,6 @@
 # __init__.py

-from .async_webcrawler import AsyncWebCrawler
+from .async_webcrawler import AsyncWebCrawler, CacheMode
 from .models import CrawlResult
 from .__version__ import __version__
 # __version__ = "0.3.73"
@@ -8,6 +8,7 @@ from .__version__ import __version__
 __all__ = [
    "AsyncWebCrawler",
    "CrawlResult",
+    "CacheMode",
 ]

 def is_sync_version_installed():
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -669,8 +669,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if self.accept_downloads:
                page.on("download", lambda download: asyncio.create_task(self._handle_download(download)))

-            if self.verbose:
-                print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
+            # if self.verbose:
+            #     print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")

            if self.use_cached_html:
                cache_file_path = os.path.join(
@@ -873,8 +873,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    await asyncio.sleep(screenshot_wait_for)
                screenshot_data = await self.take_screenshot(page)          

-            if self.verbose:
-                print(f"[LOG] ✅ Crawled {url} successfully!")
+            # if self.verbose:
+            #     print(f"[LOG] ✅ Crawled {url} successfully!")
           
            if self.use_cached_html:
                cache_file_path = os.path.join(
--- a/crawl4ai/async_webcrawler.3.73.py
+++ b/crawl4ai/async_webcrawler.3.73.py
@@ -0,0 +1,344 @@
+import os
+import time
+from pathlib import Path
+from typing import Optional
+import json
+import asyncio
+from .models import CrawlResult
+from .async_database import async_db_manager
+from .chunking_strategy import *
+from .extraction_strategy import *
+from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
+from .content_scrapping_strategy import WebScrapingStrategy
+from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+from .utils import (
+    sanitize_input_encode,
+    InvalidCSSSelectorError,
+    format_html
+)
+from .__version__ import __version__ as crawl4ai_version
+
+class AsyncWebCrawler:
+    def __init__(
+        self,
+        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
+        always_by_pass_cache: bool = False,
+        base_directory: str = str(Path.home()),
+        **kwargs,
+    ):
+        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
+            **kwargs
+        )
+        self.always_by_pass_cache = always_by_pass_cache
+        # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
+        os.makedirs(self.crawl4ai_folder, exist_ok=True)
+        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
+        self.ready = False
+        self.verbose = kwargs.get("verbose", False)
+
+    async def __aenter__(self):
+        await self.crawler_strategy.__aenter__()
+        await self.awarmup()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
+
+    async def awarmup(self):
+        # Print a message for crawl4ai and its version
+        if self.verbose:
+            print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
+            print("[LOG] 🌤️  Warming up the AsyncWebCrawler")
+        # await async_db_manager.ainit_db()
+        # # await async_db_manager.initialize()
+        # await self.arun(
+        #     url="https://google.com/",
+        #     word_count_threshold=5,
+        #     bypass_cache=False,
+        #     verbose=False,
+        # )
+        self.ready = True
+        if self.verbose:
+            print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
+
+    async def arun(
+        self,
+        url: str,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        disable_cache: bool = False,
+        no_cache_read: bool = False,
+        no_cache_write: bool = False,
+        **kwargs,
+    ) -> CrawlResult:
+        """
+        Runs the crawler for a single source: URL (web, local file, or raw HTML).
+
+        Args:
+            url (str): The URL to crawl. Supported prefixes:
+                - 'http://' or 'https://': Web URL to crawl.
+                - 'file://': Local file path to process.
+                - 'raw:': Raw HTML content to process.
+            ... [other existing parameters]
+
+        Returns:
+            CrawlResult: The result of the crawling and processing.
+        """
+        try:
+            if disable_cache:
+                bypass_cache = True
+                no_cache_read = True
+                no_cache_write = True
+            
+            extraction_strategy = extraction_strategy or NoExtractionStrategy()
+            extraction_strategy.verbose = verbose
+            if not isinstance(extraction_strategy, ExtractionStrategy):
+                raise ValueError("Unsupported extraction strategy")
+            if not isinstance(chunking_strategy, ChunkingStrategy):
+                raise ValueError("Unsupported chunking strategy")
+            
+            word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
+
+            async_response: AsyncCrawlResponse = None
+            cached = None
+            screenshot_data = None
+            extracted_content = None
+            
+            is_web_url = url.startswith(('http://', 'https://'))
+            is_local_file = url.startswith("file://")
+            is_raw_html = url.startswith("raw:")
+            _url = url if not is_raw_html else "Raw HTML"
+            
+            start_time = time.perf_counter()
+            cached_result = None
+            if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache:
+                cached_result = await async_db_manager.aget_cached_url(url)
+                        
+            if cached_result:
+                html = sanitize_input_encode(cached_result.html)
+                extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
+                if screenshot:
+                    screenshot_data = cached_result.screenshot
+                    if not screenshot_data:
+                        cached_result = None
+                if verbose:
+                    print(
+                        f"[LOG] 1️⃣  ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds"
+                    )
+
+
+            if not cached or not html:
+                t1 = time.perf_counter()
+                
+                if user_agent:
+                    self.crawler_strategy.update_user_agent(user_agent)
+                async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
+                html = sanitize_input_encode(async_response.html)
+                screenshot_data = async_response.screenshot
+                t2 = time.perf_counter()
+                if verbose:
+                    print(
+                        f"[LOG] 1️⃣  ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
+                    )
+
+            t1 = time.perf_counter()
+            crawl_result = await self.aprocess_html(
+                url=url,
+                html=html,
+                extracted_content=extracted_content,
+                word_count_threshold=word_count_threshold,
+                extraction_strategy=extraction_strategy,
+                chunking_strategy=chunking_strategy,
+                css_selector=css_selector,
+                screenshot=screenshot_data,
+                verbose=verbose,
+                is_cached=bool(cached),
+                async_response=async_response,
+                bypass_cache=bypass_cache,
+                is_web_url = is_web_url,
+                is_local_file = is_local_file,
+                is_raw_html = is_raw_html,
+                **kwargs,
+            )
+            
+            if async_response:
+                crawl_result.status_code = async_response.status_code
+                crawl_result.response_headers = async_response.response_headers
+                crawl_result.downloaded_files = async_response.downloaded_files
+            else:
+                crawl_result.status_code = 200
+                crawl_result.response_headers = cached_result.response_headers if cached_result else {}
+
+            crawl_result.success = bool(html)
+            crawl_result.session_id = kwargs.get("session_id", None)
+
+            if verbose:
+                print(
+                    f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds"
+                )
+
+            if not is_raw_html and not no_cache_write:
+                if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
+                    await async_db_manager.acache_url(crawl_result)
+
+
+            return crawl_result
+        
+        except Exception as e:
+            if not hasattr(e, "msg"):
+                e.msg = str(e)
+            print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}")
+            return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg)
+
+    async def arun_many(
+        self,
+        urls: List[str],
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        **kwargs,
+    ) -> List[CrawlResult]:
+        """
+        Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
+
+        Args:
+            urls (List[str]): A list of URLs with supported prefixes:
+                - 'http://' or 'https://': Web URL to crawl.
+                - 'file://': Local file path to process.
+                - 'raw:': Raw HTML content to process.
+            ... [other existing parameters]
+
+        Returns:
+            List[CrawlResult]: The results of the crawling and processing.
+        """
+        semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
+        semaphore = asyncio.Semaphore(semaphore_count)
+
+        async def crawl_with_semaphore(url):
+            async with semaphore:
+                return await self.arun(
+                    url,
+                    word_count_threshold=word_count_threshold,
+                    extraction_strategy=extraction_strategy,
+                    chunking_strategy=chunking_strategy,
+                    bypass_cache=bypass_cache,
+                    css_selector=css_selector,
+                    screenshot=screenshot,
+                    user_agent=user_agent,
+                    verbose=verbose,
+                    **kwargs,
+                )
+
+        tasks = [crawl_with_semaphore(url) for url in urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        return [result if not isinstance(result, Exception) else str(result) for result in results]
+
+    async def aprocess_html(
+        self,
+        url: str,
+        html: str,
+        extracted_content: str,
+        word_count_threshold: int,
+        extraction_strategy: ExtractionStrategy,
+        chunking_strategy: ChunkingStrategy,
+        css_selector: str,
+        screenshot: str,
+        verbose: bool,
+        **kwargs,
+    ) -> CrawlResult:
+        t = time.perf_counter()
+        # Extract content from HTML
+        try:
+            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
+            t1 = time.perf_counter()
+            scrapping_strategy = WebScrapingStrategy()
+            # result = await scrapping_strategy.ascrap(
+            result = scrapping_strategy.scrap(
+                url,
+                html,
+                word_count_threshold=word_count_threshold,
+                css_selector=css_selector,
+                only_text=kwargs.get("only_text", False),
+                image_description_min_word_threshold=kwargs.get(
+                    "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+                ),
+                **kwargs,
+            )
+
+            if result is None:
+                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
+        except Exception as e:
+            raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
+
+        cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+        markdown = sanitize_input_encode(result.get("markdown", ""))
+        fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
+        fit_html = sanitize_input_encode(result.get("fit_html", ""))
+        media = result.get("media", [])
+        links = result.get("links", [])
+        metadata = result.get("metadata", {})
+        
+        if verbose:
+            print(
+                f"[LOG] 2️⃣  ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds"
+            )        
+
+        if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
+            t1 = time.perf_counter()
+            # Check if extraction strategy is type of JsonCssExtractionStrategy
+            if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy):
+                extraction_strategy.verbose = verbose
+                extracted_content = extraction_strategy.run(url, [html])
+                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
+            else:
+                sections = chunking_strategy.chunk(markdown)
+                extracted_content = extraction_strategy.run(url, sections)
+                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
+            if verbose:
+                print(
+                    f"[LOG] 3️⃣  ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds"
+                )
+
+        screenshot = None if not screenshot else screenshot
+        
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=format_html(cleaned_html),
+            markdown=markdown,
+            fit_markdown=fit_markdown,
+            fit_html= fit_html,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=screenshot,
+            extracted_content=extracted_content,
+            success=True,
+            error_message="",
+        )
+
+    async def aclear_cache(self):
+        # await async_db_manager.aclear_db()
+        await async_db_manager.cleanup()
+
+    async def aflush_cache(self):
+        await async_db_manager.aflush_db()
+
+    async def aget_cache_size(self):
+        return await async_db_manager.aget_total_count()
+
+
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -1,7 +1,10 @@
 import os
 import time
+import warnings
+from enum import Enum
+from colorama import init, Fore, Back, Style
 from pathlib import Path
-from typing import Optional
+from typing import Optional, List, Union
 import json
 import asyncio
 from .models import CrawlResult
@@ -9,8 +12,13 @@ from .async_database import async_db_manager
 from .chunking_strategy import *
 from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
+from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
 from .content_scrapping_strategy import WebScrapingStrategy
-from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+from .config import (
+    MIN_WORD_THRESHOLD, 
+    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+    SHOW_DEPRECATION_WARNINGS  # New import
+)
 from .utils import (
    sanitize_input_encode,
    InvalidCSSSelectorError,
@@ -18,19 +26,77 @@ from .utils import (
 )
 from .__version__ import __version__ as crawl4ai_version

+
 class AsyncWebCrawler:
+    """
+    Asynchronous web crawler with flexible caching capabilities.
+    
+    Migration Guide (from version X.X.X):
+    Old way (deprecated):
+        crawler = AsyncWebCrawler(always_by_pass_cache=True)
+        result = await crawler.arun(
+            url="https://example.com",
+            bypass_cache=True,
+            no_cache_read=True,
+            no_cache_write=False
+        )
+    
+    New way (recommended):
+        crawler = AsyncWebCrawler(always_bypass_cache=True)
+        result = await crawler.arun(
+            url="https://example.com",
+            cache_mode=CacheMode.WRITE_ONLY
+        )
+    
+    To disable deprecation warnings:
+        Set SHOW_DEPRECATION_WARNINGS = False in config.py
+    """
+
    def __init__(
        self,
        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
-        always_by_pass_cache: bool = False,
+        always_bypass_cache: bool = False,
+        always_by_pass_cache: Optional[bool] = None,  # Deprecated parameter
        base_directory: str = str(Path.home()),
        **kwargs,
    ):
-        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
-            **kwargs
-        )
-        self.always_by_pass_cache = always_by_pass_cache
-        # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        """
+        Initialize the AsyncWebCrawler.
+
+        Args:
+            crawler_strategy: Strategy for crawling web pages
+            always_bypass_cache: Whether to always bypass cache (new parameter)
+            always_by_pass_cache: Deprecated, use always_bypass_cache instead
+            base_directory: Base directory for storing cache
+        """
+        init()
+        self.log_width = 10  # Width of "[COMPLETE]" 
+        self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
+        self.log_icons = {
+            'INIT': '→',      # Alternative: '▶' or '►'
+            'READY': '✓',     # Alternative: '√'
+            'FETCH': '↓',     # Alternative: '▼'
+            'SCRAPE': '◆',    # Alternative: '♦'
+            'EXTRACT': '■',    # Alternative: '□'
+            'COMPLETE': '●',   # Alternative: '○'
+            'ERROR': '×' 
+        }        
+        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs)
+        
+        # Handle deprecated parameter
+        if always_by_pass_cache is not None:
+            if SHOW_DEPRECATION_WARNINGS:
+                warnings.warn(
+                    "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. "
+                    "Use 'always_bypass_cache' instead. "
+                    "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
+                    DeprecationWarning,
+                    stacklevel=2
+                )
+            self.always_bypass_cache = always_by_pass_cache
+        else:
+            self.always_bypass_cache = always_bypass_cache
+
        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
@@ -46,21 +112,13 @@ class AsyncWebCrawler:
        await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)

    async def awarmup(self):
-        # Print a message for crawl4ai and its version
+        """Initialize the crawler with warm-up sequence."""
        if self.verbose:
-            print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
-            print("[LOG] 🌤️  Warming up the AsyncWebCrawler")
-        # await async_db_manager.ainit_db()
-        # # await async_db_manager.initialize()
-        # await self.arun(
-        #     url="https://google.com/",
-        #     word_count_threshold=5,
-        #     bypass_cache=False,
-        #     verbose=False,
-        # )
+            print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
+            print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
        self.ready = True
        if self.verbose:
-            print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
+            print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")

    async def arun(
        self,
@@ -68,35 +126,81 @@ class AsyncWebCrawler:
        word_count_threshold=MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        cache_mode: Optional[CacheMode] = None,
+        # Deprecated parameters
        bypass_cache: bool = False,
+        disable_cache: bool = False,
+        no_cache_read: bool = False,
+        no_cache_write: bool = False,
+        # Other parameters
        css_selector: str = None,
        screenshot: bool = False,
        user_agent: str = None,
        verbose=True,
-        disable_cache: bool = False,
-        no_cache_read: bool = False,
-        no_cache_write: bool = False,
        **kwargs,
    ) -> CrawlResult:
        """
        Runs the crawler for a single source: URL (web, local file, or raw HTML).

+        Migration from legacy cache parameters:
+            Old way (deprecated):
+                await crawler.arun(url, bypass_cache=True, no_cache_read=True)
+            
+            New way:
+                await crawler.arun(url, cache_mode=CacheMode.BYPASS)
+
        Args:
-            url (str): The URL to crawl. Supported prefixes:
-                - 'http://' or 'https://': Web URL to crawl.
-                - 'file://': Local file path to process.
-                - 'raw:': Raw HTML content to process.
-            ... [other existing parameters]
+            url: The URL to crawl (http://, https://, file://, or raw:)
+            cache_mode: Cache behavior control (recommended)
+            word_count_threshold: Minimum word count threshold
+            extraction_strategy: Strategy for content extraction
+            chunking_strategy: Strategy for content chunking
+            css_selector: CSS selector for content extraction
+            screenshot: Whether to capture screenshot
+            user_agent: Custom user agent
+            verbose: Enable verbose logging
+            
+            Deprecated Args:
+                bypass_cache: Use cache_mode=CacheMode.BYPASS instead
+                disable_cache: Use cache_mode=CacheMode.DISABLED instead
+                no_cache_read: Use cache_mode=CacheMode.WRITE_ONLY instead
+                no_cache_write: Use cache_mode=CacheMode.READ_ONLY instead

        Returns:
-            CrawlResult: The result of the crawling and processing.
+            CrawlResult: The result of crawling and processing
        """
        try:
-            if disable_cache:
-                bypass_cache = True
-                no_cache_read = True
-                no_cache_write = True
+            # Handle deprecated parameters
+            if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
+                if SHOW_DEPRECATION_WARNINGS:
+                    warnings.warn(
+                        "Cache control boolean flags are deprecated and will be removed in version X.X.X. "
+                        "Use 'cache_mode' parameter instead. Examples:\n"
+                        "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n"
+                        "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n"
+                        "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n"
+                        "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n"
+                        "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
+                        DeprecationWarning,
+                        stacklevel=2
+                    )
+                
+                # Convert legacy parameters if cache_mode not provided
+                if cache_mode is None:
+                    cache_mode = _legacy_to_cache_mode(
+                        disable_cache=disable_cache,
+                        bypass_cache=bypass_cache,
+                        no_cache_read=no_cache_read,
+                        no_cache_write=no_cache_write
+                    )
            
+            # Default to ENABLED if no cache mode specified
+            if cache_mode is None:
+                cache_mode = CacheMode.ENABLED
+
+            # Create cache context
+            cache_context = CacheContext(url, cache_mode, self.always_bypass_cache)
+
            extraction_strategy = extraction_strategy or NoExtractionStrategy()
            extraction_strategy.verbose = verbose
            if not isinstance(extraction_strategy, ExtractionStrategy):
@@ -107,18 +211,14 @@ class AsyncWebCrawler:
            word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)

            async_response: AsyncCrawlResponse = None
-            cached = None
+            cached_result = None
            screenshot_data = None
            extracted_content = None
            
-            is_web_url = url.startswith(('http://', 'https://'))
-            is_local_file = url.startswith("file://")
-            is_raw_html = url.startswith("raw:")
-            _url = url if not is_raw_html else "Raw HTML"
-            
            start_time = time.perf_counter()
-            cached_result = None
-            if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache:
+            
+            # Try to get cached result if appropriate
+            if cache_context.should_read():
                cached_result = await async_db_manager.aget_cached_url(url)
                        
            if cached_result:
@@ -129,26 +229,27 @@ class AsyncWebCrawler:
                    if not screenshot_data:
                        cached_result = None
                if verbose:
-                    print(
-                        f"[LOG] 1️⃣  ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds"
-                    )
+                    print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")


-            if not cached or not html:
+            # Fetch fresh content if needed
+            if not cached_result or not html:
                t1 = time.perf_counter()
                
                if user_agent:
                    self.crawler_strategy.update_user_agent(user_agent)
-                async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
+                async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
+                    url, 
+                    screenshot=screenshot, 
+                    **kwargs
+                )
                html = sanitize_input_encode(async_response.html)
                screenshot_data = async_response.screenshot
                t2 = time.perf_counter()
                if verbose:
-                    print(
-                        f"[LOG] 1️⃣  ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
-                    )
+                    print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")

-            t1 = time.perf_counter()
+            # Process the HTML content
            crawl_result = await self.aprocess_html(
                url=url,
                html=html,
@@ -159,15 +260,15 @@ class AsyncWebCrawler:
                css_selector=css_selector,
                screenshot=screenshot_data,
                verbose=verbose,
-                is_cached=bool(cached),
+                is_cached=bool(cached_result),
                async_response=async_response,
-                bypass_cache=bypass_cache,
-                is_web_url = is_web_url,
-                is_local_file = is_local_file,
-                is_raw_html = is_raw_html,
+                is_web_url=cache_context.is_web_url,
+                is_local_file=cache_context.is_local_file,
+                is_raw_html=cache_context.is_raw_html,
                **kwargs,
            )
            
+            # Set response data
            if async_response:
                crawl_result.status_code = async_response.status_code
                crawl_result.response_headers = async_response.response_headers
@@ -180,22 +281,26 @@ class AsyncWebCrawler:
            crawl_result.session_id = kwargs.get("session_id", None)

            if verbose:
-                print(
-                    f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds"
-                )
+                print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url} | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")

-            if not is_raw_html and not no_cache_write:
-                if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
-                    await async_db_manager.acache_url(crawl_result)

+            # Update cache if appropriate
+            if cache_context.should_write() and not bool(cached_result):
+                await async_db_manager.acache_url(crawl_result)

            return crawl_result
        
        except Exception as e:
            if not hasattr(e, "msg"):
                e.msg = str(e)
-            print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}")
-            return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg)
+            print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url} | {e.msg}{Style.RESET_ALL}")
+            return CrawlResult(
+                url=url, 
+                html="", 
+                markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}", 
+                success=False, 
+                error_message=e.msg
+            )

    async def arun_many(
        self,
@@ -203,6 +308,8 @@ class AsyncWebCrawler:
        word_count_threshold=MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        cache_mode: Optional[CacheMode] = None,
+        # Deprecated parameters
        bypass_cache: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
@@ -211,19 +318,35 @@ class AsyncWebCrawler:
        **kwargs,
    ) -> List[CrawlResult]:
        """
-        Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
+        Runs the crawler for multiple URLs concurrently.
+
+        Migration from legacy parameters:
+            Old way (deprecated):
+                results = await crawler.arun_many(urls, bypass_cache=True)
+            
+            New way:
+                results = await crawler.arun_many(urls, cache_mode=CacheMode.BYPASS)

        Args:
-            urls (List[str]): A list of URLs with supported prefixes:
-                - 'http://' or 'https://': Web URL to crawl.
-                - 'file://': Local file path to process.
-                - 'raw:': Raw HTML content to process.
-            ... [other existing parameters]
+            urls: List of URLs to crawl
+            cache_mode: Cache behavior control (recommended)
+            [other parameters same as arun()]

        Returns:
-            List[CrawlResult]: The results of the crawling and processing.
+            List[CrawlResult]: Results for each URL
        """
-        semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
+        if bypass_cache and SHOW_DEPRECATION_WARNINGS:
+            warnings.warn(
+                "'bypass_cache' is deprecated and will be removed in version X.X.X. "
+                "Use 'cache_mode=CacheMode.BYPASS' instead. "
+                "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+            if cache_mode is None:
+                cache_mode = CacheMode.BYPASS
+
+        semaphore_count = kwargs.get('semaphore_count', 5)
        semaphore = asyncio.Semaphore(semaphore_count)

        async def crawl_with_semaphore(url):
@@ -233,7 +356,7 @@ class AsyncWebCrawler:
                    word_count_threshold=word_count_threshold,
                    extraction_strategy=extraction_strategy,
                    chunking_strategy=chunking_strategy,
-                    bypass_cache=bypass_cache,
+                    cache_mode=cache_mode,
                    css_selector=css_selector,
                    screenshot=screenshot,
                    user_agent=user_agent,
@@ -245,6 +368,7 @@ class AsyncWebCrawler:
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return [result if not isinstance(result, Exception) else str(result) for result in results]

+
    async def aprocess_html(
        self,
        url: str,
@@ -258,7 +382,6 @@ class AsyncWebCrawler:
        verbose: bool,
        **kwargs,
    ) -> CrawlResult:
-        t = time.perf_counter()
        # Extract content from HTML
        try:
            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
@@ -293,9 +416,9 @@ class AsyncWebCrawler:
        metadata = result.get("metadata", {})
        
        if verbose:
-            print(
-                f"[LOG] 2️⃣  ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds"
-            )        
+            print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url}{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
+
+

        if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
            t1 = time.perf_counter()
@@ -309,9 +432,9 @@ class AsyncWebCrawler:
                extracted_content = extraction_strategy.run(url, sections)
                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
            if verbose:
-                print(
-                    f"[LOG] 3️⃣  ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds"
-                )
+                print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url}{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
+
+                

        screenshot = None if not screenshot else screenshot
        
@@ -332,13 +455,15 @@ class AsyncWebCrawler:
        )

    async def aclear_cache(self):
-        # await async_db_manager.aclear_db()
+        """Clear the cache database."""
        await async_db_manager.cleanup()

    async def aflush_cache(self):
+        """Flush the cache database."""
        await async_db_manager.aflush_db()

    async def aget_cache_size(self):
+        """Get the total number of cached items."""
        return await async_db_manager.aget_total_count()


--- a/crawl4ai/cache_context.py
+++ b/crawl4ai/cache_context.py
@@ -0,0 +1,79 @@
+from enum import Enum
+
+
+class CacheMode(Enum):
+    """
+    Defines the caching behavior for web crawling operations.
+    
+    Modes:
+    - ENABLED: Normal caching behavior (read and write)
+    - DISABLED: No caching at all
+    - READ_ONLY: Only read from cache, don't write
+    - WRITE_ONLY: Only write to cache, don't read
+    - BYPASS: Bypass cache for this operation
+    """
+    ENABLED = "enabled"
+    DISABLED = "disabled"
+    READ_ONLY = "read_only"
+    WRITE_ONLY = "write_only"
+    BYPASS = "bypass"
+
+
+class CacheContext:
+    """
+    Encapsulates cache-related decisions and URL handling.
+    
+    This class centralizes all cache-related logic and URL type checking,
+    making the caching behavior more predictable and maintainable.
+    """
+    def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
+        self.url = url
+        self.cache_mode = cache_mode
+        self.always_bypass = always_bypass
+        self.is_cacheable = url.startswith(('http://', 'https://', 'file://'))
+        self.is_web_url = url.startswith(('http://', 'https://'))
+        self.is_local_file = url.startswith("file://")
+        self.is_raw_html = url.startswith("raw:")
+        self._url_display = url if not self.is_raw_html else "Raw HTML"
+    
+    def should_read(self) -> bool:
+        """Determines if cache should be read based on context."""
+        if self.always_bypass or not self.is_cacheable:
+            return False
+        return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
+    
+    def should_write(self) -> bool:
+        """Determines if cache should be written based on context."""
+        if self.always_bypass or not self.is_cacheable:
+            return False
+        return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
+    
+    @property
+    def display_url(self) -> str:
+        """Returns the URL in display format."""
+        return self._url_display
+
+
+def _legacy_to_cache_mode(
+    disable_cache: bool = False,
+    bypass_cache: bool = False,
+    no_cache_read: bool = False,
+    no_cache_write: bool = False
+) -> CacheMode:
+    """
+    Converts legacy cache parameters to the new CacheMode enum.
+    
+    This is an internal function to help transition from the old boolean flags
+    to the new CacheMode system.
+    """
+    if disable_cache:
+        return CacheMode.DISABLED
+    if bypass_cache:
+        return CacheMode.BYPASS
+    if no_cache_read and no_cache_write:
+        return CacheMode.DISABLED
+    if no_cache_read:
+        return CacheMode.WRITE_ONLY
+    if no_cache_write:
+        return CacheMode.READ_ONLY
+    return CacheMode.ENABLED
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -54,4 +54,5 @@ IMAGE_SCORE_THRESHOLD = 2

 MAX_METRICS_HISTORY = 1000

-NEED_MIGRATION = True
+NEED_MIGRATION = True
+SHOW_DEPRECATION_WARNINGS = True
--- a/docs/examples/docker_example.py
+++ b/docs/examples/docker_example.py
@@ -48,8 +48,8 @@ class Crawl4AiTester:
 def test_docker_deployment(version="basic"):
    tester = Crawl4AiTester(
        base_url="http://localhost:11235" ,
-        # base_url="https://crawl4ai-sby74.ondigitalocean.app",
-        api_token="test"
+        # base_url="https://api.crawl4ai.com" # just for example
+        # api_token="test" # just for example
    )
    print(f"Testing Crawl4AI Docker {version} version")
    
@@ -69,6 +69,7 @@ def test_docker_deployment(version="basic"):
    
    # Test cases based on version
    test_basic_crawl(tester)
+    test_basic_crawl(tester)
    test_basic_crawl_sync(tester)
    
    # if version in ["full", "transformer"]:
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -71,12 +71,12 @@ async def use_proxy():
        "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
    )
    # Uncomment and modify the following lines to use a proxy
-    # async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
-    #     result = await crawler.arun(
-    #         url="https://www.nbcnews.com/business",
-    #         bypass_cache=True
-    #     )
-    #     print(result.markdown[:500])  # Print first 500 characters
+    async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            bypass_cache=True
+        )
+        print(result.markdown[:500])  # Print first 500 characters

 async def capture_and_save_screenshot(url: str, output_path: str):
    async with AsyncWebCrawler(verbose=True) as crawler:
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ beautifulsoup4~=4.12
 tf-playwright-stealth~=1.0
 xxhash~=3.4
 rank-bm25~=0.2
-aiofiles~=24.0
+aiofiles~=24.0
+colorama~=0.4
--- a/tests/docker_example.py
+++ b/tests/docker_example.py
@@ -0,0 +1,332 @@
+import requests
+import json
+import time
+import sys
+import base64
+import os
+from typing import Dict, Any
+
+class Crawl4AiTester:
+    def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
+        self.base_url = base_url
+        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN')  # Check environment variable as fallback
+        self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
+        
+    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+        # Submit crawl job
+        response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
+        if response.status_code == 403:
+            raise Exception("API token is invalid or missing")
+        task_id = response.json()["task_id"]
+        print(f"Task ID: {task_id}")
+        
+        # Poll for result
+        start_time = time.time()
+        while True:
+            if time.time() - start_time > timeout:
+                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
+                
+            result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
+            status = result.json()
+            
+            if status["status"] == "failed":
+                print("Task failed:", status.get("error"))
+                raise Exception(f"Task failed: {status.get('error')}")
+                
+            if status["status"] == "completed":
+                return status
+                
+            time.sleep(2)
+            
+    def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
+        response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
+        if response.status_code == 408:
+            raise TimeoutError("Task did not complete within server timeout")
+        response.raise_for_status()
+        return response.json()
+
+def test_docker_deployment(version="basic"):
+    tester = Crawl4AiTester(
+        # base_url="http://localhost:11235" ,
+        base_url="https://crawl4ai-sby74.ondigitalocean.app",
+        api_token="test"
+    )
+    print(f"Testing Crawl4AI Docker {version} version")
+    
+    # Health check with timeout and retry
+    max_retries = 5
+    for i in range(max_retries):
+        try:
+            health = requests.get(f"{tester.base_url}/health", timeout=10)
+            print("Health check:", health.json())
+            break
+        except requests.exceptions.RequestException as e:
+            if i == max_retries - 1:
+                print(f"Failed to connect after {max_retries} attempts")
+                sys.exit(1)
+            print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
+            time.sleep(5)
+    
+    # Test cases based on version
+    test_basic_crawl(tester)
+    test_basic_crawl(tester)
+    test_basic_crawl_sync(tester)
+    
+    # if version in ["full", "transformer"]:
+    #     test_cosine_extraction(tester)
+
+    # test_js_execution(tester)
+    # test_css_selector(tester)
+    # test_structured_extraction(tester)
+    # test_llm_extraction(tester)
+    # test_llm_with_ollama(tester)
+    # test_screenshot(tester)
+    
+
+def test_basic_crawl(tester: Crawl4AiTester):
+    print("\n=== Testing Basic Crawl ===")
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 10, 
+        "session_id": "test"
+    }
+    
+    result = tester.submit_and_wait(request)
+    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+def test_basic_crawl_sync(tester: Crawl4AiTester):
+    print("\n=== Testing Basic Crawl (Sync) ===")
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 10,
+        "session_id": "test"
+    }
+    
+    result = tester.submit_sync(request)
+    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
+    assert result['status'] == 'completed'
+    assert result['result']['success']
+    assert len(result['result']['markdown']) > 0
+    
+def test_js_execution(tester: Crawl4AiTester):
+    print("\n=== Testing JS Execution ===")
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 8,
+        "js_code": [
+            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+        ],
+        "wait_for": "article.tease-card:nth-child(10)",
+        "crawler_params": {
+            "headless": True
+        }
+    }
+    
+    result = tester.submit_and_wait(request)
+    print(f"JS execution result length: {len(result['result']['markdown'])}")
+    assert result["result"]["success"]
+
+def test_css_selector(tester: Crawl4AiTester):
+    print("\n=== Testing CSS Selector ===")
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 7,
+        "css_selector": ".wide-tease-item__description",
+        "crawler_params": {
+            "headless": True
+        },
+        "extra": {"word_count_threshold": 10}
+        
+    }
+    
+    result = tester.submit_and_wait(request)
+    print(f"CSS selector result length: {len(result['result']['markdown'])}")
+    assert result["result"]["success"]
+
+def test_structured_extraction(tester: Crawl4AiTester):
+    print("\n=== Testing Structured Extraction ===")
+    schema = {
+        "name": "Coinbase Crypto Prices",
+        "baseSelector": ".cds-tableRow-t45thuk",
+        "fields": [
+            {
+                "name": "crypto",
+                "selector": "td:nth-child(1) h2",
+                "type": "text",
+            },
+            {
+                "name": "symbol",
+                "selector": "td:nth-child(1) p",
+                "type": "text",
+            },
+            {
+                "name": "price",
+                "selector": "td:nth-child(2)",
+                "type": "text",
+            }
+        ],
+    }
+    
+    request = {
+        "urls": "https://www.coinbase.com/explore",
+        "priority": 9,
+        "extraction_config": {
+            "type": "json_css",
+            "params": {
+                "schema": schema
+            }
+        }
+    }
+    
+    result = tester.submit_and_wait(request)
+    extracted = json.loads(result["result"]["extracted_content"])
+    print(f"Extracted {len(extracted)} items")
+    print("Sample item:", json.dumps(extracted[0], indent=2))
+    assert result["result"]["success"]
+    assert len(extracted) > 0
+
+def test_llm_extraction(tester: Crawl4AiTester):
+    print("\n=== Testing LLM Extraction ===")
+    schema = {
+        "type": "object",
+        "properties": {
+            "model_name": {
+                "type": "string",
+                "description": "Name of the OpenAI model."
+            },
+            "input_fee": {
+                "type": "string",
+                "description": "Fee for input token for the OpenAI model."
+            },
+            "output_fee": {
+                "type": "string",
+                "description": "Fee for output token for the OpenAI model."
+            }
+        },
+        "required": ["model_name", "input_fee", "output_fee"]
+    }
+    
+    request = {
+        "urls": "https://openai.com/api/pricing",
+        "priority": 8,
+        "extraction_config": {
+            "type": "llm",
+            "params": {
+                "provider": "openai/gpt-4o-mini",
+                "api_token": os.getenv("OPENAI_API_KEY"),
+                "schema": schema,
+                "extraction_type": "schema",
+                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
+            }
+        },
+        "crawler_params": {"word_count_threshold": 1}
+    }
+    
+    try:
+        result = tester.submit_and_wait(request)
+        extracted = json.loads(result["result"]["extracted_content"])
+        print(f"Extracted {len(extracted)} model pricing entries")
+        print("Sample entry:", json.dumps(extracted[0], indent=2))
+        assert result["result"]["success"]
+    except Exception as e:
+        print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
+
+def test_llm_with_ollama(tester: Crawl4AiTester):
+    print("\n=== Testing LLM with Ollama ===")
+    schema = {
+        "type": "object",
+        "properties": {
+            "article_title": {
+                "type": "string",
+                "description": "The main title of the news article"
+            },
+            "summary": {
+                "type": "string",
+                "description": "A brief summary of the article content"
+            },
+            "main_topics": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Main topics or themes discussed in the article"
+            }
+        }
+    }
+    
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 8,
+        "extraction_config": {
+            "type": "llm",
+            "params": {
+                "provider": "ollama/llama2",
+                "schema": schema,
+                "extraction_type": "schema",
+                "instruction": "Extract the main article information including title, summary, and main topics."
+            }
+        },
+        "extra": {"word_count_threshold": 1},
+        "crawler_params": {"verbose": True}
+    }
+    
+    try:
+        result = tester.submit_and_wait(request)
+        extracted = json.loads(result["result"]["extracted_content"])
+        print("Extracted content:", json.dumps(extracted, indent=2))
+        assert result["result"]["success"]
+    except Exception as e:
+        print(f"Ollama extraction test failed: {str(e)}")
+
+def test_cosine_extraction(tester: Crawl4AiTester):
+    print("\n=== Testing Cosine Extraction ===")
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 8,
+        "extraction_config": {
+            "type": "cosine",
+            "params": {
+                "semantic_filter": "business finance economy",
+                "word_count_threshold": 10,
+                "max_dist": 0.2,
+                "top_k": 3
+            }
+        }
+    }
+    
+    try:
+        result = tester.submit_and_wait(request)
+        extracted = json.loads(result["result"]["extracted_content"])
+        print(f"Extracted {len(extracted)} text clusters")
+        print("First cluster tags:", extracted[0]["tags"])
+        assert result["result"]["success"]
+    except Exception as e:
+        print(f"Cosine extraction test failed: {str(e)}")
+
+def test_screenshot(tester: Crawl4AiTester):
+    print("\n=== Testing Screenshot ===")
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 5,
+        "screenshot": True,
+        "crawler_params": {
+            "headless": True
+        }
+    }
+    
+    result = tester.submit_and_wait(request)
+    print("Screenshot captured:", bool(result["result"]["screenshot"]))
+    
+    if result["result"]["screenshot"]:
+        # Save screenshot
+        screenshot_data = base64.b64decode(result["result"]["screenshot"])
+        with open("test_screenshot.jpg", "wb") as f:
+            f.write(screenshot_data)
+        print("Screenshot saved as test_screenshot.jpg")
+    
+    assert result["result"]["success"]
+
+if __name__ == "__main__":
+    version = sys.argv[1] if len(sys.argv) > 1 else "basic"
+    # version = "full"
+    test_docker_deployment(version)