Commit Message:

Enhance Async Crawler with storage state handling - Updated Async Crawler to support storage state management. - Added error handling for URL validation in Async Web Crawler. - Modified README logo and improved .gitignore entries. - Fixed issues in multiple files for better code robustness.
2024-12-22 15:52:24 +03:00 · 2024-12-09 20:04:59 +08:00
parent c51e901f68
commit 2d31915f0a
7 changed files with 58 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -214,4 +214,7 @@ git_issues.md
 todo_executor.md
 protect-all-except-feature.sh
 manage-collab.sh
-publish.sh
+publish.sh
+
+combine.sh
+combined_output.txt
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
+# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI.

 <a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -238,8 +238,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             
        self.user_agent = kwargs.get(
            "user_agent",
-            # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
-            "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
+            # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
        )
        user_agenr_generator = UserAgentGenerator()
        if kwargs.get("user_agent_mode") == "random":
@@ -254,6 +254,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
        self.headers.setdefault("sec-ch-ua", self.browser_hint)
        self.cookies = kwargs.get("cookies", [])
+        self.storage_state = kwargs.get("storage_state", None)
        self.sessions = {}
        self.session_ttl = 1800 
        self.js_code = js_code
@@ -315,7 +316,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    # If no default context exists, create one
                    self.default_context = await self.browser.new_context(
                        # viewport={"width": 1920, "height": 1080}
-                        viewport={"width": self.viewport_width, "height": self.viewport_height}
+                        viewport={"width": self.viewport_width, "height": self.viewport_height},
+                        storage_state=self.storage_state,
                    )
                
                # Set up the default context
@@ -323,6 +325,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    await self.default_context.set_extra_http_headers(self.headers)
                    if self.cookies:
                        await self.default_context.add_cookies(self.cookies)                    
+                    if self.storage_state:
+                        # If storage_state is a dictionary or file path, Playwright will handle it.
+                        await self.default_context.storage_state(path=None)  # Just ensuring default_context is ready
                    if self.accept_downloads:
                        await self.default_context.set_default_timeout(60000)
                        await self.default_context.set_default_navigation_timeout(60000)
@@ -426,6 +431,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                            self.default_context = self.browser
                        else:
                            self.browser = await self.playwright.chromium.launch(**browser_args)
+                            self.default_context = self.browser
                                
                except Exception as e:
                    # Fallback to chromium if Chrome channel fails
@@ -643,6 +649,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    viewport={"width": self.viewport_width, "height": self.viewport_height},
                    proxy={"server": self.proxy} if self.proxy else None,
                    accept_downloads=self.accept_downloads,
+                    storage_state=self.storage_state,
                    ignore_https_errors=True
                )
                
@@ -771,6 +778,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                            proxy={"server": self.proxy} if self.proxy else None,
                            java_script_enabled=True,
                            accept_downloads=self.accept_downloads,
+                            storage_state=self.storage_state,
                            # downloads_path=self.downloads_path if self.accept_downloads else None
                        )
                        await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
@@ -792,6 +800,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        viewport={"width": self.viewport_width, "height": self.viewport_height},
                        proxy={"server": self.proxy} if self.proxy else None,
                        accept_downloads=self.accept_downloads,
+                        storage_state=self.storage_state,
                        ignore_https_errors=True  # Add this line
                    )
                    if self.cookies:
@@ -862,7 +871,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    return response

            if not kwargs.get("js_only", False):
-                await self.execute_hook('before_goto', page, context = context)
+                await self.execute_hook('before_goto', page, context = context, **kwargs)

                try:
                    response = await page.goto(
@@ -874,7 +883,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                except Error as e:
                    raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}")
                
-                await self.execute_hook('after_goto', page, context = context)
+                await self.execute_hook('after_goto', page, context = context, **kwargs)
                
                # Get status code and headers
                status_code = response.status
@@ -929,9 +938,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            # CONTENT LOADING ASSURANCE
            if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)):
                # Wait for network idle after initial load and images to load
-                await page.wait_for_load_state("networkidle")
+                # await page.wait_for_load_state("networkidle")
+                await page.wait_for_load_state("domcontentloaded")
                await asyncio.sleep(0.1)
-                await page.wait_for_function("Array.from(document.images).every(img => img.complete)")
+                from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+                try:
+                    await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000)
+                # Check for TimeoutError and ignore it
+                except PlaywrightTimeoutError:
+                    pass
            
            # After initial load, adjust viewport to content size
            if not self.text_only and kwargs.get("adjust_viewport_to_content", False):
@@ -1015,7 +1030,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                # await page.wait_for_timeout(100)
                
                # Check for on execution event
-                await self.execute_hook('on_execution_started', page, context = context)
+                await self.execute_hook('on_execution_started', page, context = context, **kwargs)
                
            if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
                # Simulate user interactions
@@ -1119,7 +1134,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if kwargs.get("process_iframes", False):
                page = await self.process_iframes(page)
            
-            await self.execute_hook('before_retrieve_html', page, context = context)
+            await self.execute_hook('before_retrieve_html', page, context = context, **kwargs)
            # Check if delay_before_return_html is set then wait for that time
            delay_before_return_html = kwargs.get("delay_before_return_html", 0.1)
            if delay_before_return_html:
@@ -1130,7 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                await self.remove_overlay_elements(page)
            
            html = await page.content()
-            await self.execute_hook('before_return_html', page, html, context = context)
+            await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
            
            # Check if kwargs has screenshot=True then take screenshot
            screenshot_data = None
@@ -1394,6 +1409,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return base64.b64encode(buffered.getvalue()).decode('utf-8')
        finally:
            await page.close()
+     
+    async def export_storage_state(self, path: str = None) -> dict:
+        """
+        Exports the current storage state (cookies, localStorage, sessionStorage)
+        to a JSON file at the specified path.
+        """
+        if self.default_context:
+            state = await self.default_context.storage_state(path=path)
+            self.logger.info(
+                message="Exported storage state to {path}",
+                tag="INFO",
+                params={"path": path}
+            )
+            return state
+        else:
+            self.logger.warning(
+                message="No default_context available to export storage state.",
+                tag="WARNING"
+            )
            
    async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
        """
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -182,6 +182,10 @@ class AsyncWebCrawler:
        Returns:
            CrawlResult: The result of crawling and processing
        """
+        # Check if url is not string and is not empty
+        if not isinstance(url, str) or not url:
+            raise ValueError("Invalid URL, make sure the URL is a non-empty string")
+        
        async with self._lock or nullcontext():
            try:
                # Handle deprecated parameters
@@ -335,7 +339,8 @@ class AsyncWebCrawler:
                # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
                
                self.logger.error_status(
-                    url=cache_context.display_url,
+                    # url=cache_context.display_url,
+                    url=url,
                    error=create_box_message(e.msg, type = "error"),
                    tag="ERROR"
                )            
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -632,7 +632,7 @@ class ContentSummarizationStrategy(ExtractionStrategy):
        # Sort summaries by the original section index to maintain order
        summaries.sort(key=lambda x: x[0])
        return [summary for _, summary in summaries]
-  
+ 
 class JsonCssExtractionStrategy(ExtractionStrategy):
    def __init__(self, schema: Dict[str, Any], **kwargs):
        super().__init__(**kwargs)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -147,6 +147,7 @@ class CustomHTML2Text(HTML2Text):
    #         self.preserved_content.append(data)
    #         return
    #     super().handle_data(data, entity_char)
+
 class InvalidCSSSelectorError(Exception):
    pass

--- a/main.py
+++ b/main.py
@@ -342,7 +342,7 @@ app.add_middleware(

 # API token security
 security = HTTPBearer()
-CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")

 async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
    if not CRAWL4AI_API_TOKEN: