mirror of
https://github.com/unclecode/crawl4ai.git
synced 2024-12-22 15:52:24 +03:00
Add full-page screenshot and PDF export features
- Introduced a new approach for capturing full-page screenshots by exporting them as PDFs first, enhancing reliability and performance. - Added documentation for the feature in `docs/examples/full_page_screenshot_and_pdf_export.md`. - Refactored `perform_completion_with_backoff` in `crawl4ai/utils.py` to include necessary extra parameters. - Updated `quickstart_async.py` to utilize LLM extraction with refined arguments.
This commit is contained in:
@@ -839,7 +839,11 @@ def perform_completion_with_backoff(
|
||||
max_attempts = 3
|
||||
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
|
||||
|
||||
extra_args = {}
|
||||
extra_args = {
|
||||
"temperature": 0.01,
|
||||
'api_key': api_token,
|
||||
'base_url': base_url
|
||||
}
|
||||
if json_response:
|
||||
extra_args["response_format"] = { "type": "json_object" }
|
||||
|
||||
@@ -848,14 +852,12 @@ def perform_completion_with_backoff(
|
||||
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
|
||||
response =completion(
|
||||
model=provider,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt_with_variables}
|
||||
],
|
||||
temperature=0.01,
|
||||
api_key=api_token,
|
||||
base_url=base_url,
|
||||
**extra_args
|
||||
)
|
||||
return response # Return the successful response
|
||||
|
||||
58
docs/examples/full_page_screenshot_and_pdf_export.md
Normal file
58
docs/examples/full_page_screenshot_and_pdf_export.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI
|
||||
|
||||
When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences.
|
||||
|
||||
**The New Approach:**
|
||||
We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic.
|
||||
|
||||
**Key Benefits:**
|
||||
- **Reliability:** The PDF export never times out and works regardless of page length.
|
||||
- **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing.
|
||||
- **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime.
|
||||
|
||||
**Simple Example:**
|
||||
```python
|
||||
import os, sys
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
|
||||
# Adjust paths as needed
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Request both PDF and screenshot
|
||||
result = await crawler.arun(
|
||||
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Save screenshot
|
||||
if result.screenshot:
|
||||
from base64 import b64decode
|
||||
with open(os.path.join(__location__, "screenshot.png"), "wb") as f:
|
||||
f.write(b64decode(result.screenshot))
|
||||
|
||||
# Save PDF
|
||||
if result.pdf_data:
|
||||
pdf_bytes = b64decode(result.pdf_data)
|
||||
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**What Happens Under the Hood:**
|
||||
- Crawl4AI navigates to the target page.
|
||||
- If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length.
|
||||
- If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling.
|
||||
- Finally, you get your PDF and/or screenshot ready to use.
|
||||
|
||||
**Conclusion:**
|
||||
With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages.
|
||||
@@ -117,7 +117,13 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
|
||||
print(f"API token is required for {provider}. Skipping this example.")
|
||||
return
|
||||
|
||||
extra_args = {}
|
||||
# extra_args = {}
|
||||
extra_args={
|
||||
"temperature": 0,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": 2000,
|
||||
# any other supported parameters for litellm
|
||||
}
|
||||
if extra_headers:
|
||||
extra_args["extra_headers"] = extra_headers
|
||||
|
||||
@@ -598,6 +604,8 @@ async def fit_markdown_remove_overlay():
|
||||
|
||||
|
||||
async def main():
|
||||
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
await simple_crawl()
|
||||
await simple_example_with_running_js_code()
|
||||
await simple_example_with_css_selector()
|
||||
@@ -609,7 +617,6 @@ async def main():
|
||||
# await extract_structured_data_using_llm()
|
||||
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
||||
# await extract_structured_data_using_llm("ollama/llama3.2")
|
||||
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# You always can pass custom headers to the extraction strategy
|
||||
# custom_headers = {
|
||||
|
||||
Reference in New Issue
Block a user