mirror of
https://github.com/HKUDS/RAG-Anything.git
synced 2025-08-20 19:01:34 +03:00
fix lint
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -76,4 +76,4 @@ memory-bank/
|
||||
# AI
|
||||
.claude/
|
||||
.cursor/
|
||||
CLAUDE.md
|
||||
CLAUDE.md
|
||||
|
||||
@@ -67,14 +67,14 @@ async def async_batch_processing():
|
||||
max_workers=4,
|
||||
show_progress=True
|
||||
)
|
||||
|
||||
|
||||
# Process files asynchronously
|
||||
result = await batch_parser.process_batch_async(
|
||||
file_paths=["doc1.pdf", "doc2.docx"],
|
||||
output_dir="./output",
|
||||
parse_method="auto"
|
||||
)
|
||||
|
||||
|
||||
return result
|
||||
|
||||
# Run async processing
|
||||
@@ -170,7 +170,7 @@ class BatchProcessingResult:
|
||||
processing_time: float # Total processing time in seconds
|
||||
errors: Dict[str, str] # Error messages for failed files
|
||||
output_dir: str # Output directory used
|
||||
|
||||
|
||||
def summary(self) -> str: # Human-readable summary
|
||||
def success_rate(self) -> float: # Success rate as percentage
|
||||
```
|
||||
@@ -181,16 +181,16 @@ class BatchProcessingResult:
|
||||
class BatchParser:
|
||||
def __init__(self, parser_type: str = "mineru", max_workers: int = 4, ...):
|
||||
"""Initialize batch parser"""
|
||||
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""Get list of supported file extensions"""
|
||||
|
||||
|
||||
def filter_supported_files(self, file_paths: List[str], recursive: bool = True) -> List[str]:
|
||||
"""Filter files to only supported types"""
|
||||
|
||||
|
||||
def process_batch(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
|
||||
"""Process files in batch"""
|
||||
|
||||
|
||||
async def process_batch_async(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
|
||||
"""Process files in batch asynchronously"""
|
||||
```
|
||||
@@ -312,16 +312,16 @@ result = batch_parser.process_batch(
|
||||
```python
|
||||
def process_with_retry(file_paths, max_retries=3):
|
||||
"""Process files with retry logic"""
|
||||
|
||||
|
||||
for attempt in range(max_retries):
|
||||
result = batch_parser.process_batch(file_paths, "./output")
|
||||
|
||||
|
||||
if not result.failed_files:
|
||||
break # All files processed successfully
|
||||
|
||||
|
||||
print(f"Attempt {attempt + 1}: {len(result.failed_files)} files failed")
|
||||
file_paths = result.failed_files # Retry failed files
|
||||
|
||||
|
||||
return result
|
||||
```
|
||||
|
||||
@@ -338,4 +338,4 @@ def process_with_retry(file_paths, max_retries=3):
|
||||
|
||||
## Conclusion
|
||||
|
||||
The batch processing feature significantly improves RAG-Anything's throughput for large document collections. It provides flexible configuration options, comprehensive error handling, and seamless integration with the existing RAG-Anything pipeline.
|
||||
The batch processing feature significantly improves RAG-Anything's throughput for large document collections. It provides flexible configuration options, comprehensive error handling, and seamless integration with the existing RAG-Anything pipeline.
|
||||
|
||||
@@ -96,21 +96,21 @@ config = MarkdownConfig(
|
||||
line_height="1.5", # Line spacing
|
||||
include_toc=True, # Generate table of contents
|
||||
syntax_highlighting=True, # Enable code syntax highlighting
|
||||
|
||||
|
||||
# Custom CSS styling
|
||||
custom_css="""
|
||||
body {
|
||||
font-family: 'Georgia', serif;
|
||||
body {
|
||||
font-family: 'Georgia', serif;
|
||||
color: #333;
|
||||
}
|
||||
h1 {
|
||||
color: #2c3e50;
|
||||
border-bottom: 2px solid #3498db;
|
||||
h1 {
|
||||
color: #2c3e50;
|
||||
border-bottom: 2px solid #3498db;
|
||||
padding-bottom: 0.3em;
|
||||
}
|
||||
code {
|
||||
background-color: #f8f9fa;
|
||||
padding: 2px 4px;
|
||||
code {
|
||||
background-color: #f8f9fa;
|
||||
padding: 2px 4px;
|
||||
border-radius: 3px;
|
||||
}
|
||||
pre {
|
||||
@@ -232,22 +232,22 @@ class MarkdownConfig:
|
||||
margin: str = "1in" # CSS margin format
|
||||
font_size: str = "12pt" # Base font size
|
||||
line_height: str = "1.5" # Line spacing multiplier
|
||||
|
||||
|
||||
# Content options
|
||||
include_toc: bool = True # Generate table of contents
|
||||
syntax_highlighting: bool = True # Enable code highlighting
|
||||
image_max_width: str = "100%" # Maximum image width
|
||||
table_style: str = "..." # Default table CSS
|
||||
|
||||
|
||||
# Styling
|
||||
css_file: Optional[str] = None # External CSS file path
|
||||
custom_css: Optional[str] = None # Inline CSS content
|
||||
template_file: Optional[str] = None # Custom HTML template
|
||||
|
||||
|
||||
# Output options
|
||||
output_format: str = "pdf" # Currently only PDF supported
|
||||
output_dir: Optional[str] = None # Output directory
|
||||
|
||||
|
||||
# Metadata
|
||||
metadata: Optional[Dict[str, str]] = None # Document metadata
|
||||
```
|
||||
@@ -347,7 +347,7 @@ This document provides comprehensive technical specifications.
|
||||
|
||||
### System Components
|
||||
1. **Parser Engine**: Handles document processing
|
||||
2. **Storage Layer**: Manages data persistence
|
||||
2. **Storage Layer**: Manages data persistence
|
||||
3. **Query Interface**: Provides search capabilities
|
||||
|
||||
### Code Implementation
|
||||
@@ -437,8 +437,8 @@ sudo apt-get install pandoc wkhtmltopdf
|
||||
# Use web-safe fonts
|
||||
config = MarkdownConfig(
|
||||
custom_css="""
|
||||
body {
|
||||
font-family: 'Arial', 'Helvetica', sans-serif;
|
||||
body {
|
||||
font-family: 'Arial', 'Helvetica', sans-serif;
|
||||
}
|
||||
"""
|
||||
)
|
||||
@@ -468,10 +468,10 @@ result = converter.convert_file_to_pdf("test.md", "test.pdf")
|
||||
def robust_conversion(input_path, output_path):
|
||||
"""Convert with fallback backends"""
|
||||
converter = EnhancedMarkdownConverter()
|
||||
|
||||
|
||||
# Try backends in order of preference
|
||||
backends = ["weasyprint", "pandoc", "auto"]
|
||||
|
||||
|
||||
for backend in backends:
|
||||
try:
|
||||
success = converter.convert_file_to_pdf(
|
||||
@@ -485,7 +485,7 @@ def robust_conversion(input_path, output_path):
|
||||
except Exception as e:
|
||||
print(f"❌ {backend} failed: {str(e)}")
|
||||
continue
|
||||
|
||||
|
||||
print("❌ All backends failed")
|
||||
return False
|
||||
```
|
||||
@@ -498,19 +498,19 @@ def robust_conversion(input_path, output_path):
|
||||
class EnhancedMarkdownConverter:
|
||||
def __init__(self, config: Optional[MarkdownConfig] = None):
|
||||
"""Initialize converter with optional configuration"""
|
||||
|
||||
|
||||
def convert_file_to_pdf(self, input_path: str, output_path: str, method: str = "auto") -> bool:
|
||||
"""Convert markdown file to PDF"""
|
||||
|
||||
|
||||
def convert_markdown_to_pdf(self, markdown_content: str, output_path: str, method: str = "auto") -> bool:
|
||||
"""Convert markdown content to PDF"""
|
||||
|
||||
|
||||
def get_backend_info(self) -> Dict[str, Any]:
|
||||
"""Get information about available backends"""
|
||||
|
||||
|
||||
def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
|
||||
"""Convert using WeasyPrint backend"""
|
||||
|
||||
|
||||
def convert_with_pandoc(self, markdown_content: str, output_path: str) -> bool:
|
||||
"""Convert using Pandoc backend"""
|
||||
```
|
||||
@@ -549,4 +549,4 @@ class EnhancedMarkdownConverter:
|
||||
|
||||
## Conclusion
|
||||
|
||||
The enhanced markdown conversion feature provides professional-quality PDF generation with flexible styling options and multiple backend support. It seamlessly integrates with RAG-Anything's document processing pipeline while offering standalone functionality for markdown-to-PDF conversion needs.
|
||||
The enhanced markdown conversion feature provides professional-quality PDF generation with flexible styling options and multiple backend support. It seamlessly integrates with RAG-Anything's document processing pipeline while offering standalone functionality for markdown-to-PDF conversion needs.
|
||||
|
||||
@@ -21,6 +21,7 @@ import time
|
||||
|
||||
# Add project root directory to Python path
|
||||
import sys
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
@@ -31,13 +32,13 @@ def create_sample_documents():
|
||||
"""Create sample documents for batch processing testing"""
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
sample_files = []
|
||||
|
||||
|
||||
# Create various document types
|
||||
documents = {
|
||||
"document1.txt": "This is a simple text document for testing batch processing.",
|
||||
"document2.txt": "Another text document with different content.",
|
||||
"document3.md": """# Markdown Document
|
||||
|
||||
|
||||
## Introduction
|
||||
This is a markdown document for testing.
|
||||
|
||||
@@ -92,16 +93,16 @@ Batch processing is essential for large-scale document processing.
|
||||
|
||||
### Next Steps
|
||||
Continue development and testing of batch processing features.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
|
||||
# Create files
|
||||
for filename, content in documents.items():
|
||||
file_path = temp_dir / filename
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
sample_files.append(str(file_path))
|
||||
|
||||
|
||||
return sample_files, temp_dir
|
||||
|
||||
|
||||
@@ -110,54 +111,54 @@ def demonstrate_basic_batch_processing():
|
||||
print("\n" + "=" * 60)
|
||||
print("BASIC BATCH PROCESSING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
# Create sample documents
|
||||
sample_files, temp_dir = create_sample_documents()
|
||||
|
||||
|
||||
try:
|
||||
print(f"Created {len(sample_files)} sample documents in: {temp_dir}")
|
||||
for file_path in sample_files:
|
||||
print(f" - {Path(file_path).name}")
|
||||
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=3,
|
||||
show_progress=True,
|
||||
timeout_per_file=60,
|
||||
skip_installation_check=True # Skip installation check for demo
|
||||
skip_installation_check=True, # Skip installation check for demo
|
||||
)
|
||||
|
||||
print(f"\nBatch parser configured:")
|
||||
print(f" - Parser type: mineru")
|
||||
print(f" - Max workers: 3")
|
||||
print(f" - Progress tracking: enabled")
|
||||
print(f" - Timeout per file: 60 seconds")
|
||||
|
||||
|
||||
print("\nBatch parser configured:")
|
||||
print(" - Parser type: mineru")
|
||||
print(" - Max workers: 3")
|
||||
print(" - Progress tracking: enabled")
|
||||
print(" - Timeout per file: 60 seconds")
|
||||
|
||||
# Check supported extensions
|
||||
supported_extensions = batch_parser.get_supported_extensions()
|
||||
print(f" - Supported extensions: {supported_extensions}")
|
||||
|
||||
|
||||
# Filter files to supported types
|
||||
supported_files = batch_parser.filter_supported_files(sample_files)
|
||||
print(f"\nFile filtering results:")
|
||||
print("\nFile filtering results:")
|
||||
print(f" - Total files: {len(sample_files)}")
|
||||
print(f" - Supported files: {len(supported_files)}")
|
||||
|
||||
|
||||
# Process batch
|
||||
output_dir = temp_dir / "batch_output"
|
||||
print(f"\nStarting batch processing...")
|
||||
print("\nStarting batch processing...")
|
||||
print(f"Output directory: {output_dir}")
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=supported_files,
|
||||
output_dir=str(output_dir),
|
||||
parse_method="auto",
|
||||
recursive=False
|
||||
recursive=False,
|
||||
)
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
|
||||
# Display results
|
||||
print("\n" + "-" * 40)
|
||||
print("BATCH PROCESSING RESULTS")
|
||||
@@ -165,20 +166,20 @@ def demonstrate_basic_batch_processing():
|
||||
print(result.summary())
|
||||
print(f"Total processing time: {processing_time:.2f} seconds")
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
|
||||
|
||||
if result.successful_files:
|
||||
print(f"\nSuccessfully processed files:")
|
||||
print("\nSuccessfully processed files:")
|
||||
for file_path in result.successful_files:
|
||||
print(f" ✅ {Path(file_path).name}")
|
||||
|
||||
|
||||
if result.failed_files:
|
||||
print(f"\nFailed files:")
|
||||
print("\nFailed files:")
|
||||
for file_path in result.failed_files:
|
||||
error = result.errors.get(file_path, "Unknown error")
|
||||
print(f" ❌ {Path(file_path).name}: {error}")
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Batch processing demonstration failed: {str(e)}")
|
||||
return None
|
||||
@@ -189,33 +190,33 @@ async def demonstrate_async_batch_processing():
|
||||
print("\n" + "=" * 60)
|
||||
print("ASYNCHRONOUS BATCH PROCESSING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
# Create sample documents
|
||||
sample_files, temp_dir = create_sample_documents()
|
||||
|
||||
|
||||
try:
|
||||
print(f"Processing {len(sample_files)} documents asynchronously...")
|
||||
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=2,
|
||||
show_progress=True,
|
||||
skip_installation_check=True
|
||||
skip_installation_check=True,
|
||||
)
|
||||
|
||||
|
||||
# Process batch asynchronously
|
||||
output_dir = temp_dir / "async_output"
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
result = await batch_parser.process_batch_async(
|
||||
file_paths=sample_files,
|
||||
output_dir=str(output_dir),
|
||||
parse_method="auto",
|
||||
recursive=False
|
||||
recursive=False,
|
||||
)
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
|
||||
# Display results
|
||||
print("\n" + "-" * 40)
|
||||
print("ASYNC BATCH PROCESSING RESULTS")
|
||||
@@ -223,9 +224,9 @@ async def demonstrate_async_batch_processing():
|
||||
print(result.summary())
|
||||
print(f"Async processing time: {processing_time:.2f} seconds")
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Async batch processing demonstration failed: {str(e)}")
|
||||
return None
|
||||
@@ -236,10 +237,10 @@ async def demonstrate_rag_integration():
|
||||
print("\n" + "=" * 60)
|
||||
print("RAG-ANYTHING BATCH INTEGRATION DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
# Create sample documents
|
||||
sample_files, temp_dir = create_sample_documents()
|
||||
|
||||
|
||||
try:
|
||||
# Initialize RAG-Anything with temporary storage
|
||||
config = RAGAnythingConfig(
|
||||
@@ -247,20 +248,20 @@ async def demonstrate_rag_integration():
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
max_concurrent_files=2
|
||||
max_concurrent_files=2,
|
||||
)
|
||||
|
||||
|
||||
rag = RAGAnything(config=config)
|
||||
|
||||
|
||||
print("RAG-Anything initialized with batch processing capabilities")
|
||||
|
||||
|
||||
# Show available batch methods
|
||||
batch_methods = [method for method in dir(rag) if 'batch' in method.lower()]
|
||||
batch_methods = [method for method in dir(rag) if "batch" in method.lower()]
|
||||
print(f"Available batch methods: {batch_methods}")
|
||||
|
||||
|
||||
# Demonstrate batch processing with RAG integration
|
||||
print(f"\nProcessing {len(sample_files)} documents with RAG integration...")
|
||||
|
||||
|
||||
# Use the RAG-integrated batch processing
|
||||
try:
|
||||
# Process documents in batch
|
||||
@@ -268,40 +269,46 @@ async def demonstrate_rag_integration():
|
||||
file_paths=sample_files,
|
||||
output_dir=str(temp_dir / "rag_batch_output"),
|
||||
max_workers=2,
|
||||
show_progress=True
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
|
||||
print("\n" + "-" * 40)
|
||||
print("RAG BATCH PROCESSING RESULTS")
|
||||
print("-" * 40)
|
||||
print(result.summary())
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
|
||||
|
||||
# Demonstrate batch processing with full RAG integration
|
||||
print(f"\nProcessing documents with full RAG integration...")
|
||||
|
||||
print("\nProcessing documents with full RAG integration...")
|
||||
|
||||
rag_result = await rag.process_documents_with_rag_batch(
|
||||
file_paths=sample_files[:2], # Process subset for demo
|
||||
output_dir=str(temp_dir / "rag_full_output"),
|
||||
max_workers=1,
|
||||
show_progress=True
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
|
||||
print("\n" + "-" * 40)
|
||||
print("FULL RAG INTEGRATION RESULTS")
|
||||
print("-" * 40)
|
||||
print(f"Parse result: {rag_result['parse_result'].summary()}")
|
||||
print(f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds")
|
||||
print(f"Successfully processed with RAG: {rag_result['successful_rag_files']}")
|
||||
print(
|
||||
f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds"
|
||||
)
|
||||
print(
|
||||
f"Successfully processed with RAG: {rag_result['successful_rag_files']}"
|
||||
)
|
||||
print(f"Failed RAG processing: {rag_result['failed_rag_files']}")
|
||||
|
||||
|
||||
return rag_result
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ RAG integration demo completed with limitations: {str(e)}")
|
||||
print("Note: This is expected in environments without full API configuration")
|
||||
print(
|
||||
"Note: This is expected in environments without full API configuration"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ RAG integration demonstration failed: {str(e)}")
|
||||
return None
|
||||
@@ -312,79 +319,79 @@ def demonstrate_directory_processing():
|
||||
print("\n" + "=" * 60)
|
||||
print("DIRECTORY PROCESSING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
# Create a directory structure with nested files
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
|
||||
# Create main directory files
|
||||
main_files = {
|
||||
"overview.txt": "Main directory overview document",
|
||||
"readme.md": "# Project README\n\nThis is the main project documentation."
|
||||
"readme.md": "# Project README\n\nThis is the main project documentation.",
|
||||
}
|
||||
|
||||
|
||||
# Create subdirectory
|
||||
sub_dir = temp_dir / "subdirectory"
|
||||
sub_dir.mkdir()
|
||||
|
||||
|
||||
sub_files = {
|
||||
"details.txt": "Detailed information in subdirectory",
|
||||
"notes.md": "# Notes\n\nAdditional notes and information."
|
||||
"notes.md": "# Notes\n\nAdditional notes and information.",
|
||||
}
|
||||
|
||||
|
||||
# Write all files
|
||||
all_files = []
|
||||
for filename, content in main_files.items():
|
||||
file_path = temp_dir / filename
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
all_files.append(str(file_path))
|
||||
|
||||
|
||||
for filename, content in sub_files.items():
|
||||
file_path = sub_dir / filename
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
all_files.append(str(file_path))
|
||||
|
||||
|
||||
try:
|
||||
print(f"Created directory structure:")
|
||||
print("Created directory structure:")
|
||||
print(f" Main directory: {temp_dir}")
|
||||
print(f" Files in main: {list(main_files.keys())}")
|
||||
print(f" Subdirectory: {sub_dir}")
|
||||
print(f" Files in sub: {list(sub_files.keys())}")
|
||||
|
||||
|
||||
# Create batch parser
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=2,
|
||||
show_progress=True,
|
||||
skip_installation_check=True
|
||||
skip_installation_check=True,
|
||||
)
|
||||
|
||||
|
||||
# Process entire directory recursively
|
||||
print(f"\nProcessing entire directory recursively...")
|
||||
|
||||
print("\nProcessing entire directory recursively...")
|
||||
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=[str(temp_dir)], # Pass directory path
|
||||
output_dir=str(temp_dir / "directory_output"),
|
||||
parse_method="auto",
|
||||
recursive=True # Include subdirectories
|
||||
recursive=True, # Include subdirectories
|
||||
)
|
||||
|
||||
|
||||
print("\n" + "-" * 40)
|
||||
print("DIRECTORY PROCESSING RESULTS")
|
||||
print("-" * 40)
|
||||
print(result.summary())
|
||||
print(f"Total files found and processed: {result.total_files}")
|
||||
print(f"Success rate: {result.success_rate:.1f}%")
|
||||
|
||||
|
||||
if result.successful_files:
|
||||
print(f"\nSuccessfully processed:")
|
||||
print("\nSuccessfully processed:")
|
||||
for file_path in result.successful_files:
|
||||
relative_path = Path(file_path).relative_to(temp_dir)
|
||||
print(f" ✅ {relative_path}")
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Directory processing demonstration failed: {str(e)}")
|
||||
return None
|
||||
@@ -395,26 +402,26 @@ def demonstrate_error_handling():
|
||||
print("\n" + "=" * 60)
|
||||
print("ERROR HANDLING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
|
||||
# Create files with various issues
|
||||
files_with_issues = {
|
||||
"valid_file.txt": "This is a valid file that should process successfully.",
|
||||
"empty_file.txt": "", # Empty file
|
||||
"large_file.txt": "x" * 1000000, # Large file (1MB of 'x')
|
||||
}
|
||||
|
||||
|
||||
created_files = []
|
||||
for filename, content in files_with_issues.items():
|
||||
file_path = temp_dir / filename
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
created_files.append(str(file_path))
|
||||
|
||||
|
||||
# Add a non-existent file to the list
|
||||
created_files.append(str(temp_dir / "non_existent_file.txt"))
|
||||
|
||||
|
||||
try:
|
||||
print(f"Testing error handling with {len(created_files)} files:")
|
||||
for file_path in created_files:
|
||||
@@ -422,54 +429,56 @@ def demonstrate_error_handling():
|
||||
exists = Path(file_path).exists()
|
||||
size = Path(file_path).stat().st_size if exists else 0
|
||||
print(f" - {name}: {'exists' if exists else 'missing'}, {size} bytes")
|
||||
|
||||
|
||||
# Create batch parser with short timeout for demonstration
|
||||
batch_parser = BatchParser(
|
||||
parser_type="mineru",
|
||||
max_workers=2,
|
||||
show_progress=True,
|
||||
timeout_per_file=30, # Short timeout for demo
|
||||
skip_installation_check=True
|
||||
skip_installation_check=True,
|
||||
)
|
||||
|
||||
|
||||
# Process files and handle errors
|
||||
result = batch_parser.process_batch(
|
||||
file_paths=created_files,
|
||||
output_dir=str(temp_dir / "error_test_output"),
|
||||
parse_method="auto"
|
||||
parse_method="auto",
|
||||
)
|
||||
|
||||
|
||||
print("\n" + "-" * 40)
|
||||
print("ERROR HANDLING RESULTS")
|
||||
print("-" * 40)
|
||||
print(result.summary())
|
||||
|
||||
|
||||
if result.successful_files:
|
||||
print(f"\nSuccessful files:")
|
||||
print("\nSuccessful files:")
|
||||
for file_path in result.successful_files:
|
||||
print(f" ✅ {Path(file_path).name}")
|
||||
|
||||
|
||||
if result.failed_files:
|
||||
print(f"\nFailed files with error details:")
|
||||
print("\nFailed files with error details:")
|
||||
for file_path in result.failed_files:
|
||||
error = result.errors.get(file_path, "Unknown error")
|
||||
print(f" ❌ {Path(file_path).name}: {error}")
|
||||
|
||||
|
||||
# Demonstrate retry logic
|
||||
if result.failed_files:
|
||||
print(f"\nDemonstrating retry logic for {len(result.failed_files)} failed files...")
|
||||
|
||||
print(
|
||||
f"\nDemonstrating retry logic for {len(result.failed_files)} failed files..."
|
||||
)
|
||||
|
||||
# Retry only the failed files
|
||||
retry_result = batch_parser.process_batch(
|
||||
file_paths=result.failed_files,
|
||||
output_dir=str(temp_dir / "retry_output"),
|
||||
parse_method="auto"
|
||||
parse_method="auto",
|
||||
)
|
||||
|
||||
|
||||
print(f"Retry results: {retry_result.summary()}")
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error handling demonstration failed: {str(e)}")
|
||||
return None
|
||||
@@ -480,9 +489,9 @@ async def main():
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
|
||||
print("RAG-Anything Batch Processing Demonstration")
|
||||
print("=" * 70)
|
||||
print("This example demonstrates various batch processing capabilities:")
|
||||
@@ -491,41 +500,43 @@ async def main():
|
||||
print(" - Integration with RAG-Anything pipeline")
|
||||
print(" - Directory processing with recursive file discovery")
|
||||
print(" - Comprehensive error handling and recovery")
|
||||
|
||||
|
||||
results = {}
|
||||
|
||||
|
||||
# Run demonstrations
|
||||
print("\n🚀 Starting demonstrations...")
|
||||
|
||||
|
||||
# Basic batch processing
|
||||
results['basic'] = demonstrate_basic_batch_processing()
|
||||
|
||||
results["basic"] = demonstrate_basic_batch_processing()
|
||||
|
||||
# Asynchronous processing
|
||||
results['async'] = await demonstrate_async_batch_processing()
|
||||
|
||||
results["async"] = await demonstrate_async_batch_processing()
|
||||
|
||||
# RAG integration
|
||||
results['rag'] = await demonstrate_rag_integration()
|
||||
|
||||
results["rag"] = await demonstrate_rag_integration()
|
||||
|
||||
# Directory processing
|
||||
results['directory'] = demonstrate_directory_processing()
|
||||
|
||||
results["directory"] = demonstrate_directory_processing()
|
||||
|
||||
# Error handling
|
||||
results['error_handling'] = demonstrate_error_handling()
|
||||
|
||||
results["error_handling"] = demonstrate_error_handling()
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("DEMONSTRATION SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
for demo_name, result in results.items():
|
||||
if result:
|
||||
if hasattr(result, 'success_rate'):
|
||||
print(f"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate")
|
||||
if hasattr(result, "success_rate"):
|
||||
print(
|
||||
f"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate"
|
||||
)
|
||||
else:
|
||||
print(f"✅ {demo_name.upper()}: Completed successfully")
|
||||
else:
|
||||
print(f"❌ {demo_name.upper()}: Failed or had limitations")
|
||||
|
||||
|
||||
print("\n📊 Key Features Demonstrated:")
|
||||
print(" - Parallel document processing with configurable worker counts")
|
||||
print(" - Real-time progress tracking with tqdm progress bars")
|
||||
@@ -536,7 +547,7 @@ async def main():
|
||||
print(" - Integration with RAG-Anything document pipeline")
|
||||
print(" - Retry logic for failed documents")
|
||||
print(" - Detailed processing statistics and timing")
|
||||
|
||||
|
||||
print("\n💡 Best Practices Highlighted:")
|
||||
print(" - Use appropriate worker counts for your system")
|
||||
print(" - Enable progress tracking for long-running operations")
|
||||
@@ -547,4 +558,4 @@ async def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -20,6 +20,7 @@ import tempfile
|
||||
|
||||
# Add project root directory to Python path
|
||||
import sys
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
|
||||
@@ -27,7 +28,7 @@ from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownCon
|
||||
|
||||
def create_sample_markdown_content():
|
||||
"""Create comprehensive sample markdown content for testing"""
|
||||
|
||||
|
||||
# Basic sample
|
||||
basic_content = """# Basic Markdown Sample
|
||||
|
||||
@@ -136,7 +137,7 @@ converter:
|
||||
|
||||
### Processing Times
|
||||
- **Small documents** (< 10 pages): 1-3 seconds
|
||||
- **Medium documents** (10-50 pages): 3-10 seconds
|
||||
- **Medium documents** (10-50 pages): 3-10 seconds
|
||||
- **Large documents** (> 50 pages): 10-30 seconds
|
||||
|
||||
## Advanced Features
|
||||
@@ -200,17 +201,17 @@ The enhanced markdown conversion system provides professional-quality PDF genera
|
||||
|
||||
---
|
||||
|
||||
*Generated on: 2024-01-15*
|
||||
*Generated on: 2024-01-15*
|
||||
*Version: 1.0.0*
|
||||
"""
|
||||
|
||||
# Academic paper sample
|
||||
academic_content = """# Research Paper: Advanced Document Processing
|
||||
|
||||
**Authors:** Alice Johnson¹, Bob Smith², Carol Williams¹
|
||||
**Affiliations:**
|
||||
¹ University of Technology
|
||||
² Research Institute
|
||||
**Authors:** Alice Johnson¹, Bob Smith², Carol Williams¹
|
||||
**Affiliations:**
|
||||
¹ University of Technology
|
||||
² Research Institute
|
||||
|
||||
## Abstract
|
||||
|
||||
@@ -294,18 +295,18 @@ Both backends support syntax highlighting through Pygments:
|
||||
def analyze_performance(backend, documents):
|
||||
'''Analyze conversion performance for given backend'''
|
||||
results = []
|
||||
|
||||
|
||||
for doc in documents:
|
||||
start_time = time.time()
|
||||
success = backend.convert(doc)
|
||||
end_time = time.time()
|
||||
|
||||
|
||||
results.append({
|
||||
'document': doc,
|
||||
'time': end_time - start_time,
|
||||
'success': success
|
||||
})
|
||||
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
@@ -356,15 +357,15 @@ This research demonstrates that enhanced markdown conversion provides significan
|
||||
|
||||
---
|
||||
|
||||
**Manuscript received:** January 10, 2024
|
||||
**Accepted for publication:** January 15, 2024
|
||||
**Manuscript received:** January 10, 2024
|
||||
**Accepted for publication:** January 15, 2024
|
||||
**Published online:** January 20, 2024
|
||||
"""
|
||||
|
||||
return {
|
||||
"basic": basic_content,
|
||||
"technical": technical_content,
|
||||
"academic": academic_content
|
||||
"academic": academic_content,
|
||||
}
|
||||
|
||||
|
||||
@@ -373,11 +374,11 @@ def demonstrate_basic_conversion():
|
||||
print("\n" + "=" * 60)
|
||||
print("BASIC MARKDOWN CONVERSION DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
try:
|
||||
# Create converter with default settings
|
||||
converter = EnhancedMarkdownConverter()
|
||||
|
||||
|
||||
# Show backend information
|
||||
backend_info = converter.get_backend_info()
|
||||
print("Available conversion backends:")
|
||||
@@ -385,32 +386,32 @@ def demonstrate_basic_conversion():
|
||||
status = "✅" if available else "❌"
|
||||
print(f" {status} {backend}")
|
||||
print(f"Recommended backend: {backend_info['recommended_backend']}")
|
||||
|
||||
|
||||
# Get sample content
|
||||
samples = create_sample_markdown_content()
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
|
||||
# Convert basic sample
|
||||
basic_md_path = temp_dir / "basic_sample.md"
|
||||
with open(basic_md_path, 'w', encoding='utf-8') as f:
|
||||
f.write(samples['basic'])
|
||||
|
||||
with open(basic_md_path, "w", encoding="utf-8") as f:
|
||||
f.write(samples["basic"])
|
||||
|
||||
print(f"\nConverting basic sample: {basic_md_path}")
|
||||
|
||||
|
||||
success = converter.convert_file_to_pdf(
|
||||
input_path=str(basic_md_path),
|
||||
output_path=str(temp_dir / "basic_sample.pdf"),
|
||||
method="auto" # Let the system choose the best backend
|
||||
method="auto", # Let the system choose the best backend
|
||||
)
|
||||
|
||||
|
||||
if success:
|
||||
print("✅ Basic conversion successful!")
|
||||
print(f" Output: {temp_dir / 'basic_sample.pdf'}")
|
||||
else:
|
||||
print("❌ Basic conversion failed")
|
||||
|
||||
|
||||
return success, temp_dir
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Basic conversion demonstration failed: {str(e)}")
|
||||
return False, None
|
||||
@@ -421,71 +422,76 @@ def demonstrate_backend_comparison():
|
||||
print("\n" + "=" * 60)
|
||||
print("BACKEND COMPARISON DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
try:
|
||||
samples = create_sample_markdown_content()
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
|
||||
# Create technical document
|
||||
tech_md_path = temp_dir / "technical.md"
|
||||
with open(tech_md_path, 'w', encoding='utf-8') as f:
|
||||
f.write(samples['technical'])
|
||||
|
||||
print(f"Testing different backends with technical document...")
|
||||
|
||||
with open(tech_md_path, "w", encoding="utf-8") as f:
|
||||
f.write(samples["technical"])
|
||||
|
||||
print("Testing different backends with technical document...")
|
||||
|
||||
# Test different backends
|
||||
backends = ["auto", "weasyprint", "pandoc"]
|
||||
results = {}
|
||||
|
||||
|
||||
for backend in backends:
|
||||
try:
|
||||
print(f"\nTesting {backend} backend...")
|
||||
|
||||
|
||||
converter = EnhancedMarkdownConverter()
|
||||
output_path = temp_dir / f"technical_{backend}.pdf"
|
||||
|
||||
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
success = converter.convert_file_to_pdf(
|
||||
input_path=str(tech_md_path),
|
||||
output_path=str(output_path),
|
||||
method=backend
|
||||
method=backend,
|
||||
)
|
||||
|
||||
|
||||
end_time = time.time()
|
||||
conversion_time = end_time - start_time
|
||||
|
||||
|
||||
if success:
|
||||
file_size = output_path.stat().st_size if output_path.exists() else 0
|
||||
print(f" ✅ {backend}: Success in {conversion_time:.2f}s, {file_size} bytes")
|
||||
file_size = (
|
||||
output_path.stat().st_size if output_path.exists() else 0
|
||||
)
|
||||
print(
|
||||
f" ✅ {backend}: Success in {conversion_time:.2f}s, {file_size} bytes"
|
||||
)
|
||||
results[backend] = {
|
||||
'success': True,
|
||||
'time': conversion_time,
|
||||
'size': file_size,
|
||||
'output': str(output_path)
|
||||
"success": True,
|
||||
"time": conversion_time,
|
||||
"size": file_size,
|
||||
"output": str(output_path),
|
||||
}
|
||||
else:
|
||||
print(f" ❌ {backend}: Failed")
|
||||
results[backend] = {'success': False, 'time': conversion_time}
|
||||
|
||||
results[backend] = {"success": False, "time": conversion_time}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {backend}: Error - {str(e)}")
|
||||
results[backend] = {'success': False, 'error': str(e)}
|
||||
|
||||
results[backend] = {"success": False, "error": str(e)}
|
||||
|
||||
# Summary
|
||||
print("\n" + "-" * 40)
|
||||
print("BACKEND COMPARISON SUMMARY")
|
||||
print("-" * 40)
|
||||
successful_backends = [b for b, r in results.items() if r.get('success', False)]
|
||||
successful_backends = [b for b, r in results.items() if r.get("success", False)]
|
||||
print(f"Successful backends: {successful_backends}")
|
||||
|
||||
|
||||
if successful_backends:
|
||||
fastest = min(successful_backends, key=lambda b: results[b]['time'])
|
||||
fastest = min(successful_backends, key=lambda b: results[b]["time"])
|
||||
print(f"Fastest backend: {fastest} ({results[fastest]['time']:.2f}s)")
|
||||
|
||||
|
||||
return results, temp_dir
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Backend comparison demonstration failed: {str(e)}")
|
||||
return None, None
|
||||
@@ -496,11 +502,11 @@ def demonstrate_custom_styling():
|
||||
print("\n" + "=" * 60)
|
||||
print("CUSTOM STYLING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
try:
|
||||
samples = create_sample_markdown_content()
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
|
||||
# Create custom CSS
|
||||
custom_css = """
|
||||
body {
|
||||
@@ -512,7 +518,7 @@ def demonstrate_custom_styling():
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
|
||||
h1 {
|
||||
color: #c0392b;
|
||||
font-size: 2.2em;
|
||||
@@ -520,7 +526,7 @@ def demonstrate_custom_styling():
|
||||
padding-bottom: 0.5em;
|
||||
margin-top: 2em;
|
||||
}
|
||||
|
||||
|
||||
h2 {
|
||||
color: #8e44ad;
|
||||
font-size: 1.6em;
|
||||
@@ -528,13 +534,13 @@ def demonstrate_custom_styling():
|
||||
padding-bottom: 0.3em;
|
||||
margin-top: 1.5em;
|
||||
}
|
||||
|
||||
|
||||
h3 {
|
||||
color: #2980b9;
|
||||
font-size: 1.3em;
|
||||
margin-top: 1.2em;
|
||||
}
|
||||
|
||||
|
||||
code {
|
||||
background-color: #ecf0f1;
|
||||
color: #e74c3c;
|
||||
@@ -543,7 +549,7 @@ def demonstrate_custom_styling():
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
|
||||
pre {
|
||||
background-color: #2c3e50;
|
||||
color: #ecf0f1;
|
||||
@@ -553,13 +559,13 @@ def demonstrate_custom_styling():
|
||||
overflow-x: auto;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
|
||||
pre code {
|
||||
background-color: transparent;
|
||||
color: inherit;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
|
||||
blockquote {
|
||||
background-color: #f8f9fa;
|
||||
border-left: 5px solid #3498db;
|
||||
@@ -568,7 +574,7 @@ def demonstrate_custom_styling():
|
||||
font-style: italic;
|
||||
color: #555;
|
||||
}
|
||||
|
||||
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
@@ -578,7 +584,7 @@ def demonstrate_custom_styling():
|
||||
overflow: hidden;
|
||||
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
||||
}
|
||||
|
||||
|
||||
th {
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
@@ -586,41 +592,41 @@ def demonstrate_custom_styling():
|
||||
text-align: left;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
|
||||
td {
|
||||
padding: 10px 15px;
|
||||
border-bottom: 1px solid #ecf0f1;
|
||||
}
|
||||
|
||||
|
||||
tr:nth-child(even) {
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
|
||||
|
||||
tr:hover {
|
||||
background-color: #e8f4fd;
|
||||
}
|
||||
|
||||
|
||||
ul, ol {
|
||||
margin-bottom: 1em;
|
||||
padding-left: 2em;
|
||||
}
|
||||
|
||||
|
||||
li {
|
||||
margin-bottom: 0.5em;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
|
||||
a {
|
||||
color: #3498db;
|
||||
text-decoration: none;
|
||||
border-bottom: 1px dotted #3498db;
|
||||
}
|
||||
|
||||
|
||||
a:hover {
|
||||
color: #2980b9;
|
||||
border-bottom: 1px solid #2980b9;
|
||||
}
|
||||
|
||||
|
||||
.toc {
|
||||
background-color: #f8f9fa;
|
||||
border: 2px solid #e9ecef;
|
||||
@@ -628,29 +634,29 @@ def demonstrate_custom_styling():
|
||||
padding: 20px;
|
||||
margin: 2em 0;
|
||||
}
|
||||
|
||||
|
||||
.toc h2 {
|
||||
color: #2c3e50;
|
||||
margin-top: 0;
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
|
||||
.toc ul {
|
||||
list-style-type: none;
|
||||
padding-left: 0;
|
||||
}
|
||||
|
||||
|
||||
.toc li {
|
||||
margin-bottom: 0.8em;
|
||||
}
|
||||
|
||||
|
||||
.toc a {
|
||||
color: #2c3e50;
|
||||
font-weight: 500;
|
||||
border-bottom: none;
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
# Create custom configuration
|
||||
config = MarkdownConfig(
|
||||
page_size="A4",
|
||||
@@ -659,16 +665,16 @@ def demonstrate_custom_styling():
|
||||
line_height="1.4",
|
||||
include_toc=True,
|
||||
syntax_highlighting=True,
|
||||
custom_css=custom_css
|
||||
custom_css=custom_css,
|
||||
)
|
||||
|
||||
|
||||
converter = EnhancedMarkdownConverter(config)
|
||||
|
||||
|
||||
# Convert academic sample with custom styling
|
||||
academic_md_path = temp_dir / "academic_styled.md"
|
||||
with open(academic_md_path, 'w', encoding='utf-8') as f:
|
||||
f.write(samples['academic'])
|
||||
|
||||
with open(academic_md_path, "w", encoding="utf-8") as f:
|
||||
f.write(samples["academic"])
|
||||
|
||||
print("Converting academic paper with custom styling...")
|
||||
print("Custom styling features:")
|
||||
print(" - Custom color scheme (reds, purples, blues)")
|
||||
@@ -677,32 +683,32 @@ def demonstrate_custom_styling():
|
||||
print(" - Styled code blocks with dark theme")
|
||||
print(" - Custom blockquote styling")
|
||||
print(" - Professional header styling")
|
||||
|
||||
|
||||
success = converter.convert_file_to_pdf(
|
||||
input_path=str(academic_md_path),
|
||||
output_path=str(temp_dir / "academic_styled.pdf"),
|
||||
method="weasyprint" # WeasyPrint is best for custom CSS
|
||||
method="weasyprint", # WeasyPrint is best for custom CSS
|
||||
)
|
||||
|
||||
|
||||
if success:
|
||||
print("✅ Custom styling conversion successful!")
|
||||
print(f" Output: {temp_dir / 'academic_styled.pdf'}")
|
||||
|
||||
|
||||
# Also create a default version for comparison
|
||||
default_converter = EnhancedMarkdownConverter()
|
||||
default_success = default_converter.convert_file_to_pdf(
|
||||
input_path=str(academic_md_path),
|
||||
output_path=str(temp_dir / "academic_default.pdf"),
|
||||
method="weasyprint"
|
||||
method="weasyprint",
|
||||
)
|
||||
|
||||
|
||||
if default_success:
|
||||
print(f" Comparison (default): {temp_dir / 'academic_default.pdf'}")
|
||||
else:
|
||||
print("❌ Custom styling conversion failed")
|
||||
|
||||
|
||||
return success, temp_dir
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Custom styling demonstration failed: {str(e)}")
|
||||
return False, None
|
||||
@@ -713,7 +719,7 @@ def demonstrate_content_conversion():
|
||||
print("\n" + "=" * 60)
|
||||
print("CONTENT CONVERSION DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
try:
|
||||
# Create markdown content programmatically
|
||||
dynamic_content = f"""# Dynamic Content Example
|
||||
@@ -762,38 +768,38 @@ Direct content conversion is useful for:
|
||||
- API-based document services
|
||||
- Real-time content processing
|
||||
"""
|
||||
|
||||
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
converter = EnhancedMarkdownConverter()
|
||||
|
||||
|
||||
print("Converting dynamically generated markdown content...")
|
||||
print("Content includes:")
|
||||
print(" - System information")
|
||||
print(" - Dynamic tables with current values")
|
||||
print(" - Generated timestamps")
|
||||
print(" - Programmatic examples")
|
||||
|
||||
|
||||
# Convert content directly to PDF
|
||||
output_path = temp_dir / "dynamic_content.pdf"
|
||||
|
||||
|
||||
success = converter.convert_markdown_to_pdf(
|
||||
markdown_content=dynamic_content,
|
||||
output_path=str(output_path),
|
||||
method="auto"
|
||||
method="auto",
|
||||
)
|
||||
|
||||
|
||||
if success:
|
||||
print("✅ Content conversion successful!")
|
||||
print(f" Output: {output_path}")
|
||||
|
||||
|
||||
# Show file size
|
||||
file_size = output_path.stat().st_size
|
||||
print(f" Generated PDF size: {file_size} bytes")
|
||||
else:
|
||||
print("❌ Content conversion failed")
|
||||
|
||||
|
||||
return success, temp_dir
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Content conversion demonstration failed: {str(e)}")
|
||||
return False, None
|
||||
@@ -804,14 +810,14 @@ def demonstrate_error_handling():
|
||||
print("\n" + "=" * 60)
|
||||
print("ERROR HANDLING DEMONSTRATION")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
try:
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
|
||||
# Test cases with various issues
|
||||
test_cases = {
|
||||
"invalid_markdown": """# Invalid Markdown
|
||||
|
||||
|
||||
This markdown has some {{invalid}} syntax and [broken links](http://nonexistent.invalid).
|
||||
|
||||
```unknown_language
|
||||
@@ -841,98 +847,106 @@ Symbols: ♠ ♣ ♥ ♦ ☀ ☁ ☂ ☃ ☄ ★ ☆ ☉ ☊ ☋ ☌ ☍ ☎ ☏
|
||||
Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
|
||||
""",
|
||||
"empty_content": "",
|
||||
"minimal_content": "# Just a title"
|
||||
"minimal_content": "# Just a title",
|
||||
}
|
||||
|
||||
|
||||
print("Testing error handling with various content types...")
|
||||
|
||||
|
||||
results = {}
|
||||
|
||||
|
||||
for test_name, content in test_cases.items():
|
||||
print(f"\nTesting: {test_name}")
|
||||
|
||||
|
||||
try:
|
||||
# Try multiple backends for each test case
|
||||
for backend in ["auto", "weasyprint", "pandoc"]:
|
||||
try:
|
||||
converter = EnhancedMarkdownConverter()
|
||||
output_path = temp_dir / f"{test_name}_{backend}.pdf"
|
||||
|
||||
|
||||
success = converter.convert_markdown_to_pdf(
|
||||
markdown_content=content,
|
||||
output_path=str(output_path),
|
||||
method=backend
|
||||
method=backend,
|
||||
)
|
||||
|
||||
|
||||
if success:
|
||||
file_size = output_path.stat().st_size if output_path.exists() else 0
|
||||
file_size = (
|
||||
output_path.stat().st_size
|
||||
if output_path.exists()
|
||||
else 0
|
||||
)
|
||||
print(f" ✅ {backend}: Success ({file_size} bytes)")
|
||||
results[f"{test_name}_{backend}"] = {
|
||||
'success': True,
|
||||
'size': file_size
|
||||
"success": True,
|
||||
"size": file_size,
|
||||
}
|
||||
else:
|
||||
print(f" ❌ {backend}: Failed")
|
||||
results[f"{test_name}_{backend}"] = {'success': False}
|
||||
|
||||
results[f"{test_name}_{backend}"] = {"success": False}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {backend}: Error - {str(e)[:60]}...")
|
||||
results[f"{test_name}_{backend}"] = {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Test case failed: {str(e)}")
|
||||
|
||||
|
||||
# Demonstrate robust conversion with fallbacks
|
||||
print(f"\nDemonstrating robust conversion with fallback logic...")
|
||||
|
||||
print("\nDemonstrating robust conversion with fallback logic...")
|
||||
|
||||
def robust_convert(content, output_path):
|
||||
"""Convert with multiple backend fallbacks"""
|
||||
backends = ["weasyprint", "pandoc", "auto"]
|
||||
|
||||
|
||||
for backend in backends:
|
||||
try:
|
||||
converter = EnhancedMarkdownConverter()
|
||||
success = converter.convert_markdown_to_pdf(
|
||||
markdown_content=content,
|
||||
output_path=output_path,
|
||||
method=backend
|
||||
method=backend,
|
||||
)
|
||||
if success:
|
||||
return backend, True
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
|
||||
return None, False
|
||||
|
||||
|
||||
# Test robust conversion
|
||||
test_content = test_cases["complex_content"]
|
||||
robust_output = temp_dir / "robust_conversion.pdf"
|
||||
|
||||
|
||||
successful_backend, success = robust_convert(test_content, str(robust_output))
|
||||
|
||||
|
||||
if success:
|
||||
print(f"✅ Robust conversion successful using {successful_backend}")
|
||||
print(f" Output: {robust_output}")
|
||||
else:
|
||||
print("❌ All backends failed for robust conversion")
|
||||
|
||||
|
||||
# Summary
|
||||
print("\n" + "-" * 40)
|
||||
print("ERROR HANDLING SUMMARY")
|
||||
print("-" * 40)
|
||||
successful_conversions = sum(1 for r in results.values() if r.get('success', False))
|
||||
successful_conversions = sum(
|
||||
1 for r in results.values() if r.get("success", False)
|
||||
)
|
||||
total_attempts = len(results)
|
||||
success_rate = (successful_conversions / total_attempts * 100) if total_attempts > 0 else 0
|
||||
|
||||
success_rate = (
|
||||
(successful_conversions / total_attempts * 100) if total_attempts > 0 else 0
|
||||
)
|
||||
|
||||
print(f"Total conversion attempts: {total_attempts}")
|
||||
print(f"Successful conversions: {successful_conversions}")
|
||||
print(f"Success rate: {success_rate:.1f}%")
|
||||
|
||||
|
||||
return results, temp_dir
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error handling demonstration failed: {str(e)}")
|
||||
return None, None
|
||||
@@ -943,62 +957,72 @@ def main():
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
|
||||
print("RAG-Anything Enhanced Markdown Conversion Demonstration")
|
||||
print("=" * 70)
|
||||
print("This example demonstrates various enhanced markdown conversion capabilities:")
|
||||
print(
|
||||
"This example demonstrates various enhanced markdown conversion capabilities:"
|
||||
)
|
||||
print(" - Basic markdown to PDF conversion")
|
||||
print(" - Multiple backend comparison (WeasyPrint vs Pandoc)")
|
||||
print(" - Custom CSS styling and professional formatting")
|
||||
print(" - Direct content conversion without file I/O")
|
||||
print(" - Comprehensive error handling and fallback mechanisms")
|
||||
|
||||
|
||||
results = {}
|
||||
|
||||
|
||||
# Run demonstrations
|
||||
print("\n🚀 Starting demonstrations...")
|
||||
|
||||
|
||||
# Basic conversion
|
||||
success, temp_dir = demonstrate_basic_conversion()
|
||||
results['basic'] = success
|
||||
|
||||
results["basic"] = success
|
||||
|
||||
# Backend comparison
|
||||
backend_results, _ = demonstrate_backend_comparison()
|
||||
results['backends'] = backend_results
|
||||
|
||||
results["backends"] = backend_results
|
||||
|
||||
# Custom styling
|
||||
styling_success, _ = demonstrate_custom_styling()
|
||||
results['styling'] = styling_success
|
||||
|
||||
results["styling"] = styling_success
|
||||
|
||||
# Content conversion
|
||||
content_success, _ = demonstrate_content_conversion()
|
||||
results['content'] = content_success
|
||||
|
||||
results["content"] = content_success
|
||||
|
||||
# Error handling
|
||||
error_results, _ = demonstrate_error_handling()
|
||||
results['error_handling'] = error_results
|
||||
|
||||
results["error_handling"] = error_results
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("DEMONSTRATION SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
print("✅ Features Successfully Demonstrated:")
|
||||
if results['basic']:
|
||||
if results["basic"]:
|
||||
print(" - Basic markdown to PDF conversion")
|
||||
if results['backends']:
|
||||
successful_backends = [b for b, r in results['backends'].items() if r.get('success', False)]
|
||||
if results["backends"]:
|
||||
successful_backends = [
|
||||
b for b, r in results["backends"].items() if r.get("success", False)
|
||||
]
|
||||
print(f" - Multiple backends: {successful_backends}")
|
||||
if results['styling']:
|
||||
if results["styling"]:
|
||||
print(" - Custom CSS styling and professional formatting")
|
||||
if results['content']:
|
||||
if results["content"]:
|
||||
print(" - Direct content conversion without file I/O")
|
||||
if results['error_handling']:
|
||||
success_rate = sum(1 for r in results['error_handling'].values() if r.get('success', False)) / len(results['error_handling']) * 100
|
||||
if results["error_handling"]:
|
||||
success_rate = (
|
||||
sum(
|
||||
1 for r in results["error_handling"].values() if r.get("success", False)
|
||||
)
|
||||
/ len(results["error_handling"])
|
||||
* 100
|
||||
)
|
||||
print(f" - Error handling with {success_rate:.1f}% overall success rate")
|
||||
|
||||
|
||||
print("\n📊 Key Capabilities Highlighted:")
|
||||
print(" - Professional PDF generation with high-quality typography")
|
||||
print(" - Multiple conversion backends with automatic selection")
|
||||
@@ -1008,7 +1032,7 @@ def main():
|
||||
print(" - Image embedding with proper scaling")
|
||||
print(" - Table of contents generation with navigation")
|
||||
print(" - Comprehensive error handling and fallback mechanisms")
|
||||
|
||||
|
||||
print("\n💡 Best Practices Demonstrated:")
|
||||
print(" - Choose WeasyPrint for web-style documents and custom CSS")
|
||||
print(" - Choose Pandoc for academic papers and complex formatting")
|
||||
@@ -1018,7 +1042,7 @@ def main():
|
||||
print(" - Test custom CSS with simple content first")
|
||||
print(" - Handle errors gracefully with multiple backend attempts")
|
||||
print(" - Use appropriate page sizes and margins for target use case")
|
||||
|
||||
|
||||
print("\n🎯 Integration Patterns:")
|
||||
print(" - Standalone conversion for document generation")
|
||||
print(" - Integration with RAG-Anything document pipeline")
|
||||
@@ -1028,4 +1052,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user