This commit is contained in:
zrguo
2025-07-29 17:07:15 +08:00
parent 4f900db761
commit 5e56140300
5 changed files with 365 additions and 330 deletions

View File

@@ -21,6 +21,7 @@ import time
# Add project root directory to Python path
import sys
sys.path.append(str(Path(__file__).parent.parent))
from raganything import RAGAnything, RAGAnythingConfig
@@ -92,13 +93,13 @@ Batch processing is essential for large-scale document processing.
### Next Steps
Continue development and testing of batch processing features.
"""
""",
}
# Create files
for filename, content in documents.items():
file_path = temp_dir / filename
with open(file_path, 'w', encoding='utf-8') as f:
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
sample_files.append(str(file_path))
@@ -125,14 +126,14 @@ def demonstrate_basic_batch_processing():
max_workers=3,
show_progress=True,
timeout_per_file=60,
skip_installation_check=True # Skip installation check for demo
skip_installation_check=True, # Skip installation check for demo
)
print(f"\nBatch parser configured:")
print(f" - Parser type: mineru")
print(f" - Max workers: 3")
print(f" - Progress tracking: enabled")
print(f" - Timeout per file: 60 seconds")
print("\nBatch parser configured:")
print(" - Parser type: mineru")
print(" - Max workers: 3")
print(" - Progress tracking: enabled")
print(" - Timeout per file: 60 seconds")
# Check supported extensions
supported_extensions = batch_parser.get_supported_extensions()
@@ -140,13 +141,13 @@ def demonstrate_basic_batch_processing():
# Filter files to supported types
supported_files = batch_parser.filter_supported_files(sample_files)
print(f"\nFile filtering results:")
print("\nFile filtering results:")
print(f" - Total files: {len(sample_files)}")
print(f" - Supported files: {len(supported_files)}")
# Process batch
output_dir = temp_dir / "batch_output"
print(f"\nStarting batch processing...")
print("\nStarting batch processing...")
print(f"Output directory: {output_dir}")
start_time = time.time()
@@ -154,7 +155,7 @@ def demonstrate_basic_batch_processing():
file_paths=supported_files,
output_dir=str(output_dir),
parse_method="auto",
recursive=False
recursive=False,
)
processing_time = time.time() - start_time
@@ -167,12 +168,12 @@ def demonstrate_basic_batch_processing():
print(f"Success rate: {result.success_rate:.1f}%")
if result.successful_files:
print(f"\nSuccessfully processed files:")
print("\nSuccessfully processed files:")
for file_path in result.successful_files:
print(f"{Path(file_path).name}")
if result.failed_files:
print(f"\nFailed files:")
print("\nFailed files:")
for file_path in result.failed_files:
error = result.errors.get(file_path, "Unknown error")
print(f"{Path(file_path).name}: {error}")
@@ -201,7 +202,7 @@ async def demonstrate_async_batch_processing():
parser_type="mineru",
max_workers=2,
show_progress=True,
skip_installation_check=True
skip_installation_check=True,
)
# Process batch asynchronously
@@ -212,7 +213,7 @@ async def demonstrate_async_batch_processing():
file_paths=sample_files,
output_dir=str(output_dir),
parse_method="auto",
recursive=False
recursive=False,
)
processing_time = time.time() - start_time
@@ -247,7 +248,7 @@ async def demonstrate_rag_integration():
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
max_concurrent_files=2
max_concurrent_files=2,
)
rag = RAGAnything(config=config)
@@ -255,7 +256,7 @@ async def demonstrate_rag_integration():
print("RAG-Anything initialized with batch processing capabilities")
# Show available batch methods
batch_methods = [method for method in dir(rag) if 'batch' in method.lower()]
batch_methods = [method for method in dir(rag) if "batch" in method.lower()]
print(f"Available batch methods: {batch_methods}")
# Demonstrate batch processing with RAG integration
@@ -268,7 +269,7 @@ async def demonstrate_rag_integration():
file_paths=sample_files,
output_dir=str(temp_dir / "rag_batch_output"),
max_workers=2,
show_progress=True
show_progress=True,
)
print("\n" + "-" * 40)
@@ -278,28 +279,34 @@ async def demonstrate_rag_integration():
print(f"Success rate: {result.success_rate:.1f}%")
# Demonstrate batch processing with full RAG integration
print(f"\nProcessing documents with full RAG integration...")
print("\nProcessing documents with full RAG integration...")
rag_result = await rag.process_documents_with_rag_batch(
file_paths=sample_files[:2], # Process subset for demo
output_dir=str(temp_dir / "rag_full_output"),
max_workers=1,
show_progress=True
show_progress=True,
)
print("\n" + "-" * 40)
print("FULL RAG INTEGRATION RESULTS")
print("-" * 40)
print(f"Parse result: {rag_result['parse_result'].summary()}")
print(f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds")
print(f"Successfully processed with RAG: {rag_result['successful_rag_files']}")
print(
f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds"
)
print(
f"Successfully processed with RAG: {rag_result['successful_rag_files']}"
)
print(f"Failed RAG processing: {rag_result['failed_rag_files']}")
return rag_result
except Exception as e:
print(f"⚠️ RAG integration demo completed with limitations: {str(e)}")
print("Note: This is expected in environments without full API configuration")
print(
"Note: This is expected in environments without full API configuration"
)
return None
except Exception as e:
@@ -319,7 +326,7 @@ def demonstrate_directory_processing():
# Create main directory files
main_files = {
"overview.txt": "Main directory overview document",
"readme.md": "# Project README\n\nThis is the main project documentation."
"readme.md": "# Project README\n\nThis is the main project documentation.",
}
# Create subdirectory
@@ -328,25 +335,25 @@ def demonstrate_directory_processing():
sub_files = {
"details.txt": "Detailed information in subdirectory",
"notes.md": "# Notes\n\nAdditional notes and information."
"notes.md": "# Notes\n\nAdditional notes and information.",
}
# Write all files
all_files = []
for filename, content in main_files.items():
file_path = temp_dir / filename
with open(file_path, 'w', encoding='utf-8') as f:
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
all_files.append(str(file_path))
for filename, content in sub_files.items():
file_path = sub_dir / filename
with open(file_path, 'w', encoding='utf-8') as f:
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
all_files.append(str(file_path))
try:
print(f"Created directory structure:")
print("Created directory structure:")
print(f" Main directory: {temp_dir}")
print(f" Files in main: {list(main_files.keys())}")
print(f" Subdirectory: {sub_dir}")
@@ -357,17 +364,17 @@ def demonstrate_directory_processing():
parser_type="mineru",
max_workers=2,
show_progress=True,
skip_installation_check=True
skip_installation_check=True,
)
# Process entire directory recursively
print(f"\nProcessing entire directory recursively...")
print("\nProcessing entire directory recursively...")
result = batch_parser.process_batch(
file_paths=[str(temp_dir)], # Pass directory path
output_dir=str(temp_dir / "directory_output"),
parse_method="auto",
recursive=True # Include subdirectories
recursive=True, # Include subdirectories
)
print("\n" + "-" * 40)
@@ -378,7 +385,7 @@ def demonstrate_directory_processing():
print(f"Success rate: {result.success_rate:.1f}%")
if result.successful_files:
print(f"\nSuccessfully processed:")
print("\nSuccessfully processed:")
for file_path in result.successful_files:
relative_path = Path(file_path).relative_to(temp_dir)
print(f"{relative_path}")
@@ -408,7 +415,7 @@ def demonstrate_error_handling():
created_files = []
for filename, content in files_with_issues.items():
file_path = temp_dir / filename
with open(file_path, 'w', encoding='utf-8') as f:
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
created_files.append(str(file_path))
@@ -429,14 +436,14 @@ def demonstrate_error_handling():
max_workers=2,
show_progress=True,
timeout_per_file=30, # Short timeout for demo
skip_installation_check=True
skip_installation_check=True,
)
# Process files and handle errors
result = batch_parser.process_batch(
file_paths=created_files,
output_dir=str(temp_dir / "error_test_output"),
parse_method="auto"
parse_method="auto",
)
print("\n" + "-" * 40)
@@ -445,25 +452,27 @@ def demonstrate_error_handling():
print(result.summary())
if result.successful_files:
print(f"\nSuccessful files:")
print("\nSuccessful files:")
for file_path in result.successful_files:
print(f"{Path(file_path).name}")
if result.failed_files:
print(f"\nFailed files with error details:")
print("\nFailed files with error details:")
for file_path in result.failed_files:
error = result.errors.get(file_path, "Unknown error")
print(f"{Path(file_path).name}: {error}")
# Demonstrate retry logic
if result.failed_files:
print(f"\nDemonstrating retry logic for {len(result.failed_files)} failed files...")
print(
f"\nDemonstrating retry logic for {len(result.failed_files)} failed files..."
)
# Retry only the failed files
retry_result = batch_parser.process_batch(
file_paths=result.failed_files,
output_dir=str(temp_dir / "retry_output"),
parse_method="auto"
parse_method="auto",
)
print(f"Retry results: {retry_result.summary()}")
@@ -480,7 +489,7 @@ async def main():
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
print("RAG-Anything Batch Processing Demonstration")
@@ -498,19 +507,19 @@ async def main():
print("\n🚀 Starting demonstrations...")
# Basic batch processing
results['basic'] = demonstrate_basic_batch_processing()
results["basic"] = demonstrate_basic_batch_processing()
# Asynchronous processing
results['async'] = await demonstrate_async_batch_processing()
results["async"] = await demonstrate_async_batch_processing()
# RAG integration
results['rag'] = await demonstrate_rag_integration()
results["rag"] = await demonstrate_rag_integration()
# Directory processing
results['directory'] = demonstrate_directory_processing()
results["directory"] = demonstrate_directory_processing()
# Error handling
results['error_handling'] = demonstrate_error_handling()
results["error_handling"] = demonstrate_error_handling()
# Summary
print("\n" + "=" * 70)
@@ -519,8 +528,10 @@ async def main():
for demo_name, result in results.items():
if result:
if hasattr(result, 'success_rate'):
print(f"{demo_name.upper()}: {result.success_rate:.1f}% success rate")
if hasattr(result, "success_rate"):
print(
f"{demo_name.upper()}: {result.success_rate:.1f}% success rate"
)
else:
print(f"{demo_name.upper()}: Completed successfully")
else:

View File

@@ -20,6 +20,7 @@ import tempfile
# Add project root directory to Python path
import sys
sys.path.append(str(Path(__file__).parent.parent))
from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
@@ -364,7 +365,7 @@ This research demonstrates that enhanced markdown conversion provides significan
return {
"basic": basic_content,
"technical": technical_content,
"academic": academic_content
"academic": academic_content,
}
@@ -392,15 +393,15 @@ def demonstrate_basic_conversion():
# Convert basic sample
basic_md_path = temp_dir / "basic_sample.md"
with open(basic_md_path, 'w', encoding='utf-8') as f:
f.write(samples['basic'])
with open(basic_md_path, "w", encoding="utf-8") as f:
f.write(samples["basic"])
print(f"\nConverting basic sample: {basic_md_path}")
success = converter.convert_file_to_pdf(
input_path=str(basic_md_path),
output_path=str(temp_dir / "basic_sample.pdf"),
method="auto" # Let the system choose the best backend
method="auto", # Let the system choose the best backend
)
if success:
@@ -428,10 +429,10 @@ def demonstrate_backend_comparison():
# Create technical document
tech_md_path = temp_dir / "technical.md"
with open(tech_md_path, 'w', encoding='utf-8') as f:
f.write(samples['technical'])
with open(tech_md_path, "w", encoding="utf-8") as f:
f.write(samples["technical"])
print(f"Testing different backends with technical document...")
print("Testing different backends with technical document...")
# Test different backends
backends = ["auto", "weasyprint", "pandoc"]
@@ -445,43 +446,48 @@ def demonstrate_backend_comparison():
output_path = temp_dir / f"technical_{backend}.pdf"
import time
start_time = time.time()
success = converter.convert_file_to_pdf(
input_path=str(tech_md_path),
output_path=str(output_path),
method=backend
method=backend,
)
end_time = time.time()
conversion_time = end_time - start_time
if success:
file_size = output_path.stat().st_size if output_path.exists() else 0
print(f"{backend}: Success in {conversion_time:.2f}s, {file_size} bytes")
file_size = (
output_path.stat().st_size if output_path.exists() else 0
)
print(
f"{backend}: Success in {conversion_time:.2f}s, {file_size} bytes"
)
results[backend] = {
'success': True,
'time': conversion_time,
'size': file_size,
'output': str(output_path)
"success": True,
"time": conversion_time,
"size": file_size,
"output": str(output_path),
}
else:
print(f"{backend}: Failed")
results[backend] = {'success': False, 'time': conversion_time}
results[backend] = {"success": False, "time": conversion_time}
except Exception as e:
print(f"{backend}: Error - {str(e)}")
results[backend] = {'success': False, 'error': str(e)}
results[backend] = {"success": False, "error": str(e)}
# Summary
print("\n" + "-" * 40)
print("BACKEND COMPARISON SUMMARY")
print("-" * 40)
successful_backends = [b for b, r in results.items() if r.get('success', False)]
successful_backends = [b for b, r in results.items() if r.get("success", False)]
print(f"Successful backends: {successful_backends}")
if successful_backends:
fastest = min(successful_backends, key=lambda b: results[b]['time'])
fastest = min(successful_backends, key=lambda b: results[b]["time"])
print(f"Fastest backend: {fastest} ({results[fastest]['time']:.2f}s)")
return results, temp_dir
@@ -659,15 +665,15 @@ def demonstrate_custom_styling():
line_height="1.4",
include_toc=True,
syntax_highlighting=True,
custom_css=custom_css
custom_css=custom_css,
)
converter = EnhancedMarkdownConverter(config)
# Convert academic sample with custom styling
academic_md_path = temp_dir / "academic_styled.md"
with open(academic_md_path, 'w', encoding='utf-8') as f:
f.write(samples['academic'])
with open(academic_md_path, "w", encoding="utf-8") as f:
f.write(samples["academic"])
print("Converting academic paper with custom styling...")
print("Custom styling features:")
@@ -681,7 +687,7 @@ def demonstrate_custom_styling():
success = converter.convert_file_to_pdf(
input_path=str(academic_md_path),
output_path=str(temp_dir / "academic_styled.pdf"),
method="weasyprint" # WeasyPrint is best for custom CSS
method="weasyprint", # WeasyPrint is best for custom CSS
)
if success:
@@ -693,7 +699,7 @@ def demonstrate_custom_styling():
default_success = default_converter.convert_file_to_pdf(
input_path=str(academic_md_path),
output_path=str(temp_dir / "academic_default.pdf"),
method="weasyprint"
method="weasyprint",
)
if default_success:
@@ -779,7 +785,7 @@ Direct content conversion is useful for:
success = converter.convert_markdown_to_pdf(
markdown_content=dynamic_content,
output_path=str(output_path),
method="auto"
method="auto",
)
if success:
@@ -841,7 +847,7 @@ Symbols: ♠ ♣ ♥ ♦ ☀ ☁ ☂ ☃ ☄ ★ ☆ ☉ ☊ ☋ ☌ ☍ ☎ ☏
Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
""",
"empty_content": "",
"minimal_content": "# Just a title"
"minimal_content": "# Just a title",
}
print("Testing error handling with various content types...")
@@ -861,32 +867,36 @@ Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
success = converter.convert_markdown_to_pdf(
markdown_content=content,
output_path=str(output_path),
method=backend
method=backend,
)
if success:
file_size = output_path.stat().st_size if output_path.exists() else 0
file_size = (
output_path.stat().st_size
if output_path.exists()
else 0
)
print(f"{backend}: Success ({file_size} bytes)")
results[f"{test_name}_{backend}"] = {
'success': True,
'size': file_size
"success": True,
"size": file_size,
}
else:
print(f"{backend}: Failed")
results[f"{test_name}_{backend}"] = {'success': False}
results[f"{test_name}_{backend}"] = {"success": False}
except Exception as e:
print(f"{backend}: Error - {str(e)[:60]}...")
results[f"{test_name}_{backend}"] = {
'success': False,
'error': str(e)
"success": False,
"error": str(e),
}
except Exception as e:
print(f" ❌ Test case failed: {str(e)}")
# Demonstrate robust conversion with fallbacks
print(f"\nDemonstrating robust conversion with fallback logic...")
print("\nDemonstrating robust conversion with fallback logic...")
def robust_convert(content, output_path):
"""Convert with multiple backend fallbacks"""
@@ -898,7 +908,7 @@ Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
success = converter.convert_markdown_to_pdf(
markdown_content=content,
output_path=output_path,
method=backend
method=backend,
)
if success:
return backend, True
@@ -923,9 +933,13 @@ Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
print("\n" + "-" * 40)
print("ERROR HANDLING SUMMARY")
print("-" * 40)
successful_conversions = sum(1 for r in results.values() if r.get('success', False))
successful_conversions = sum(
1 for r in results.values() if r.get("success", False)
)
total_attempts = len(results)
success_rate = (successful_conversions / total_attempts * 100) if total_attempts > 0 else 0
success_rate = (
(successful_conversions / total_attempts * 100) if total_attempts > 0 else 0
)
print(f"Total conversion attempts: {total_attempts}")
print(f"Successful conversions: {successful_conversions}")
@@ -943,12 +957,14 @@ def main():
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
print("RAG-Anything Enhanced Markdown Conversion Demonstration")
print("=" * 70)
print("This example demonstrates various enhanced markdown conversion capabilities:")
print(
"This example demonstrates various enhanced markdown conversion capabilities:"
)
print(" - Basic markdown to PDF conversion")
print(" - Multiple backend comparison (WeasyPrint vs Pandoc)")
print(" - Custom CSS styling and professional formatting")
@@ -962,23 +978,23 @@ def main():
# Basic conversion
success, temp_dir = demonstrate_basic_conversion()
results['basic'] = success
results["basic"] = success
# Backend comparison
backend_results, _ = demonstrate_backend_comparison()
results['backends'] = backend_results
results["backends"] = backend_results
# Custom styling
styling_success, _ = demonstrate_custom_styling()
results['styling'] = styling_success
results["styling"] = styling_success
# Content conversion
content_success, _ = demonstrate_content_conversion()
results['content'] = content_success
results["content"] = content_success
# Error handling
error_results, _ = demonstrate_error_handling()
results['error_handling'] = error_results
results["error_handling"] = error_results
# Summary
print("\n" + "=" * 70)
@@ -986,17 +1002,25 @@ def main():
print("=" * 70)
print("✅ Features Successfully Demonstrated:")
if results['basic']:
if results["basic"]:
print(" - Basic markdown to PDF conversion")
if results['backends']:
successful_backends = [b for b, r in results['backends'].items() if r.get('success', False)]
if results["backends"]:
successful_backends = [
b for b, r in results["backends"].items() if r.get("success", False)
]
print(f" - Multiple backends: {successful_backends}")
if results['styling']:
if results["styling"]:
print(" - Custom CSS styling and professional formatting")
if results['content']:
if results["content"]:
print(" - Direct content conversion without file I/O")
if results['error_handling']:
success_rate = sum(1 for r in results['error_handling'].values() if r.get('success', False)) / len(results['error_handling']) * 100
if results["error_handling"]:
success_rate = (
sum(
1 for r in results["error_handling"].values() if r.get("success", False)
)
/ len(results["error_handling"])
* 100
)
print(f" - Error handling with {success_rate:.1f}% overall success rate")
print("\n📊 Key Capabilities Highlighted:")