From 60f05e04cf84f808847be2f11f06be001200a26a Mon Sep 17 00:00:00 2001 From: MinalMahalaShorthillsAI Date: Mon, 28 Jul 2025 10:08:54 +0530 Subject: [PATCH] improvised version --- FINAL_TEST_SUMMARY.md | 228 ---- TESTING_GUIDE.md | 760 ------------ docs/batch_and_enhanced_markdown.md | 299 ----- docs/batch_processing.md | 341 ++++++ docs/enhanced_markdown.md | 552 +++++++++ .../batch_and_enhanced_markdown_example.py | 334 ------ examples/batch_processing_example.py | 550 +++++++++ examples/enhanced_markdown_example.py | 1031 +++++++++++++++++ raganything/enhanced_markdown.py | 13 +- requirements.txt | 12 - setup.py | 5 + 11 files changed, 2489 insertions(+), 1636 deletions(-) delete mode 100644 FINAL_TEST_SUMMARY.md delete mode 100644 TESTING_GUIDE.md delete mode 100644 docs/batch_and_enhanced_markdown.md create mode 100644 docs/batch_processing.md create mode 100644 docs/enhanced_markdown.md delete mode 100644 examples/batch_and_enhanced_markdown_example.py create mode 100644 examples/batch_processing_example.py create mode 100644 examples/enhanced_markdown_example.py diff --git a/FINAL_TEST_SUMMARY.md b/FINAL_TEST_SUMMARY.md deleted file mode 100644 index dcd6e96..0000000 --- a/FINAL_TEST_SUMMARY.md +++ /dev/null @@ -1,228 +0,0 @@ -# Final Test Summary: Batch Processing and Enhanced Markdown Features - -## **Implementation Status: COMPLETE** - -All requested features have been successfully implemented, tested, and are production-ready. - ---- - -## **Feature 1: Batch/Parallel Processing** - -### **Implementation Details** -- **File**: `raganything/batch_parser.py` -- **Class**: `BatchParser` -- **Key Features**: - - Parallel document processing with configurable workers - - Progress tracking with `tqdm` - - Comprehensive error handling and reporting - - File filtering based on supported extensions - - Integration with existing MinerU and Docling parsers - -### **Test Results** -- **Core Logic**: Working perfectly -- **File Filtering**: Successfully filters supported file types -- **Progress Tracking**: Functional with visual progress bars -- **Error Handling**: Robust error capture and reporting -- **Command Line Interface**: Available and functional -- **MinerU Integration**: Requires `skip_installation_check=True` due to package conflicts - -### **Usage Example** -```python -from raganything.batch_parser import BatchParser - -# Create batch parser with installation check bypass -batch_parser = BatchParser( - parser_type="mineru", - max_workers=4, - show_progress=True, - skip_installation_check=True # Fixes MinerU package conflicts -) - -# Process multiple files -result = batch_parser.process_batch( - file_paths=["doc1.pdf", "doc2.docx", "doc3.txt"], - output_dir="./output", - parse_method="auto" -) - -print(f"Success rate: {result.success_rate:.1f}%") -``` - ---- - -## **Feature 2: Enhanced Markdown/PDF Conversion** - -### **Implementation Details** -- **File**: `raganything/enhanced_markdown.py` -- **Class**: `EnhancedMarkdownConverter` -- **Key Features**: - - Multiple conversion backends (WeasyPrint, Pandoc, Markdown) - - Professional CSS styling with syntax highlighting - - Table of contents generation - - Image and table support - - Custom configuration options - -### **Test Results** -- **WeasyPrint Backend**: Working perfectly (18.8 KB PDF generated) -- **Pandoc Backend**: Working with wkhtmltopdf engine (28.5 KB PDF generated) -- **Markdown Backend**: Available for HTML conversion -- **Command Line Interface**: Fully functional with all backends -- **Professional Styling**: Beautiful PDF output with proper formatting - -### **Backend Status** -```bash -Backend Information: - βœ… weasyprint # Working perfectly - ❌ pandoc # Python library (not needed) - βœ… markdown # Working for HTML conversion - βœ… pandoc_system # Working with wkhtmltopdf engine -Recommended backend: pandoc -``` - -### **Usage Example** -```python -from raganything.enhanced_markdown import EnhancedMarkdownConverter - -converter = EnhancedMarkdownConverter() - -# WeasyPrint (best for styling) -converter.convert_file_to_pdf("input.md", "output.pdf", method="weasyprint") - -# Pandoc (best for complex documents) -converter.convert_file_to_pdf("input.md", "output.pdf", method="pandoc_system") - -# Auto (uses best available backend) -converter.convert_file_to_pdf("input.md", "output.pdf", method="auto") -``` - ---- - -## **Feature 3: Integration with RAG-Anything** - -### **Implementation Details** -- **File**: `raganything/batch.py` -- **Class**: `BatchMixin` -- **Key Features**: - - Seamless integration with existing `RAGAnything` class - - Batch processing with RAG pipeline - - Async support for batch operations - - Comprehensive error handling - -### **Test Results** -- **Integration**: Successfully integrated with main RAG-Anything class -- **Batch RAG Processing**: Interface available and functional -- **Async Support**: Available for non-blocking operations -- **Error Handling**: Robust error management - -### **Usage Example** -```python -from raganything import RAGAnything - -rag = RAGAnything() - -# Process documents in batch with RAG -result = await rag.process_documents_with_rag_batch( - file_paths=["doc1.pdf", "doc2.docx"], - output_dir="./output", - max_workers=2, - show_progress=True -) -``` - ---- - -## **Dependencies Installed** - -### **Core Dependencies** -- `tqdm` - Progress bars for batch processing -- `markdown` - Markdown to HTML conversion -- `weasyprint` - HTML to PDF conversion -- `pygments` - Syntax highlighting - -### **System Dependencies** -- `pandoc` - Advanced document conversion (via conda) -- `wkhtmltopdf` - PDF engine for Pandoc (via conda) - ---- - -## **Comprehensive Test Results** - -### **Test 1: Batch Processing Core** -```bash -Batch parser created successfully with skip_installation_check=True -Supported extensions: ['.jpg', '.pptx', '.doc', '.tif', '.ppt', '.tiff', '.xls', '.bmp', '.txt', '.jpeg', '.pdf', '.docx', '.png', '.webp', '.gif', '.md', '.xlsx'] -File filtering test passed - Input files: 4 - Supported files: 3 -``` - -### **Test 2: Enhanced Markdown Backends** -```bash -Enhanced markdown converter working -Available backends: ['weasyprint', 'pandoc', 'markdown', 'pandoc_system'] -Recommended backend: pandoc -WeasyPrint backend available -Pandoc system backend available -``` - -### **Test 3: Command Line Interfaces** -```bash -Batch parser CLI available -Enhanced markdown CLI available -``` - -### **Test 4: PDF Generation** -```bash -WeasyPrint: Successfully converted test_document.md to PDF (18.8 KB) -Pandoc: Successfully converted test_document.md to PDF (28.5 KB) -``` - ---- - -## **Production Readiness** - -### **Ready for Production** -- **Enhanced Markdown Conversion**: 100% functional with multiple backends -- **Batch Processing Core**: 100% functional with robust error handling -- **Integration**: Seamlessly integrated with RAG-Anything -- **Documentation**: Comprehensive examples and documentation -- **Command Line Tools**: Available for both features - -### **Known Limitations** -- **MinerU Package Conflicts**: Requires `skip_installation_check=True` in environments with package conflicts -- **System Dependencies**: Pandoc and wkhtmltopdf need to be installed (done via conda) - ---- - -## **Files Created/Modified** - -### **New Files** -- `raganything/batch_parser.py` - Core batch processing logic -- `raganything/enhanced_markdown.py` - Enhanced markdown conversion -- `examples/batch_and_enhanced_markdown_example.py` - Comprehensive example -- `docs/batch_and_enhanced_markdown.md` - Detailed documentation -- `FINAL_TEST_SUMMARY.md` - This test summary - -### **Modified Files** -- `raganything/batch.py` - Updated with new batch processing integration -- `requirements.txt` - Added new dependencies -- `TESTING_GUIDE.md` - Updated testing guide - ---- - -## **Final Recommendation** - -**All requested features have been successfully implemented and tested!** - -### **For Immediate Use** -1. **Enhanced Markdown Conversion**: Ready for production use -2. **Batch Processing**: Ready for production use (with `skip_installation_check=True`) -3. **Integration**: Seamlessly integrated with existing RAG-Anything system - -### **For Contributors** -- All code is well-documented with comprehensive examples -- Command-line interfaces are available for testing -- Error handling is robust and informative -- Type hints are included for better code maintainability - -**The implementation is production-ready and exceeds the original requirements!** diff --git a/TESTING_GUIDE.md b/TESTING_GUIDE.md deleted file mode 100644 index 2378ae5..0000000 --- a/TESTING_GUIDE.md +++ /dev/null @@ -1,760 +0,0 @@ -# πŸ§ͺ Comprehensive Testing Guide: Batch Processing & Enhanced Markdown - -This guide provides step-by-step testing instructions for the new batch processing and enhanced markdown conversion features in RAG-Anything. - -## πŸ“‹ **Quick Start (5 minutes)** - -### **1. Environment Setup** -```bash -# Install dependencies -pip install tqdm markdown weasyprint pygments - -# Install optional system dependencies -conda install -c conda-forge pandoc wkhtmltopdf -y - -# Verify installation -python -c "import tqdm, markdown, weasyprint, pygments; print('βœ… All dependencies installed')" -``` - -### **2. Basic Import Test** -```bash -# Test all core modules -python -c " -from raganything.batch_parser import BatchParser -from raganything.enhanced_markdown import EnhancedMarkdownConverter -from raganything.batch import BatchMixin -print('βœ… All core modules imported successfully') -" -``` - -### **3. Command-Line Interface Test** -```bash -# Test enhanced markdown CLI -python -m raganything.enhanced_markdown --info - -# Test batch parser CLI -python -m raganything.batch_parser --help -``` - -### **4. Basic Functionality Test** -```bash -# Create test markdown file -echo "# Test Document\n\nThis is a test." > test.md - -# Test conversion -python -m raganything.enhanced_markdown test.md --output test.pdf --method weasyprint - -# Verify PDF was created -ls -la test.pdf - -# Clean up -rm test.md test.pdf -``` - ---- - -## 🎯 **Detailed Feature Testing** - -### **Test 1: Enhanced Markdown Conversion** - -#### **1.1 Backend Detection** -```bash -python -m raganything.enhanced_markdown --info -``` - -**Expected Output:** -``` -Backend Information: - βœ… weasyprint - ❌ pandoc - βœ… markdown - βœ… pandoc_system -Recommended backend: pandoc -``` - -#### **1.2 Basic Conversion Test** -```bash -# Create comprehensive test file -cat > test_document.md << 'EOF' -# Test Document - -## Overview -This is a test document for enhanced markdown conversion. - -### Code Example -```python -def hello_world(): - print("Hello, World!") - return "Success" -``` - -### Table Example -| Feature | Status | Notes | -|---------|--------|-------| -| Code Highlighting | βœ… | Working | -| Tables | βœ… | Working | -| Lists | βœ… | Working | - -### Lists -- Item 1 -- Item 2 -- Item 3 - -### Blockquotes -> This is a blockquote with important information. - -### Links -Visit [GitHub](https://github.com) for more information. -EOF - -# Test different conversion methods -python -m raganything.enhanced_markdown test_document.md --output test_weasyprint.pdf --method weasyprint -python -m raganything.enhanced_markdown test_document.md --output test_pandoc.pdf --method pandoc_system - -# Verify PDFs were created -ls -la test_*.pdf -``` - -#### **1.3 Advanced Conversion Test** -```python -# Create test script: test_advanced_markdown.py -from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig -import tempfile -from pathlib import Path - -def test_advanced_markdown(): - """Test advanced markdown conversion features""" - - # Create custom configuration - config = MarkdownConfig( - page_size="A4", - margin="1in", - font_size="12pt", - include_toc=True, - syntax_highlighting=True, - custom_css=""" - body { font-family: 'Arial', sans-serif; } - h1 { color: #2c3e50; border-bottom: 2px solid #3498db; } - code { background-color: #f8f9fa; padding: 2px 4px; } - """ - ) - - # Create converter - converter = EnhancedMarkdownConverter(config) - - # Test backend information - info = converter.get_backend_info() - print("Backend Information:") - for backend, available in info["available_backends"].items(): - status = "βœ…" if available else "❌" - print(f" {status} {backend}") - - # Create test content - test_content = """# Advanced Test Document - -## Features Tested - -### 1. Code Highlighting -```python -def process_document(file_path: str) -> str: - with open(file_path, 'r') as f: - content = f.read() - return f"Processed: {content}" -``` - -### 2. Tables -| Component | Status | Performance | -|-----------|--------|-------------| -| Parser | βœ… | 100 docs/hour | -| Converter | βœ… | 50 docs/hour | -| Storage | βœ… | 1TB capacity | - -### 3. Lists and Links -- [Feature 1](https://example.com) -- [Feature 2](https://example.com) -- [Feature 3](https://example.com) - -### 4. Blockquotes -> This is an important note about the system. - -## Conclusion -The enhanced markdown conversion provides excellent formatting. -""" - - # Test conversion - with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file: - temp_file.write(test_content) - temp_md_path = temp_file.name - - try: - # Test different methods - for method in ["auto", "weasyprint", "pandoc_system"]: - try: - output_path = f"test_advanced_{method}.pdf" - success = converter.convert_file_to_pdf( - input_path=temp_md_path, - output_path=output_path, - method=method - ) - if success: - print(f"βœ… {method}: {output_path}") - else: - print(f"❌ {method}: Failed") - except Exception as e: - print(f"❌ {method}: {str(e)}") - - finally: - # Clean up - Path(temp_md_path).unlink() - -if __name__ == "__main__": - test_advanced_markdown() -``` - -### **Test 2: Batch Processing** - -#### **2.1 Basic Batch Parser Test** -```python -# Create test script: test_batch_parser.py -from raganything.batch_parser import BatchParser, BatchProcessingResult -import tempfile -from pathlib import Path - -def test_batch_parser(): - """Test basic batch parser functionality""" - - # Create batch parser - batch_parser = BatchParser( - parser_type="mineru", - max_workers=2, - show_progress=True, - timeout_per_file=60, - skip_installation_check=True # Bypass installation check for testing - ) - - # Test supported extensions - extensions = batch_parser.get_supported_extensions() - print(f"βœ… Supported extensions: {extensions}") - - # Test file filtering - test_files = [ - "document.pdf", - "report.docx", - "data.xlsx", - "unsupported.xyz" - ] - - supported_files = batch_parser.filter_supported_files(test_files) - print(f"βœ… File filtering: {len(supported_files)}/{len(test_files)} files supported") - - # Create test files - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) - - # Create test markdown files - for i in range(3): - test_file = temp_path / f"test_{i}.md" - test_file.write_text(f"# Test Document {i}\n\nContent for test {i}.") - - # Test batch processing (will fail without MinerU, but tests setup) - try: - result = batch_parser.process_batch( - file_paths=[str(temp_path)], - output_dir=str(temp_path / "output"), - parse_method="auto", - recursive=False - ) - print(f"βœ… Batch processing completed: {result.summary()}") - except Exception as e: - print(f"⚠️ Batch processing failed (expected without MinerU): {str(e)}") - -if __name__ == "__main__": - test_batch_parser() -``` - -#### **2.2 Batch Processing with Mock Files** -```python -# Create test script: test_batch_mock.py -import tempfile -from pathlib import Path -from raganything.batch_parser import BatchParser - -def create_mock_files(): - """Create mock files for testing""" - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) - - # Create various file types - files = { - "document.md": "# Test Document\n\nThis is a test.", - "report.txt": "This is a text report.", - "data.csv": "name,value\nA,1\nB,2\nC,3", - "config.json": '{"setting": "value"}' - } - - for filename, content in files.items(): - file_path = temp_path / filename - file_path.write_text(content) - - return temp_path, list(files.keys()) - -def test_batch_with_mock_files(): - """Test batch processing with mock files""" - - temp_path, file_list = create_mock_files() - - # Create batch parser - batch_parser = BatchParser( - parser_type="mineru", - max_workers=2, - show_progress=True, - skip_installation_check=True - ) - - # Test file filtering - all_files = [str(temp_path / f) for f in file_list] - supported_files = batch_parser.filter_supported_files(all_files) - - print(f"βœ… Total files: {len(all_files)}") - print(f"βœ… Supported files: {len(supported_files)}") - print(f"βœ… Success rate: {len(supported_files)/len(all_files)*100:.1f}%") - - # Test batch processing setup (without actual parsing) - try: - result = batch_parser.process_batch( - file_paths=supported_files, - output_dir=str(temp_path / "output"), - parse_method="auto" - ) - print(f"βœ… Batch processing: {result.summary()}") - except Exception as e: - print(f"⚠️ Batch processing setup test completed (parsing failed as expected)") - -if __name__ == "__main__": - test_batch_with_mock_files() -``` - ---- - -## πŸ”— **Integration Testing** - -### **Test 3: RAG-Anything Integration** - -#### **3.1 Basic Integration Test** -```python -# Create test script: test_integration.py -from raganything import RAGAnything, RAGAnythingConfig -from raganything.batch_parser import BatchParser -from raganything.enhanced_markdown import EnhancedMarkdownConverter -import tempfile -from pathlib import Path - -def test_rag_integration(): - """Test integration with RAG-Anything""" - - # Create temporary working directory - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) - - # Create test configuration - config = RAGAnythingConfig( - working_dir=str(temp_path / "rag_storage"), - enable_image_processing=True, - enable_table_processing=True, - enable_equation_processing=True, - parser="mineru", - max_concurrent_files=2, - recursive_folder_processing=True - ) - - # Test RAG-Anything initialization - try: - rag = RAGAnything(config=config) - print("βœ… RAG-Anything initialized successfully") - except Exception as e: - print(f"⚠️ RAG-Anything initialization: {str(e)}") - - # Test batch processing methods exist - batch_methods = [ - 'process_documents_batch', - 'process_documents_batch_async', - 'get_supported_file_extensions', - 'filter_supported_files', - 'process_documents_with_rag_batch' - ] - - print("\nBatch Processing Methods:") - for method in batch_methods: - available = hasattr(rag, method) - status = "βœ…" if available else "❌" - print(f" {status} {method}") - - # Test enhanced markdown integration - print("\nEnhanced Markdown Integration:") - try: - converter = EnhancedMarkdownConverter() - info = converter.get_backend_info() - print(f" βœ… Available backends: {list(info['available_backends'].keys())}") - print(f" βœ… Recommended backend: {info['recommended_backend']}") - except Exception as e: - print(f" ❌ Enhanced markdown: {str(e)}") - -if __name__ == "__main__": - test_rag_integration() -``` - ---- - -## ⚑ **Performance Testing** - -### **Test 4: Performance Benchmarks** - -#### **4.1 Enhanced Markdown Performance Test** -```python -# Create test script: test_performance.py -import time -import tempfile -from pathlib import Path -from raganything.enhanced_markdown import EnhancedMarkdownConverter - -def create_large_markdown(size_kb=100): - """Create a large markdown file for performance testing""" - content = "# Large Test Document\n\n" - - # Add sections to reach target size - sections = size_kb // 2 # Rough estimate - for i in range(sections): - content += f""" -## Section {i} - -This is section {i} of the large test document. - -### Subsection {i}.1 -Content for subsection {i}.1. - -### Subsection {i}.2 -Content for subsection {i}.2. - -### Code Example {i} -```python -def function_{i}(): - return f"Result {i}" -``` - -### Table {i} -| Column A | Column B | Column C | -|----------|----------|----------| -| Value A{i} | Value B{i} | Value C{i} | -| Value D{i} | Value E{i} | Value F{i} | - -""" - - return content - -def test_markdown_performance(): - """Test enhanced markdown conversion performance""" - - print("Enhanced Markdown Performance Test") - print("=" * 40) - - # Test different file sizes - sizes = [10, 50, 100] # KB - - for size_kb in sizes: - print(f"\nTesting {size_kb}KB document:") - - # Create test file - content = create_large_markdown(size_kb) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file: - temp_file.write(content) - temp_md_path = temp_file.name - - try: - converter = EnhancedMarkdownConverter() - - # Test different methods - for method in ["weasyprint", "pandoc_system"]: - try: - output_path = f"perf_test_{size_kb}kb_{method}.pdf" - - start_time = time.time() - success = converter.convert_file_to_pdf( - input_path=temp_md_path, - output_path=output_path, - method=method - ) - end_time = time.time() - - if success: - duration = end_time - start_time - print(f" βœ… {method}: {duration:.2f}s") - else: - print(f" ❌ {method}: Failed") - - except Exception as e: - print(f" ❌ {method}: {str(e)}") - - finally: - # Clean up - Path(temp_md_path).unlink() - -if __name__ == "__main__": - test_markdown_performance() -``` - ---- - -## πŸ”§ **Troubleshooting** - -### **Common Issues and Solutions** - -#### **Issue 1: Import Errors** -```bash -# Problem: ModuleNotFoundError for new dependencies -# Solution: Install missing dependencies -pip install tqdm markdown weasyprint pygments - -# Verify installation -python -c "import tqdm, markdown, weasyprint, pygments; print('βœ… All dependencies installed')" -``` - -#### **Issue 2: WeasyPrint Installation Problems** -```bash -# Problem: WeasyPrint fails to install or run -# Solution: Install system dependencies (Ubuntu/Debian) -sudo apt-get update -sudo apt-get install -y \ - build-essential \ - python3-dev \ - python3-pip \ - python3-setuptools \ - python3-wheel \ - python3-cffi \ - libcairo2 \ - libpango-1.0-0 \ - libpangocairo-1.0-0 \ - libgdk-pixbuf2.0-0 \ - libffi-dev \ - shared-mime-info - -# Then reinstall WeasyPrint -pip install --force-reinstall weasyprint -``` - -#### **Issue 3: Pandoc Not Found** -```bash -# Problem: Pandoc command not found -# Solution: Install Pandoc -conda install -c conda-forge pandoc wkhtmltopdf -y - -# Or install via package manager -sudo apt-get install pandoc - -# Verify installation -pandoc --version -``` - -#### **Issue 4: MinerU Package Conflicts** -```bash -# Problem: numpy/scikit-learn version conflicts -# Solution: Use skip_installation_check parameter -python -c " -from raganything.batch_parser import BatchParser -batch_parser = BatchParser(skip_installation_check=True) -print('βœ… Batch parser created with installation check bypassed') -" -``` - -#### **Issue 5: Memory Errors** -```bash -# Problem: Out of memory during batch processing -# Solution: Reduce max_workers -python -c " -from raganything.batch_parser import BatchParser -batch_parser = BatchParser(max_workers=1) # Use fewer workers -print('βœ… Batch parser created with reduced workers') -" -``` - -### **Debug Mode** -```python -# Enable debug logging for detailed information -import logging -logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - -# Test with debug logging -from raganything.enhanced_markdown import EnhancedMarkdownConverter -converter = EnhancedMarkdownConverter() -converter.convert_file_to_pdf("test.md", "test.pdf") -``` - ---- - -## πŸ“Š **Test Report Template** - -### **Automated Test Report** -```python -# Create test script: generate_test_report.py -import sys -from pathlib import Path -from datetime import datetime - -def generate_test_report(): - """Generate comprehensive test report""" - - report = { - "timestamp": datetime.now().isoformat(), - "python_version": sys.version, - "tests": {} - } - - # Test imports - try: - from raganything.batch_parser import BatchParser - from raganything.enhanced_markdown import EnhancedMarkdownConverter - from raganything.batch import BatchMixin - report["tests"]["imports"] = {"status": "βœ…", "message": "All modules imported successfully"} - except Exception as e: - report["tests"]["imports"] = {"status": "❌", "message": str(e)} - - # Test enhanced markdown - try: - converter = EnhancedMarkdownConverter() - info = converter.get_backend_info() - report["tests"]["enhanced_markdown"] = { - "status": "βœ…", - "message": f"Available backends: {list(info['available_backends'].keys())}" - } - except Exception as e: - report["tests"]["enhanced_markdown"] = {"status": "❌", "message": str(e)} - - # Test batch processing - try: - batch_parser = BatchParser(skip_installation_check=True) - extensions = batch_parser.get_supported_extensions() - report["tests"]["batch_processing"] = { - "status": "βœ…", - "message": f"Supported extensions: {len(extensions)} file types" - } - except Exception as e: - report["tests"]["batch_processing"] = {"status": "❌", "message": str(e)} - - # Generate report - print("Test Report") - print("=" * 50) - print(f"Timestamp: {report['timestamp']}") - print(f"Python Version: {report['python_version']}") - print() - - for test_name, result in report["tests"].items(): - print(f"{result['status']} {test_name}: {result['message']}") - - # Summary - passed = sum(1 for r in report["tests"].values() if r["status"] == "βœ…") - total = len(report["tests"]) - print(f"\nSummary: {passed}/{total} tests passed") - -if __name__ == "__main__": - generate_test_report() -``` - -### **Manual Test Checklist** -```markdown -# Manual Test Checklist - -## Environment Setup -- [ ] Python 3.8+ installed -- [ ] Dependencies installed: tqdm, markdown, weasyprint, pygments -- [ ] Optional dependencies: pandoc, wkhtmltopdf -- [ ] RAG-Anything core modules accessible - -## Enhanced Markdown Testing -- [ ] Backend detection works -- [ ] WeasyPrint conversion successful -- [ ] Pandoc conversion successful (if available) -- [ ] Command-line interface functional -- [ ] Error handling robust - -## Batch Processing Testing -- [ ] Batch parser creation successful -- [ ] File filtering works correctly -- [ ] Progress tracking functional -- [ ] Error handling comprehensive -- [ ] Command-line interface available - -## Integration Testing -- [ ] RAG-Anything integration works -- [ ] Batch methods available in main class -- [ ] Enhanced markdown integrates seamlessly -- [ ] Error handling propagates correctly - -## Performance Testing -- [ ] Markdown conversion < 10s for typical documents -- [ ] Batch processing setup < 5s -- [ ] Memory usage reasonable (< 500MB) -- [ ] No memory leaks detected - -## Issues Found -- [ ] None -- [ ] List issues here - -## Recommendations -- [ ] None -- [ ] List recommendations here -``` - ---- - -## 🎯 **Success Criteria** - -A successful implementation should pass all tests: - -### **βœ… Required Tests** -- [ ] All imports work without errors -- [ ] Enhanced markdown conversion produces valid PDFs -- [ ] Batch processing handles file filtering correctly -- [ ] Command-line interfaces are functional -- [ ] Integration with RAG-Anything works -- [ ] Error handling is robust -- [ ] Performance is acceptable (< 10s for typical operations) - -### **βœ… Optional Tests** -- [ ] Pandoc backend available and working -- [ ] Large document processing successful -- [ ] Memory usage stays within limits -- [ ] All command-line options work correctly - -### **πŸ“ˆ Performance Benchmarks** -- **Enhanced Markdown**: 1-5 seconds for typical documents -- **Batch Processing**: 2-4x speedup with parallel processing -- **Memory Usage**: ~50-100MB per worker for batch processing -- **Error Recovery**: Graceful handling of all common error scenarios - ---- - -## πŸš€ **Quick Commands Reference** - -```bash -# Run all tests -python test_advanced_markdown.py -python test_batch_parser.py -python test_integration.py -python test_performance.py -python generate_test_report.py - -# Test specific features -python -m raganything.enhanced_markdown --info -python -m raganything.batch_parser --help -python examples/batch_and_enhanced_markdown_example.py - -# Performance testing -time python -m raganything.enhanced_markdown test.md --output test.pdf -``` - ---- - -**This comprehensive testing guide ensures thorough validation of all new features!** πŸŽ‰ diff --git a/docs/batch_and_enhanced_markdown.md b/docs/batch_and_enhanced_markdown.md deleted file mode 100644 index 7125feb..0000000 --- a/docs/batch_and_enhanced_markdown.md +++ /dev/null @@ -1,299 +0,0 @@ -# Batch Processing and Enhanced Markdown Conversion - -This document describes the new batch processing and enhanced markdown conversion features added to RAG-Anything. - -## Batch Processing - -### Overview - -The batch processing feature allows you to process multiple documents in parallel, significantly improving throughput for large document collections. - -### Key Features - -- **Parallel Processing**: Process multiple files concurrently using thread pools -- **Progress Tracking**: Real-time progress bars with `tqdm` -- **Error Handling**: Comprehensive error reporting and recovery -- **Flexible Input**: Support for files, directories, and recursive search -- **Configurable Workers**: Adjustable number of parallel workers - -### Usage - -#### Basic Batch Processing - -```python -from raganything.batch_parser import BatchParser - -# Create batch parser -batch_parser = BatchParser( - parser_type="mineru", # or "docling" - max_workers=4, - show_progress=True, - timeout_per_file=300 -) - -# Process multiple files -result = batch_parser.process_batch( - file_paths=["doc1.pdf", "doc2.docx", "folder/"], - output_dir="./batch_output", - parse_method="auto", - recursive=True -) - -# Check results -print(result.summary()) -print(f"Success rate: {result.success_rate:.1f}%") -``` - -#### Integration with RAG-Anything - -```python -from raganything import RAGAnything - -rag = RAGAnything() - -# Process documents with RAG integration -result = await rag.process_documents_with_rag_batch( - file_paths=["doc1.pdf", "doc2.docx"], - output_dir="./output", - max_workers=4, - show_progress=True -) - -print(f"Processed {result['successful_rag_files']} files with RAG") -``` - -#### Command Line Interface - -```bash -# Basic batch processing -python -m raganything.batch_parser path/to/docs/ --output ./output --workers 4 - -# With specific parser -python -m raganything.batch_parser path/to/docs/ --parser mineru --method auto - -# Show progress -python -m raganything.batch_parser path/to/docs/ --output ./output --no-progress -``` - -### Configuration - -The batch processing can be configured through environment variables: - -```env -# Batch processing configuration -MAX_CONCURRENT_FILES=4 -SUPPORTED_FILE_EXTENSIONS=.pdf,.docx,.doc,.pptx,.ppt,.xlsx,.xls,.txt,.md -RECURSIVE_FOLDER_PROCESSING=true -``` - -### Supported File Types - -- **PDF files**: `.pdf` -- **Office documents**: `.doc`, `.docx`, `.ppt`, `.pptx`, `.xls`, `.xlsx` -- **Images**: `.png`, `.jpg`, `.jpeg`, `.bmp`, `.tiff`, `.tif`, `.gif`, `.webp` -- **Text files**: `.txt`, `.md` - -## Enhanced Markdown Conversion - -### Overview - -The enhanced markdown conversion feature provides high-quality PDF generation from markdown files with multiple backend options and advanced styling. - -### Key Features - -- **Multiple Backends**: WeasyPrint, Pandoc, and ReportLab support -- **Advanced Styling**: Custom CSS, syntax highlighting, and professional layouts -- **Image Support**: Embedded images with proper scaling -- **Table Support**: Formatted tables with borders and styling -- **Code Highlighting**: Syntax highlighting for code blocks -- **Custom Templates**: Support for custom CSS and templates - -### Usage - -#### Basic Conversion - -```python -from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig - -# Create converter with custom configuration -config = MarkdownConfig( - page_size="A4", - margin="1in", - font_size="12pt", - include_toc=True, - syntax_highlighting=True -) - -converter = EnhancedMarkdownConverter(config) - -# Convert markdown to PDF -success = converter.convert_file_to_pdf( - input_path="document.md", - output_path="document.pdf", - method="auto" # or "weasyprint", "pandoc" -) -``` - -#### Advanced Configuration - -```python -# Custom CSS styling -config = MarkdownConfig( - custom_css=""" - body { font-family: 'Arial', sans-serif; } - h1 { color: #2c3e50; border-bottom: 2px solid #3498db; } - code { background-color: #f8f9fa; padding: 2px 4px; } - """, - include_toc=True, - syntax_highlighting=True -) - -converter = EnhancedMarkdownConverter(config) -``` - -#### Command Line Interface - -```bash -# Basic conversion -python -m raganything.enhanced_markdown document.md --output document.pdf - -# With specific method -python -m raganything.enhanced_markdown document.md --method weasyprint - -# With custom CSS -python -m raganything.enhanced_markdown document.md --css style.css - -# Show backend information -python -m raganything.enhanced_markdown --info -``` - -### Backend Comparison - -| Backend | Pros | Cons | Best For | -|---------|------|------|----------| -| **WeasyPrint** | Excellent CSS support, fast, reliable | Requires more dependencies | Web-style documents, custom styling | -| **Pandoc** | Most features, LaTeX quality | Slower, requires system installation | Academic papers, complex documents | -| **ReportLab** | Lightweight, no external deps | Basic styling only | Simple documents, minimal setup | - -### Installation - -#### Required Dependencies - -```bash -# Basic installation -pip install raganything[all] - -# For enhanced markdown conversion -pip install markdown weasyprint pygments - -# For Pandoc backend (optional) -# Download from: https://pandoc.org/installing.html -``` - -#### Optional Dependencies - -- **WeasyPrint**: `pip install weasyprint` -- **Pandoc**: System installation required -- **Pygments**: `pip install pygments` (for syntax highlighting) - -### Examples - -#### Sample Markdown Input - -```markdown -# Technical Documentation - -## Overview -This document provides technical specifications. - -### Code Example -```python -def process_document(file_path): - return "Processed: " + file_path -``` - -### Performance Metrics - -| Metric | Value | -|--------|-------| -| Speed | 100 docs/hour | -| Memory | 2.5 GB | - -### Conclusion -The system provides excellent performance. -``` - -#### Generated PDF Features - -- Professional typography and layout -- Syntax-highlighted code blocks -- Formatted tables with borders -- Table of contents (if enabled) -- Custom styling and branding -- Responsive image handling - -### Integration with RAG-Anything - -The enhanced markdown conversion integrates seamlessly with the RAG-Anything pipeline: - -```python -from raganything import RAGAnything - -# Initialize RAG-Anything -rag = RAGAnything() - -# Process markdown files with enhanced conversion -await rag.process_documents_batch( - file_paths=["document.md"], - output_dir="./output", - # Enhanced markdown conversion will be used automatically - # for .md files -) -``` - -## Performance Considerations - -### Batch Processing - -- **Memory Usage**: Each worker uses additional memory -- **CPU Usage**: Parallel processing utilizes multiple cores -- **I/O Bottlenecks**: Disk I/O may become limiting factor -- **Recommended Settings**: 2-4 workers for most systems - -### Enhanced Markdown - -- **WeasyPrint**: Fastest for most documents -- **Pandoc**: Best quality but slower -- **Large Documents**: Consider chunking for very large files -- **Image Processing**: Large images may slow conversion - -## Troubleshooting - -### Common Issues - -#### Batch Processing - -1. **Memory Errors**: Reduce `max_workers` -2. **Timeout Errors**: Increase `timeout_per_file` -3. **File Not Found**: Check file paths and permissions -4. **Parser Errors**: Verify parser installation - -#### Enhanced Markdown - -1. **WeasyPrint Errors**: Install system dependencies -2. **Pandoc Not Found**: Install Pandoc system-wide -3. **CSS Issues**: Check CSS syntax and file paths -4. **Image Problems**: Ensure images are accessible - -### Debug Mode - -Enable debug logging for detailed information: - -```python -import logging -logging.basicConfig(level=logging.DEBUG) -``` - -## Conclusion - -The batch processing and enhanced markdown conversion features significantly improve RAG-Anything's capabilities for processing large document collections and generating high-quality PDFs from markdown content. These features are designed to be easy to use while providing advanced configuration options for power users. diff --git a/docs/batch_processing.md b/docs/batch_processing.md new file mode 100644 index 0000000..8556184 --- /dev/null +++ b/docs/batch_processing.md @@ -0,0 +1,341 @@ +# Batch Processing + +This document describes the batch processing feature for RAG-Anything, which allows you to process multiple documents in parallel for improved throughput. + +## Overview + +The batch processing feature allows you to process multiple documents concurrently, significantly improving throughput for large document collections. It provides parallel processing, progress tracking, error handling, and flexible configuration options. + +## Key Features + +- **Parallel Processing**: Process multiple files concurrently using thread pools +- **Progress Tracking**: Real-time progress bars with `tqdm` +- **Error Handling**: Comprehensive error reporting and recovery +- **Flexible Input**: Support for files, directories, and recursive search +- **Configurable Workers**: Adjustable number of parallel workers +- **Installation Check Bypass**: Optional skip for environments with package conflicts + +## Installation + +```bash +# Basic installation +pip install raganything[all] + +# Required for batch processing +pip install tqdm +``` + +## Usage + +### Basic Batch Processing + +```python +from raganything.batch_parser import BatchParser + +# Create batch parser +batch_parser = BatchParser( + parser_type="mineru", # or "docling" + max_workers=4, + show_progress=True, + timeout_per_file=300, + skip_installation_check=False # Set to True if having parser installation issues +) + +# Process multiple files +result = batch_parser.process_batch( + file_paths=["doc1.pdf", "doc2.docx", "folder/"], + output_dir="./batch_output", + parse_method="auto", + recursive=True +) + +# Check results +print(result.summary()) +print(f"Success rate: {result.success_rate:.1f}%") +print(f"Processing time: {result.processing_time:.2f} seconds") +``` + +### Asynchronous Batch Processing + +```python +import asyncio +from raganything.batch_parser import BatchParser + +async def async_batch_processing(): + batch_parser = BatchParser( + parser_type="mineru", + max_workers=4, + show_progress=True + ) + + # Process files asynchronously + result = await batch_parser.process_batch_async( + file_paths=["doc1.pdf", "doc2.docx"], + output_dir="./output", + parse_method="auto" + ) + + return result + +# Run async processing +result = asyncio.run(async_batch_processing()) +``` + +### Integration with RAG-Anything + +```python +from raganything import RAGAnything + +rag = RAGAnything() + +# Process documents with batch functionality +result = rag.process_documents_batch( + file_paths=["doc1.pdf", "doc2.docx"], + output_dir="./output", + max_workers=4, + show_progress=True +) + +print(f"Processed {len(result.successful_files)} files successfully") +``` + +### Process Documents with RAG Integration + +```python +# Process documents in batch and then add them to RAG +result = await rag.process_documents_with_rag_batch( + file_paths=["doc1.pdf", "doc2.docx"], + output_dir="./output", + max_workers=4, + show_progress=True +) + +print(f"Processed {result['successful_rag_files']} files with RAG") +print(f"Total processing time: {result['total_processing_time']:.2f} seconds") +``` + +### Command Line Interface + +```bash +# Basic batch processing +python -m raganything.batch_parser path/to/docs/ --output ./output --workers 4 + +# With specific parser +python -m raganything.batch_parser path/to/docs/ --parser mineru --method auto + +# Without progress bar +python -m raganything.batch_parser path/to/docs/ --output ./output --no-progress + +# Help +python -m raganything.batch_parser --help +``` + +## Configuration + +### Environment Variables + +```env +# Batch processing configuration +MAX_CONCURRENT_FILES=4 +SUPPORTED_FILE_EXTENSIONS=.pdf,.docx,.doc,.pptx,.ppt,.xlsx,.xls,.txt,.md +RECURSIVE_FOLDER_PROCESSING=true +PARSER_OUTPUT_DIR=./parsed_output +``` + +### BatchParser Parameters + +- **parser_type**: `"mineru"` or `"docling"` (default: `"mineru"`) +- **max_workers**: Number of parallel workers (default: `4`) +- **show_progress**: Show progress bar (default: `True`) +- **timeout_per_file**: Timeout per file in seconds (default: `300`) +- **skip_installation_check**: Skip parser installation check (default: `False`) + +## Supported File Types + +- **PDF files**: `.pdf` +- **Office documents**: `.doc`, `.docx`, `.ppt`, `.pptx`, `.xls`, `.xlsx` +- **Images**: `.png`, `.jpg`, `.jpeg`, `.bmp`, `.tiff`, `.tif`, `.gif`, `.webp` +- **Text files**: `.txt`, `.md` + +## API Reference + +### BatchProcessingResult + +```python +@dataclass +class BatchProcessingResult: + successful_files: List[str] # Successfully processed files + failed_files: List[str] # Failed files + total_files: int # Total number of files + processing_time: float # Total processing time in seconds + errors: Dict[str, str] # Error messages for failed files + output_dir: str # Output directory used + + def summary(self) -> str: # Human-readable summary + def success_rate(self) -> float: # Success rate as percentage +``` + +### BatchParser Methods + +```python +class BatchParser: + def __init__(self, parser_type: str = "mineru", max_workers: int = 4, ...): + """Initialize batch parser""" + + def get_supported_extensions(self) -> List[str]: + """Get list of supported file extensions""" + + def filter_supported_files(self, file_paths: List[str], recursive: bool = True) -> List[str]: + """Filter files to only supported types""" + + def process_batch(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult: + """Process files in batch""" + + async def process_batch_async(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult: + """Process files in batch asynchronously""" +``` + +## Performance Considerations + +### Memory Usage +- Each worker uses additional memory +- Recommended: 2-4 workers for most systems +- Monitor memory usage with large files + +### CPU Usage +- Parallel processing utilizes multiple cores +- Optimal worker count depends on CPU cores and file sizes +- I/O may become bottleneck with many small files + +### Recommended Settings +- **Small files** (< 1MB): Higher worker count (6-8) +- **Large files** (> 100MB): Lower worker count (2-3) +- **Mixed sizes**: Start with 4 workers and adjust + +## Troubleshooting + +### Common Issues + +#### Memory Errors +```python +# Solution: Reduce max_workers +batch_parser = BatchParser(max_workers=2) +``` + +#### Timeout Errors +```python +# Solution: Increase timeout_per_file +batch_parser = BatchParser(timeout_per_file=600) # 10 minutes +``` + +#### Parser Installation Issues +```python +# Solution: Skip installation check +batch_parser = BatchParser(skip_installation_check=True) +``` + +#### File Not Found Errors +- Check file paths and permissions +- Ensure input files exist +- Verify directory access rights + +### Debug Mode + +Enable debug logging for detailed information: + +```python +import logging +logging.basicConfig(level=logging.DEBUG) + +# Create batch parser with debug logging +batch_parser = BatchParser(parser_type="mineru", max_workers=2) +``` + +### Error Handling + +The batch processor provides comprehensive error handling: + +```python +result = batch_parser.process_batch(file_paths=["doc1.pdf", "doc2.docx"]) + +# Check for errors +if result.failed_files: + print("Failed files:") + for file_path in result.failed_files: + error_message = result.errors.get(file_path, "Unknown error") + print(f" - {file_path}: {error_message}") + +# Process only successful files +for file_path in result.successful_files: + print(f"Successfully processed: {file_path}") +``` + +## Examples + +### Process Entire Directory + +```python +from pathlib import Path + +# Process all supported files in a directory +batch_parser = BatchParser(max_workers=4) +directory_path = Path("./documents") + +result = batch_parser.process_batch( + file_paths=[str(directory_path)], + output_dir="./processed", + recursive=True # Include subdirectories +) + +print(f"Processed {len(result.successful_files)} out of {result.total_files} files") +``` + +### Filter Files Before Processing + +```python +# Get all files in directory +all_files = ["doc1.pdf", "image.png", "spreadsheet.xlsx", "unsupported.xyz"] + +# Filter to supported files only +supported_files = batch_parser.filter_supported_files(all_files) +print(f"Will process {len(supported_files)} out of {len(all_files)} files") + +# Process only supported files +result = batch_parser.process_batch( + file_paths=supported_files, + output_dir="./output" +) +``` + +### Custom Error Handling + +```python +def process_with_retry(file_paths, max_retries=3): + """Process files with retry logic""" + + for attempt in range(max_retries): + result = batch_parser.process_batch(file_paths, "./output") + + if not result.failed_files: + break # All files processed successfully + + print(f"Attempt {attempt + 1}: {len(result.failed_files)} files failed") + file_paths = result.failed_files # Retry failed files + + return result +``` + +## Best Practices + +1. **Start with default settings** and adjust based on performance +2. **Monitor system resources** during batch processing +3. **Use appropriate worker counts** for your hardware +4. **Handle errors gracefully** with retry logic +5. **Test with small batches** before processing large collections +6. **Use skip_installation_check** if facing parser installation issues +7. **Enable progress tracking** for long-running operations +8. **Set appropriate timeouts** based on expected file processing times + +## Conclusion + +The batch processing feature significantly improves RAG-Anything's throughput for large document collections. It provides flexible configuration options, comprehensive error handling, and seamless integration with the existing RAG-Anything pipeline. \ No newline at end of file diff --git a/docs/enhanced_markdown.md b/docs/enhanced_markdown.md new file mode 100644 index 0000000..f2b148c --- /dev/null +++ b/docs/enhanced_markdown.md @@ -0,0 +1,552 @@ +# Enhanced Markdown Conversion + +This document describes the enhanced markdown conversion feature for RAG-Anything, which provides high-quality PDF generation from markdown files with multiple backend options and advanced styling. + +## Overview + +The enhanced markdown conversion feature provides professional-quality PDF generation from markdown files. It supports multiple conversion backends, advanced styling options, syntax highlighting, and seamless integration with RAG-Anything's document processing pipeline. + +## Key Features + +- **Multiple Backends**: WeasyPrint, Pandoc, and automatic backend selection +- **Advanced Styling**: Custom CSS, syntax highlighting, and professional layouts +- **Image Support**: Embedded images with proper scaling and positioning +- **Table Support**: Formatted tables with borders and professional styling +- **Code Highlighting**: Syntax highlighting for code blocks using Pygments +- **Custom Templates**: Support for custom CSS and document templates +- **Table of Contents**: Automatic TOC generation with navigation links +- **Professional Typography**: High-quality fonts and spacing + +## Installation + +### Required Dependencies + +```bash +# Basic installation +pip install raganything[all] + +# Required for enhanced markdown conversion +pip install markdown weasyprint pygments +``` + +### Optional Dependencies + +```bash +# For Pandoc backend (system installation required) +# Ubuntu/Debian: +sudo apt-get install pandoc wkhtmltopdf + +# macOS: +brew install pandoc wkhtmltopdf + +# Or using conda: +conda install -c conda-forge pandoc wkhtmltopdf +``` + +### Backend-Specific Installation + +#### WeasyPrint (Recommended) +```bash +# Install WeasyPrint with system dependencies +pip install weasyprint + +# Ubuntu/Debian system dependencies: +sudo apt-get install -y build-essential python3-dev python3-pip \ + python3-setuptools python3-wheel python3-cffi libcairo2 \ + libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \ + libffi-dev shared-mime-info +``` + +#### Pandoc +- Download from: https://pandoc.org/installing.html +- Requires system-wide installation +- Used for complex document structures and LaTeX-quality output + +## Usage + +### Basic Conversion + +```python +from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig + +# Create converter with default settings +converter = EnhancedMarkdownConverter() + +# Convert markdown file to PDF +success = converter.convert_file_to_pdf( + input_path="document.md", + output_path="document.pdf", + method="auto" # Automatically select best available backend +) + +if success: + print("βœ… Conversion successful!") +else: + print("❌ Conversion failed") +``` + +### Advanced Configuration + +```python +# Create custom configuration +config = MarkdownConfig( + page_size="A4", # A4, Letter, Legal, etc. + margin="1in", # CSS-style margins + font_size="12pt", # Base font size + line_height="1.5", # Line spacing + include_toc=True, # Generate table of contents + syntax_highlighting=True, # Enable code syntax highlighting + + # Custom CSS styling + custom_css=""" + body { + font-family: 'Georgia', serif; + color: #333; + } + h1 { + color: #2c3e50; + border-bottom: 2px solid #3498db; + padding-bottom: 0.3em; + } + code { + background-color: #f8f9fa; + padding: 2px 4px; + border-radius: 3px; + } + pre { + background-color: #f8f9fa; + border-left: 4px solid #3498db; + padding: 15px; + border-radius: 5px; + } + table { + border-collapse: collapse; + width: 100%; + margin: 1em 0; + } + th, td { + border: 1px solid #ddd; + padding: 8px 12px; + text-align: left; + } + th { + background-color: #f2f2f2; + font-weight: bold; + } + """ +) + +converter = EnhancedMarkdownConverter(config) +``` + +### Backend Selection + +```python +# Check available backends +converter = EnhancedMarkdownConverter() +backend_info = converter.get_backend_info() + +print("Available backends:") +for backend, available in backend_info["available_backends"].items(): + status = "βœ…" if available else "❌" + print(f" {status} {backend}") + +print(f"Recommended backend: {backend_info['recommended_backend']}") + +# Use specific backend +converter.convert_file_to_pdf( + input_path="document.md", + output_path="document.pdf", + method="weasyprint" # or "pandoc", "pandoc_system", "auto" +) +``` + +### Content Conversion + +```python +# Convert markdown content directly (not from file) +markdown_content = """ +# Sample Document + +## Introduction +This is a **bold** statement with *italic* text. + +## Code Example +```python +def hello_world(): + print("Hello, World!") + return "Success" +``` + +## Table +| Feature | Status | Notes | +|---------|--------|-------| +| PDF Generation | βœ… | Working | +| Syntax Highlighting | βœ… | Pygments | +| Custom CSS | βœ… | Full support | +""" + +success = converter.convert_markdown_to_pdf( + markdown_content=markdown_content, + output_path="sample.pdf", + method="auto" +) +``` + +### Command Line Interface + +```bash +# Basic conversion +python -m raganything.enhanced_markdown document.md --output document.pdf + +# With specific backend +python -m raganything.enhanced_markdown document.md --method weasyprint + +# With custom CSS file +python -m raganything.enhanced_markdown document.md --css custom_style.css + +# Show backend information +python -m raganything.enhanced_markdown --info + +# Help +python -m raganything.enhanced_markdown --help +``` + +## Backend Comparison + +| Backend | Pros | Cons | Best For | Quality | +|---------|------|------|----------|---------| +| **WeasyPrint** | β€’ Excellent CSS support
β€’ Fast rendering
β€’ Great web-style layouts
β€’ Python-based | β€’ Limited LaTeX features
β€’ Requires system deps | β€’ Web-style documents
β€’ Custom styling
β€’ Fast conversion | ⭐⭐⭐⭐ | +| **Pandoc** | β€’ Extensive features
β€’ LaTeX-quality output
β€’ Academic formatting
β€’ Many input/output formats | β€’ Slower conversion
β€’ System installation
β€’ Complex setup | β€’ Academic papers
β€’ Complex documents
β€’ Publication quality | ⭐⭐⭐⭐⭐ | +| **Auto** | β€’ Automatic selection
β€’ Fallback support
β€’ User-friendly | β€’ May not use optimal backend | β€’ General use
β€’ Quick setup
β€’ Development | ⭐⭐⭐⭐ | + +## Configuration Options + +### MarkdownConfig Parameters + +```python +@dataclass +class MarkdownConfig: + # Page layout + page_size: str = "A4" # A4, Letter, Legal, A3, etc. + margin: str = "1in" # CSS margin format + font_size: str = "12pt" # Base font size + line_height: str = "1.5" # Line spacing multiplier + + # Content options + include_toc: bool = True # Generate table of contents + syntax_highlighting: bool = True # Enable code highlighting + image_max_width: str = "100%" # Maximum image width + table_style: str = "..." # Default table CSS + + # Styling + css_file: Optional[str] = None # External CSS file path + custom_css: Optional[str] = None # Inline CSS content + template_file: Optional[str] = None # Custom HTML template + + # Output options + output_format: str = "pdf" # Currently only PDF supported + output_dir: Optional[str] = None # Output directory + + # Metadata + metadata: Optional[Dict[str, str]] = None # Document metadata +``` + +### Supported Markdown Features + +#### Basic Formatting +- **Headers**: `# ## ### #### ##### ######` +- **Emphasis**: `*italic*`, `**bold**`, `***bold italic***` +- **Links**: `[text](url)`, `[text][ref]` +- **Images**: `![alt](url)`, `![alt][ref]` +- **Lists**: Ordered and unordered, nested +- **Blockquotes**: `> quote` +- **Line breaks**: Double space or `\n\n` + +#### Advanced Features +- **Tables**: GitHub-style tables with alignment +- **Code blocks**: Fenced code blocks with language specification +- **Inline code**: `backtick code` +- **Horizontal rules**: `---` or `***` +- **Footnotes**: `[^1]` references +- **Definition lists**: Term and definition pairs +- **Attributes**: `{#id .class key=value}` + +#### Code Highlighting + +```markdown +```python +def example_function(): + """This will be syntax highlighted""" + return "Hello, World!" +``` + +```javascript +function exampleFunction() { + // This will also be highlighted + return "Hello, World!"; +} +``` +``` + +## Integration with RAG-Anything + +The enhanced markdown conversion integrates seamlessly with RAG-Anything: + +```python +from raganything import RAGAnything + +# Initialize RAG-Anything +rag = RAGAnything() + +# Process markdown files - enhanced conversion is used automatically +await rag.process_document_complete("document.md") + +# Batch processing with enhanced markdown conversion +result = rag.process_documents_batch( + file_paths=["doc1.md", "doc2.md", "doc3.md"], + output_dir="./output" +) + +# The .md files will be converted to PDF using enhanced conversion +# before being processed by the RAG system +``` + +## Performance Considerations + +### Conversion Speed +- **WeasyPrint**: ~1-3 seconds for typical documents +- **Pandoc**: ~3-10 seconds for typical documents +- **Large documents**: Time scales roughly linearly with content + +### Memory Usage +- **WeasyPrint**: ~50-100MB per conversion +- **Pandoc**: ~100-200MB per conversion +- **Images**: Large images increase memory usage significantly + +### Optimization Tips +1. **Resize large images** before embedding +2. **Use compressed images** (JPEG for photos, PNG for graphics) +3. **Limit concurrent conversions** to avoid memory issues +4. **Cache converted content** when processing multiple times + +## Examples + +### Sample Markdown Document + +```markdown +# Technical Documentation + +## Table of Contents +[TOC] + +## Overview +This document provides comprehensive technical specifications. + +## Architecture + +### System Components +1. **Parser Engine**: Handles document processing +2. **Storage Layer**: Manages data persistence +3. **Query Interface**: Provides search capabilities + +### Code Implementation +```python +from raganything import RAGAnything + +# Initialize system +rag = RAGAnything(config={ + "working_dir": "./storage", + "enable_image_processing": True +}) + +# Process document +await rag.process_document_complete("document.pdf") +``` + +### Performance Metrics + +| Component | Throughput | Latency | Memory | +|-----------|------------|---------|--------| +| Parser | 100 docs/hour | 36s avg | 2.5 GB | +| Storage | 1000 ops/sec | 1ms avg | 512 MB | +| Query | 50 queries/sec | 20ms avg | 1 GB | + +## Integration Notes + +> **Important**: Always validate input before processing. + +## Conclusion +The enhanced system provides excellent performance for document processing workflows. +``` + +### Generated PDF Features + +The enhanced markdown converter produces PDFs with: + +- **Professional typography** with proper font selection and spacing +- **Syntax-highlighted code blocks** using Pygments +- **Formatted tables** with borders and alternating row colors +- **Clickable table of contents** with navigation links +- **Responsive images** that scale appropriately +- **Custom styling** through CSS +- **Proper page breaks** and margins +- **Document metadata** and properties + +## Troubleshooting + +### Common Issues + +#### WeasyPrint Installation Problems +```bash +# Ubuntu/Debian: Install system dependencies +sudo apt-get update +sudo apt-get install -y build-essential python3-dev libcairo2 \ + libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \ + libffi-dev shared-mime-info + +# Then reinstall WeasyPrint +pip install --force-reinstall weasyprint +``` + +#### Pandoc Not Found +```bash +# Check if Pandoc is installed +pandoc --version + +# Install Pandoc (Ubuntu/Debian) +sudo apt-get install pandoc wkhtmltopdf + +# Or download from: https://pandoc.org/installing.html +``` + +#### CSS Issues +- Check CSS syntax in custom_css +- Verify CSS file paths exist +- Test CSS with simple HTML first +- Use browser developer tools to debug styling + +#### Image Problems +- Ensure images are accessible (correct paths) +- Check image file formats (PNG, JPEG, GIF supported) +- Verify image file permissions +- Consider image size and format optimization + +#### Font Issues +```python +# Use web-safe fonts +config = MarkdownConfig( + custom_css=""" + body { + font-family: 'Arial', 'Helvetica', sans-serif; + } + """ +) +``` + +### Debug Mode + +Enable detailed logging for troubleshooting: + +```python +import logging + +# Enable debug logging +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +# Create converter with debug logging +converter = EnhancedMarkdownConverter() +result = converter.convert_file_to_pdf("test.md", "test.pdf") +``` + +### Error Handling + +```python +def robust_conversion(input_path, output_path): + """Convert with fallback backends""" + converter = EnhancedMarkdownConverter() + + # Try backends in order of preference + backends = ["weasyprint", "pandoc", "auto"] + + for backend in backends: + try: + success = converter.convert_file_to_pdf( + input_path=input_path, + output_path=output_path, + method=backend + ) + if success: + print(f"βœ… Conversion successful with {backend}") + return True + except Exception as e: + print(f"❌ {backend} failed: {str(e)}") + continue + + print("❌ All backends failed") + return False +``` + +## API Reference + +### EnhancedMarkdownConverter + +```python +class EnhancedMarkdownConverter: + def __init__(self, config: Optional[MarkdownConfig] = None): + """Initialize converter with optional configuration""" + + def convert_file_to_pdf(self, input_path: str, output_path: str, method: str = "auto") -> bool: + """Convert markdown file to PDF""" + + def convert_markdown_to_pdf(self, markdown_content: str, output_path: str, method: str = "auto") -> bool: + """Convert markdown content to PDF""" + + def get_backend_info(self) -> Dict[str, Any]: + """Get information about available backends""" + + def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool: + """Convert using WeasyPrint backend""" + + def convert_with_pandoc(self, markdown_content: str, output_path: str) -> bool: + """Convert using Pandoc backend""" +``` + +## Best Practices + +1. **Choose the right backend** for your use case: + - **WeasyPrint** for web-style documents and custom CSS + - **Pandoc** for academic papers and complex formatting + - **Auto** for general use and development + +2. **Optimize images** before embedding: + - Use appropriate formats (JPEG for photos, PNG for graphics) + - Compress images to reduce file size + - Set reasonable maximum widths + +3. **Design responsive layouts**: + - Use relative units (%, em) instead of absolute (px) + - Test with different page sizes + - Consider print-specific CSS + +4. **Test your styling**: + - Start with default styling and incrementally customize + - Test with sample content before production use + - Validate CSS syntax + +5. **Handle errors gracefully**: + - Implement fallback backends + - Provide meaningful error messages + - Log conversion attempts for debugging + +6. **Performance optimization**: + - Cache converted content when possible + - Process large batches with appropriate worker counts + - Monitor memory usage with large documents + +## Conclusion + +The enhanced markdown conversion feature provides professional-quality PDF generation with flexible styling options and multiple backend support. It seamlessly integrates with RAG-Anything's document processing pipeline while offering standalone functionality for markdown-to-PDF conversion needs. \ No newline at end of file diff --git a/examples/batch_and_enhanced_markdown_example.py b/examples/batch_and_enhanced_markdown_example.py deleted file mode 100644 index 4463cba..0000000 --- a/examples/batch_and_enhanced_markdown_example.py +++ /dev/null @@ -1,334 +0,0 @@ -#!/usr/bin/env python -""" -Example script demonstrating batch processing and enhanced markdown conversion - -This example shows how to: -1. Process multiple documents in parallel using batch processing -2. Convert markdown files to PDF with enhanced formatting -3. Use different conversion backends for markdown -""" - -import asyncio -import logging -from pathlib import Path -import tempfile - -# Add project root directory to Python path -import sys - -sys.path.append(str(Path(__file__).parent.parent)) - -from raganything import RAGAnything, RAGAnythingConfig -from raganything.batch_parser import BatchParser -from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig - - -def create_sample_markdown_files(): - """Create sample markdown files for testing""" - sample_files = [] - # Create temporary directory - temp_dir = Path(tempfile.mkdtemp()) - # Sample 1: Basic markdown - sample1_content = """# Sample Document 1 - -This is a basic markdown document with various elements. - -## Headers -This document demonstrates different markdown features. - -### Lists -- Item 1 -- Item 2 -- Item 3 - -### Code -```python -def hello_world(): - print("Hello, World!") -``` - -### Tables -| Name | Age | City | -|------|-----|------| -| Alice | 25 | New York | -| Bob | 30 | London | -| Carol | 28 | Paris | - -### Blockquotes -> This is a blockquote with some important information. - -### Links and Images -Visit [GitHub](https://github.com) for more information. -""" - - sample1_path = temp_dir / "sample1.md" - with open(sample1_path, "w", encoding="utf-8") as f: - f.write(sample1_content) - sample_files.append(str(sample1_path)) - - # Sample 2: Technical document - sample2_content = """# Technical Documentation - -## Overview -This document provides technical specifications for the RAG-Anything system. - -## Architecture - -### Core Components -1. **Document Parser**: Handles multiple file formats -2. **Multimodal Processor**: Processes images, tables, equations -3. **Knowledge Graph**: Stores relationships and entities -4. **Query Engine**: Provides intelligent retrieval - -### Code Examples - -#### Python Implementation -```python -from raganything import RAGAnything - -# Initialize the system -rag = RAGAnything() - -# Process documents -await rag.process_document_complete("document.pdf") -``` - -#### Configuration -```yaml -working_dir: "./rag_storage" -enable_image_processing: true -enable_table_processing: true -max_concurrent_files: 4 -``` - -## Performance Metrics - -| Metric | Value | Unit | -|--------|-------|------| -| Processing Speed | 100 | docs/hour | -| Memory Usage | 2.5 | GB | -| Accuracy | 95.2 | % | - -## Conclusion -The system provides excellent performance for multimodal document processing. -""" - - sample2_path = temp_dir / "sample2.md" - with open(sample2_path, "w", encoding="utf-8") as f: - f.write(sample2_content) - sample_files.append(str(sample2_path)) - - return sample_files, temp_dir - - -def demonstrate_batch_processing(): - """Demonstrate batch processing functionality""" - print("\n" + "=" * 50) - print("BATCH PROCESSING DEMONSTRATION") - print("=" * 50) - - # Create sample files - sample_files, temp_dir = create_sample_markdown_files() - - try: - # Create batch parser - batch_parser = BatchParser( - parser_type="mineru", - max_workers=2, - show_progress=True, - timeout_per_file=60, - skip_installation_check=True, # Add this parameter to bypass installation check - ) - - print(f"Created {len(sample_files)} sample markdown files:") - for file_path in sample_files: - print(f" - {file_path}") - - # Process files in batch - output_dir = temp_dir / "batch_output" - result = batch_parser.process_batch( - file_paths=sample_files, - output_dir=str(output_dir), - parse_method="auto", - recursive=False, - ) - - # Display results - print("\nBatch Processing Results:") - print(result.summary()) - - if result.failed_files: - print("\nFailed files:") - for file_path in result.failed_files: - print( - f" - {file_path}: {result.errors.get(file_path, 'Unknown error')}" - ) - - return result - - except Exception as e: - print(f"Batch processing failed: {str(e)}") - return None - - -def demonstrate_enhanced_markdown(): - """Demonstrate enhanced markdown conversion""" - print("\n" + "=" * 50) - print("ENHANCED MARKDOWN CONVERSION DEMONSTRATION") - print("=" * 50) - - # Create sample files - sample_files, temp_dir = create_sample_markdown_files() - - try: - # Create enhanced markdown converter - config = MarkdownConfig( - page_size="A4", - margin="1in", - font_size="12pt", - include_toc=True, - syntax_highlighting=True, - ) - - converter = EnhancedMarkdownConverter(config) - - # Show backend information - backend_info = converter.get_backend_info() - print("Available backends:") - for backend, available in backend_info["available_backends"].items(): - status = "βœ…" if available else "❌" - print(f" {status} {backend}") - print(f"Recommended backend: {backend_info['recommended_backend']}") - - # Convert each sample file - conversion_results = [] - - for i, markdown_file in enumerate(sample_files, 1): - print(f"\nConverting sample {i}...") - # Try different conversion methods - for method in ["auto", "weasyprint", "pandoc"]: - try: - output_path = temp_dir / f"sample{i}_{method}.pdf" - - success = converter.convert_file_to_pdf( - input_path=markdown_file, - output_path=str(output_path), - method=method, - ) - - if success: - print(f" βœ… {method}: {output_path}") - conversion_results.append( - { - "file": markdown_file, - "method": method, - "output": str(output_path), - "success": True, - } - ) - break # Use first successful method - else: - print(f" ❌ {method}: Failed") - - except Exception as e: - print(f" ❌ {method}: {str(e)}") - continue - - # Summary - print("\nConversion Summary:") - print(f" Total files: {len(sample_files)}") - print(f" Successful conversions: {len(conversion_results)}") - - return conversion_results - - except Exception as e: - print(f"Enhanced markdown conversion failed: {str(e)}") - return None - - -async def demonstrate_integration(): - """Demonstrate integration with RAG-Anything""" - print("\n" + "=" * 50) - print("RAG-ANYTHING INTEGRATION DEMONSTRATION") - print("=" * 50) - - # Create sample files - sample_files, temp_dir = create_sample_markdown_files() - - try: - # Initialize RAG-Anything (without API keys for demo) - config = RAGAnythingConfig( - working_dir=str(temp_dir / "rag_storage"), - enable_image_processing=True, - enable_table_processing=True, - enable_equation_processing=True, - ) - - rag = RAGAnything(config=config) - - # Demonstrate batch processing with RAG - print("Processing documents with batch functionality...") - - # Note: This would require actual API keys for full functionality - # For demo purposes, we'll just show the interface - print(" - Batch processing interface available") - print(" - Enhanced markdown conversion available") - print(" - Integration with multimodal processors available") - - # Show that rag object has the expected methods - print(f" - RAG instance created: {type(rag).__name__}") - print( - f" - Available batch methods: {[m for m in dir(rag) if 'batch' in m.lower()]}" - ) - - return True - - except Exception as e: - print(f"Integration demonstration failed: {str(e)}") - return False - - -def main(): - """Main demonstration function""" - # Configure logging - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - ) - - print("RAG-Anything Batch Processing and Enhanced Markdown Demo") - print("=" * 60) - - # Demonstrate batch processing - batch_result = demonstrate_batch_processing() - - # Demonstrate enhanced markdown conversion - markdown_result = demonstrate_enhanced_markdown() - - # Demonstrate integration - asyncio.run(demonstrate_integration()) - - # Summary - print("\n" + "=" * 60) - print("DEMONSTRATION SUMMARY") - print("=" * 60) - if batch_result: - print(f"Batch Processing: {batch_result.success_rate:.1f}% success rate") - else: - print("Batch Processing: Failed") - - if markdown_result: - print(f"Enhanced Markdown: {len(markdown_result)} successful conversions") - else: - print("Enhanced Markdown: Failed") - - print("\nFeatures demonstrated:") - print(" - Parallel document processing with progress tracking") - print(" - Multiple markdown conversion backends (WeasyPrint, Pandoc)") - print(" - Enhanced styling and formatting") - print(" - Integration with RAG-Anything pipeline") - print(" - Comprehensive error handling and reporting") - - -if __name__ == "__main__": - main() diff --git a/examples/batch_processing_example.py b/examples/batch_processing_example.py new file mode 100644 index 0000000..ce03c39 --- /dev/null +++ b/examples/batch_processing_example.py @@ -0,0 +1,550 @@ +#!/usr/bin/env python +""" +Batch Processing Example for RAG-Anything + +This example demonstrates how to use the batch processing capabilities +to process multiple documents in parallel for improved throughput. + +Features demonstrated: +- Basic batch processing with BatchParser +- Asynchronous batch processing +- Integration with RAG-Anything +- Error handling and progress tracking +- File filtering and directory processing +""" + +import asyncio +import logging +from pathlib import Path +import tempfile +import time + +# Add project root directory to Python path +import sys +sys.path.append(str(Path(__file__).parent.parent)) + +from raganything import RAGAnything, RAGAnythingConfig +from raganything.batch_parser import BatchParser + + +def create_sample_documents(): + """Create sample documents for batch processing testing""" + temp_dir = Path(tempfile.mkdtemp()) + sample_files = [] + + # Create various document types + documents = { + "document1.txt": "This is a simple text document for testing batch processing.", + "document2.txt": "Another text document with different content.", + "document3.md": """# Markdown Document + +## Introduction +This is a markdown document for testing. + +### Features +- Markdown formatting +- Code blocks +- Lists + +```python +def example(): + return "Hello from markdown" +``` +""", + "report.txt": """Business Report + +Executive Summary: +This report demonstrates batch processing capabilities. + +Key Findings: +1. Parallel processing improves throughput +2. Progress tracking enhances user experience +3. Error handling ensures reliability + +Conclusion: +Batch processing is essential for large-scale document processing. +""", + "notes.md": """# Meeting Notes + +## Date: 2024-01-15 + +### Attendees +- Alice Johnson +- Bob Smith +- Carol Williams + +### Discussion Topics +1. **Batch Processing Implementation** + - Parallel document processing + - Progress tracking + - Error handling strategies + +2. **Performance Metrics** + - Target: 100 documents/hour + - Memory usage: < 4GB + - Success rate: > 95% + +### Action Items +- [ ] Implement batch processing +- [ ] Add progress bars +- [ ] Test with large document sets +- [ ] Optimize memory usage + +### Next Steps +Continue development and testing of batch processing features. +""" + } + + # Create files + for filename, content in documents.items(): + file_path = temp_dir / filename + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + sample_files.append(str(file_path)) + + return sample_files, temp_dir + + +def demonstrate_basic_batch_processing(): + """Demonstrate basic batch processing functionality""" + print("\n" + "=" * 60) + print("BASIC BATCH PROCESSING DEMONSTRATION") + print("=" * 60) + + # Create sample documents + sample_files, temp_dir = create_sample_documents() + + try: + print(f"Created {len(sample_files)} sample documents in: {temp_dir}") + for file_path in sample_files: + print(f" - {Path(file_path).name}") + + # Create batch parser + batch_parser = BatchParser( + parser_type="mineru", + max_workers=3, + show_progress=True, + timeout_per_file=60, + skip_installation_check=True # Skip installation check for demo + ) + + print(f"\nBatch parser configured:") + print(f" - Parser type: mineru") + print(f" - Max workers: 3") + print(f" - Progress tracking: enabled") + print(f" - Timeout per file: 60 seconds") + + # Check supported extensions + supported_extensions = batch_parser.get_supported_extensions() + print(f" - Supported extensions: {supported_extensions}") + + # Filter files to supported types + supported_files = batch_parser.filter_supported_files(sample_files) + print(f"\nFile filtering results:") + print(f" - Total files: {len(sample_files)}") + print(f" - Supported files: {len(supported_files)}") + + # Process batch + output_dir = temp_dir / "batch_output" + print(f"\nStarting batch processing...") + print(f"Output directory: {output_dir}") + + start_time = time.time() + result = batch_parser.process_batch( + file_paths=supported_files, + output_dir=str(output_dir), + parse_method="auto", + recursive=False + ) + processing_time = time.time() - start_time + + # Display results + print("\n" + "-" * 40) + print("BATCH PROCESSING RESULTS") + print("-" * 40) + print(result.summary()) + print(f"Total processing time: {processing_time:.2f} seconds") + print(f"Success rate: {result.success_rate:.1f}%") + + if result.successful_files: + print(f"\nSuccessfully processed files:") + for file_path in result.successful_files: + print(f" βœ… {Path(file_path).name}") + + if result.failed_files: + print(f"\nFailed files:") + for file_path in result.failed_files: + error = result.errors.get(file_path, "Unknown error") + print(f" ❌ {Path(file_path).name}: {error}") + + return result + + except Exception as e: + print(f"❌ Batch processing demonstration failed: {str(e)}") + return None + + +async def demonstrate_async_batch_processing(): + """Demonstrate asynchronous batch processing""" + print("\n" + "=" * 60) + print("ASYNCHRONOUS BATCH PROCESSING DEMONSTRATION") + print("=" * 60) + + # Create sample documents + sample_files, temp_dir = create_sample_documents() + + try: + print(f"Processing {len(sample_files)} documents asynchronously...") + + # Create batch parser + batch_parser = BatchParser( + parser_type="mineru", + max_workers=2, + show_progress=True, + skip_installation_check=True + ) + + # Process batch asynchronously + output_dir = temp_dir / "async_output" + + start_time = time.time() + result = await batch_parser.process_batch_async( + file_paths=sample_files, + output_dir=str(output_dir), + parse_method="auto", + recursive=False + ) + processing_time = time.time() - start_time + + # Display results + print("\n" + "-" * 40) + print("ASYNC BATCH PROCESSING RESULTS") + print("-" * 40) + print(result.summary()) + print(f"Async processing time: {processing_time:.2f} seconds") + print(f"Success rate: {result.success_rate:.1f}%") + + return result + + except Exception as e: + print(f"❌ Async batch processing demonstration failed: {str(e)}") + return None + + +async def demonstrate_rag_integration(): + """Demonstrate batch processing integration with RAG-Anything""" + print("\n" + "=" * 60) + print("RAG-ANYTHING BATCH INTEGRATION DEMONSTRATION") + print("=" * 60) + + # Create sample documents + sample_files, temp_dir = create_sample_documents() + + try: + # Initialize RAG-Anything with temporary storage + config = RAGAnythingConfig( + working_dir=str(temp_dir / "rag_storage"), + enable_image_processing=True, + enable_table_processing=True, + enable_equation_processing=True, + max_concurrent_files=2 + ) + + rag = RAGAnything(config=config) + + print("RAG-Anything initialized with batch processing capabilities") + + # Show available batch methods + batch_methods = [method for method in dir(rag) if 'batch' in method.lower()] + print(f"Available batch methods: {batch_methods}") + + # Demonstrate batch processing with RAG integration + print(f"\nProcessing {len(sample_files)} documents with RAG integration...") + + # Use the RAG-integrated batch processing + try: + # Process documents in batch + result = rag.process_documents_batch( + file_paths=sample_files, + output_dir=str(temp_dir / "rag_batch_output"), + max_workers=2, + show_progress=True + ) + + print("\n" + "-" * 40) + print("RAG BATCH PROCESSING RESULTS") + print("-" * 40) + print(result.summary()) + print(f"Success rate: {result.success_rate:.1f}%") + + # Demonstrate batch processing with full RAG integration + print(f"\nProcessing documents with full RAG integration...") + + rag_result = await rag.process_documents_with_rag_batch( + file_paths=sample_files[:2], # Process subset for demo + output_dir=str(temp_dir / "rag_full_output"), + max_workers=1, + show_progress=True + ) + + print("\n" + "-" * 40) + print("FULL RAG INTEGRATION RESULTS") + print("-" * 40) + print(f"Parse result: {rag_result['parse_result'].summary()}") + print(f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds") + print(f"Successfully processed with RAG: {rag_result['successful_rag_files']}") + print(f"Failed RAG processing: {rag_result['failed_rag_files']}") + + return rag_result + + except Exception as e: + print(f"⚠️ RAG integration demo completed with limitations: {str(e)}") + print("Note: This is expected in environments without full API configuration") + return None + + except Exception as e: + print(f"❌ RAG integration demonstration failed: {str(e)}") + return None + + +def demonstrate_directory_processing(): + """Demonstrate processing entire directories""" + print("\n" + "=" * 60) + print("DIRECTORY PROCESSING DEMONSTRATION") + print("=" * 60) + + # Create a directory structure with nested files + temp_dir = Path(tempfile.mkdtemp()) + + # Create main directory files + main_files = { + "overview.txt": "Main directory overview document", + "readme.md": "# Project README\n\nThis is the main project documentation." + } + + # Create subdirectory + sub_dir = temp_dir / "subdirectory" + sub_dir.mkdir() + + sub_files = { + "details.txt": "Detailed information in subdirectory", + "notes.md": "# Notes\n\nAdditional notes and information." + } + + # Write all files + all_files = [] + for filename, content in main_files.items(): + file_path = temp_dir / filename + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + all_files.append(str(file_path)) + + for filename, content in sub_files.items(): + file_path = sub_dir / filename + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + all_files.append(str(file_path)) + + try: + print(f"Created directory structure:") + print(f" Main directory: {temp_dir}") + print(f" Files in main: {list(main_files.keys())}") + print(f" Subdirectory: {sub_dir}") + print(f" Files in sub: {list(sub_files.keys())}") + + # Create batch parser + batch_parser = BatchParser( + parser_type="mineru", + max_workers=2, + show_progress=True, + skip_installation_check=True + ) + + # Process entire directory recursively + print(f"\nProcessing entire directory recursively...") + + result = batch_parser.process_batch( + file_paths=[str(temp_dir)], # Pass directory path + output_dir=str(temp_dir / "directory_output"), + parse_method="auto", + recursive=True # Include subdirectories + ) + + print("\n" + "-" * 40) + print("DIRECTORY PROCESSING RESULTS") + print("-" * 40) + print(result.summary()) + print(f"Total files found and processed: {result.total_files}") + print(f"Success rate: {result.success_rate:.1f}%") + + if result.successful_files: + print(f"\nSuccessfully processed:") + for file_path in result.successful_files: + relative_path = Path(file_path).relative_to(temp_dir) + print(f" βœ… {relative_path}") + + return result + + except Exception as e: + print(f"❌ Directory processing demonstration failed: {str(e)}") + return None + + +def demonstrate_error_handling(): + """Demonstrate error handling and recovery""" + print("\n" + "=" * 60) + print("ERROR HANDLING DEMONSTRATION") + print("=" * 60) + + temp_dir = Path(tempfile.mkdtemp()) + + # Create files with various issues + files_with_issues = { + "valid_file.txt": "This is a valid file that should process successfully.", + "empty_file.txt": "", # Empty file + "large_file.txt": "x" * 1000000, # Large file (1MB of 'x') + } + + created_files = [] + for filename, content in files_with_issues.items(): + file_path = temp_dir / filename + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + created_files.append(str(file_path)) + + # Add a non-existent file to the list + created_files.append(str(temp_dir / "non_existent_file.txt")) + + try: + print(f"Testing error handling with {len(created_files)} files:") + for file_path in created_files: + name = Path(file_path).name + exists = Path(file_path).exists() + size = Path(file_path).stat().st_size if exists else 0 + print(f" - {name}: {'exists' if exists else 'missing'}, {size} bytes") + + # Create batch parser with short timeout for demonstration + batch_parser = BatchParser( + parser_type="mineru", + max_workers=2, + show_progress=True, + timeout_per_file=30, # Short timeout for demo + skip_installation_check=True + ) + + # Process files and handle errors + result = batch_parser.process_batch( + file_paths=created_files, + output_dir=str(temp_dir / "error_test_output"), + parse_method="auto" + ) + + print("\n" + "-" * 40) + print("ERROR HANDLING RESULTS") + print("-" * 40) + print(result.summary()) + + if result.successful_files: + print(f"\nSuccessful files:") + for file_path in result.successful_files: + print(f" βœ… {Path(file_path).name}") + + if result.failed_files: + print(f"\nFailed files with error details:") + for file_path in result.failed_files: + error = result.errors.get(file_path, "Unknown error") + print(f" ❌ {Path(file_path).name}: {error}") + + # Demonstrate retry logic + if result.failed_files: + print(f"\nDemonstrating retry logic for {len(result.failed_files)} failed files...") + + # Retry only the failed files + retry_result = batch_parser.process_batch( + file_paths=result.failed_files, + output_dir=str(temp_dir / "retry_output"), + parse_method="auto" + ) + + print(f"Retry results: {retry_result.summary()}") + + return result + + except Exception as e: + print(f"❌ Error handling demonstration failed: {str(e)}") + return None + + +async def main(): + """Main demonstration function""" + # Configure logging + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + + print("RAG-Anything Batch Processing Demonstration") + print("=" * 70) + print("This example demonstrates various batch processing capabilities:") + print(" - Basic batch processing with progress tracking") + print(" - Asynchronous processing for improved performance") + print(" - Integration with RAG-Anything pipeline") + print(" - Directory processing with recursive file discovery") + print(" - Comprehensive error handling and recovery") + + results = {} + + # Run demonstrations + print("\nπŸš€ Starting demonstrations...") + + # Basic batch processing + results['basic'] = demonstrate_basic_batch_processing() + + # Asynchronous processing + results['async'] = await demonstrate_async_batch_processing() + + # RAG integration + results['rag'] = await demonstrate_rag_integration() + + # Directory processing + results['directory'] = demonstrate_directory_processing() + + # Error handling + results['error_handling'] = demonstrate_error_handling() + + # Summary + print("\n" + "=" * 70) + print("DEMONSTRATION SUMMARY") + print("=" * 70) + + for demo_name, result in results.items(): + if result: + if hasattr(result, 'success_rate'): + print(f"βœ… {demo_name.upper()}: {result.success_rate:.1f}% success rate") + else: + print(f"βœ… {demo_name.upper()}: Completed successfully") + else: + print(f"❌ {demo_name.upper()}: Failed or had limitations") + + print("\nπŸ“Š Key Features Demonstrated:") + print(" - Parallel document processing with configurable worker counts") + print(" - Real-time progress tracking with tqdm progress bars") + print(" - Comprehensive error handling and reporting") + print(" - File filtering based on supported document types") + print(" - Directory processing with recursive file discovery") + print(" - Asynchronous processing for improved performance") + print(" - Integration with RAG-Anything document pipeline") + print(" - Retry logic for failed documents") + print(" - Detailed processing statistics and timing") + + print("\nπŸ’‘ Best Practices Highlighted:") + print(" - Use appropriate worker counts for your system") + print(" - Enable progress tracking for long-running operations") + print(" - Handle errors gracefully with retry mechanisms") + print(" - Filter files to supported types before processing") + print(" - Set reasonable timeouts for document processing") + print(" - Use skip_installation_check for environments with conflicts") + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/examples/enhanced_markdown_example.py b/examples/enhanced_markdown_example.py new file mode 100644 index 0000000..b3c96e0 --- /dev/null +++ b/examples/enhanced_markdown_example.py @@ -0,0 +1,1031 @@ +#!/usr/bin/env python +""" +Enhanced Markdown Conversion Example for RAG-Anything + +This example demonstrates the enhanced markdown to PDF conversion capabilities +with multiple backends, advanced styling, and professional formatting. + +Features demonstrated: +- Basic markdown to PDF conversion +- Multiple conversion backends (WeasyPrint, Pandoc) +- Custom CSS styling and configuration +- Backend detection and selection +- Error handling and fallback mechanisms +- Command-line interface usage +""" + +import logging +from pathlib import Path +import tempfile + +# Add project root directory to Python path +import sys +sys.path.append(str(Path(__file__).parent.parent)) + +from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig + + +def create_sample_markdown_content(): + """Create comprehensive sample markdown content for testing""" + + # Basic sample + basic_content = """# Basic Markdown Sample + +## Introduction +This is a simple markdown document demonstrating basic formatting. + +### Text Formatting +- **Bold text** and *italic text* +- `Inline code` examples +- [Links to external sites](https://github.com) + +### Lists +1. First ordered item +2. Second ordered item +3. Third ordered item + +- Unordered item +- Another unordered item + - Nested item + - Another nested item + +### Blockquotes +> This is a blockquote with important information. +> It can span multiple lines. + +### Code Block +```python +def hello_world(): + print("Hello, World!") + return "Success" +``` +""" + + # Technical documentation sample + technical_content = """# Technical Documentation + +## Table of Contents +- [Overview](#overview) +- [Architecture](#architecture) +- [Implementation](#implementation) +- [Performance](#performance) + +## Overview +This document provides comprehensive technical specifications for the enhanced markdown conversion system. + +## Architecture + +### Core Components +1. **Markdown Parser**: Processes markdown syntax +2. **CSS Engine**: Applies styling and layout +3. **PDF Generator**: Creates final PDF output +4. **Backend Manager**: Handles multiple conversion engines + +### Data Flow +```mermaid +graph LR + A[Markdown Input] --> B[Parser] + B --> C[CSS Processor] + C --> D[PDF Generator] + D --> E[PDF Output] +``` + +## Implementation + +### Python Code Example +```python +from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig + +# Configure converter +config = MarkdownConfig( + page_size="A4", + margin="1in", + include_toc=True, + syntax_highlighting=True +) + +# Create converter +converter = EnhancedMarkdownConverter(config) + +# Convert to PDF +success = converter.convert_file_to_pdf( + input_path="document.md", + output_path="output.pdf", + method="weasyprint" +) +``` + +### Configuration Options +```yaml +converter: + page_size: A4 + margin: 1in + font_size: 12pt + include_toc: true + syntax_highlighting: true + backend: weasyprint +``` + +## Performance + +### Benchmark Results +| Backend | Speed | Quality | Features | +|---------|-------|---------|----------| +| WeasyPrint | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | +| Pandoc | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | + +### Processing Times +- **Small documents** (< 10 pages): 1-3 seconds +- **Medium documents** (10-50 pages): 3-10 seconds +- **Large documents** (> 50 pages): 10-30 seconds + +## Advanced Features + +### Custom CSS Styling +The system supports advanced CSS customization: + +```css +body { + font-family: 'Georgia', serif; + line-height: 1.6; + color: #333; +} + +h1 { + color: #2c3e50; + border-bottom: 2px solid #3498db; + padding-bottom: 0.3em; +} + +code { + background-color: #f8f9fa; + padding: 2px 4px; + border-radius: 3px; + font-family: 'Courier New', monospace; +} + +pre { + background-color: #f8f9fa; + border-left: 4px solid #3498db; + padding: 15px; + border-radius: 5px; + overflow-x: auto; +} + +table { + border-collapse: collapse; + width: 100%; + margin: 1em 0; +} + +th, td { + border: 1px solid #ddd; + padding: 8px 12px; + text-align: left; +} + +th { + background-color: #f2f2f2; + font-weight: bold; +} +``` + +### Image Support +![Sample Image](https://via.placeholder.com/400x200/3498db/ffffff?text=Sample+Image) + +Images are automatically scaled and positioned appropriately in the PDF output. + +## Conclusion +The enhanced markdown conversion system provides professional-quality PDF generation with extensive customization options and multiple backend support. + +--- + +*Generated on: 2024-01-15* +*Version: 1.0.0* +""" + + # Academic paper sample + academic_content = """# Research Paper: Advanced Document Processing + +**Authors:** Alice JohnsonΒΉ, Bob SmithΒ², Carol WilliamsΒΉ +**Affiliations:** +ΒΉ University of Technology +Β² Research Institute + +## Abstract + +This paper presents a comprehensive analysis of advanced document processing techniques using enhanced markdown conversion. Our research demonstrates significant improvements in processing speed and output quality through optimized backend selection and custom styling approaches. + +**Keywords:** document processing, markdown conversion, PDF generation, performance optimization + +## 1. Introduction + +Document processing has become increasingly important in modern information systems. The ability to convert markdown documents to high-quality PDF outputs with professional formatting is crucial for academic, technical, and business applications. + +### 1.1 Research Objectives + +1. Evaluate different markdown conversion backends +2. Analyze performance characteristics of each approach +3. Develop optimization strategies for large-scale processing +4. Design flexible configuration systems for diverse use cases + +### 1.2 Contributions + +This work makes the following contributions: +- Comprehensive comparison of markdown conversion backends +- Performance optimization techniques for large documents +- Flexible configuration framework for customization +- Integration patterns for document processing pipelines + +## 2. Methodology + +### 2.1 Experimental Setup + +We conducted experiments using the following configuration: + +```python +# Experimental configuration +config = MarkdownConfig( + page_size="A4", + margin="1in", + font_size="11pt", + line_height="1.4", + include_toc=True, + syntax_highlighting=True +) +``` + +### 2.2 Test Documents + +| Category | Count | Avg Size | Complexity | +|----------|-------|----------|------------| +| Simple | 100 | 2 pages | Low | +| Medium | 50 | 10 pages | Medium | +| Complex | 25 | 25 pages | High | + +### 2.3 Metrics + +We evaluated performance using the following metrics: +- **Conversion Speed**: Time to generate PDF (seconds) +- **Memory Usage**: Peak memory consumption (MB) +- **Output Quality**: Visual assessment score (1-10) +- **Feature Support**: Number of supported markdown features + +## 3. Results + +### 3.1 Performance Comparison + +The following table summarizes our performance results: + +| Backend | Speed (s) | Memory (MB) | Quality | Features | +|---------|-----------|-------------|---------|----------| +| WeasyPrint | 2.3 Β± 0.5 | 85 Β± 15 | 8.5 | 85% | +| Pandoc | 4.7 Β± 1.2 | 120 Β± 25 | 9.2 | 95% | + +### 3.2 Quality Analysis + +#### 3.2.1 Typography +WeasyPrint excels in web-style typography with excellent CSS support, while Pandoc provides superior academic formatting with LaTeX-quality output. + +#### 3.2.2 Code Highlighting +Both backends support syntax highlighting through Pygments: + +```python +def analyze_performance(backend, documents): + '''Analyze conversion performance for given backend''' + results = [] + + for doc in documents: + start_time = time.time() + success = backend.convert(doc) + end_time = time.time() + + results.append({ + 'document': doc, + 'time': end_time - start_time, + 'success': success + }) + + return results +``` + +### 3.3 Scalability + +Our scalability analysis shows: +- Linear scaling with document size for both backends +- Memory usage proportional to content complexity +- Optimal batch sizes of 10-20 documents for parallel processing + +## 4. Discussion + +### 4.1 Backend Selection Guidelines + +Choose **WeasyPrint** for: +- Web-style documents with custom CSS +- Fast conversion requirements +- Simple to medium complexity documents + +Choose **Pandoc** for: +- Academic papers and publications +- Complex document structures +- Maximum feature support requirements + +### 4.2 Optimization Strategies + +1. **Image Optimization**: Compress images before embedding +2. **CSS Minimization**: Use efficient CSS selectors +3. **Content Chunking**: Process large documents in sections +4. **Caching**: Cache converted content for repeated use + +## 5. Conclusion + +This research demonstrates that enhanced markdown conversion provides significant benefits for document processing workflows. The choice between WeasyPrint and Pandoc depends on specific requirements for speed, quality, and features. + +### 5.1 Future Work + +- Integration with cloud processing services +- Real-time collaborative editing support +- Advanced template systems +- Performance optimization for very large documents + +## References + +1. Johnson, A. et al. (2024). "Advanced Document Processing Techniques." *Journal of Information Systems*, 15(3), 45-62. +2. Smith, B. (2023). "PDF Generation Optimization." *Technical Computing Review*, 8(2), 12-28. +3. Williams, C. (2024). "Markdown Processing Frameworks." *Software Engineering Quarterly*, 22(1), 78-95. + +--- + +**Manuscript received:** January 10, 2024 +**Accepted for publication:** January 15, 2024 +**Published online:** January 20, 2024 +""" + + return { + "basic": basic_content, + "technical": technical_content, + "academic": academic_content + } + + +def demonstrate_basic_conversion(): + """Demonstrate basic markdown to PDF conversion""" + print("\n" + "=" * 60) + print("BASIC MARKDOWN CONVERSION DEMONSTRATION") + print("=" * 60) + + try: + # Create converter with default settings + converter = EnhancedMarkdownConverter() + + # Show backend information + backend_info = converter.get_backend_info() + print("Available conversion backends:") + for backend, available in backend_info["available_backends"].items(): + status = "βœ…" if available else "❌" + print(f" {status} {backend}") + print(f"Recommended backend: {backend_info['recommended_backend']}") + + # Get sample content + samples = create_sample_markdown_content() + temp_dir = Path(tempfile.mkdtemp()) + + # Convert basic sample + basic_md_path = temp_dir / "basic_sample.md" + with open(basic_md_path, 'w', encoding='utf-8') as f: + f.write(samples['basic']) + + print(f"\nConverting basic sample: {basic_md_path}") + + success = converter.convert_file_to_pdf( + input_path=str(basic_md_path), + output_path=str(temp_dir / "basic_sample.pdf"), + method="auto" # Let the system choose the best backend + ) + + if success: + print("βœ… Basic conversion successful!") + print(f" Output: {temp_dir / 'basic_sample.pdf'}") + else: + print("❌ Basic conversion failed") + + return success, temp_dir + + except Exception as e: + print(f"❌ Basic conversion demonstration failed: {str(e)}") + return False, None + + +def demonstrate_backend_comparison(): + """Demonstrate different conversion backends""" + print("\n" + "=" * 60) + print("BACKEND COMPARISON DEMONSTRATION") + print("=" * 60) + + try: + samples = create_sample_markdown_content() + temp_dir = Path(tempfile.mkdtemp()) + + # Create technical document + tech_md_path = temp_dir / "technical.md" + with open(tech_md_path, 'w', encoding='utf-8') as f: + f.write(samples['technical']) + + print(f"Testing different backends with technical document...") + + # Test different backends + backends = ["auto", "weasyprint", "pandoc"] + results = {} + + for backend in backends: + try: + print(f"\nTesting {backend} backend...") + + converter = EnhancedMarkdownConverter() + output_path = temp_dir / f"technical_{backend}.pdf" + + import time + start_time = time.time() + + success = converter.convert_file_to_pdf( + input_path=str(tech_md_path), + output_path=str(output_path), + method=backend + ) + + end_time = time.time() + conversion_time = end_time - start_time + + if success: + file_size = output_path.stat().st_size if output_path.exists() else 0 + print(f" βœ… {backend}: Success in {conversion_time:.2f}s, {file_size} bytes") + results[backend] = { + 'success': True, + 'time': conversion_time, + 'size': file_size, + 'output': str(output_path) + } + else: + print(f" ❌ {backend}: Failed") + results[backend] = {'success': False, 'time': conversion_time} + + except Exception as e: + print(f" ❌ {backend}: Error - {str(e)}") + results[backend] = {'success': False, 'error': str(e)} + + # Summary + print("\n" + "-" * 40) + print("BACKEND COMPARISON SUMMARY") + print("-" * 40) + successful_backends = [b for b, r in results.items() if r.get('success', False)] + print(f"Successful backends: {successful_backends}") + + if successful_backends: + fastest = min(successful_backends, key=lambda b: results[b]['time']) + print(f"Fastest backend: {fastest} ({results[fastest]['time']:.2f}s)") + + return results, temp_dir + + except Exception as e: + print(f"❌ Backend comparison demonstration failed: {str(e)}") + return None, None + + +def demonstrate_custom_styling(): + """Demonstrate custom CSS styling and configuration""" + print("\n" + "=" * 60) + print("CUSTOM STYLING DEMONSTRATION") + print("=" * 60) + + try: + samples = create_sample_markdown_content() + temp_dir = Path(tempfile.mkdtemp()) + + # Create custom CSS + custom_css = """ + body { + font-family: 'Times New Roman', serif; + font-size: 11pt; + line-height: 1.4; + color: #2c3e50; + max-width: 800px; + margin: 0 auto; + padding: 20px; + } + + h1 { + color: #c0392b; + font-size: 2.2em; + border-bottom: 3px solid #e74c3c; + padding-bottom: 0.5em; + margin-top: 2em; + } + + h2 { + color: #8e44ad; + font-size: 1.6em; + border-bottom: 2px solid #9b59b6; + padding-bottom: 0.3em; + margin-top: 1.5em; + } + + h3 { + color: #2980b9; + font-size: 1.3em; + margin-top: 1.2em; + } + + code { + background-color: #ecf0f1; + color: #e74c3c; + padding: 3px 6px; + border-radius: 4px; + font-family: 'Courier New', monospace; + font-size: 0.9em; + } + + pre { + background-color: #2c3e50; + color: #ecf0f1; + padding: 20px; + border-radius: 8px; + border-left: 5px solid #3498db; + overflow-x: auto; + font-size: 0.9em; + } + + pre code { + background-color: transparent; + color: inherit; + padding: 0; + } + + blockquote { + background-color: #f8f9fa; + border-left: 5px solid #3498db; + margin: 1em 0; + padding: 15px 20px; + font-style: italic; + color: #555; + } + + table { + border-collapse: collapse; + width: 100%; + margin: 1.5em 0; + background-color: white; + border-radius: 8px; + overflow: hidden; + box-shadow: 0 2px 4px rgba(0,0,0,0.1); + } + + th { + background-color: #3498db; + color: white; + padding: 12px 15px; + text-align: left; + font-weight: bold; + } + + td { + padding: 10px 15px; + border-bottom: 1px solid #ecf0f1; + } + + tr:nth-child(even) { + background-color: #f8f9fa; + } + + tr:hover { + background-color: #e8f4fd; + } + + ul, ol { + margin-bottom: 1em; + padding-left: 2em; + } + + li { + margin-bottom: 0.5em; + line-height: 1.6; + } + + a { + color: #3498db; + text-decoration: none; + border-bottom: 1px dotted #3498db; + } + + a:hover { + color: #2980b9; + border-bottom: 1px solid #2980b9; + } + + .toc { + background-color: #f8f9fa; + border: 2px solid #e9ecef; + border-radius: 8px; + padding: 20px; + margin: 2em 0; + } + + .toc h2 { + color: #2c3e50; + margin-top: 0; + border-bottom: none; + } + + .toc ul { + list-style-type: none; + padding-left: 0; + } + + .toc li { + margin-bottom: 0.8em; + } + + .toc a { + color: #2c3e50; + font-weight: 500; + border-bottom: none; + } + """ + + # Create custom configuration + config = MarkdownConfig( + page_size="A4", + margin="0.8in", + font_size="11pt", + line_height="1.4", + include_toc=True, + syntax_highlighting=True, + custom_css=custom_css + ) + + converter = EnhancedMarkdownConverter(config) + + # Convert academic sample with custom styling + academic_md_path = temp_dir / "academic_styled.md" + with open(academic_md_path, 'w', encoding='utf-8') as f: + f.write(samples['academic']) + + print("Converting academic paper with custom styling...") + print("Custom styling features:") + print(" - Custom color scheme (reds, purples, blues)") + print(" - Times New Roman serif font") + print(" - Enhanced table styling with hover effects") + print(" - Styled code blocks with dark theme") + print(" - Custom blockquote styling") + print(" - Professional header styling") + + success = converter.convert_file_to_pdf( + input_path=str(academic_md_path), + output_path=str(temp_dir / "academic_styled.pdf"), + method="weasyprint" # WeasyPrint is best for custom CSS + ) + + if success: + print("βœ… Custom styling conversion successful!") + print(f" Output: {temp_dir / 'academic_styled.pdf'}") + + # Also create a default version for comparison + default_converter = EnhancedMarkdownConverter() + default_success = default_converter.convert_file_to_pdf( + input_path=str(academic_md_path), + output_path=str(temp_dir / "academic_default.pdf"), + method="weasyprint" + ) + + if default_success: + print(f" Comparison (default): {temp_dir / 'academic_default.pdf'}") + else: + print("❌ Custom styling conversion failed") + + return success, temp_dir + + except Exception as e: + print(f"❌ Custom styling demonstration failed: {str(e)}") + return False, None + + +def demonstrate_content_conversion(): + """Demonstrate converting markdown content directly (not from file)""" + print("\n" + "=" * 60) + print("CONTENT CONVERSION DEMONSTRATION") + print("=" * 60) + + try: + # Create markdown content programmatically + dynamic_content = f"""# Dynamic Content Example + +## Generated Information +This document was generated programmatically on {Path(__file__).name}. + +## System Information +- **Python Path**: {sys.executable} +- **Script Location**: {Path(__file__).absolute()} +- **Working Directory**: {Path.cwd()} + +## Dynamic Table +| Property | Value | +|----------|-------| +| Script Name | {Path(__file__).name} | +| Python Version | {sys.version.split()[0]} | +| Platform | {sys.platform} | + +## Code Example +```python +# This content was generated dynamically +import sys +from pathlib import Path + +def generate_report(): + return f"Report generated from {{Path(__file__).name}}" + +print(generate_report()) +``` + +## Features Demonstrated +This example shows how to: +1. Generate markdown content programmatically +2. Convert content directly without saving to file first +3. Include dynamic information in documents +4. Use different conversion methods + +> **Note**: This content was created in memory and converted directly to PDF +> without intermediate file storage. + +## Conclusion +Direct content conversion is useful for: +- Dynamic report generation +- Programmatic document creation +- API-based document services +- Real-time content processing +""" + + temp_dir = Path(tempfile.mkdtemp()) + converter = EnhancedMarkdownConverter() + + print("Converting dynamically generated markdown content...") + print("Content includes:") + print(" - System information") + print(" - Dynamic tables with current values") + print(" - Generated timestamps") + print(" - Programmatic examples") + + # Convert content directly to PDF + output_path = temp_dir / "dynamic_content.pdf" + + success = converter.convert_markdown_to_pdf( + markdown_content=dynamic_content, + output_path=str(output_path), + method="auto" + ) + + if success: + print("βœ… Content conversion successful!") + print(f" Output: {output_path}") + + # Show file size + file_size = output_path.stat().st_size + print(f" Generated PDF size: {file_size} bytes") + else: + print("❌ Content conversion failed") + + return success, temp_dir + + except Exception as e: + print(f"❌ Content conversion demonstration failed: {str(e)}") + return False, None + + +def demonstrate_error_handling(): + """Demonstrate error handling and fallback mechanisms""" + print("\n" + "=" * 60) + print("ERROR HANDLING DEMONSTRATION") + print("=" * 60) + + try: + temp_dir = Path(tempfile.mkdtemp()) + + # Test cases with various issues + test_cases = { + "invalid_markdown": """# Invalid Markdown + +This markdown has some {{invalid}} syntax and [broken links](http://nonexistent.invalid). + +```unknown_language +This code block uses an unknown language +``` + +![Missing Image](nonexistent_image.png) +""", + "complex_content": """# Complex Content Test + +## Mathematical Expressions +This tests content that might be challenging for some backends: + +$$ E = mc^2 $$ + +$$\\sum_{i=1}^{n} x_i = \\frac{n(n+1)}{2}$$ + +## Complex Tables +| A | B | C | D | E | F | G | +|---|---|---|---|---|---|---| +| Very long content that might wrap | Short | Medium length content | X | Y | Z | End | +| Another row with different lengths | A | B | C | D | E | F | + +## Special Characters +Unicode: Ξ±, Ξ², Ξ³, Ξ΄, Ξ΅, ΞΆ, Ξ·, ΞΈ, ΞΉ, ΞΊ, Ξ», ΞΌ, Ξ½, ΞΎ, ΞΏ, Ο€, ρ, Οƒ, Ο„, Ο…, Ο†, Ο‡, ψ, Ο‰ +Symbols: β™  ♣ β™₯ ♦ β˜€ ☁ β˜‚ β˜ƒ β˜„ β˜… β˜† β˜‰ ☊ β˜‹ ☌ ☍ ☎ ☏ +Arrows: ← ↑ β†’ ↓ ↔ ↕ β†– β†— β†˜ ↙ +""", + "empty_content": "", + "minimal_content": "# Just a title" + } + + print("Testing error handling with various content types...") + + results = {} + + for test_name, content in test_cases.items(): + print(f"\nTesting: {test_name}") + + try: + # Try multiple backends for each test case + for backend in ["auto", "weasyprint", "pandoc"]: + try: + converter = EnhancedMarkdownConverter() + output_path = temp_dir / f"{test_name}_{backend}.pdf" + + success = converter.convert_markdown_to_pdf( + markdown_content=content, + output_path=str(output_path), + method=backend + ) + + if success: + file_size = output_path.stat().st_size if output_path.exists() else 0 + print(f" βœ… {backend}: Success ({file_size} bytes)") + results[f"{test_name}_{backend}"] = { + 'success': True, + 'size': file_size + } + else: + print(f" ❌ {backend}: Failed") + results[f"{test_name}_{backend}"] = {'success': False} + + except Exception as e: + print(f" ❌ {backend}: Error - {str(e)[:60]}...") + results[f"{test_name}_{backend}"] = { + 'success': False, + 'error': str(e) + } + + except Exception as e: + print(f" ❌ Test case failed: {str(e)}") + + # Demonstrate robust conversion with fallbacks + print(f"\nDemonstrating robust conversion with fallback logic...") + + def robust_convert(content, output_path): + """Convert with multiple backend fallbacks""" + backends = ["weasyprint", "pandoc", "auto"] + + for backend in backends: + try: + converter = EnhancedMarkdownConverter() + success = converter.convert_markdown_to_pdf( + markdown_content=content, + output_path=output_path, + method=backend + ) + if success: + return backend, True + except Exception: + continue + + return None, False + + # Test robust conversion + test_content = test_cases["complex_content"] + robust_output = temp_dir / "robust_conversion.pdf" + + successful_backend, success = robust_convert(test_content, str(robust_output)) + + if success: + print(f"βœ… Robust conversion successful using {successful_backend}") + print(f" Output: {robust_output}") + else: + print("❌ All backends failed for robust conversion") + + # Summary + print("\n" + "-" * 40) + print("ERROR HANDLING SUMMARY") + print("-" * 40) + successful_conversions = sum(1 for r in results.values() if r.get('success', False)) + total_attempts = len(results) + success_rate = (successful_conversions / total_attempts * 100) if total_attempts > 0 else 0 + + print(f"Total conversion attempts: {total_attempts}") + print(f"Successful conversions: {successful_conversions}") + print(f"Success rate: {success_rate:.1f}%") + + return results, temp_dir + + except Exception as e: + print(f"❌ Error handling demonstration failed: {str(e)}") + return None, None + + +def main(): + """Main demonstration function""" + # Configure logging + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + + print("RAG-Anything Enhanced Markdown Conversion Demonstration") + print("=" * 70) + print("This example demonstrates various enhanced markdown conversion capabilities:") + print(" - Basic markdown to PDF conversion") + print(" - Multiple backend comparison (WeasyPrint vs Pandoc)") + print(" - Custom CSS styling and professional formatting") + print(" - Direct content conversion without file I/O") + print(" - Comprehensive error handling and fallback mechanisms") + + results = {} + + # Run demonstrations + print("\nπŸš€ Starting demonstrations...") + + # Basic conversion + success, temp_dir = demonstrate_basic_conversion() + results['basic'] = success + + # Backend comparison + backend_results, _ = demonstrate_backend_comparison() + results['backends'] = backend_results + + # Custom styling + styling_success, _ = demonstrate_custom_styling() + results['styling'] = styling_success + + # Content conversion + content_success, _ = demonstrate_content_conversion() + results['content'] = content_success + + # Error handling + error_results, _ = demonstrate_error_handling() + results['error_handling'] = error_results + + # Summary + print("\n" + "=" * 70) + print("DEMONSTRATION SUMMARY") + print("=" * 70) + + print("βœ… Features Successfully Demonstrated:") + if results['basic']: + print(" - Basic markdown to PDF conversion") + if results['backends']: + successful_backends = [b for b, r in results['backends'].items() if r.get('success', False)] + print(f" - Multiple backends: {successful_backends}") + if results['styling']: + print(" - Custom CSS styling and professional formatting") + if results['content']: + print(" - Direct content conversion without file I/O") + if results['error_handling']: + success_rate = sum(1 for r in results['error_handling'].values() if r.get('success', False)) / len(results['error_handling']) * 100 + print(f" - Error handling with {success_rate:.1f}% overall success rate") + + print("\nπŸ“Š Key Capabilities Highlighted:") + print(" - Professional PDF generation with high-quality typography") + print(" - Multiple conversion backends with automatic selection") + print(" - Extensive CSS customization for branded documents") + print(" - Syntax highlighting for code blocks using Pygments") + print(" - Table formatting with professional styling") + print(" - Image embedding with proper scaling") + print(" - Table of contents generation with navigation") + print(" - Comprehensive error handling and fallback mechanisms") + + print("\nπŸ’‘ Best Practices Demonstrated:") + print(" - Choose WeasyPrint for web-style documents and custom CSS") + print(" - Choose Pandoc for academic papers and complex formatting") + print(" - Use 'auto' method for general-purpose conversion") + print(" - Implement fallback logic for robust conversion") + print(" - Optimize images before embedding in documents") + print(" - Test custom CSS with simple content first") + print(" - Handle errors gracefully with multiple backend attempts") + print(" - Use appropriate page sizes and margins for target use case") + + print("\n🎯 Integration Patterns:") + print(" - Standalone conversion for document generation") + print(" - Integration with RAG-Anything document pipeline") + print(" - API-based document services") + print(" - Batch processing for multiple documents") + print(" - Dynamic content generation from templates") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/raganything/enhanced_markdown.py b/raganything/enhanced_markdown.py index ac9c577..a456192 100644 --- a/raganything/enhanced_markdown.py +++ b/raganything/enhanced_markdown.py @@ -319,6 +319,7 @@ class EnhancedMarkdownConverter: "Pandoc not available. Install from: https://pandoc.org/installing.html" ) + temp_md_path = None try: import subprocess @@ -344,9 +345,6 @@ class EnhancedMarkdownConverter: # Run pandoc result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) - # Clean up temp file - os.unlink(temp_md_path) - if result.returncode == 0: self.logger.info( f"Successfully converted to PDF using Pandoc: {output_path}" @@ -360,6 +358,15 @@ class EnhancedMarkdownConverter: self.logger.error(f"Pandoc conversion failed: {str(e)}") return False + finally: + if temp_md_path and os.path.exists(temp_md_path): + try: + os.unlink(temp_md_path) + except OSError as e: + self.logger.error( + f"Failed to clean up temp file {temp_md_path}: {str(e)}" + ) + def convert_markdown_to_pdf( self, markdown_content: str, output_path: str, method: str = "auto" ) -> bool: diff --git a/requirements.txt b/requirements.txt index f3a96dd..9cd2d0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,10 @@ huggingface_hub # LightRAG packages lightrag-hku - -# Enhanced markdown conversion (optional) -markdown - -# Enhanced markdown conversion (optional) - # MinerU 2.0 packages (replaces magic-pdf) mineru[core] -pygments - # Progress bars for batch processing tqdm - -# Progress bars for batch processing -weasyprint - # Note: Optional dependencies are now defined in setup.py extras_require: # - [image]: Pillow>=10.0.0 (for BMP, TIFF, GIF, WebP format conversion) # - [text]: reportlab>=4.0.0 (for TXT, MD to PDF conversion) diff --git a/setup.py b/setup.py index cc4e67f..3c2277a 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,11 @@ extras_require = { "text": ["reportlab>=4.0.0"], # For text file to PDF conversion (TXT, MD) "office": [], # Office document processing requires LibreOffice (external program) "all": ["Pillow>=10.0.0", "reportlab>=4.0.0"], # All optional features + "markdown": [ + "markdown>=3.4.0", + "weasyprint>=60.0", + "pygments>=2.10.0", + ], # Enhanced markdown conversion } setuptools.setup(