improvised version

2025-08-09 13:53:04 +03:00 · 2025-07-28 10:08:54 +05:30
parent 099b502860
commit 60f05e04cf
11 changed files with 2489 additions and 1636 deletions
--- a/examples/batch_processing_example.py
+++ b/examples/batch_processing_example.py
@@ -0,0 +1,550 @@
+#!/usr/bin/env python
+"""
+Batch Processing Example for RAG-Anything
+
+This example demonstrates how to use the batch processing capabilities
+to process multiple documents in parallel for improved throughput.
+
+Features demonstrated:
+- Basic batch processing with BatchParser
+- Asynchronous batch processing
+- Integration with RAG-Anything
+- Error handling and progress tracking
+- File filtering and directory processing
+"""
+
+import asyncio
+import logging
+from pathlib import Path
+import tempfile
+import time
+
+# Add project root directory to Python path
+import sys
+sys.path.append(str(Path(__file__).parent.parent))
+
+from raganything import RAGAnything, RAGAnythingConfig
+from raganything.batch_parser import BatchParser
+
+
+def create_sample_documents():
+    """Create sample documents for batch processing testing"""
+    temp_dir = Path(tempfile.mkdtemp())
+    sample_files = []
+    
+    # Create various document types
+    documents = {
+        "document1.txt": "This is a simple text document for testing batch processing.",
+        "document2.txt": "Another text document with different content.",
+        "document3.md": """# Markdown Document
+        
+## Introduction
+This is a markdown document for testing.
+
+### Features
+- Markdown formatting
+- Code blocks
+- Lists
+
+```python
+def example():
+    return "Hello from markdown"
+```
+""",
+        "report.txt": """Business Report
+
+Executive Summary:
+This report demonstrates batch processing capabilities.
+
+Key Findings:
+1. Parallel processing improves throughput
+2. Progress tracking enhances user experience
+3. Error handling ensures reliability
+
+Conclusion:
+Batch processing is essential for large-scale document processing.
+""",
+        "notes.md": """# Meeting Notes
+
+## Date: 2024-01-15
+
+### Attendees
+- Alice Johnson
+- Bob Smith
+- Carol Williams
+
+### Discussion Topics
+1. **Batch Processing Implementation**
+   - Parallel document processing
+   - Progress tracking
+   - Error handling strategies
+
+2. **Performance Metrics**
+   - Target: 100 documents/hour
+   - Memory usage: < 4GB
+   - Success rate: > 95%
+
+### Action Items
+- [ ] Implement batch processing
+- [ ] Add progress bars
+- [ ] Test with large document sets
+- [ ] Optimize memory usage
+
+### Next Steps
+Continue development and testing of batch processing features.
+"""
+    }
+    
+    # Create files
+    for filename, content in documents.items():
+        file_path = temp_dir / filename
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        sample_files.append(str(file_path))
+    
+    return sample_files, temp_dir
+
+
+def demonstrate_basic_batch_processing():
+    """Demonstrate basic batch processing functionality"""
+    print("\n" + "=" * 60)
+    print("BASIC BATCH PROCESSING DEMONSTRATION")
+    print("=" * 60)
+    
+    # Create sample documents
+    sample_files, temp_dir = create_sample_documents()
+    
+    try:
+        print(f"Created {len(sample_files)} sample documents in: {temp_dir}")
+        for file_path in sample_files:
+            print(f"  - {Path(file_path).name}")
+        
+        # Create batch parser
+        batch_parser = BatchParser(
+            parser_type="mineru",
+            max_workers=3,
+            show_progress=True,
+            timeout_per_file=60,
+            skip_installation_check=True  # Skip installation check for demo
+        )
+        
+        print(f"\nBatch parser configured:")
+        print(f"  - Parser type: mineru")
+        print(f"  - Max workers: 3")
+        print(f"  - Progress tracking: enabled")
+        print(f"  - Timeout per file: 60 seconds")
+        
+        # Check supported extensions
+        supported_extensions = batch_parser.get_supported_extensions()
+        print(f"  - Supported extensions: {supported_extensions}")
+        
+        # Filter files to supported types
+        supported_files = batch_parser.filter_supported_files(sample_files)
+        print(f"\nFile filtering results:")
+        print(f"  - Total files: {len(sample_files)}")
+        print(f"  - Supported files: {len(supported_files)}")
+        
+        # Process batch
+        output_dir = temp_dir / "batch_output"
+        print(f"\nStarting batch processing...")
+        print(f"Output directory: {output_dir}")
+        
+        start_time = time.time()
+        result = batch_parser.process_batch(
+            file_paths=supported_files,
+            output_dir=str(output_dir),
+            parse_method="auto",
+            recursive=False
+        )
+        processing_time = time.time() - start_time
+        
+        # Display results
+        print("\n" + "-" * 40)
+        print("BATCH PROCESSING RESULTS")
+        print("-" * 40)
+        print(result.summary())
+        print(f"Total processing time: {processing_time:.2f} seconds")
+        print(f"Success rate: {result.success_rate:.1f}%")
+        
+        if result.successful_files:
+            print(f"\nSuccessfully processed files:")
+            for file_path in result.successful_files:
+                print(f"  ✅ {Path(file_path).name}")
+        
+        if result.failed_files:
+            print(f"\nFailed files:")
+            for file_path in result.failed_files:
+                error = result.errors.get(file_path, "Unknown error")
+                print(f"  ❌ {Path(file_path).name}: {error}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Batch processing demonstration failed: {str(e)}")
+        return None
+
+
+async def demonstrate_async_batch_processing():
+    """Demonstrate asynchronous batch processing"""
+    print("\n" + "=" * 60)
+    print("ASYNCHRONOUS BATCH PROCESSING DEMONSTRATION")
+    print("=" * 60)
+    
+    # Create sample documents
+    sample_files, temp_dir = create_sample_documents()
+    
+    try:
+        print(f"Processing {len(sample_files)} documents asynchronously...")
+        
+        # Create batch parser
+        batch_parser = BatchParser(
+            parser_type="mineru",
+            max_workers=2,
+            show_progress=True,
+            skip_installation_check=True
+        )
+        
+        # Process batch asynchronously
+        output_dir = temp_dir / "async_output"
+        
+        start_time = time.time()
+        result = await batch_parser.process_batch_async(
+            file_paths=sample_files,
+            output_dir=str(output_dir),
+            parse_method="auto",
+            recursive=False
+        )
+        processing_time = time.time() - start_time
+        
+        # Display results
+        print("\n" + "-" * 40)
+        print("ASYNC BATCH PROCESSING RESULTS")
+        print("-" * 40)
+        print(result.summary())
+        print(f"Async processing time: {processing_time:.2f} seconds")
+        print(f"Success rate: {result.success_rate:.1f}%")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Async batch processing demonstration failed: {str(e)}")
+        return None
+
+
+async def demonstrate_rag_integration():
+    """Demonstrate batch processing integration with RAG-Anything"""
+    print("\n" + "=" * 60)
+    print("RAG-ANYTHING BATCH INTEGRATION DEMONSTRATION")
+    print("=" * 60)
+    
+    # Create sample documents
+    sample_files, temp_dir = create_sample_documents()
+    
+    try:
+        # Initialize RAG-Anything with temporary storage
+        config = RAGAnythingConfig(
+            working_dir=str(temp_dir / "rag_storage"),
+            enable_image_processing=True,
+            enable_table_processing=True,
+            enable_equation_processing=True,
+            max_concurrent_files=2
+        )
+        
+        rag = RAGAnything(config=config)
+        
+        print("RAG-Anything initialized with batch processing capabilities")
+        
+        # Show available batch methods
+        batch_methods = [method for method in dir(rag) if 'batch' in method.lower()]
+        print(f"Available batch methods: {batch_methods}")
+        
+        # Demonstrate batch processing with RAG integration
+        print(f"\nProcessing {len(sample_files)} documents with RAG integration...")
+        
+        # Use the RAG-integrated batch processing
+        try:
+            # Process documents in batch
+            result = rag.process_documents_batch(
+                file_paths=sample_files,
+                output_dir=str(temp_dir / "rag_batch_output"),
+                max_workers=2,
+                show_progress=True
+            )
+            
+            print("\n" + "-" * 40)
+            print("RAG BATCH PROCESSING RESULTS")
+            print("-" * 40)
+            print(result.summary())
+            print(f"Success rate: {result.success_rate:.1f}%")
+            
+            # Demonstrate batch processing with full RAG integration
+            print(f"\nProcessing documents with full RAG integration...")
+            
+            rag_result = await rag.process_documents_with_rag_batch(
+                file_paths=sample_files[:2],  # Process subset for demo
+                output_dir=str(temp_dir / "rag_full_output"),
+                max_workers=1,
+                show_progress=True
+            )
+            
+            print("\n" + "-" * 40)
+            print("FULL RAG INTEGRATION RESULTS")
+            print("-" * 40)
+            print(f"Parse result: {rag_result['parse_result'].summary()}")
+            print(f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds")
+            print(f"Successfully processed with RAG: {rag_result['successful_rag_files']}")
+            print(f"Failed RAG processing: {rag_result['failed_rag_files']}")
+            
+            return rag_result
+            
+        except Exception as e:
+            print(f"⚠️ RAG integration demo completed with limitations: {str(e)}")
+            print("Note: This is expected in environments without full API configuration")
+            return None
+            
+    except Exception as e:
+        print(f"❌ RAG integration demonstration failed: {str(e)}")
+        return None
+
+
+def demonstrate_directory_processing():
+    """Demonstrate processing entire directories"""
+    print("\n" + "=" * 60)
+    print("DIRECTORY PROCESSING DEMONSTRATION")
+    print("=" * 60)
+    
+    # Create a directory structure with nested files
+    temp_dir = Path(tempfile.mkdtemp())
+    
+    # Create main directory files
+    main_files = {
+        "overview.txt": "Main directory overview document",
+        "readme.md": "# Project README\n\nThis is the main project documentation."
+    }
+    
+    # Create subdirectory
+    sub_dir = temp_dir / "subdirectory"
+    sub_dir.mkdir()
+    
+    sub_files = {
+        "details.txt": "Detailed information in subdirectory",
+        "notes.md": "# Notes\n\nAdditional notes and information."
+    }
+    
+    # Write all files
+    all_files = []
+    for filename, content in main_files.items():
+        file_path = temp_dir / filename
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        all_files.append(str(file_path))
+    
+    for filename, content in sub_files.items():
+        file_path = sub_dir / filename
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        all_files.append(str(file_path))
+    
+    try:
+        print(f"Created directory structure:")
+        print(f"  Main directory: {temp_dir}")
+        print(f"  Files in main: {list(main_files.keys())}")
+        print(f"  Subdirectory: {sub_dir}")
+        print(f"  Files in sub: {list(sub_files.keys())}")
+        
+        # Create batch parser
+        batch_parser = BatchParser(
+            parser_type="mineru",
+            max_workers=2,
+            show_progress=True,
+            skip_installation_check=True
+        )
+        
+        # Process entire directory recursively
+        print(f"\nProcessing entire directory recursively...")
+        
+        result = batch_parser.process_batch(
+            file_paths=[str(temp_dir)],  # Pass directory path
+            output_dir=str(temp_dir / "directory_output"),
+            parse_method="auto",
+            recursive=True  # Include subdirectories
+        )
+        
+        print("\n" + "-" * 40)
+        print("DIRECTORY PROCESSING RESULTS")
+        print("-" * 40)
+        print(result.summary())
+        print(f"Total files found and processed: {result.total_files}")
+        print(f"Success rate: {result.success_rate:.1f}%")
+        
+        if result.successful_files:
+            print(f"\nSuccessfully processed:")
+            for file_path in result.successful_files:
+                relative_path = Path(file_path).relative_to(temp_dir)
+                print(f"  ✅ {relative_path}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Directory processing demonstration failed: {str(e)}")
+        return None
+
+
+def demonstrate_error_handling():
+    """Demonstrate error handling and recovery"""
+    print("\n" + "=" * 60)
+    print("ERROR HANDLING DEMONSTRATION")
+    print("=" * 60)
+    
+    temp_dir = Path(tempfile.mkdtemp())
+    
+    # Create files with various issues
+    files_with_issues = {
+        "valid_file.txt": "This is a valid file that should process successfully.",
+        "empty_file.txt": "",  # Empty file
+        "large_file.txt": "x" * 1000000,  # Large file (1MB of 'x')
+    }
+    
+    created_files = []
+    for filename, content in files_with_issues.items():
+        file_path = temp_dir / filename
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        created_files.append(str(file_path))
+    
+    # Add a non-existent file to the list
+    created_files.append(str(temp_dir / "non_existent_file.txt"))
+    
+    try:
+        print(f"Testing error handling with {len(created_files)} files:")
+        for file_path in created_files:
+            name = Path(file_path).name
+            exists = Path(file_path).exists()
+            size = Path(file_path).stat().st_size if exists else 0
+            print(f"  - {name}: {'exists' if exists else 'missing'}, {size} bytes")
+        
+        # Create batch parser with short timeout for demonstration
+        batch_parser = BatchParser(
+            parser_type="mineru",
+            max_workers=2,
+            show_progress=True,
+            timeout_per_file=30,  # Short timeout for demo
+            skip_installation_check=True
+        )
+        
+        # Process files and handle errors
+        result = batch_parser.process_batch(
+            file_paths=created_files,
+            output_dir=str(temp_dir / "error_test_output"),
+            parse_method="auto"
+        )
+        
+        print("\n" + "-" * 40)
+        print("ERROR HANDLING RESULTS")
+        print("-" * 40)
+        print(result.summary())
+        
+        if result.successful_files:
+            print(f"\nSuccessful files:")
+            for file_path in result.successful_files:
+                print(f"  ✅ {Path(file_path).name}")
+        
+        if result.failed_files:
+            print(f"\nFailed files with error details:")
+            for file_path in result.failed_files:
+                error = result.errors.get(file_path, "Unknown error")
+                print(f"  ❌ {Path(file_path).name}: {error}")
+        
+        # Demonstrate retry logic
+        if result.failed_files:
+            print(f"\nDemonstrating retry logic for {len(result.failed_files)} failed files...")
+            
+            # Retry only the failed files
+            retry_result = batch_parser.process_batch(
+                file_paths=result.failed_files,
+                output_dir=str(temp_dir / "retry_output"),
+                parse_method="auto"
+            )
+            
+            print(f"Retry results: {retry_result.summary()}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error handling demonstration failed: {str(e)}")
+        return None
+
+
+async def main():
+    """Main demonstration function"""
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    
+    print("RAG-Anything Batch Processing Demonstration")
+    print("=" * 70)
+    print("This example demonstrates various batch processing capabilities:")
+    print("  - Basic batch processing with progress tracking")
+    print("  - Asynchronous processing for improved performance")
+    print("  - Integration with RAG-Anything pipeline")
+    print("  - Directory processing with recursive file discovery")
+    print("  - Comprehensive error handling and recovery")
+    
+    results = {}
+    
+    # Run demonstrations
+    print("\n🚀 Starting demonstrations...")
+    
+    # Basic batch processing
+    results['basic'] = demonstrate_basic_batch_processing()
+    
+    # Asynchronous processing
+    results['async'] = await demonstrate_async_batch_processing()
+    
+    # RAG integration
+    results['rag'] = await demonstrate_rag_integration()
+    
+    # Directory processing
+    results['directory'] = demonstrate_directory_processing()
+    
+    # Error handling
+    results['error_handling'] = demonstrate_error_handling()
+    
+    # Summary
+    print("\n" + "=" * 70)
+    print("DEMONSTRATION SUMMARY")
+    print("=" * 70)
+    
+    for demo_name, result in results.items():
+        if result:
+            if hasattr(result, 'success_rate'):
+                print(f"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate")
+            else:
+                print(f"✅ {demo_name.upper()}: Completed successfully")
+        else:
+            print(f"❌ {demo_name.upper()}: Failed or had limitations")
+    
+    print("\n📊 Key Features Demonstrated:")
+    print("  - Parallel document processing with configurable worker counts")
+    print("  - Real-time progress tracking with tqdm progress bars")
+    print("  - Comprehensive error handling and reporting")
+    print("  - File filtering based on supported document types")
+    print("  - Directory processing with recursive file discovery")
+    print("  - Asynchronous processing for improved performance")
+    print("  - Integration with RAG-Anything document pipeline")
+    print("  - Retry logic for failed documents")
+    print("  - Detailed processing statistics and timing")
+    
+    print("\n💡 Best Practices Highlighted:")
+    print("  - Use appropriate worker counts for your system")
+    print("  - Enable progress tracking for long-running operations")
+    print("  - Handle errors gracefully with retry mechanisms")
+    print("  - Filter files to supported types before processing")
+    print("  - Set reasonable timeouts for document processing")
+    print("  - Use skip_installation_check for environments with conflicts")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())