fix lint

2025-08-20 19:01:34 +03:00 · 2025-07-29 17:07:15 +08:00
parent 4f900db761
commit 5e56140300
5 changed files with 365 additions and 330 deletions
--- a/examples/batch_processing_example.py
+++ b/examples/batch_processing_example.py
@@ -21,6 +21,7 @@ import time

 # Add project root directory to Python path
 import sys
+
 sys.path.append(str(Path(__file__).parent.parent))

 from raganything import RAGAnything, RAGAnythingConfig
@@ -92,13 +93,13 @@ Batch processing is essential for large-scale document processing.

 ### Next Steps
 Continue development and testing of batch processing features.
-"""
+""",
    }

    # Create files
    for filename, content in documents.items():
        file_path = temp_dir / filename
-        with open(file_path, 'w', encoding='utf-8') as f:
+        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        sample_files.append(str(file_path))

@@ -125,14 +126,14 @@ def demonstrate_basic_batch_processing():
            max_workers=3,
            show_progress=True,
            timeout_per_file=60,
-            skip_installation_check=True  # Skip installation check for demo
+            skip_installation_check=True,  # Skip installation check for demo
        )

-        print(f"\nBatch parser configured:")
-        print(f"  - Parser type: mineru")
-        print(f"  - Max workers: 3")
-        print(f"  - Progress tracking: enabled")
-        print(f"  - Timeout per file: 60 seconds")
+        print("\nBatch parser configured:")
+        print("  - Parser type: mineru")
+        print("  - Max workers: 3")
+        print("  - Progress tracking: enabled")
+        print("  - Timeout per file: 60 seconds")

        # Check supported extensions
        supported_extensions = batch_parser.get_supported_extensions()
@@ -140,13 +141,13 @@ def demonstrate_basic_batch_processing():

        # Filter files to supported types
        supported_files = batch_parser.filter_supported_files(sample_files)
-        print(f"\nFile filtering results:")
+        print("\nFile filtering results:")
        print(f"  - Total files: {len(sample_files)}")
        print(f"  - Supported files: {len(supported_files)}")

        # Process batch
        output_dir = temp_dir / "batch_output"
-        print(f"\nStarting batch processing...")
+        print("\nStarting batch processing...")
        print(f"Output directory: {output_dir}")

        start_time = time.time()
@@ -154,7 +155,7 @@ def demonstrate_basic_batch_processing():
            file_paths=supported_files,
            output_dir=str(output_dir),
            parse_method="auto",
-            recursive=False
+            recursive=False,
        )
        processing_time = time.time() - start_time

@@ -167,12 +168,12 @@ def demonstrate_basic_batch_processing():
        print(f"Success rate: {result.success_rate:.1f}%")

        if result.successful_files:
-            print(f"\nSuccessfully processed files:")
+            print("\nSuccessfully processed files:")
            for file_path in result.successful_files:
                print(f"  ✅ {Path(file_path).name}")

        if result.failed_files:
-            print(f"\nFailed files:")
+            print("\nFailed files:")
            for file_path in result.failed_files:
                error = result.errors.get(file_path, "Unknown error")
                print(f"  ❌ {Path(file_path).name}: {error}")
@@ -201,7 +202,7 @@ async def demonstrate_async_batch_processing():
            parser_type="mineru",
            max_workers=2,
            show_progress=True,
-            skip_installation_check=True
+            skip_installation_check=True,
        )

        # Process batch asynchronously
@@ -212,7 +213,7 @@ async def demonstrate_async_batch_processing():
            file_paths=sample_files,
            output_dir=str(output_dir),
            parse_method="auto",
-            recursive=False
+            recursive=False,
        )
        processing_time = time.time() - start_time

@@ -247,7 +248,7 @@ async def demonstrate_rag_integration():
            enable_image_processing=True,
            enable_table_processing=True,
            enable_equation_processing=True,
-            max_concurrent_files=2
+            max_concurrent_files=2,
        )

        rag = RAGAnything(config=config)
@@ -255,7 +256,7 @@ async def demonstrate_rag_integration():
        print("RAG-Anything initialized with batch processing capabilities")

        # Show available batch methods
-        batch_methods = [method for method in dir(rag) if 'batch' in method.lower()]
+        batch_methods = [method for method in dir(rag) if "batch" in method.lower()]
        print(f"Available batch methods: {batch_methods}")

        # Demonstrate batch processing with RAG integration
@@ -268,7 +269,7 @@ async def demonstrate_rag_integration():
                file_paths=sample_files,
                output_dir=str(temp_dir / "rag_batch_output"),
                max_workers=2,
-                show_progress=True
+                show_progress=True,
            )

            print("\n" + "-" * 40)
@@ -278,28 +279,34 @@ async def demonstrate_rag_integration():
            print(f"Success rate: {result.success_rate:.1f}%")

            # Demonstrate batch processing with full RAG integration
-            print(f"\nProcessing documents with full RAG integration...")
+            print("\nProcessing documents with full RAG integration...")

            rag_result = await rag.process_documents_with_rag_batch(
                file_paths=sample_files[:2],  # Process subset for demo
                output_dir=str(temp_dir / "rag_full_output"),
                max_workers=1,
-                show_progress=True
+                show_progress=True,
            )

            print("\n" + "-" * 40)
            print("FULL RAG INTEGRATION RESULTS")
            print("-" * 40)
            print(f"Parse result: {rag_result['parse_result'].summary()}")
-            print(f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds")
-            print(f"Successfully processed with RAG: {rag_result['successful_rag_files']}")
+            print(
+                f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds"
+            )
+            print(
+                f"Successfully processed with RAG: {rag_result['successful_rag_files']}"
+            )
            print(f"Failed RAG processing: {rag_result['failed_rag_files']}")

            return rag_result

        except Exception as e:
            print(f"⚠️ RAG integration demo completed with limitations: {str(e)}")
-            print("Note: This is expected in environments without full API configuration")
+            print(
+                "Note: This is expected in environments without full API configuration"
+            )
            return None

    except Exception as e:
@@ -319,7 +326,7 @@ def demonstrate_directory_processing():
    # Create main directory files
    main_files = {
        "overview.txt": "Main directory overview document",
-        "readme.md": "# Project README\n\nThis is the main project documentation."
+        "readme.md": "# Project README\n\nThis is the main project documentation.",
    }

    # Create subdirectory
@@ -328,25 +335,25 @@ def demonstrate_directory_processing():

    sub_files = {
        "details.txt": "Detailed information in subdirectory",
-        "notes.md": "# Notes\n\nAdditional notes and information."
+        "notes.md": "# Notes\n\nAdditional notes and information.",
    }

    # Write all files
    all_files = []
    for filename, content in main_files.items():
        file_path = temp_dir / filename
-        with open(file_path, 'w', encoding='utf-8') as f:
+        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        all_files.append(str(file_path))

    for filename, content in sub_files.items():
        file_path = sub_dir / filename
-        with open(file_path, 'w', encoding='utf-8') as f:
+        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        all_files.append(str(file_path))

    try:
-        print(f"Created directory structure:")
+        print("Created directory structure:")
        print(f"  Main directory: {temp_dir}")
        print(f"  Files in main: {list(main_files.keys())}")
        print(f"  Subdirectory: {sub_dir}")
@@ -357,17 +364,17 @@ def demonstrate_directory_processing():
            parser_type="mineru",
            max_workers=2,
            show_progress=True,
-            skip_installation_check=True
+            skip_installation_check=True,
        )

        # Process entire directory recursively
-        print(f"\nProcessing entire directory recursively...")
+        print("\nProcessing entire directory recursively...")

        result = batch_parser.process_batch(
            file_paths=[str(temp_dir)],  # Pass directory path
            output_dir=str(temp_dir / "directory_output"),
            parse_method="auto",
-            recursive=True  # Include subdirectories
+            recursive=True,  # Include subdirectories
        )

        print("\n" + "-" * 40)
@@ -378,7 +385,7 @@ def demonstrate_directory_processing():
        print(f"Success rate: {result.success_rate:.1f}%")

        if result.successful_files:
-            print(f"\nSuccessfully processed:")
+            print("\nSuccessfully processed:")
            for file_path in result.successful_files:
                relative_path = Path(file_path).relative_to(temp_dir)
                print(f"  ✅ {relative_path}")
@@ -408,7 +415,7 @@ def demonstrate_error_handling():
    created_files = []
    for filename, content in files_with_issues.items():
        file_path = temp_dir / filename
-        with open(file_path, 'w', encoding='utf-8') as f:
+        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        created_files.append(str(file_path))

@@ -429,14 +436,14 @@ def demonstrate_error_handling():
            max_workers=2,
            show_progress=True,
            timeout_per_file=30,  # Short timeout for demo
-            skip_installation_check=True
+            skip_installation_check=True,
        )

        # Process files and handle errors
        result = batch_parser.process_batch(
            file_paths=created_files,
            output_dir=str(temp_dir / "error_test_output"),
-            parse_method="auto"
+            parse_method="auto",
        )

        print("\n" + "-" * 40)
@@ -445,25 +452,27 @@ def demonstrate_error_handling():
        print(result.summary())

        if result.successful_files:
-            print(f"\nSuccessful files:")
+            print("\nSuccessful files:")
            for file_path in result.successful_files:
                print(f"  ✅ {Path(file_path).name}")

        if result.failed_files:
-            print(f"\nFailed files with error details:")
+            print("\nFailed files with error details:")
            for file_path in result.failed_files:
                error = result.errors.get(file_path, "Unknown error")
                print(f"  ❌ {Path(file_path).name}: {error}")

        # Demonstrate retry logic
        if result.failed_files:
-            print(f"\nDemonstrating retry logic for {len(result.failed_files)} failed files...")
+            print(
+                f"\nDemonstrating retry logic for {len(result.failed_files)} failed files..."
+            )

            # Retry only the failed files
            retry_result = batch_parser.process_batch(
                file_paths=result.failed_files,
                output_dir=str(temp_dir / "retry_output"),
-                parse_method="auto"
+                parse_method="auto",
            )

            print(f"Retry results: {retry_result.summary()}")
@@ -480,7 +489,7 @@ async def main():
    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    print("RAG-Anything Batch Processing Demonstration")
@@ -498,19 +507,19 @@ async def main():
    print("\n🚀 Starting demonstrations...")

    # Basic batch processing
-    results['basic'] = demonstrate_basic_batch_processing()
+    results["basic"] = demonstrate_basic_batch_processing()

    # Asynchronous processing
-    results['async'] = await demonstrate_async_batch_processing()
+    results["async"] = await demonstrate_async_batch_processing()

    # RAG integration
-    results['rag'] = await demonstrate_rag_integration()
+    results["rag"] = await demonstrate_rag_integration()

    # Directory processing
-    results['directory'] = demonstrate_directory_processing()
+    results["directory"] = demonstrate_directory_processing()

    # Error handling
-    results['error_handling'] = demonstrate_error_handling()
+    results["error_handling"] = demonstrate_error_handling()

    # Summary
    print("\n" + "=" * 70)
@@ -519,8 +528,10 @@ async def main():

    for demo_name, result in results.items():
        if result:
-            if hasattr(result, 'success_rate'):
-                print(f"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate")
+            if hasattr(result, "success_rate"):
+                print(
+                    f"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate"
+                )
            else:
                print(f"✅ {demo_name.upper()}: Completed successfully")
        else:
--- a/examples/enhanced_markdown_example.py
+++ b/examples/enhanced_markdown_example.py
@@ -20,6 +20,7 @@ import tempfile

 # Add project root directory to Python path
 import sys
+
 sys.path.append(str(Path(__file__).parent.parent))

 from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
@@ -364,7 +365,7 @@ This research demonstrates that enhanced markdown conversion provides significan
    return {
        "basic": basic_content,
        "technical": technical_content,
-        "academic": academic_content
+        "academic": academic_content,
    }


@@ -392,15 +393,15 @@ def demonstrate_basic_conversion():

        # Convert basic sample
        basic_md_path = temp_dir / "basic_sample.md"
-        with open(basic_md_path, 'w', encoding='utf-8') as f:
-            f.write(samples['basic'])
+        with open(basic_md_path, "w", encoding="utf-8") as f:
+            f.write(samples["basic"])

        print(f"\nConverting basic sample: {basic_md_path}")

        success = converter.convert_file_to_pdf(
            input_path=str(basic_md_path),
            output_path=str(temp_dir / "basic_sample.pdf"),
-            method="auto"  # Let the system choose the best backend
+            method="auto",  # Let the system choose the best backend
        )

        if success:
@@ -428,10 +429,10 @@ def demonstrate_backend_comparison():

        # Create technical document
        tech_md_path = temp_dir / "technical.md"
-        with open(tech_md_path, 'w', encoding='utf-8') as f:
-            f.write(samples['technical'])
+        with open(tech_md_path, "w", encoding="utf-8") as f:
+            f.write(samples["technical"])

-        print(f"Testing different backends with technical document...")
+        print("Testing different backends with technical document...")

        # Test different backends
        backends = ["auto", "weasyprint", "pandoc"]
@@ -445,43 +446,48 @@ def demonstrate_backend_comparison():
                output_path = temp_dir / f"technical_{backend}.pdf"

                import time
+
                start_time = time.time()

                success = converter.convert_file_to_pdf(
                    input_path=str(tech_md_path),
                    output_path=str(output_path),
-                    method=backend
+                    method=backend,
                )

                end_time = time.time()
                conversion_time = end_time - start_time

                if success:
-                    file_size = output_path.stat().st_size if output_path.exists() else 0
-                    print(f"  ✅ {backend}: Success in {conversion_time:.2f}s, {file_size} bytes")
+                    file_size = (
+                        output_path.stat().st_size if output_path.exists() else 0
+                    )
+                    print(
+                        f"  ✅ {backend}: Success in {conversion_time:.2f}s, {file_size} bytes"
+                    )
                    results[backend] = {
-                        'success': True,
-                        'time': conversion_time,
-                        'size': file_size,
-                        'output': str(output_path)
+                        "success": True,
+                        "time": conversion_time,
+                        "size": file_size,
+                        "output": str(output_path),
                    }
                else:
                    print(f"  ❌ {backend}: Failed")
-                    results[backend] = {'success': False, 'time': conversion_time}
+                    results[backend] = {"success": False, "time": conversion_time}

            except Exception as e:
                print(f"  ❌ {backend}: Error - {str(e)}")
-                results[backend] = {'success': False, 'error': str(e)}
+                results[backend] = {"success": False, "error": str(e)}

        # Summary
        print("\n" + "-" * 40)
        print("BACKEND COMPARISON SUMMARY")
        print("-" * 40)
-        successful_backends = [b for b, r in results.items() if r.get('success', False)]
+        successful_backends = [b for b, r in results.items() if r.get("success", False)]
        print(f"Successful backends: {successful_backends}")

        if successful_backends:
-            fastest = min(successful_backends, key=lambda b: results[b]['time'])
+            fastest = min(successful_backends, key=lambda b: results[b]["time"])
            print(f"Fastest backend: {fastest} ({results[fastest]['time']:.2f}s)")

        return results, temp_dir
@@ -659,15 +665,15 @@ def demonstrate_custom_styling():
            line_height="1.4",
            include_toc=True,
            syntax_highlighting=True,
-            custom_css=custom_css
+            custom_css=custom_css,
        )

        converter = EnhancedMarkdownConverter(config)

        # Convert academic sample with custom styling
        academic_md_path = temp_dir / "academic_styled.md"
-        with open(academic_md_path, 'w', encoding='utf-8') as f:
-            f.write(samples['academic'])
+        with open(academic_md_path, "w", encoding="utf-8") as f:
+            f.write(samples["academic"])

        print("Converting academic paper with custom styling...")
        print("Custom styling features:")
@@ -681,7 +687,7 @@ def demonstrate_custom_styling():
        success = converter.convert_file_to_pdf(
            input_path=str(academic_md_path),
            output_path=str(temp_dir / "academic_styled.pdf"),
-            method="weasyprint"  # WeasyPrint is best for custom CSS
+            method="weasyprint",  # WeasyPrint is best for custom CSS
        )

        if success:
@@ -693,7 +699,7 @@ def demonstrate_custom_styling():
            default_success = default_converter.convert_file_to_pdf(
                input_path=str(academic_md_path),
                output_path=str(temp_dir / "academic_default.pdf"),
-                method="weasyprint"
+                method="weasyprint",
            )

            if default_success:
@@ -779,7 +785,7 @@ Direct content conversion is useful for:
        success = converter.convert_markdown_to_pdf(
            markdown_content=dynamic_content,
            output_path=str(output_path),
-            method="auto"
+            method="auto",
        )

        if success:
@@ -841,7 +847,7 @@ Symbols: ♠ ♣ ♥ ♦ ☀ ☁ ☂ ☃ ☄ ★ ☆ ☉ ☊ ☋ ☌ ☍ ☎ ☏
 Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
 """,
            "empty_content": "",
-            "minimal_content": "# Just a title"
+            "minimal_content": "# Just a title",
        }

        print("Testing error handling with various content types...")
@@ -861,32 +867,36 @@ Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
                        success = converter.convert_markdown_to_pdf(
                            markdown_content=content,
                            output_path=str(output_path),
-                            method=backend
+                            method=backend,
                        )

                        if success:
-                            file_size = output_path.stat().st_size if output_path.exists() else 0
+                            file_size = (
+                                output_path.stat().st_size
+                                if output_path.exists()
+                                else 0
+                            )
                            print(f"  ✅ {backend}: Success ({file_size} bytes)")
                            results[f"{test_name}_{backend}"] = {
-                                'success': True,
-                                'size': file_size
+                                "success": True,
+                                "size": file_size,
                            }
                        else:
                            print(f"  ❌ {backend}: Failed")
-                            results[f"{test_name}_{backend}"] = {'success': False}
+                            results[f"{test_name}_{backend}"] = {"success": False}

                    except Exception as e:
                        print(f"  ❌ {backend}: Error - {str(e)[:60]}...")
                        results[f"{test_name}_{backend}"] = {
-                            'success': False,
-                            'error': str(e)
+                            "success": False,
+                            "error": str(e),
                        }

            except Exception as e:
                print(f"  ❌ Test case failed: {str(e)}")

        # Demonstrate robust conversion with fallbacks
-        print(f"\nDemonstrating robust conversion with fallback logic...")
+        print("\nDemonstrating robust conversion with fallback logic...")

        def robust_convert(content, output_path):
            """Convert with multiple backend fallbacks"""
@@ -898,7 +908,7 @@ Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
                    success = converter.convert_markdown_to_pdf(
                        markdown_content=content,
                        output_path=output_path,
-                        method=backend
+                        method=backend,
                    )
                    if success:
                        return backend, True
@@ -923,9 +933,13 @@ Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
        print("\n" + "-" * 40)
        print("ERROR HANDLING SUMMARY")
        print("-" * 40)
-        successful_conversions = sum(1 for r in results.values() if r.get('success', False))
+        successful_conversions = sum(
+            1 for r in results.values() if r.get("success", False)
+        )
        total_attempts = len(results)
-        success_rate = (successful_conversions / total_attempts * 100) if total_attempts > 0 else 0
+        success_rate = (
+            (successful_conversions / total_attempts * 100) if total_attempts > 0 else 0
+        )

        print(f"Total conversion attempts: {total_attempts}")
        print(f"Successful conversions: {successful_conversions}")
@@ -943,12 +957,14 @@ def main():
    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    print("RAG-Anything Enhanced Markdown Conversion Demonstration")
    print("=" * 70)
-    print("This example demonstrates various enhanced markdown conversion capabilities:")
+    print(
+        "This example demonstrates various enhanced markdown conversion capabilities:"
+    )
    print("  - Basic markdown to PDF conversion")
    print("  - Multiple backend comparison (WeasyPrint vs Pandoc)")
    print("  - Custom CSS styling and professional formatting")
@@ -962,23 +978,23 @@ def main():

    # Basic conversion
    success, temp_dir = demonstrate_basic_conversion()
-    results['basic'] = success
+    results["basic"] = success

    # Backend comparison
    backend_results, _ = demonstrate_backend_comparison()
-    results['backends'] = backend_results
+    results["backends"] = backend_results

    # Custom styling
    styling_success, _ = demonstrate_custom_styling()
-    results['styling'] = styling_success
+    results["styling"] = styling_success

    # Content conversion
    content_success, _ = demonstrate_content_conversion()
-    results['content'] = content_success
+    results["content"] = content_success

    # Error handling
    error_results, _ = demonstrate_error_handling()
-    results['error_handling'] = error_results
+    results["error_handling"] = error_results

    # Summary
    print("\n" + "=" * 70)
@@ -986,17 +1002,25 @@ def main():
    print("=" * 70)

    print("✅ Features Successfully Demonstrated:")
-    if results['basic']:
+    if results["basic"]:
        print("  - Basic markdown to PDF conversion")
-    if results['backends']:
-        successful_backends = [b for b, r in results['backends'].items() if r.get('success', False)]
+    if results["backends"]:
+        successful_backends = [
+            b for b, r in results["backends"].items() if r.get("success", False)
+        ]
        print(f"  - Multiple backends: {successful_backends}")
-    if results['styling']:
+    if results["styling"]:
        print("  - Custom CSS styling and professional formatting")
-    if results['content']:
+    if results["content"]:
        print("  - Direct content conversion without file I/O")
-    if results['error_handling']:
-        success_rate = sum(1 for r in results['error_handling'].values() if r.get('success', False)) / len(results['error_handling']) * 100
+    if results["error_handling"]:
+        success_rate = (
+            sum(
+                1 for r in results["error_handling"].values() if r.get("success", False)
+            )
+            / len(results["error_handling"])
+            * 100
+        )
        print(f"  - Error handling with {success_rate:.1f}% overall success rate")

    print("\n📊 Key Capabilities Highlighted:")