Update parser param

2025-08-20 19:01:34 +03:00 · 2025-07-21 23:48:27 +08:00
parent 7aafb58e41
commit d8302d0cf8
9 changed files with 142 additions and 50 deletions
--- a/README.md
+++ b/README.md
@@ -297,7 +297,8 @@ async def main():
    # Create RAGAnything configuration
    config = RAGAnythingConfig(
        working_dir="./rag_storage",
-        mineru_parse_method="auto",
+        parser="mineru",  # Parser selection: mineru or docling
+        parse_method="auto",  # Parse method: auto, ocr, or txt
        enable_image_processing=True,
        enable_table_processing=True,
        enable_equation_processing=True,
@@ -762,14 +763,30 @@ OPENAI_API_KEY=your_openai_api_key
 OPENAI_BASE_URL=your_base_url  # Optional
 OUTPUT_DIR=./output             # Default output directory for parsed documents
 PARSER=mineru                   # Parser selection: mineru or docling
+PARSE_METHOD=auto              # Parse method: auto, ocr, or txt
 ```

-> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys.
+**Note:** For backward compatibility, legacy environment variable names are still supported:
+- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`
+
+> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.
+
+### Parser Configuration
+
+RAGAnything now supports multiple parsers, each with specific advantages:
+
+#### MinerU Parser
+- Supports PDF, images, Office documents, and more formats
+- Powerful OCR and table extraction capabilities
+- GPU acceleration support
+
+#### Docling Parser
+- Optimized for Office documents and HTML files
+- Better document structure preservation
+- Native support for multiple Office formats

 ### MinerU Configuration

-RAG-Anything now supports multiple parsers:
-
 ```bash
 # MinerU 2.0 uses command-line parameters instead of config files
 # Check available options:
--- a/README_zh.md
+++ b/README_zh.md
@@ -293,7 +293,8 @@ async def main():
    # 创建 RAGAnything 配置
    config = RAGAnythingConfig(
        working_dir="./rag_storage",
-        mineru_parse_method="auto",
+        parser="mineru",  # 选择解析器：mineru 或 docling
+        parse_method="auto",  # 解析方法：auto, ocr 或 txt
        enable_image_processing=True,
        enable_table_processing=True,
        enable_equation_processing=True,
@@ -761,11 +762,27 @@ OPENAI_API_KEY=your_openai_api_key
 OPENAI_BASE_URL=your_base_url  # 可选
 OUTPUT_DIR=./output             # 解析文档的默认输出目录
 PARSER=mineru                   # 解析器选择：mineru 或 docling
+PARSE_METHOD=auto              # 解析方法：auto, ocr 或 txt
 ```

-### MinerU配置
+**注意：** 为了向后兼容，旧的环境变量名称仍然有效：
+- `MINERU_PARSE_METHOD` 已弃用，请使用 `PARSE_METHOD`

-RAG-Anything现在支持多种解析器：
+### 解析器配置
+
+RAGAnything 现在支持多种解析器，每种解析器都有其特定的优势：
+
+#### MinerU 解析器
+- 支持PDF、图像、Office文档等多种格式
+- 强大的OCR和表格提取能力
+- 支持GPU加速
+
+#### Docling 解析器
+- 专门优化Office文档和HTML文件的解析
+- 更好的文档结构保持
+- 原生支持多种Office格式
+
+### MinerU配置

 ```bash
 # MinerU 2.0使用命令行参数而不是配置文件
--- a/env.example
+++ b/env.example
@@ -34,7 +34,7 @@ OLLAMA_EMULATING_MODEL_TAG=latest
 ### RAGAnything Configuration (Multimodal Document Processing)
 ### ---
 ### Parser Configuration
-# MINERU_PARSE_METHOD=auto
+# PARSE_METHOD=auto
 # OUTPUT_DIR=./output
 # PARSER=mineru
 # DISPLAY_CONTENT_STATS=true
--- a/examples/raganything_example.py
+++ b/examples/raganything_example.py
@@ -104,11 +104,11 @@ async def process_with_rag(
        # Create RAGAnything configuration
        config = RAGAnythingConfig(
            working_dir=working_dir or "./rag_storage",
-            mineru_parse_method="auto",
+            parser=parser,  # Parser selection: mineru or docling
+            parse_method="auto",  # Parse method: auto, ocr, or txt
            enable_image_processing=True,
            enable_table_processing=True,
            enable_equation_processing=True,
-            parser=parser,
        )

        # Define LLM model function
--- a/raganything/batch.py
+++ b/raganything/batch.py
@@ -29,8 +29,8 @@ class BatchMixin:

        Args:
            folder_path: Path to the folder to process
-            output_dir: MinerU output directory (defaults to config.mineru_output_dir)
-            parse_method: Parse method (defaults to config.mineru_parse_method)
+            output_dir: Parser output directory (defaults to config.parser_output_dir)
+            parse_method: Parse method (defaults to config.parse_method)
            display_stats: Whether to display content statistics for each file (defaults to False for batch processing)
            split_by_character: Optional character to split text by
            split_by_character_only: If True, split only by the specified character
@@ -43,9 +43,9 @@ class BatchMixin:

        # Use config defaults if not provided
        if output_dir is None:
-            output_dir = self.config.mineru_output_dir
+            output_dir = self.config.parser_output_dir
        if parse_method is None:
-            parse_method = self.config.mineru_parse_method
+            parse_method = self.config.parse_method
        if display_stats is None:
            display_stats = False  # Default to False for batch processing
        if recursive is None:
--- a/raganything/config.py
+++ b/raganything/config.py
@@ -20,13 +20,11 @@ class RAGAnythingConfig:

    # Parser Configuration
    # ---
-    mineru_parse_method: str = field(
-        default=get_env_value("MINERU_PARSE_METHOD", "auto", str)
-    )
-    """Default parsing method for MinerU: 'auto', 'ocr', or 'txt'."""
+    parse_method: str = field(default=get_env_value("PARSE_METHOD", "auto", str))
+    """Default parsing method for document parsing: 'auto', 'ocr', or 'txt'."""

    parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
-    """Default output directory for MinerU parsed content."""
+    """Default output directory for parsed content."""

    parser: str = field(default=get_env_value("PARSER", "mineru", str))
    """Parser selection: 'mineru' or 'docling'."""
@@ -104,3 +102,46 @@ class RAGAnythingConfig:

    content_format: str = field(default=get_env_value("CONTENT_FORMAT", "minerU", str))
    """Default content format for context extraction when processing documents."""
+
+    def __post_init__(self):
+        """Post-initialization setup for backward compatibility"""
+        # Support legacy environment variable names for backward compatibility
+        legacy_parse_method = get_env_value("MINERU_PARSE_METHOD", None, str)
+        if legacy_parse_method and not get_env_value("PARSE_METHOD", None, str):
+            self.parse_method = legacy_parse_method
+            import warnings
+
+            warnings.warn(
+                "MINERU_PARSE_METHOD is deprecated. Use PARSE_METHOD instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+    @property
+    def mineru_parse_method(self) -> str:
+        """
+        Backward compatibility property for old code.
+
+        .. deprecated::
+           Use `parse_method` instead. This property will be removed in a future version.
+        """
+        import warnings
+
+        warnings.warn(
+            "mineru_parse_method is deprecated. Use parse_method instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.parse_method
+
+    @mineru_parse_method.setter
+    def mineru_parse_method(self, value: str):
+        """Setter for backward compatibility"""
+        import warnings
+
+        warnings.warn(
+            "mineru_parse_method is deprecated. Use parse_method instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        self.parse_method = value
--- a/raganything/parser.py
+++ b/raganything/parser.py
@@ -626,7 +626,7 @@ class MineruParser(Parser):
            result = subprocess.run(cmd, **subprocess_kwargs)
            logging.info("MinerU command executed successfully")
            if result.stdout:
-                logging.debug(f"Output: {result.stdout}")
+                logging.debug(f"MinerU output: {result.stdout}")
        except subprocess.CalledProcessError as e:
            logging.error(f"Error running mineru command: {e}")
            if e.stderr:
--- a/raganything/processor.py
+++ b/raganything/processor.py
@@ -31,8 +31,8 @@ class ProcessorMixin:

        Args:
            file_path: Path to the file to parse
-            output_dir: Output directory (defaults to config.output_dir)
-            parse_method: Parse method (defaults to config.mineru_parse_method)
+            output_dir: Output directory (defaults to config.parser_output_dir)
+            parse_method: Parse method (defaults to config.parse_method)
            display_stats: Whether to display content statistics (defaults to config.display_content_stats)
            **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)

@@ -43,7 +43,7 @@ class ProcessorMixin:
        if output_dir is None:
            output_dir = self.config.parser_output_dir
        if parse_method is None:
-            parse_method = self.config.mineru_parse_method
+            parse_method = self.config.parse_method
        if display_stats is None:
            display_stats = self.config.display_content_stats

@@ -60,10 +60,14 @@ class ProcessorMixin:
            doc_parser = (
                DoclingParser() if self.config.parser == "docling" else MineruParser()
            )
+
+            # Log parser and method information
+            self.logger.info(
+                f"Using {self.config.parser} parser with method: {parse_method}"
+            )
+
            if ext in [".pdf"]:
-                self.logger.info(
-                    f"Detected PDF file, using PDF parser (method={parse_method})..."
-                )
+                self.logger.info("Detected PDF file, using parser for PDF...")
                content_list = doc_parser.parse_pdf(
                    pdf_path=file_path,
                    output_dir=output_dir,
@@ -80,10 +84,20 @@ class ProcessorMixin:
                ".gif",
                ".webp",
            ]:
-                self.logger.info("Detected image file, using image parser...")
-                content_list = MineruParser.parse_image(
-                    image_path=file_path, output_dir=output_dir, **kwargs
-                )
+                self.logger.info("Detected image file, using parser for images...")
+                # Use the selected parser's image parsing capability
+                if hasattr(doc_parser, "parse_image"):
+                    content_list = doc_parser.parse_image(
+                        image_path=file_path, output_dir=output_dir, **kwargs
+                    )
+                else:
+                    # Fallback to MinerU for image parsing if current parser doesn't support it
+                    self.logger.warning(
+                        f"{self.config.parser} parser doesn't support image parsing, falling back to MinerU"
+                    )
+                    content_list = MineruParser().parse_image(
+                        image_path=file_path, output_dir=output_dir, **kwargs
+                    )
            elif ext in [
                ".doc",
                ".docx",
@@ -96,7 +110,7 @@ class ProcessorMixin:
                ".xhtml",
            ]:
                self.logger.info(
-                    "Detected Office or HTML document, using Office parser..."
+                    "Detected Office or HTML document, using parser for Office/HTML..."
                )
                content_list = doc_parser.parse_office_doc(
                    doc_path=file_path, output_dir=output_dir, **kwargs
@@ -114,11 +128,12 @@ class ProcessorMixin:
                )

        except Exception as e:
-            self.logger.error(f"Error during parsing with specific parser: {str(e)}")
-            self.logger.warning("Falling back to generic parser...")
-            # If specific parser fails, fall back to generic parser
-            content_list = MineruParser.parse_document(
-                MineruParser(),
+            self.logger.error(
+                f"Error during parsing with {self.config.parser} parser: {str(e)}"
+            )
+            self.logger.warning("Falling back to MinerU parser...")
+            # If specific parser fails, fall back to MinerU parser
+            content_list = MineruParser().parse_document(
                file_path=file_path,
                method=parse_method,
                output_dir=output_dir,
@@ -262,22 +277,22 @@ class ProcessorMixin:

        Args:
            file_path: Path to the file to process
-            output_dir: output directory (defaults to config.output_dir)
-            parse_method: Parse method (defaults to config.mineru_parse_method)
+            output_dir: output directory (defaults to config.parser_output_dir)
+            parse_method: Parse method (defaults to config.parse_method)
            display_stats: Whether to display content statistics (defaults to config.display_content_stats)
            split_by_character: Optional character to split the text by
            split_by_character_only: If True, split only by the specified character
            doc_id: Optional document ID, if not provided MD5 hash will be generated
-            **kwargs: Additional parameters for MinerU parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
+            **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
        """
        # Ensure LightRAG is initialized
        await self._ensure_lightrag_initialized()

        # Use config defaults if not provided
        if output_dir is None:
-            output_dir = self.config.mineru_output_dir
+            output_dir = self.config.parser_output_dir
        if parse_method is None:
-            parse_method = self.config.mineru_parse_method
+            parse_method = self.config.parse_method
        if display_stats is None:
            display_stats = self.config.display_content_stats

--- a/raganything/raganything.py
+++ b/raganything/raganything.py
@@ -1,8 +1,8 @@
 """
-Complete MinerU parsing + multimodal content insertion Pipeline
+Complete document parsing + multimodal content insertion Pipeline

 This script integrates:
-1. MinerU document parsing
+1. Document parsing (using configurable parsers)
 2. Pure text content LightRAG insertion
 3. Specialized processing for multimodal content (using different processors)
 """
@@ -102,6 +102,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
        self.logger.info("RAGAnything initialized with config:")
        self.logger.info(f"  Working directory: {self.config.working_dir}")
        self.logger.info(f"  Parser: {self.config.parser}")
+        self.logger.info(f"  Parse method: {self.config.parse_method}")
        self.logger.info(
            f"  Multimodal processing - Image: {self.config.enable_image_processing}, "
            f"Table: {self.config.enable_table_processing}, "
@@ -194,7 +195,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
        # Check parser installation
        if not self.doc_parser.check_installation():
            raise RuntimeError(
-                "Parser is not properly installed. "
+                f"Parser '{self.config.parser}' is not properly installed. "
                "Please install it using pip install or uv pip install."
            )

@@ -225,24 +226,25 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):

        self.logger.info("LightRAG and multimodal processors initialized")

-    def check_mineru_installation(self) -> bool:
+    def check_parser_installation(self) -> bool:
        """
-        Check if MinerU 2.0 is properly installed
+        Check if the configured parser is properly installed

        Returns:
-            bool: True if MinerU 2.0 is properly installed
+            bool: True if the configured parser is properly installed
        """
-        return MineruParser.check_installation(MineruParser())
+        return self.doc_parser.check_installation()

    def get_config_info(self) -> Dict[str, Any]:
        """Get current configuration information"""
        return {
            "directory": {
                "working_dir": self.config.working_dir,
-                "mineru_output_dir": self.config.output_dir,
+                "parser_output_dir": self.config.parser_output_dir,
            },
            "parsing": {
-                "mineru_parse_method": self.config.mineru_parse_method,
+                "parser": self.config.parser,
+                "parse_method": self.config.parse_method,
                "display_content_stats": self.config.display_content_stats,
            },
            "multimodal_processing": {