diff --git a/README.md b/README.md index 323e584..713f7b6 100644 --- a/README.md +++ b/README.md @@ -297,7 +297,8 @@ async def main(): # Create RAGAnything configuration config = RAGAnythingConfig( working_dir="./rag_storage", - mineru_parse_method="auto", + parser="mineru", # Parser selection: mineru or docling + parse_method="auto", # Parse method: auto, ocr, or txt enable_image_processing=True, enable_table_processing=True, enable_equation_processing=True, @@ -762,14 +763,30 @@ OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=your_base_url # Optional OUTPUT_DIR=./output # Default output directory for parsed documents PARSER=mineru # Parser selection: mineru or docling +PARSE_METHOD=auto # Parse method: auto, ocr, or txt ``` -> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys. +**Note:** For backward compatibility, legacy environment variable names are still supported: +- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD` + +> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys. + +### Parser Configuration + +RAGAnything now supports multiple parsers, each with specific advantages: + +#### MinerU Parser +- Supports PDF, images, Office documents, and more formats +- Powerful OCR and table extraction capabilities +- GPU acceleration support + +#### Docling Parser +- Optimized for Office documents and HTML files +- Better document structure preservation +- Native support for multiple Office formats ### MinerU Configuration -RAG-Anything now supports multiple parsers: - ```bash # MinerU 2.0 uses command-line parameters instead of config files # Check available options: diff --git a/README_zh.md b/README_zh.md index 6907ecf..61f4320 100644 --- a/README_zh.md +++ b/README_zh.md @@ -293,7 +293,8 @@ async def main(): # 创建 RAGAnything 配置 config = RAGAnythingConfig( working_dir="./rag_storage", - mineru_parse_method="auto", + parser="mineru", # 选择解析器:mineru 或 docling + parse_method="auto", # 解析方法:auto, ocr 或 txt enable_image_processing=True, enable_table_processing=True, enable_equation_processing=True, @@ -761,11 +762,27 @@ OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=your_base_url # 可选 OUTPUT_DIR=./output # 解析文档的默认输出目录 PARSER=mineru # 解析器选择:mineru 或 docling +PARSE_METHOD=auto # 解析方法:auto, ocr 或 txt ``` -### MinerU配置 +**注意:** 为了向后兼容,旧的环境变量名称仍然有效: +- `MINERU_PARSE_METHOD` 已弃用,请使用 `PARSE_METHOD` -RAG-Anything现在支持多种解析器: +### 解析器配置 + +RAGAnything 现在支持多种解析器,每种解析器都有其特定的优势: + +#### MinerU 解析器 +- 支持PDF、图像、Office文档等多种格式 +- 强大的OCR和表格提取能力 +- 支持GPU加速 + +#### Docling 解析器 +- 专门优化Office文档和HTML文件的解析 +- 更好的文档结构保持 +- 原生支持多种Office格式 + +### MinerU配置 ```bash # MinerU 2.0使用命令行参数而不是配置文件 diff --git a/env.example b/env.example index 64683d4..859089f 100644 --- a/env.example +++ b/env.example @@ -34,7 +34,7 @@ OLLAMA_EMULATING_MODEL_TAG=latest ### RAGAnything Configuration (Multimodal Document Processing) ### --- ### Parser Configuration -# MINERU_PARSE_METHOD=auto +# PARSE_METHOD=auto # OUTPUT_DIR=./output # PARSER=mineru # DISPLAY_CONTENT_STATS=true diff --git a/examples/raganything_example.py b/examples/raganything_example.py index 59ebf41..572d8fa 100644 --- a/examples/raganything_example.py +++ b/examples/raganything_example.py @@ -104,11 +104,11 @@ async def process_with_rag( # Create RAGAnything configuration config = RAGAnythingConfig( working_dir=working_dir or "./rag_storage", - mineru_parse_method="auto", + parser=parser, # Parser selection: mineru or docling + parse_method="auto", # Parse method: auto, ocr, or txt enable_image_processing=True, enable_table_processing=True, enable_equation_processing=True, - parser=parser, ) # Define LLM model function diff --git a/raganything/batch.py b/raganything/batch.py index 6214689..d0907ff 100644 --- a/raganything/batch.py +++ b/raganything/batch.py @@ -29,8 +29,8 @@ class BatchMixin: Args: folder_path: Path to the folder to process - output_dir: MinerU output directory (defaults to config.mineru_output_dir) - parse_method: Parse method (defaults to config.mineru_parse_method) + output_dir: Parser output directory (defaults to config.parser_output_dir) + parse_method: Parse method (defaults to config.parse_method) display_stats: Whether to display content statistics for each file (defaults to False for batch processing) split_by_character: Optional character to split text by split_by_character_only: If True, split only by the specified character @@ -43,9 +43,9 @@ class BatchMixin: # Use config defaults if not provided if output_dir is None: - output_dir = self.config.mineru_output_dir + output_dir = self.config.parser_output_dir if parse_method is None: - parse_method = self.config.mineru_parse_method + parse_method = self.config.parse_method if display_stats is None: display_stats = False # Default to False for batch processing if recursive is None: diff --git a/raganything/config.py b/raganything/config.py index e16d3f1..6daaa7e 100644 --- a/raganything/config.py +++ b/raganything/config.py @@ -20,13 +20,11 @@ class RAGAnythingConfig: # Parser Configuration # --- - mineru_parse_method: str = field( - default=get_env_value("MINERU_PARSE_METHOD", "auto", str) - ) - """Default parsing method for MinerU: 'auto', 'ocr', or 'txt'.""" + parse_method: str = field(default=get_env_value("PARSE_METHOD", "auto", str)) + """Default parsing method for document parsing: 'auto', 'ocr', or 'txt'.""" parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str)) - """Default output directory for MinerU parsed content.""" + """Default output directory for parsed content.""" parser: str = field(default=get_env_value("PARSER", "mineru", str)) """Parser selection: 'mineru' or 'docling'.""" @@ -104,3 +102,46 @@ class RAGAnythingConfig: content_format: str = field(default=get_env_value("CONTENT_FORMAT", "minerU", str)) """Default content format for context extraction when processing documents.""" + + def __post_init__(self): + """Post-initialization setup for backward compatibility""" + # Support legacy environment variable names for backward compatibility + legacy_parse_method = get_env_value("MINERU_PARSE_METHOD", None, str) + if legacy_parse_method and not get_env_value("PARSE_METHOD", None, str): + self.parse_method = legacy_parse_method + import warnings + + warnings.warn( + "MINERU_PARSE_METHOD is deprecated. Use PARSE_METHOD instead.", + DeprecationWarning, + stacklevel=2, + ) + + @property + def mineru_parse_method(self) -> str: + """ + Backward compatibility property for old code. + + .. deprecated:: + Use `parse_method` instead. This property will be removed in a future version. + """ + import warnings + + warnings.warn( + "mineru_parse_method is deprecated. Use parse_method instead.", + DeprecationWarning, + stacklevel=2, + ) + return self.parse_method + + @mineru_parse_method.setter + def mineru_parse_method(self, value: str): + """Setter for backward compatibility""" + import warnings + + warnings.warn( + "mineru_parse_method is deprecated. Use parse_method instead.", + DeprecationWarning, + stacklevel=2, + ) + self.parse_method = value diff --git a/raganything/parser.py b/raganything/parser.py index 36d4973..fd7774c 100644 --- a/raganything/parser.py +++ b/raganything/parser.py @@ -626,7 +626,7 @@ class MineruParser(Parser): result = subprocess.run(cmd, **subprocess_kwargs) logging.info("MinerU command executed successfully") if result.stdout: - logging.debug(f"Output: {result.stdout}") + logging.debug(f"MinerU output: {result.stdout}") except subprocess.CalledProcessError as e: logging.error(f"Error running mineru command: {e}") if e.stderr: diff --git a/raganything/processor.py b/raganything/processor.py index b98d7d9..65db8e9 100644 --- a/raganything/processor.py +++ b/raganything/processor.py @@ -31,8 +31,8 @@ class ProcessorMixin: Args: file_path: Path to the file to parse - output_dir: Output directory (defaults to config.output_dir) - parse_method: Parse method (defaults to config.mineru_parse_method) + output_dir: Output directory (defaults to config.parser_output_dir) + parse_method: Parse method (defaults to config.parse_method) display_stats: Whether to display content statistics (defaults to config.display_content_stats) **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source) @@ -43,7 +43,7 @@ class ProcessorMixin: if output_dir is None: output_dir = self.config.parser_output_dir if parse_method is None: - parse_method = self.config.mineru_parse_method + parse_method = self.config.parse_method if display_stats is None: display_stats = self.config.display_content_stats @@ -60,10 +60,14 @@ class ProcessorMixin: doc_parser = ( DoclingParser() if self.config.parser == "docling" else MineruParser() ) + + # Log parser and method information + self.logger.info( + f"Using {self.config.parser} parser with method: {parse_method}" + ) + if ext in [".pdf"]: - self.logger.info( - f"Detected PDF file, using PDF parser (method={parse_method})..." - ) + self.logger.info("Detected PDF file, using parser for PDF...") content_list = doc_parser.parse_pdf( pdf_path=file_path, output_dir=output_dir, @@ -80,10 +84,20 @@ class ProcessorMixin: ".gif", ".webp", ]: - self.logger.info("Detected image file, using image parser...") - content_list = MineruParser.parse_image( - image_path=file_path, output_dir=output_dir, **kwargs - ) + self.logger.info("Detected image file, using parser for images...") + # Use the selected parser's image parsing capability + if hasattr(doc_parser, "parse_image"): + content_list = doc_parser.parse_image( + image_path=file_path, output_dir=output_dir, **kwargs + ) + else: + # Fallback to MinerU for image parsing if current parser doesn't support it + self.logger.warning( + f"{self.config.parser} parser doesn't support image parsing, falling back to MinerU" + ) + content_list = MineruParser().parse_image( + image_path=file_path, output_dir=output_dir, **kwargs + ) elif ext in [ ".doc", ".docx", @@ -96,7 +110,7 @@ class ProcessorMixin: ".xhtml", ]: self.logger.info( - "Detected Office or HTML document, using Office parser..." + "Detected Office or HTML document, using parser for Office/HTML..." ) content_list = doc_parser.parse_office_doc( doc_path=file_path, output_dir=output_dir, **kwargs @@ -114,11 +128,12 @@ class ProcessorMixin: ) except Exception as e: - self.logger.error(f"Error during parsing with specific parser: {str(e)}") - self.logger.warning("Falling back to generic parser...") - # If specific parser fails, fall back to generic parser - content_list = MineruParser.parse_document( - MineruParser(), + self.logger.error( + f"Error during parsing with {self.config.parser} parser: {str(e)}" + ) + self.logger.warning("Falling back to MinerU parser...") + # If specific parser fails, fall back to MinerU parser + content_list = MineruParser().parse_document( file_path=file_path, method=parse_method, output_dir=output_dir, @@ -262,22 +277,22 @@ class ProcessorMixin: Args: file_path: Path to the file to process - output_dir: output directory (defaults to config.output_dir) - parse_method: Parse method (defaults to config.mineru_parse_method) + output_dir: output directory (defaults to config.parser_output_dir) + parse_method: Parse method (defaults to config.parse_method) display_stats: Whether to display content statistics (defaults to config.display_content_stats) split_by_character: Optional character to split the text by split_by_character_only: If True, split only by the specified character doc_id: Optional document ID, if not provided MD5 hash will be generated - **kwargs: Additional parameters for MinerU parser (e.g., lang, device, start_page, end_page, formula, table, backend, source) + **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source) """ # Ensure LightRAG is initialized await self._ensure_lightrag_initialized() # Use config defaults if not provided if output_dir is None: - output_dir = self.config.mineru_output_dir + output_dir = self.config.parser_output_dir if parse_method is None: - parse_method = self.config.mineru_parse_method + parse_method = self.config.parse_method if display_stats is None: display_stats = self.config.display_content_stats diff --git a/raganything/raganything.py b/raganything/raganything.py index 6f25647..38ffead 100644 --- a/raganything/raganything.py +++ b/raganything/raganything.py @@ -1,8 +1,8 @@ """ -Complete MinerU parsing + multimodal content insertion Pipeline +Complete document parsing + multimodal content insertion Pipeline This script integrates: -1. MinerU document parsing +1. Document parsing (using configurable parsers) 2. Pure text content LightRAG insertion 3. Specialized processing for multimodal content (using different processors) """ @@ -102,6 +102,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin): self.logger.info("RAGAnything initialized with config:") self.logger.info(f" Working directory: {self.config.working_dir}") self.logger.info(f" Parser: {self.config.parser}") + self.logger.info(f" Parse method: {self.config.parse_method}") self.logger.info( f" Multimodal processing - Image: {self.config.enable_image_processing}, " f"Table: {self.config.enable_table_processing}, " @@ -194,7 +195,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin): # Check parser installation if not self.doc_parser.check_installation(): raise RuntimeError( - "Parser is not properly installed. " + f"Parser '{self.config.parser}' is not properly installed. " "Please install it using pip install or uv pip install." ) @@ -225,24 +226,25 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin): self.logger.info("LightRAG and multimodal processors initialized") - def check_mineru_installation(self) -> bool: + def check_parser_installation(self) -> bool: """ - Check if MinerU 2.0 is properly installed + Check if the configured parser is properly installed Returns: - bool: True if MinerU 2.0 is properly installed + bool: True if the configured parser is properly installed """ - return MineruParser.check_installation(MineruParser()) + return self.doc_parser.check_installation() def get_config_info(self) -> Dict[str, Any]: """Get current configuration information""" return { "directory": { "working_dir": self.config.working_dir, - "mineru_output_dir": self.config.output_dir, + "parser_output_dir": self.config.parser_output_dir, }, "parsing": { - "mineru_parse_method": self.config.mineru_parse_method, + "parser": self.config.parser, + "parse_method": self.config.parse_method, "display_content_stats": self.config.display_content_stats, }, "multimodal_processing": {