Update parser param

This commit is contained in:
zrguo
2025-07-21 23:48:27 +08:00
parent 7aafb58e41
commit d8302d0cf8
9 changed files with 142 additions and 50 deletions

View File

@@ -297,7 +297,8 @@ async def main():
# Create RAGAnything configuration # Create RAGAnything configuration
config = RAGAnythingConfig( config = RAGAnythingConfig(
working_dir="./rag_storage", working_dir="./rag_storage",
mineru_parse_method="auto", parser="mineru", # Parser selection: mineru or docling
parse_method="auto", # Parse method: auto, ocr, or txt
enable_image_processing=True, enable_image_processing=True,
enable_table_processing=True, enable_table_processing=True,
enable_equation_processing=True, enable_equation_processing=True,
@@ -762,14 +763,30 @@ OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # Optional OPENAI_BASE_URL=your_base_url # Optional
OUTPUT_DIR=./output # Default output directory for parsed documents OUTPUT_DIR=./output # Default output directory for parsed documents
PARSER=mineru # Parser selection: mineru or docling PARSER=mineru # Parser selection: mineru or docling
PARSE_METHOD=auto # Parse method: auto, ocr, or txt
``` ```
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys. **Note:** For backward compatibility, legacy environment variable names are still supported:
- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.
### Parser Configuration
RAGAnything now supports multiple parsers, each with specific advantages:
#### MinerU Parser
- Supports PDF, images, Office documents, and more formats
- Powerful OCR and table extraction capabilities
- GPU acceleration support
#### Docling Parser
- Optimized for Office documents and HTML files
- Better document structure preservation
- Native support for multiple Office formats
### MinerU Configuration ### MinerU Configuration
RAG-Anything now supports multiple parsers:
```bash ```bash
# MinerU 2.0 uses command-line parameters instead of config files # MinerU 2.0 uses command-line parameters instead of config files
# Check available options: # Check available options:

View File

@@ -293,7 +293,8 @@ async def main():
# 创建 RAGAnything 配置 # 创建 RAGAnything 配置
config = RAGAnythingConfig( config = RAGAnythingConfig(
working_dir="./rag_storage", working_dir="./rag_storage",
mineru_parse_method="auto", parser="mineru", # 选择解析器mineru 或 docling
parse_method="auto", # 解析方法auto, ocr 或 txt
enable_image_processing=True, enable_image_processing=True,
enable_table_processing=True, enable_table_processing=True,
enable_equation_processing=True, enable_equation_processing=True,
@@ -761,11 +762,27 @@ OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # 可选 OPENAI_BASE_URL=your_base_url # 可选
OUTPUT_DIR=./output # 解析文档的默认输出目录 OUTPUT_DIR=./output # 解析文档的默认输出目录
PARSER=mineru # 解析器选择mineru 或 docling PARSER=mineru # 解析器选择mineru 或 docling
PARSE_METHOD=auto # 解析方法auto, ocr 或 txt
``` ```
### MinerU配置 **注意:** 为了向后兼容,旧的环境变量名称仍然有效:
- `MINERU_PARSE_METHOD` 已弃用,请使用 `PARSE_METHOD`
RAG-Anything现在支持多种解析器 ### 解析器配置
RAGAnything 现在支持多种解析器,每种解析器都有其特定的优势:
#### MinerU 解析器
- 支持PDF、图像、Office文档等多种格式
- 强大的OCR和表格提取能力
- 支持GPU加速
#### Docling 解析器
- 专门优化Office文档和HTML文件的解析
- 更好的文档结构保持
- 原生支持多种Office格式
### MinerU配置
```bash ```bash
# MinerU 2.0使用命令行参数而不是配置文件 # MinerU 2.0使用命令行参数而不是配置文件

View File

@@ -34,7 +34,7 @@ OLLAMA_EMULATING_MODEL_TAG=latest
### RAGAnything Configuration (Multimodal Document Processing) ### RAGAnything Configuration (Multimodal Document Processing)
### --- ### ---
### Parser Configuration ### Parser Configuration
# MINERU_PARSE_METHOD=auto # PARSE_METHOD=auto
# OUTPUT_DIR=./output # OUTPUT_DIR=./output
# PARSER=mineru # PARSER=mineru
# DISPLAY_CONTENT_STATS=true # DISPLAY_CONTENT_STATS=true

View File

@@ -104,11 +104,11 @@ async def process_with_rag(
# Create RAGAnything configuration # Create RAGAnything configuration
config = RAGAnythingConfig( config = RAGAnythingConfig(
working_dir=working_dir or "./rag_storage", working_dir=working_dir or "./rag_storage",
mineru_parse_method="auto", parser=parser, # Parser selection: mineru or docling
parse_method="auto", # Parse method: auto, ocr, or txt
enable_image_processing=True, enable_image_processing=True,
enable_table_processing=True, enable_table_processing=True,
enable_equation_processing=True, enable_equation_processing=True,
parser=parser,
) )
# Define LLM model function # Define LLM model function

View File

@@ -29,8 +29,8 @@ class BatchMixin:
Args: Args:
folder_path: Path to the folder to process folder_path: Path to the folder to process
output_dir: MinerU output directory (defaults to config.mineru_output_dir) output_dir: Parser output directory (defaults to config.parser_output_dir)
parse_method: Parse method (defaults to config.mineru_parse_method) parse_method: Parse method (defaults to config.parse_method)
display_stats: Whether to display content statistics for each file (defaults to False for batch processing) display_stats: Whether to display content statistics for each file (defaults to False for batch processing)
split_by_character: Optional character to split text by split_by_character: Optional character to split text by
split_by_character_only: If True, split only by the specified character split_by_character_only: If True, split only by the specified character
@@ -43,9 +43,9 @@ class BatchMixin:
# Use config defaults if not provided # Use config defaults if not provided
if output_dir is None: if output_dir is None:
output_dir = self.config.mineru_output_dir output_dir = self.config.parser_output_dir
if parse_method is None: if parse_method is None:
parse_method = self.config.mineru_parse_method parse_method = self.config.parse_method
if display_stats is None: if display_stats is None:
display_stats = False # Default to False for batch processing display_stats = False # Default to False for batch processing
if recursive is None: if recursive is None:

View File

@@ -20,13 +20,11 @@ class RAGAnythingConfig:
# Parser Configuration # Parser Configuration
# --- # ---
mineru_parse_method: str = field( parse_method: str = field(default=get_env_value("PARSE_METHOD", "auto", str))
default=get_env_value("MINERU_PARSE_METHOD", "auto", str) """Default parsing method for document parsing: 'auto', 'ocr', or 'txt'."""
)
"""Default parsing method for MinerU: 'auto', 'ocr', or 'txt'."""
parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str)) parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
"""Default output directory for MinerU parsed content.""" """Default output directory for parsed content."""
parser: str = field(default=get_env_value("PARSER", "mineru", str)) parser: str = field(default=get_env_value("PARSER", "mineru", str))
"""Parser selection: 'mineru' or 'docling'.""" """Parser selection: 'mineru' or 'docling'."""
@@ -104,3 +102,46 @@ class RAGAnythingConfig:
content_format: str = field(default=get_env_value("CONTENT_FORMAT", "minerU", str)) content_format: str = field(default=get_env_value("CONTENT_FORMAT", "minerU", str))
"""Default content format for context extraction when processing documents.""" """Default content format for context extraction when processing documents."""
def __post_init__(self):
"""Post-initialization setup for backward compatibility"""
# Support legacy environment variable names for backward compatibility
legacy_parse_method = get_env_value("MINERU_PARSE_METHOD", None, str)
if legacy_parse_method and not get_env_value("PARSE_METHOD", None, str):
self.parse_method = legacy_parse_method
import warnings
warnings.warn(
"MINERU_PARSE_METHOD is deprecated. Use PARSE_METHOD instead.",
DeprecationWarning,
stacklevel=2,
)
@property
def mineru_parse_method(self) -> str:
"""
Backward compatibility property for old code.
.. deprecated::
Use `parse_method` instead. This property will be removed in a future version.
"""
import warnings
warnings.warn(
"mineru_parse_method is deprecated. Use parse_method instead.",
DeprecationWarning,
stacklevel=2,
)
return self.parse_method
@mineru_parse_method.setter
def mineru_parse_method(self, value: str):
"""Setter for backward compatibility"""
import warnings
warnings.warn(
"mineru_parse_method is deprecated. Use parse_method instead.",
DeprecationWarning,
stacklevel=2,
)
self.parse_method = value

View File

@@ -626,7 +626,7 @@ class MineruParser(Parser):
result = subprocess.run(cmd, **subprocess_kwargs) result = subprocess.run(cmd, **subprocess_kwargs)
logging.info("MinerU command executed successfully") logging.info("MinerU command executed successfully")
if result.stdout: if result.stdout:
logging.debug(f"Output: {result.stdout}") logging.debug(f"MinerU output: {result.stdout}")
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
logging.error(f"Error running mineru command: {e}") logging.error(f"Error running mineru command: {e}")
if e.stderr: if e.stderr:

View File

@@ -31,8 +31,8 @@ class ProcessorMixin:
Args: Args:
file_path: Path to the file to parse file_path: Path to the file to parse
output_dir: Output directory (defaults to config.output_dir) output_dir: Output directory (defaults to config.parser_output_dir)
parse_method: Parse method (defaults to config.mineru_parse_method) parse_method: Parse method (defaults to config.parse_method)
display_stats: Whether to display content statistics (defaults to config.display_content_stats) display_stats: Whether to display content statistics (defaults to config.display_content_stats)
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source) **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
@@ -43,7 +43,7 @@ class ProcessorMixin:
if output_dir is None: if output_dir is None:
output_dir = self.config.parser_output_dir output_dir = self.config.parser_output_dir
if parse_method is None: if parse_method is None:
parse_method = self.config.mineru_parse_method parse_method = self.config.parse_method
if display_stats is None: if display_stats is None:
display_stats = self.config.display_content_stats display_stats = self.config.display_content_stats
@@ -60,10 +60,14 @@ class ProcessorMixin:
doc_parser = ( doc_parser = (
DoclingParser() if self.config.parser == "docling" else MineruParser() DoclingParser() if self.config.parser == "docling" else MineruParser()
) )
# Log parser and method information
self.logger.info(
f"Using {self.config.parser} parser with method: {parse_method}"
)
if ext in [".pdf"]: if ext in [".pdf"]:
self.logger.info( self.logger.info("Detected PDF file, using parser for PDF...")
f"Detected PDF file, using PDF parser (method={parse_method})..."
)
content_list = doc_parser.parse_pdf( content_list = doc_parser.parse_pdf(
pdf_path=file_path, pdf_path=file_path,
output_dir=output_dir, output_dir=output_dir,
@@ -80,10 +84,20 @@ class ProcessorMixin:
".gif", ".gif",
".webp", ".webp",
]: ]:
self.logger.info("Detected image file, using image parser...") self.logger.info("Detected image file, using parser for images...")
content_list = MineruParser.parse_image( # Use the selected parser's image parsing capability
image_path=file_path, output_dir=output_dir, **kwargs if hasattr(doc_parser, "parse_image"):
) content_list = doc_parser.parse_image(
image_path=file_path, output_dir=output_dir, **kwargs
)
else:
# Fallback to MinerU for image parsing if current parser doesn't support it
self.logger.warning(
f"{self.config.parser} parser doesn't support image parsing, falling back to MinerU"
)
content_list = MineruParser().parse_image(
image_path=file_path, output_dir=output_dir, **kwargs
)
elif ext in [ elif ext in [
".doc", ".doc",
".docx", ".docx",
@@ -96,7 +110,7 @@ class ProcessorMixin:
".xhtml", ".xhtml",
]: ]:
self.logger.info( self.logger.info(
"Detected Office or HTML document, using Office parser..." "Detected Office or HTML document, using parser for Office/HTML..."
) )
content_list = doc_parser.parse_office_doc( content_list = doc_parser.parse_office_doc(
doc_path=file_path, output_dir=output_dir, **kwargs doc_path=file_path, output_dir=output_dir, **kwargs
@@ -114,11 +128,12 @@ class ProcessorMixin:
) )
except Exception as e: except Exception as e:
self.logger.error(f"Error during parsing with specific parser: {str(e)}") self.logger.error(
self.logger.warning("Falling back to generic parser...") f"Error during parsing with {self.config.parser} parser: {str(e)}"
# If specific parser fails, fall back to generic parser )
content_list = MineruParser.parse_document( self.logger.warning("Falling back to MinerU parser...")
MineruParser(), # If specific parser fails, fall back to MinerU parser
content_list = MineruParser().parse_document(
file_path=file_path, file_path=file_path,
method=parse_method, method=parse_method,
output_dir=output_dir, output_dir=output_dir,
@@ -262,22 +277,22 @@ class ProcessorMixin:
Args: Args:
file_path: Path to the file to process file_path: Path to the file to process
output_dir: output directory (defaults to config.output_dir) output_dir: output directory (defaults to config.parser_output_dir)
parse_method: Parse method (defaults to config.mineru_parse_method) parse_method: Parse method (defaults to config.parse_method)
display_stats: Whether to display content statistics (defaults to config.display_content_stats) display_stats: Whether to display content statistics (defaults to config.display_content_stats)
split_by_character: Optional character to split the text by split_by_character: Optional character to split the text by
split_by_character_only: If True, split only by the specified character split_by_character_only: If True, split only by the specified character
doc_id: Optional document ID, if not provided MD5 hash will be generated doc_id: Optional document ID, if not provided MD5 hash will be generated
**kwargs: Additional parameters for MinerU parser (e.g., lang, device, start_page, end_page, formula, table, backend, source) **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
""" """
# Ensure LightRAG is initialized # Ensure LightRAG is initialized
await self._ensure_lightrag_initialized() await self._ensure_lightrag_initialized()
# Use config defaults if not provided # Use config defaults if not provided
if output_dir is None: if output_dir is None:
output_dir = self.config.mineru_output_dir output_dir = self.config.parser_output_dir
if parse_method is None: if parse_method is None:
parse_method = self.config.mineru_parse_method parse_method = self.config.parse_method
if display_stats is None: if display_stats is None:
display_stats = self.config.display_content_stats display_stats = self.config.display_content_stats

View File

@@ -1,8 +1,8 @@
""" """
Complete MinerU parsing + multimodal content insertion Pipeline Complete document parsing + multimodal content insertion Pipeline
This script integrates: This script integrates:
1. MinerU document parsing 1. Document parsing (using configurable parsers)
2. Pure text content LightRAG insertion 2. Pure text content LightRAG insertion
3. Specialized processing for multimodal content (using different processors) 3. Specialized processing for multimodal content (using different processors)
""" """
@@ -102,6 +102,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
self.logger.info("RAGAnything initialized with config:") self.logger.info("RAGAnything initialized with config:")
self.logger.info(f" Working directory: {self.config.working_dir}") self.logger.info(f" Working directory: {self.config.working_dir}")
self.logger.info(f" Parser: {self.config.parser}") self.logger.info(f" Parser: {self.config.parser}")
self.logger.info(f" Parse method: {self.config.parse_method}")
self.logger.info( self.logger.info(
f" Multimodal processing - Image: {self.config.enable_image_processing}, " f" Multimodal processing - Image: {self.config.enable_image_processing}, "
f"Table: {self.config.enable_table_processing}, " f"Table: {self.config.enable_table_processing}, "
@@ -194,7 +195,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
# Check parser installation # Check parser installation
if not self.doc_parser.check_installation(): if not self.doc_parser.check_installation():
raise RuntimeError( raise RuntimeError(
"Parser is not properly installed. " f"Parser '{self.config.parser}' is not properly installed. "
"Please install it using pip install or uv pip install." "Please install it using pip install or uv pip install."
) )
@@ -225,24 +226,25 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
self.logger.info("LightRAG and multimodal processors initialized") self.logger.info("LightRAG and multimodal processors initialized")
def check_mineru_installation(self) -> bool: def check_parser_installation(self) -> bool:
""" """
Check if MinerU 2.0 is properly installed Check if the configured parser is properly installed
Returns: Returns:
bool: True if MinerU 2.0 is properly installed bool: True if the configured parser is properly installed
""" """
return MineruParser.check_installation(MineruParser()) return self.doc_parser.check_installation()
def get_config_info(self) -> Dict[str, Any]: def get_config_info(self) -> Dict[str, Any]:
"""Get current configuration information""" """Get current configuration information"""
return { return {
"directory": { "directory": {
"working_dir": self.config.working_dir, "working_dir": self.config.working_dir,
"mineru_output_dir": self.config.output_dir, "parser_output_dir": self.config.parser_output_dir,
}, },
"parsing": { "parsing": {
"mineru_parse_method": self.config.mineru_parse_method, "parser": self.config.parser,
"parse_method": self.config.parse_method,
"display_content_stats": self.config.display_content_stats, "display_content_stats": self.config.display_content_stats,
}, },
"multimodal_processing": { "multimodal_processing": {