Update parser param

This commit is contained in:
zrguo
2025-07-21 23:48:27 +08:00
parent 7aafb58e41
commit d8302d0cf8
9 changed files with 142 additions and 50 deletions

View File

@@ -297,7 +297,8 @@ async def main():
# Create RAGAnything configuration
config = RAGAnythingConfig(
working_dir="./rag_storage",
mineru_parse_method="auto",
parser="mineru", # Parser selection: mineru or docling
parse_method="auto", # Parse method: auto, ocr, or txt
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
@@ -762,14 +763,30 @@ OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # Optional
OUTPUT_DIR=./output # Default output directory for parsed documents
PARSER=mineru # Parser selection: mineru or docling
PARSE_METHOD=auto # Parse method: auto, ocr, or txt
```
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys.
**Note:** For backward compatibility, legacy environment variable names are still supported:
- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.
### Parser Configuration
RAGAnything now supports multiple parsers, each with specific advantages:
#### MinerU Parser
- Supports PDF, images, Office documents, and more formats
- Powerful OCR and table extraction capabilities
- GPU acceleration support
#### Docling Parser
- Optimized for Office documents and HTML files
- Better document structure preservation
- Native support for multiple Office formats
### MinerU Configuration
RAG-Anything now supports multiple parsers:
```bash
# MinerU 2.0 uses command-line parameters instead of config files
# Check available options:

View File

@@ -293,7 +293,8 @@ async def main():
# 创建 RAGAnything 配置
config = RAGAnythingConfig(
working_dir="./rag_storage",
mineru_parse_method="auto",
parser="mineru", # 选择解析器mineru 或 docling
parse_method="auto", # 解析方法auto, ocr 或 txt
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
@@ -761,11 +762,27 @@ OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # 可选
OUTPUT_DIR=./output # 解析文档的默认输出目录
PARSER=mineru # 解析器选择mineru 或 docling
PARSE_METHOD=auto # 解析方法auto, ocr 或 txt
```
### MinerU配置
**注意:** 为了向后兼容,旧的环境变量名称仍然有效:
- `MINERU_PARSE_METHOD` 已弃用,请使用 `PARSE_METHOD`
RAG-Anything现在支持多种解析器
### 解析器配置
RAGAnything 现在支持多种解析器,每种解析器都有其特定的优势:
#### MinerU 解析器
- 支持PDF、图像、Office文档等多种格式
- 强大的OCR和表格提取能力
- 支持GPU加速
#### Docling 解析器
- 专门优化Office文档和HTML文件的解析
- 更好的文档结构保持
- 原生支持多种Office格式
### MinerU配置
```bash
# MinerU 2.0使用命令行参数而不是配置文件

View File

@@ -34,7 +34,7 @@ OLLAMA_EMULATING_MODEL_TAG=latest
### RAGAnything Configuration (Multimodal Document Processing)
### ---
### Parser Configuration
# MINERU_PARSE_METHOD=auto
# PARSE_METHOD=auto
# OUTPUT_DIR=./output
# PARSER=mineru
# DISPLAY_CONTENT_STATS=true

View File

@@ -104,11 +104,11 @@ async def process_with_rag(
# Create RAGAnything configuration
config = RAGAnythingConfig(
working_dir=working_dir or "./rag_storage",
mineru_parse_method="auto",
parser=parser, # Parser selection: mineru or docling
parse_method="auto", # Parse method: auto, ocr, or txt
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
parser=parser,
)
# Define LLM model function

View File

@@ -29,8 +29,8 @@ class BatchMixin:
Args:
folder_path: Path to the folder to process
output_dir: MinerU output directory (defaults to config.mineru_output_dir)
parse_method: Parse method (defaults to config.mineru_parse_method)
output_dir: Parser output directory (defaults to config.parser_output_dir)
parse_method: Parse method (defaults to config.parse_method)
display_stats: Whether to display content statistics for each file (defaults to False for batch processing)
split_by_character: Optional character to split text by
split_by_character_only: If True, split only by the specified character
@@ -43,9 +43,9 @@ class BatchMixin:
# Use config defaults if not provided
if output_dir is None:
output_dir = self.config.mineru_output_dir
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.mineru_parse_method
parse_method = self.config.parse_method
if display_stats is None:
display_stats = False # Default to False for batch processing
if recursive is None:

View File

@@ -20,13 +20,11 @@ class RAGAnythingConfig:
# Parser Configuration
# ---
mineru_parse_method: str = field(
default=get_env_value("MINERU_PARSE_METHOD", "auto", str)
)
"""Default parsing method for MinerU: 'auto', 'ocr', or 'txt'."""
parse_method: str = field(default=get_env_value("PARSE_METHOD", "auto", str))
"""Default parsing method for document parsing: 'auto', 'ocr', or 'txt'."""
parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
"""Default output directory for MinerU parsed content."""
"""Default output directory for parsed content."""
parser: str = field(default=get_env_value("PARSER", "mineru", str))
"""Parser selection: 'mineru' or 'docling'."""
@@ -104,3 +102,46 @@ class RAGAnythingConfig:
content_format: str = field(default=get_env_value("CONTENT_FORMAT", "minerU", str))
"""Default content format for context extraction when processing documents."""
def __post_init__(self):
"""Post-initialization setup for backward compatibility"""
# Support legacy environment variable names for backward compatibility
legacy_parse_method = get_env_value("MINERU_PARSE_METHOD", None, str)
if legacy_parse_method and not get_env_value("PARSE_METHOD", None, str):
self.parse_method = legacy_parse_method
import warnings
warnings.warn(
"MINERU_PARSE_METHOD is deprecated. Use PARSE_METHOD instead.",
DeprecationWarning,
stacklevel=2,
)
@property
def mineru_parse_method(self) -> str:
"""
Backward compatibility property for old code.
.. deprecated::
Use `parse_method` instead. This property will be removed in a future version.
"""
import warnings
warnings.warn(
"mineru_parse_method is deprecated. Use parse_method instead.",
DeprecationWarning,
stacklevel=2,
)
return self.parse_method
@mineru_parse_method.setter
def mineru_parse_method(self, value: str):
"""Setter for backward compatibility"""
import warnings
warnings.warn(
"mineru_parse_method is deprecated. Use parse_method instead.",
DeprecationWarning,
stacklevel=2,
)
self.parse_method = value

View File

@@ -626,7 +626,7 @@ class MineruParser(Parser):
result = subprocess.run(cmd, **subprocess_kwargs)
logging.info("MinerU command executed successfully")
if result.stdout:
logging.debug(f"Output: {result.stdout}")
logging.debug(f"MinerU output: {result.stdout}")
except subprocess.CalledProcessError as e:
logging.error(f"Error running mineru command: {e}")
if e.stderr:

View File

@@ -31,8 +31,8 @@ class ProcessorMixin:
Args:
file_path: Path to the file to parse
output_dir: Output directory (defaults to config.output_dir)
parse_method: Parse method (defaults to config.mineru_parse_method)
output_dir: Output directory (defaults to config.parser_output_dir)
parse_method: Parse method (defaults to config.parse_method)
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
@@ -43,7 +43,7 @@ class ProcessorMixin:
if output_dir is None:
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.mineru_parse_method
parse_method = self.config.parse_method
if display_stats is None:
display_stats = self.config.display_content_stats
@@ -60,10 +60,14 @@ class ProcessorMixin:
doc_parser = (
DoclingParser() if self.config.parser == "docling" else MineruParser()
)
# Log parser and method information
self.logger.info(
f"Using {self.config.parser} parser with method: {parse_method}"
)
if ext in [".pdf"]:
self.logger.info(
f"Detected PDF file, using PDF parser (method={parse_method})..."
)
self.logger.info("Detected PDF file, using parser for PDF...")
content_list = doc_parser.parse_pdf(
pdf_path=file_path,
output_dir=output_dir,
@@ -80,10 +84,20 @@ class ProcessorMixin:
".gif",
".webp",
]:
self.logger.info("Detected image file, using image parser...")
content_list = MineruParser.parse_image(
image_path=file_path, output_dir=output_dir, **kwargs
)
self.logger.info("Detected image file, using parser for images...")
# Use the selected parser's image parsing capability
if hasattr(doc_parser, "parse_image"):
content_list = doc_parser.parse_image(
image_path=file_path, output_dir=output_dir, **kwargs
)
else:
# Fallback to MinerU for image parsing if current parser doesn't support it
self.logger.warning(
f"{self.config.parser} parser doesn't support image parsing, falling back to MinerU"
)
content_list = MineruParser().parse_image(
image_path=file_path, output_dir=output_dir, **kwargs
)
elif ext in [
".doc",
".docx",
@@ -96,7 +110,7 @@ class ProcessorMixin:
".xhtml",
]:
self.logger.info(
"Detected Office or HTML document, using Office parser..."
"Detected Office or HTML document, using parser for Office/HTML..."
)
content_list = doc_parser.parse_office_doc(
doc_path=file_path, output_dir=output_dir, **kwargs
@@ -114,11 +128,12 @@ class ProcessorMixin:
)
except Exception as e:
self.logger.error(f"Error during parsing with specific parser: {str(e)}")
self.logger.warning("Falling back to generic parser...")
# If specific parser fails, fall back to generic parser
content_list = MineruParser.parse_document(
MineruParser(),
self.logger.error(
f"Error during parsing with {self.config.parser} parser: {str(e)}"
)
self.logger.warning("Falling back to MinerU parser...")
# If specific parser fails, fall back to MinerU parser
content_list = MineruParser().parse_document(
file_path=file_path,
method=parse_method,
output_dir=output_dir,
@@ -262,22 +277,22 @@ class ProcessorMixin:
Args:
file_path: Path to the file to process
output_dir: output directory (defaults to config.output_dir)
parse_method: Parse method (defaults to config.mineru_parse_method)
output_dir: output directory (defaults to config.parser_output_dir)
parse_method: Parse method (defaults to config.parse_method)
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
split_by_character: Optional character to split the text by
split_by_character_only: If True, split only by the specified character
doc_id: Optional document ID, if not provided MD5 hash will be generated
**kwargs: Additional parameters for MinerU parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
"""
# Ensure LightRAG is initialized
await self._ensure_lightrag_initialized()
# Use config defaults if not provided
if output_dir is None:
output_dir = self.config.mineru_output_dir
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.mineru_parse_method
parse_method = self.config.parse_method
if display_stats is None:
display_stats = self.config.display_content_stats

View File

@@ -1,8 +1,8 @@
"""
Complete MinerU parsing + multimodal content insertion Pipeline
Complete document parsing + multimodal content insertion Pipeline
This script integrates:
1. MinerU document parsing
1. Document parsing (using configurable parsers)
2. Pure text content LightRAG insertion
3. Specialized processing for multimodal content (using different processors)
"""
@@ -102,6 +102,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
self.logger.info("RAGAnything initialized with config:")
self.logger.info(f" Working directory: {self.config.working_dir}")
self.logger.info(f" Parser: {self.config.parser}")
self.logger.info(f" Parse method: {self.config.parse_method}")
self.logger.info(
f" Multimodal processing - Image: {self.config.enable_image_processing}, "
f"Table: {self.config.enable_table_processing}, "
@@ -194,7 +195,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
# Check parser installation
if not self.doc_parser.check_installation():
raise RuntimeError(
"Parser is not properly installed. "
f"Parser '{self.config.parser}' is not properly installed. "
"Please install it using pip install or uv pip install."
)
@@ -225,24 +226,25 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
self.logger.info("LightRAG and multimodal processors initialized")
def check_mineru_installation(self) -> bool:
def check_parser_installation(self) -> bool:
"""
Check if MinerU 2.0 is properly installed
Check if the configured parser is properly installed
Returns:
bool: True if MinerU 2.0 is properly installed
bool: True if the configured parser is properly installed
"""
return MineruParser.check_installation(MineruParser())
return self.doc_parser.check_installation()
def get_config_info(self) -> Dict[str, Any]:
"""Get current configuration information"""
return {
"directory": {
"working_dir": self.config.working_dir,
"mineru_output_dir": self.config.output_dir,
"parser_output_dir": self.config.parser_output_dir,
},
"parsing": {
"mineru_parse_method": self.config.mineru_parse_method,
"parser": self.config.parser,
"parse_method": self.config.parse_method,
"display_content_stats": self.config.display_content_stats,
},
"multimodal_processing": {