mirror of
https://github.com/HKUDS/RAG-Anything.git
synced 2025-08-20 19:01:34 +03:00
Update parser param
This commit is contained in:
25
README.md
25
README.md
@@ -297,7 +297,8 @@ async def main():
|
||||
# Create RAGAnything configuration
|
||||
config = RAGAnythingConfig(
|
||||
working_dir="./rag_storage",
|
||||
mineru_parse_method="auto",
|
||||
parser="mineru", # Parser selection: mineru or docling
|
||||
parse_method="auto", # Parse method: auto, ocr, or txt
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
@@ -762,14 +763,30 @@ OPENAI_API_KEY=your_openai_api_key
|
||||
OPENAI_BASE_URL=your_base_url # Optional
|
||||
OUTPUT_DIR=./output # Default output directory for parsed documents
|
||||
PARSER=mineru # Parser selection: mineru or docling
|
||||
PARSE_METHOD=auto # Parse method: auto, ocr, or txt
|
||||
```
|
||||
|
||||
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys.
|
||||
**Note:** For backward compatibility, legacy environment variable names are still supported:
|
||||
- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`
|
||||
|
||||
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.
|
||||
|
||||
### Parser Configuration
|
||||
|
||||
RAGAnything now supports multiple parsers, each with specific advantages:
|
||||
|
||||
#### MinerU Parser
|
||||
- Supports PDF, images, Office documents, and more formats
|
||||
- Powerful OCR and table extraction capabilities
|
||||
- GPU acceleration support
|
||||
|
||||
#### Docling Parser
|
||||
- Optimized for Office documents and HTML files
|
||||
- Better document structure preservation
|
||||
- Native support for multiple Office formats
|
||||
|
||||
### MinerU Configuration
|
||||
|
||||
RAG-Anything now supports multiple parsers:
|
||||
|
||||
```bash
|
||||
# MinerU 2.0 uses command-line parameters instead of config files
|
||||
# Check available options:
|
||||
|
||||
23
README_zh.md
23
README_zh.md
@@ -293,7 +293,8 @@ async def main():
|
||||
# 创建 RAGAnything 配置
|
||||
config = RAGAnythingConfig(
|
||||
working_dir="./rag_storage",
|
||||
mineru_parse_method="auto",
|
||||
parser="mineru", # 选择解析器:mineru 或 docling
|
||||
parse_method="auto", # 解析方法:auto, ocr 或 txt
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
@@ -761,11 +762,27 @@ OPENAI_API_KEY=your_openai_api_key
|
||||
OPENAI_BASE_URL=your_base_url # 可选
|
||||
OUTPUT_DIR=./output # 解析文档的默认输出目录
|
||||
PARSER=mineru # 解析器选择:mineru 或 docling
|
||||
PARSE_METHOD=auto # 解析方法:auto, ocr 或 txt
|
||||
```
|
||||
|
||||
### MinerU配置
|
||||
**注意:** 为了向后兼容,旧的环境变量名称仍然有效:
|
||||
- `MINERU_PARSE_METHOD` 已弃用,请使用 `PARSE_METHOD`
|
||||
|
||||
RAG-Anything现在支持多种解析器:
|
||||
### 解析器配置
|
||||
|
||||
RAGAnything 现在支持多种解析器,每种解析器都有其特定的优势:
|
||||
|
||||
#### MinerU 解析器
|
||||
- 支持PDF、图像、Office文档等多种格式
|
||||
- 强大的OCR和表格提取能力
|
||||
- 支持GPU加速
|
||||
|
||||
#### Docling 解析器
|
||||
- 专门优化Office文档和HTML文件的解析
|
||||
- 更好的文档结构保持
|
||||
- 原生支持多种Office格式
|
||||
|
||||
### MinerU配置
|
||||
|
||||
```bash
|
||||
# MinerU 2.0使用命令行参数而不是配置文件
|
||||
|
||||
@@ -34,7 +34,7 @@ OLLAMA_EMULATING_MODEL_TAG=latest
|
||||
### RAGAnything Configuration (Multimodal Document Processing)
|
||||
### ---
|
||||
### Parser Configuration
|
||||
# MINERU_PARSE_METHOD=auto
|
||||
# PARSE_METHOD=auto
|
||||
# OUTPUT_DIR=./output
|
||||
# PARSER=mineru
|
||||
# DISPLAY_CONTENT_STATS=true
|
||||
|
||||
@@ -104,11 +104,11 @@ async def process_with_rag(
|
||||
# Create RAGAnything configuration
|
||||
config = RAGAnythingConfig(
|
||||
working_dir=working_dir or "./rag_storage",
|
||||
mineru_parse_method="auto",
|
||||
parser=parser, # Parser selection: mineru or docling
|
||||
parse_method="auto", # Parse method: auto, ocr, or txt
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
parser=parser,
|
||||
)
|
||||
|
||||
# Define LLM model function
|
||||
|
||||
@@ -29,8 +29,8 @@ class BatchMixin:
|
||||
|
||||
Args:
|
||||
folder_path: Path to the folder to process
|
||||
output_dir: MinerU output directory (defaults to config.mineru_output_dir)
|
||||
parse_method: Parse method (defaults to config.mineru_parse_method)
|
||||
output_dir: Parser output directory (defaults to config.parser_output_dir)
|
||||
parse_method: Parse method (defaults to config.parse_method)
|
||||
display_stats: Whether to display content statistics for each file (defaults to False for batch processing)
|
||||
split_by_character: Optional character to split text by
|
||||
split_by_character_only: If True, split only by the specified character
|
||||
@@ -43,9 +43,9 @@ class BatchMixin:
|
||||
|
||||
# Use config defaults if not provided
|
||||
if output_dir is None:
|
||||
output_dir = self.config.mineru_output_dir
|
||||
output_dir = self.config.parser_output_dir
|
||||
if parse_method is None:
|
||||
parse_method = self.config.mineru_parse_method
|
||||
parse_method = self.config.parse_method
|
||||
if display_stats is None:
|
||||
display_stats = False # Default to False for batch processing
|
||||
if recursive is None:
|
||||
|
||||
@@ -20,13 +20,11 @@ class RAGAnythingConfig:
|
||||
|
||||
# Parser Configuration
|
||||
# ---
|
||||
mineru_parse_method: str = field(
|
||||
default=get_env_value("MINERU_PARSE_METHOD", "auto", str)
|
||||
)
|
||||
"""Default parsing method for MinerU: 'auto', 'ocr', or 'txt'."""
|
||||
parse_method: str = field(default=get_env_value("PARSE_METHOD", "auto", str))
|
||||
"""Default parsing method for document parsing: 'auto', 'ocr', or 'txt'."""
|
||||
|
||||
parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
|
||||
"""Default output directory for MinerU parsed content."""
|
||||
"""Default output directory for parsed content."""
|
||||
|
||||
parser: str = field(default=get_env_value("PARSER", "mineru", str))
|
||||
"""Parser selection: 'mineru' or 'docling'."""
|
||||
@@ -104,3 +102,46 @@ class RAGAnythingConfig:
|
||||
|
||||
content_format: str = field(default=get_env_value("CONTENT_FORMAT", "minerU", str))
|
||||
"""Default content format for context extraction when processing documents."""
|
||||
|
||||
def __post_init__(self):
|
||||
"""Post-initialization setup for backward compatibility"""
|
||||
# Support legacy environment variable names for backward compatibility
|
||||
legacy_parse_method = get_env_value("MINERU_PARSE_METHOD", None, str)
|
||||
if legacy_parse_method and not get_env_value("PARSE_METHOD", None, str):
|
||||
self.parse_method = legacy_parse_method
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"MINERU_PARSE_METHOD is deprecated. Use PARSE_METHOD instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@property
|
||||
def mineru_parse_method(self) -> str:
|
||||
"""
|
||||
Backward compatibility property for old code.
|
||||
|
||||
.. deprecated::
|
||||
Use `parse_method` instead. This property will be removed in a future version.
|
||||
"""
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"mineru_parse_method is deprecated. Use parse_method instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return self.parse_method
|
||||
|
||||
@mineru_parse_method.setter
|
||||
def mineru_parse_method(self, value: str):
|
||||
"""Setter for backward compatibility"""
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"mineru_parse_method is deprecated. Use parse_method instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
self.parse_method = value
|
||||
|
||||
@@ -626,7 +626,7 @@ class MineruParser(Parser):
|
||||
result = subprocess.run(cmd, **subprocess_kwargs)
|
||||
logging.info("MinerU command executed successfully")
|
||||
if result.stdout:
|
||||
logging.debug(f"Output: {result.stdout}")
|
||||
logging.debug(f"MinerU output: {result.stdout}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logging.error(f"Error running mineru command: {e}")
|
||||
if e.stderr:
|
||||
|
||||
@@ -31,8 +31,8 @@ class ProcessorMixin:
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to parse
|
||||
output_dir: Output directory (defaults to config.output_dir)
|
||||
parse_method: Parse method (defaults to config.mineru_parse_method)
|
||||
output_dir: Output directory (defaults to config.parser_output_dir)
|
||||
parse_method: Parse method (defaults to config.parse_method)
|
||||
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
|
||||
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
|
||||
|
||||
@@ -43,7 +43,7 @@ class ProcessorMixin:
|
||||
if output_dir is None:
|
||||
output_dir = self.config.parser_output_dir
|
||||
if parse_method is None:
|
||||
parse_method = self.config.mineru_parse_method
|
||||
parse_method = self.config.parse_method
|
||||
if display_stats is None:
|
||||
display_stats = self.config.display_content_stats
|
||||
|
||||
@@ -60,10 +60,14 @@ class ProcessorMixin:
|
||||
doc_parser = (
|
||||
DoclingParser() if self.config.parser == "docling" else MineruParser()
|
||||
)
|
||||
|
||||
# Log parser and method information
|
||||
self.logger.info(
|
||||
f"Using {self.config.parser} parser with method: {parse_method}"
|
||||
)
|
||||
|
||||
if ext in [".pdf"]:
|
||||
self.logger.info(
|
||||
f"Detected PDF file, using PDF parser (method={parse_method})..."
|
||||
)
|
||||
self.logger.info("Detected PDF file, using parser for PDF...")
|
||||
content_list = doc_parser.parse_pdf(
|
||||
pdf_path=file_path,
|
||||
output_dir=output_dir,
|
||||
@@ -80,10 +84,20 @@ class ProcessorMixin:
|
||||
".gif",
|
||||
".webp",
|
||||
]:
|
||||
self.logger.info("Detected image file, using image parser...")
|
||||
content_list = MineruParser.parse_image(
|
||||
image_path=file_path, output_dir=output_dir, **kwargs
|
||||
)
|
||||
self.logger.info("Detected image file, using parser for images...")
|
||||
# Use the selected parser's image parsing capability
|
||||
if hasattr(doc_parser, "parse_image"):
|
||||
content_list = doc_parser.parse_image(
|
||||
image_path=file_path, output_dir=output_dir, **kwargs
|
||||
)
|
||||
else:
|
||||
# Fallback to MinerU for image parsing if current parser doesn't support it
|
||||
self.logger.warning(
|
||||
f"{self.config.parser} parser doesn't support image parsing, falling back to MinerU"
|
||||
)
|
||||
content_list = MineruParser().parse_image(
|
||||
image_path=file_path, output_dir=output_dir, **kwargs
|
||||
)
|
||||
elif ext in [
|
||||
".doc",
|
||||
".docx",
|
||||
@@ -96,7 +110,7 @@ class ProcessorMixin:
|
||||
".xhtml",
|
||||
]:
|
||||
self.logger.info(
|
||||
"Detected Office or HTML document, using Office parser..."
|
||||
"Detected Office or HTML document, using parser for Office/HTML..."
|
||||
)
|
||||
content_list = doc_parser.parse_office_doc(
|
||||
doc_path=file_path, output_dir=output_dir, **kwargs
|
||||
@@ -114,11 +128,12 @@ class ProcessorMixin:
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error during parsing with specific parser: {str(e)}")
|
||||
self.logger.warning("Falling back to generic parser...")
|
||||
# If specific parser fails, fall back to generic parser
|
||||
content_list = MineruParser.parse_document(
|
||||
MineruParser(),
|
||||
self.logger.error(
|
||||
f"Error during parsing with {self.config.parser} parser: {str(e)}"
|
||||
)
|
||||
self.logger.warning("Falling back to MinerU parser...")
|
||||
# If specific parser fails, fall back to MinerU parser
|
||||
content_list = MineruParser().parse_document(
|
||||
file_path=file_path,
|
||||
method=parse_method,
|
||||
output_dir=output_dir,
|
||||
@@ -262,22 +277,22 @@ class ProcessorMixin:
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to process
|
||||
output_dir: output directory (defaults to config.output_dir)
|
||||
parse_method: Parse method (defaults to config.mineru_parse_method)
|
||||
output_dir: output directory (defaults to config.parser_output_dir)
|
||||
parse_method: Parse method (defaults to config.parse_method)
|
||||
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
|
||||
split_by_character: Optional character to split the text by
|
||||
split_by_character_only: If True, split only by the specified character
|
||||
doc_id: Optional document ID, if not provided MD5 hash will be generated
|
||||
**kwargs: Additional parameters for MinerU parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
|
||||
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
|
||||
"""
|
||||
# Ensure LightRAG is initialized
|
||||
await self._ensure_lightrag_initialized()
|
||||
|
||||
# Use config defaults if not provided
|
||||
if output_dir is None:
|
||||
output_dir = self.config.mineru_output_dir
|
||||
output_dir = self.config.parser_output_dir
|
||||
if parse_method is None:
|
||||
parse_method = self.config.mineru_parse_method
|
||||
parse_method = self.config.parse_method
|
||||
if display_stats is None:
|
||||
display_stats = self.config.display_content_stats
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
"""
|
||||
Complete MinerU parsing + multimodal content insertion Pipeline
|
||||
Complete document parsing + multimodal content insertion Pipeline
|
||||
|
||||
This script integrates:
|
||||
1. MinerU document parsing
|
||||
1. Document parsing (using configurable parsers)
|
||||
2. Pure text content LightRAG insertion
|
||||
3. Specialized processing for multimodal content (using different processors)
|
||||
"""
|
||||
@@ -102,6 +102,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
self.logger.info("RAGAnything initialized with config:")
|
||||
self.logger.info(f" Working directory: {self.config.working_dir}")
|
||||
self.logger.info(f" Parser: {self.config.parser}")
|
||||
self.logger.info(f" Parse method: {self.config.parse_method}")
|
||||
self.logger.info(
|
||||
f" Multimodal processing - Image: {self.config.enable_image_processing}, "
|
||||
f"Table: {self.config.enable_table_processing}, "
|
||||
@@ -194,7 +195,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
# Check parser installation
|
||||
if not self.doc_parser.check_installation():
|
||||
raise RuntimeError(
|
||||
"Parser is not properly installed. "
|
||||
f"Parser '{self.config.parser}' is not properly installed. "
|
||||
"Please install it using pip install or uv pip install."
|
||||
)
|
||||
|
||||
@@ -225,24 +226,25 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
|
||||
self.logger.info("LightRAG and multimodal processors initialized")
|
||||
|
||||
def check_mineru_installation(self) -> bool:
|
||||
def check_parser_installation(self) -> bool:
|
||||
"""
|
||||
Check if MinerU 2.0 is properly installed
|
||||
Check if the configured parser is properly installed
|
||||
|
||||
Returns:
|
||||
bool: True if MinerU 2.0 is properly installed
|
||||
bool: True if the configured parser is properly installed
|
||||
"""
|
||||
return MineruParser.check_installation(MineruParser())
|
||||
return self.doc_parser.check_installation()
|
||||
|
||||
def get_config_info(self) -> Dict[str, Any]:
|
||||
"""Get current configuration information"""
|
||||
return {
|
||||
"directory": {
|
||||
"working_dir": self.config.working_dir,
|
||||
"mineru_output_dir": self.config.output_dir,
|
||||
"parser_output_dir": self.config.parser_output_dir,
|
||||
},
|
||||
"parsing": {
|
||||
"mineru_parse_method": self.config.mineru_parse_method,
|
||||
"parser": self.config.parser,
|
||||
"parse_method": self.config.parse_method,
|
||||
"display_content_stats": self.config.display_content_stats,
|
||||
},
|
||||
"multimodal_processing": {
|
||||
|
||||
Reference in New Issue
Block a user