Merge pull request #58 from HKUDS/docling

Add Docling Parser
This commit is contained in:
zrguo
2025-07-21 12:30:43 +08:00
committed by GitHub
10 changed files with 1912 additions and 70 deletions

1
.gitignore vendored
View File

@@ -61,6 +61,7 @@ ignore_this.txt
dickens*/
book.txt
LightRAG.pdf
LightRAG_2-4.pdf
download_models_hf.py
lightrag-dev/
gui/

View File

@@ -722,8 +722,8 @@ The `examples/` directory contains comprehensive usage examples:
**Run examples:**
```bash
# End-to-end processing
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
# End-to-end processing with parser selection
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
# Direct modal processing
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
@@ -760,13 +760,15 @@ Create a `.env` file (refer to `.env.example`):
```bash
OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # Optional
OUTPUT_DIR=./output # Default output directory for parsed documents
PARSER=mineru # Parser selection: mineru or docling
```
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys.
### MinerU Configuration
MinerU 2.0 uses a simplified configuration approach:
RAG-Anything now supports multiple parsers:
```bash
# MinerU 2.0 uses command-line parameters instead of config files
@@ -779,21 +781,23 @@ mineru -p input.pdf -o output_dir -m ocr # OCR-focused parsing
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU acceleration
```
You can also configure MinerU through RAGAnything parameters:
You can also configure parsing through RAGAnything parameters:
```python
# Basic parsing configuration
# Basic parsing configuration with parser selection
await rag.process_document_complete(
file_path="document.pdf",
output_dir="./output/",
parse_method="auto", # or "ocr", "txt"
parser="mineru" # Optional: "mineru" or "docling"
)
# Advanced MinerU parsing configuration with special parameters
# Advanced parsing configuration with special parameters
await rag.process_document_complete(
file_path="document.pdf",
output_dir="./output/",
parse_method="auto", # Parsing method: "auto", "ocr", "txt"
parser="mineru", # Parser selection: "mineru" or "docling"
# MinerU special parameters - all supported kwargs:
lang="ch", # Document language for OCR optimization (e.g., "ch", "en", "ja")
@@ -813,7 +817,7 @@ await rag.process_document_complete(
)
```
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments.
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything now supports multiple document parsers - you can choose between MinerU and Docling based on your needs.
### Processing Requirements

View File

@@ -719,8 +719,8 @@ if __name__ == "__main__":
**运行示例:**
```bash
# 端到端处理
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
# 端到端处理(包含解析器选择)
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
# 直接模态处理
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
@@ -759,11 +759,13 @@ python examples/text_format_test.py --check-reportlab --file dummy
```bash
OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # 可选
OUTPUT_DIR=./output # 解析文档的默认输出目录
PARSER=mineru # 解析器选择mineru 或 docling
```
### MinerU配置
MinerU 2.0使用简化的配置方式
RAG-Anything现在支持多种解析器
```bash
# MinerU 2.0使用命令行参数而不是配置文件
@@ -776,21 +778,23 @@ mineru -p input.pdf -o output_dir -m ocr # OCR重点解析
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU加速
```
你也可以通过RAGAnything参数配置MinerU
你也可以通过RAGAnything参数配置解析
```python
# 基础解析配置
# 基础解析配置和解析器选择
await rag.process_document_complete(
file_path="document.pdf",
output_dir="./output/",
parse_method="auto", # 或 "ocr", "txt"
parser="mineru" # 可选:"mineru" 或 "docling"
)
# MinerU高级解析配置(包含特殊参数)
# 高级解析配置(包含特殊参数)
await rag.process_document_complete(
file_path="document.pdf",
output_dir="./output/",
parse_method="auto", # 解析方法:"auto", "ocr", "txt"
parser="mineru", # 解析器选择:"mineru" 或 "docling"
# MinerU特殊参数 - 支持的所有kwargs
lang="ch", # 文档语言优化(如:"ch", "en", "ja"
@@ -802,6 +806,7 @@ await rag.process_document_complete(
backend="pipeline", # 解析后端pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client
source="huggingface", # 模型源:"huggingface", "modelscope", "local"
# vlm_url="http://127.0.0.1:3000" # 当backend=vlm-sglang-client时需指定服务地址
# RAGAnything标准参数
display_stats=True, # 显示内容统计信息
split_by_character=None, # 可选的文本分割字符
@@ -809,7 +814,7 @@ await rag.process_document_complete(
)
```
> **注意**MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。
> **注意**MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。RAG-Anything现在支持多种文档解析器 - 你可以根据需要在MinerU和Docling之间选择。
### 处理要求

View File

@@ -33,9 +33,10 @@ OLLAMA_EMULATING_MODEL_TAG=latest
### RAGAnything Configuration (Multimodal Document Processing)
### ---
### MinerU Parser Configuration
### Parser Configuration
# MINERU_PARSE_METHOD=auto
# MINERU_OUTPUT_DIR=./output
# OUTPUT_DIR=./output
# PARSER=mineru
# DISPLAY_CONTENT_STATS=true
### Multimodal Processing Configuration

View File

@@ -88,6 +88,7 @@ async def process_with_rag(
api_key: str,
base_url: str = None,
working_dir: str = None,
parser: str = None,
):
"""
Process document with RAGAnything
@@ -107,6 +108,7 @@ async def process_with_rag(
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
parser=parser,
)
# Define LLM model function
@@ -250,17 +252,26 @@ def main():
)
parser.add_argument(
"--api-key",
default=os.getenv("OPENAI_API_KEY"),
help="OpenAI API key (defaults to OPENAI_API_KEY env var)",
default=os.getenv("LLM_BINDING_API_KEY"),
help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
)
parser.add_argument(
"--base-url",
default=os.getenv("LLM_BINDING_HOST"),
help="Optional base URL for API",
)
parser.add_argument(
"--parser",
default=os.getenv("PARSER", "mineru"),
help="Optional base URL for API",
)
parser.add_argument("--base-url", help="Optional base URL for API")
args = parser.parse_args()
# Check if API key is provided
if not args.api_key:
logger.error("Error: OpenAI API key is required")
logger.error("Set OPENAI_API_KEY environment variable or use --api-key option")
logger.error("Set api key environment variable or use --api-key option")
return
# Create output directory if specified
@@ -270,7 +281,12 @@ def main():
# Process with RAG
asyncio.run(
process_with_rag(
args.file_path, args.output, args.api_key, args.base_url, args.working_dir
args.file_path,
args.output,
args.api_key,
args.base_url,
args.working_dir,
args.parser,
)
)

View File

@@ -18,18 +18,19 @@ class RAGAnythingConfig:
working_dir: str = field(default=get_env_value("WORKING_DIR", "./rag_storage", str))
"""Directory where RAG storage and cache files are stored."""
# MinerU Parser Configuration
# Parser Configuration
# ---
mineru_parse_method: str = field(
default=get_env_value("MINERU_PARSE_METHOD", "auto", str)
)
"""Default parsing method for MinerU: 'auto', 'ocr', or 'txt'."""
mineru_output_dir: str = field(
default=get_env_value("MINERU_OUTPUT_DIR", "./output", str)
)
parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
"""Default output directory for MinerU parsed content."""
parser: str = field(default=get_env_value("PARSER", "mineru", str))
"""Parser selection: 'mineru' or 'docling'."""
display_content_stats: bool = field(
default=get_env_value("DISPLAY_CONTENT_STATS", True, bool)
)

View File

@@ -517,6 +517,144 @@ class BaseModalProcessor:
chunk_results,
)
def _robust_json_parse(self, response: str) -> dict:
"""Robust JSON parsing with multiple fallback strategies"""
# Strategy 1: Try direct parsing first
for json_candidate in self._extract_all_json_candidates(response):
result = self._try_parse_json(json_candidate)
if result:
return result
# Strategy 2: Try with basic cleanup
for json_candidate in self._extract_all_json_candidates(response):
cleaned = self._basic_json_cleanup(json_candidate)
result = self._try_parse_json(cleaned)
if result:
return result
# Strategy 3: Try progressive quote fixing
for json_candidate in self._extract_all_json_candidates(response):
fixed = self._progressive_quote_fix(json_candidate)
result = self._try_parse_json(fixed)
if result:
return result
# Strategy 4: Fallback to regex field extraction
return self._extract_fields_with_regex(response)
def _extract_all_json_candidates(self, response: str) -> list:
"""Extract all possible JSON candidates from response"""
candidates = []
# Method 1: JSON in code blocks
import re
json_blocks = re.findall(r"```(?:json)?\s*(\{.*?\})\s*```", response, re.DOTALL)
candidates.extend(json_blocks)
# Method 2: Balanced braces
brace_count = 0
start_pos = -1
for i, char in enumerate(response):
if char == "{":
if brace_count == 0:
start_pos = i
brace_count += 1
elif char == "}":
brace_count -= 1
if brace_count == 0 and start_pos != -1:
candidates.append(response[start_pos : i + 1])
# Method 3: Simple regex fallback
simple_match = re.search(r"\{.*\}", response, re.DOTALL)
if simple_match:
candidates.append(simple_match.group(0))
return candidates
def _try_parse_json(self, json_str: str) -> dict:
"""Try to parse JSON string, return None if failed"""
if not json_str or not json_str.strip():
return None
try:
return json.loads(json_str)
except (json.JSONDecodeError, ValueError):
return None
def _basic_json_cleanup(self, json_str: str) -> str:
"""Basic cleanup for common JSON issues"""
# Remove extra whitespace
json_str = json_str.strip()
# Fix common quote issues
json_str = json_str.replace('"', '"').replace('"', '"') # Smart quotes
json_str = json_str.replace(""", "'").replace(""", "'") # Smart apostrophes
# Fix trailing commas (simple case)
json_str = re.sub(r",(\s*[}\]])", r"\1", json_str)
return json_str
def _progressive_quote_fix(self, json_str: str) -> str:
"""Progressive fixing of quote and escape issues"""
# Only escape unescaped backslashes before quotes
json_str = re.sub(r'(?<!\\)\\(?=")', r"\\\\", json_str)
# Fix unescaped backslashes in string values (more conservative)
def fix_string_content(match):
content = match.group(1)
# Only escape obvious problematic patterns
content = re.sub(r"\\(?=[a-zA-Z])", r"\\\\", content) # \alpha -> \\alpha
return f'"{content}"'
json_str = re.sub(r'"([^"]*(?:\\.[^"]*)*)"', fix_string_content, json_str)
return json_str
def _extract_fields_with_regex(self, response: str) -> dict:
"""Extract required fields using regex as last resort"""
logger.warning("Using regex fallback for JSON parsing")
# Extract detailed_description
desc_match = re.search(
r'"detailed_description":\s*"([^"]*(?:\\.[^"]*)*)"', response, re.DOTALL
)
description = desc_match.group(1) if desc_match else ""
# Extract entity_name
name_match = re.search(r'"entity_name":\s*"([^"]*(?:\\.[^"]*)*)"', response)
entity_name = name_match.group(1) if name_match else "unknown_entity"
# Extract entity_type
type_match = re.search(r'"entity_type":\s*"([^"]*(?:\\.[^"]*)*)"', response)
entity_type = type_match.group(1) if type_match else "unknown"
# Extract summary
summary_match = re.search(
r'"summary":\s*"([^"]*(?:\\.[^"]*)*)"', response, re.DOTALL
)
summary = summary_match.group(1) if summary_match else description[:100]
return {
"detailed_description": description,
"entity_info": {
"entity_name": entity_name,
"entity_type": entity_type,
"summary": summary,
},
}
def _extract_json_from_response(self, response: str) -> str:
"""Legacy method - now handled by _extract_all_json_candidates"""
candidates = self._extract_all_json_candidates(response)
return candidates[0] if candidates else None
def _fix_json_escapes(self, json_str: str) -> str:
"""Legacy method - now handled by progressive strategies"""
return self._progressive_quote_fix(json_str)
async def _process_chunk_for_extraction(
self, chunk_id: str, modal_entity_name: str, batch_mode: bool = False
):
@@ -743,9 +881,7 @@ class ImageModalProcessor(BaseModalProcessor):
) -> Tuple[str, Dict[str, Any]]:
"""Parse model response"""
try:
response_data = json.loads(
re.search(r"\{.*\}", response, re.DOTALL).group(0)
)
response_data = self._robust_json_parse(response)
description = response_data.get("detailed_description", "")
entity_data = response_data.get("entity_info", {})
@@ -768,6 +904,7 @@ class ImageModalProcessor(BaseModalProcessor):
except (json.JSONDecodeError, AttributeError, ValueError) as e:
logger.error(f"Error parsing image analysis response: {e}")
logger.debug(f"Raw response: {response}")
fallback_entity = {
"entity_name": entity_name
if entity_name
@@ -867,9 +1004,7 @@ class TableModalProcessor(BaseModalProcessor):
) -> Tuple[str, Dict[str, Any]]:
"""Parse table analysis response"""
try:
response_data = json.loads(
re.search(r"\{.*\}", response, re.DOTALL).group(0)
)
response_data = self._robust_json_parse(response)
description = response_data.get("detailed_description", "")
entity_data = response_data.get("entity_info", {})
@@ -892,6 +1027,7 @@ class TableModalProcessor(BaseModalProcessor):
except (json.JSONDecodeError, AttributeError, ValueError) as e:
logger.error(f"Error parsing table analysis response: {e}")
logger.debug(f"Raw response: {response}")
fallback_entity = {
"entity_name": entity_name
if entity_name
@@ -979,11 +1115,9 @@ class EquationModalProcessor(BaseModalProcessor):
def _parse_equation_response(
self, response: str, entity_name: str = None
) -> Tuple[str, Dict[str, Any]]:
"""Parse equation analysis response"""
"""Parse equation analysis response with robust JSON handling"""
try:
response_data = json.loads(
re.search(r"\{.*\}", response, re.DOTALL).group(0)
)
response_data = self._robust_json_parse(response)
description = response_data.get("detailed_description", "")
entity_data = response_data.get("entity_info", {})
@@ -1006,6 +1140,7 @@ class EquationModalProcessor(BaseModalProcessor):
except (json.JSONDecodeError, AttributeError, ValueError) as e:
logger.error(f"Error parsing equation analysis response: {e}")
logger.debug(f"Raw response: {response}")
fallback_entity = {
"entity_name": entity_name
if entity_name
@@ -1085,9 +1220,7 @@ class GenericModalProcessor(BaseModalProcessor):
) -> Tuple[str, Dict[str, Any]]:
"""Parse generic analysis response"""
try:
response_data = json.loads(
re.search(r"\{.*\}", response, re.DOTALL).group(0)
)
response_data = self._robust_json_parse(response)
description = response_data.get("detailed_description", "")
entity_data = response_data.get("entity_info", {})
@@ -1109,7 +1242,8 @@ class GenericModalProcessor(BaseModalProcessor):
return description, entity_data
except (json.JSONDecodeError, AttributeError, ValueError) as e:
logger.error(f"Error parsing generic analysis response: {e}")
logger.error(f"Error parsing {content_type} analysis response: {e}")
logger.debug(f"Raw response: {response}")
fallback_entity = {
"entity_name": entity_name
if entity_name

1662
raganything/parser.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -5,9 +5,9 @@ Contains methods for parsing documents and processing multimodal content
"""
import os
from typing import Dict, List, Any, Tuple
from typing import Dict, List, Any
from pathlib import Path
from raganything.mineru_parser import MineruParser
from raganything.parser import MineruParser, DoclingParser
from raganything.utils import (
separate_content,
insert_text_content,
@@ -25,23 +25,23 @@ class ProcessorMixin:
parse_method: str = None,
display_stats: bool = None,
**kwargs,
) -> Tuple[List[Dict[str, Any]], str]:
) -> List[Dict[str, Any]]:
"""
Parse document using MinerU
Parse document
Args:
file_path: Path to the file to parse
output_dir: Output directory (defaults to config.mineru_output_dir)
output_dir: Output directory (defaults to config.output_dir)
parse_method: Parse method (defaults to config.mineru_parse_method)
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
**kwargs: Additional parameters for MinerU parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
Returns:
(content_list, md_content): Content list and markdown text
List[Dict[str, Any]]: Content list
"""
# Use config defaults if not provided
if output_dir is None:
output_dir = self.config.mineru_output_dir
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.mineru_parse_method
if display_stats is None:
@@ -57,11 +57,14 @@ class ProcessorMixin:
ext = file_path.suffix.lower()
try:
doc_parser = (
DoclingParser() if self.config.parser == "docling" else MineruParser()
)
if ext in [".pdf"]:
self.logger.info(
f"Detected PDF file, using PDF parser (method={parse_method})..."
)
content_list, md_content = MineruParser.parse_pdf(
content_list = doc_parser.parse_pdf(
pdf_path=file_path,
output_dir=output_dir,
method=parse_method,
@@ -78,12 +81,24 @@ class ProcessorMixin:
".webp",
]:
self.logger.info("Detected image file, using image parser...")
content_list, md_content = MineruParser.parse_image(
content_list = MineruParser.parse_image(
image_path=file_path, output_dir=output_dir, **kwargs
)
elif ext in [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"]:
self.logger.info("Detected Office document, using Office parser...")
content_list, md_content = MineruParser.parse_office_doc(
elif ext in [
".doc",
".docx",
".ppt",
".pptx",
".xls",
".xlsx",
".html",
".htm",
".xhtml",
]:
self.logger.info(
"Detected Office or HTML document, using Office parser..."
)
content_list = doc_parser.parse_office_doc(
doc_path=file_path, output_dir=output_dir, **kwargs
)
else:
@@ -91,7 +106,7 @@ class ProcessorMixin:
self.logger.info(
f"Using generic parser for {ext} file (method={parse_method})..."
)
content_list, md_content = MineruParser.parse_document(
content_list = doc_parser.parse_document(
file_path=file_path,
method=parse_method,
output_dir=output_dir,
@@ -102,7 +117,8 @@ class ProcessorMixin:
self.logger.error(f"Error during parsing with specific parser: {str(e)}")
self.logger.warning("Falling back to generic parser...")
# If specific parser fails, fall back to generic parser
content_list, md_content = MineruParser.parse_document(
content_list = MineruParser.parse_document(
MineruParser(),
file_path=file_path,
method=parse_method,
output_dir=output_dir,
@@ -112,13 +128,11 @@ class ProcessorMixin:
self.logger.info(
f"Parsing complete! Extracted {len(content_list)} content blocks"
)
self.logger.info(f"Markdown text length: {len(md_content)} characters")
# Display content statistics if requested
if display_stats:
self.logger.info("\nContent Information:")
self.logger.info(f"* Total blocks in content_list: {len(content_list)}")
self.logger.info(f"* Markdown content length: {len(md_content)} characters")
# Count elements by type
block_types: Dict[str, int] = {}
@@ -132,7 +146,7 @@ class ProcessorMixin:
for block_type, count in block_types.items():
self.logger.info(f" - {block_type}: {count}")
return content_list, md_content
return content_list
async def _process_multimodal_content(
self, multimodal_items: List[Dict[str, Any]], file_path: str
@@ -248,7 +262,7 @@ class ProcessorMixin:
Args:
file_path: Path to the file to process
output_dir: MinerU output directory (defaults to config.mineru_output_dir)
output_dir: output directory (defaults to config.output_dir)
parse_method: Parse method (defaults to config.mineru_parse_method)
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
split_by_character: Optional character to split the text by
@@ -269,8 +283,8 @@ class ProcessorMixin:
self.logger.info(f"Starting complete document processing: {file_path}")
# Step 1: Parse document using MinerU
content_list, md_content = self.parse_document(
# Step 1: Parse document
content_list = self.parse_document(
file_path, output_dir, parse_method, display_stats, **kwargs
)

View File

@@ -30,7 +30,7 @@ from raganything.query import QueryMixin
from raganything.processor import ProcessorMixin
from raganything.batch import BatchMixin
from raganything.utils import get_processor_supports
from raganything.mineru_parser import MineruParser
from raganything.parser import MineruParser, DoclingParser
# Import specialized processors
from raganything.modalprocessors import (
@@ -84,6 +84,11 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
# Set up logger (use existing logger, don't configure it)
self.logger = logger
# Set up document parser
self.doc_parser = (
DoclingParser() if self.config.parser == "docling" else MineruParser()
)
# Create working directory if needed
if not os.path.exists(self.working_dir):
os.makedirs(self.working_dir)
@@ -96,7 +101,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
# Log configuration info
self.logger.info("RAGAnything initialized with config:")
self.logger.info(f" Working directory: {self.config.working_dir}")
self.logger.info(f" MinerU parse method: {self.config.mineru_parse_method}")
self.logger.info(f" Parser: {self.config.parser}")
self.logger.info(
f" Multimodal processing - Image: {self.config.enable_image_processing}, "
f"Table: {self.config.enable_table_processing}, "
@@ -186,12 +191,11 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
if self.lightrag is not None:
return
# Check MinerU 2.0 installation
if not MineruParser.check_installation():
# Check parser installation
if not self.doc_parser.check_installation():
raise RuntimeError(
"MinerU 2.0 is not properly installed. "
"Please install it using: pip install -U 'mineru[core]' "
"or uv pip install -U 'mineru[core]'"
"Parser is not properly installed. "
"Please install it using pip install or uv pip install."
)
# Validate required functions
@@ -228,14 +232,14 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
Returns:
bool: True if MinerU 2.0 is properly installed
"""
return MineruParser.check_installation()
return MineruParser.check_installation(MineruParser())
def get_config_info(self) -> Dict[str, Any]:
"""Get current configuration information"""
return {
"directory": {
"working_dir": self.config.working_dir,
"mineru_output_dir": self.config.mineru_output_dir,
"mineru_output_dir": self.config.output_dir,
},
"parsing": {
"mineru_parse_method": self.config.mineru_parse_method,
@@ -327,7 +331,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
def get_processor_info(self) -> Dict[str, Any]:
"""Get processor information"""
base_info = {
"mineru_installed": MineruParser.check_installation(),
"mineru_installed": MineruParser.check_installation(MineruParser()),
"config": self.get_config_info(),
"models": {
"llm_model": "External function"