mirror of
https://github.com/HKUDS/RAG-Anything.git
synced 2025-08-20 19:01:34 +03:00
1
.gitignore
vendored
1
.gitignore
vendored
@@ -61,6 +61,7 @@ ignore_this.txt
|
||||
dickens*/
|
||||
book.txt
|
||||
LightRAG.pdf
|
||||
LightRAG_2-4.pdf
|
||||
download_models_hf.py
|
||||
lightrag-dev/
|
||||
gui/
|
||||
|
||||
18
README.md
18
README.md
@@ -722,8 +722,8 @@ The `examples/` directory contains comprehensive usage examples:
|
||||
**Run examples:**
|
||||
|
||||
```bash
|
||||
# End-to-end processing
|
||||
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
|
||||
# End-to-end processing with parser selection
|
||||
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
|
||||
|
||||
# Direct modal processing
|
||||
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
|
||||
@@ -760,13 +760,15 @@ Create a `.env` file (refer to `.env.example`):
|
||||
```bash
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
OPENAI_BASE_URL=your_base_url # Optional
|
||||
OUTPUT_DIR=./output # Default output directory for parsed documents
|
||||
PARSER=mineru # Parser selection: mineru or docling
|
||||
```
|
||||
|
||||
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test MinerU functionality and do not require API keys.
|
||||
|
||||
### MinerU Configuration
|
||||
|
||||
MinerU 2.0 uses a simplified configuration approach:
|
||||
RAG-Anything now supports multiple parsers:
|
||||
|
||||
```bash
|
||||
# MinerU 2.0 uses command-line parameters instead of config files
|
||||
@@ -779,21 +781,23 @@ mineru -p input.pdf -o output_dir -m ocr # OCR-focused parsing
|
||||
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU acceleration
|
||||
```
|
||||
|
||||
You can also configure MinerU through RAGAnything parameters:
|
||||
You can also configure parsing through RAGAnything parameters:
|
||||
|
||||
```python
|
||||
# Basic parsing configuration
|
||||
# Basic parsing configuration with parser selection
|
||||
await rag.process_document_complete(
|
||||
file_path="document.pdf",
|
||||
output_dir="./output/",
|
||||
parse_method="auto", # or "ocr", "txt"
|
||||
parser="mineru" # Optional: "mineru" or "docling"
|
||||
)
|
||||
|
||||
# Advanced MinerU parsing configuration with special parameters
|
||||
# Advanced parsing configuration with special parameters
|
||||
await rag.process_document_complete(
|
||||
file_path="document.pdf",
|
||||
output_dir="./output/",
|
||||
parse_method="auto", # Parsing method: "auto", "ocr", "txt"
|
||||
parser="mineru", # Parser selection: "mineru" or "docling"
|
||||
|
||||
# MinerU special parameters - all supported kwargs:
|
||||
lang="ch", # Document language for OCR optimization (e.g., "ch", "en", "ja")
|
||||
@@ -813,7 +817,7 @@ await rag.process_document_complete(
|
||||
)
|
||||
```
|
||||
|
||||
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments.
|
||||
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything now supports multiple document parsers - you can choose between MinerU and Docling based on your needs.
|
||||
|
||||
### Processing Requirements
|
||||
|
||||
|
||||
19
README_zh.md
19
README_zh.md
@@ -719,8 +719,8 @@ if __name__ == "__main__":
|
||||
**运行示例:**
|
||||
|
||||
```bash
|
||||
# 端到端处理
|
||||
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
|
||||
# 端到端处理(包含解析器选择)
|
||||
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
|
||||
|
||||
# 直接模态处理
|
||||
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
|
||||
@@ -759,11 +759,13 @@ python examples/text_format_test.py --check-reportlab --file dummy
|
||||
```bash
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
OPENAI_BASE_URL=your_base_url # 可选
|
||||
OUTPUT_DIR=./output # 解析文档的默认输出目录
|
||||
PARSER=mineru # 解析器选择:mineru 或 docling
|
||||
```
|
||||
|
||||
### MinerU配置
|
||||
|
||||
MinerU 2.0使用简化的配置方式:
|
||||
RAG-Anything现在支持多种解析器:
|
||||
|
||||
```bash
|
||||
# MinerU 2.0使用命令行参数而不是配置文件
|
||||
@@ -776,21 +778,23 @@ mineru -p input.pdf -o output_dir -m ocr # OCR重点解析
|
||||
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU加速
|
||||
```
|
||||
|
||||
你也可以通过RAGAnything参数配置MinerU:
|
||||
你也可以通过RAGAnything参数配置解析:
|
||||
|
||||
```python
|
||||
# 基础解析配置
|
||||
# 基础解析配置和解析器选择
|
||||
await rag.process_document_complete(
|
||||
file_path="document.pdf",
|
||||
output_dir="./output/",
|
||||
parse_method="auto", # 或 "ocr", "txt"
|
||||
parser="mineru" # 可选:"mineru" 或 "docling"
|
||||
)
|
||||
|
||||
# MinerU高级解析配置(包含特殊参数)
|
||||
# 高级解析配置(包含特殊参数)
|
||||
await rag.process_document_complete(
|
||||
file_path="document.pdf",
|
||||
output_dir="./output/",
|
||||
parse_method="auto", # 解析方法:"auto", "ocr", "txt"
|
||||
parser="mineru", # 解析器选择:"mineru" 或 "docling"
|
||||
|
||||
# MinerU特殊参数 - 支持的所有kwargs:
|
||||
lang="ch", # 文档语言优化(如:"ch", "en", "ja")
|
||||
@@ -802,6 +806,7 @@ await rag.process_document_complete(
|
||||
backend="pipeline", # 解析后端:pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client
|
||||
source="huggingface", # 模型源:"huggingface", "modelscope", "local"
|
||||
# vlm_url="http://127.0.0.1:3000" # 当backend=vlm-sglang-client时,需指定服务地址
|
||||
|
||||
# RAGAnything标准参数
|
||||
display_stats=True, # 显示内容统计信息
|
||||
split_by_character=None, # 可选的文本分割字符
|
||||
@@ -809,7 +814,7 @@ await rag.process_document_complete(
|
||||
)
|
||||
```
|
||||
|
||||
> **注意**:MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。
|
||||
> **注意**:MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。RAG-Anything现在支持多种文档解析器 - 你可以根据需要在MinerU和Docling之间选择。
|
||||
|
||||
### 处理要求
|
||||
|
||||
|
||||
@@ -33,9 +33,10 @@ OLLAMA_EMULATING_MODEL_TAG=latest
|
||||
|
||||
### RAGAnything Configuration (Multimodal Document Processing)
|
||||
### ---
|
||||
### MinerU Parser Configuration
|
||||
### Parser Configuration
|
||||
# MINERU_PARSE_METHOD=auto
|
||||
# MINERU_OUTPUT_DIR=./output
|
||||
# OUTPUT_DIR=./output
|
||||
# PARSER=mineru
|
||||
# DISPLAY_CONTENT_STATS=true
|
||||
|
||||
### Multimodal Processing Configuration
|
||||
|
||||
@@ -88,6 +88,7 @@ async def process_with_rag(
|
||||
api_key: str,
|
||||
base_url: str = None,
|
||||
working_dir: str = None,
|
||||
parser: str = None,
|
||||
):
|
||||
"""
|
||||
Process document with RAGAnything
|
||||
@@ -107,6 +108,7 @@ async def process_with_rag(
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
parser=parser,
|
||||
)
|
||||
|
||||
# Define LLM model function
|
||||
@@ -250,17 +252,26 @@ def main():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
default=os.getenv("OPENAI_API_KEY"),
|
||||
help="OpenAI API key (defaults to OPENAI_API_KEY env var)",
|
||||
default=os.getenv("LLM_BINDING_API_KEY"),
|
||||
help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default=os.getenv("LLM_BINDING_HOST"),
|
||||
help="Optional base URL for API",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parser",
|
||||
default=os.getenv("PARSER", "mineru"),
|
||||
help="Optional base URL for API",
|
||||
)
|
||||
parser.add_argument("--base-url", help="Optional base URL for API")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if API key is provided
|
||||
if not args.api_key:
|
||||
logger.error("Error: OpenAI API key is required")
|
||||
logger.error("Set OPENAI_API_KEY environment variable or use --api-key option")
|
||||
logger.error("Set api key environment variable or use --api-key option")
|
||||
return
|
||||
|
||||
# Create output directory if specified
|
||||
@@ -270,7 +281,12 @@ def main():
|
||||
# Process with RAG
|
||||
asyncio.run(
|
||||
process_with_rag(
|
||||
args.file_path, args.output, args.api_key, args.base_url, args.working_dir
|
||||
args.file_path,
|
||||
args.output,
|
||||
args.api_key,
|
||||
args.base_url,
|
||||
args.working_dir,
|
||||
args.parser,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -18,18 +18,19 @@ class RAGAnythingConfig:
|
||||
working_dir: str = field(default=get_env_value("WORKING_DIR", "./rag_storage", str))
|
||||
"""Directory where RAG storage and cache files are stored."""
|
||||
|
||||
# MinerU Parser Configuration
|
||||
# Parser Configuration
|
||||
# ---
|
||||
mineru_parse_method: str = field(
|
||||
default=get_env_value("MINERU_PARSE_METHOD", "auto", str)
|
||||
)
|
||||
"""Default parsing method for MinerU: 'auto', 'ocr', or 'txt'."""
|
||||
|
||||
mineru_output_dir: str = field(
|
||||
default=get_env_value("MINERU_OUTPUT_DIR", "./output", str)
|
||||
)
|
||||
parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
|
||||
"""Default output directory for MinerU parsed content."""
|
||||
|
||||
parser: str = field(default=get_env_value("PARSER", "mineru", str))
|
||||
"""Parser selection: 'mineru' or 'docling'."""
|
||||
|
||||
display_content_stats: bool = field(
|
||||
default=get_env_value("DISPLAY_CONTENT_STATS", True, bool)
|
||||
)
|
||||
|
||||
@@ -517,6 +517,144 @@ class BaseModalProcessor:
|
||||
chunk_results,
|
||||
)
|
||||
|
||||
def _robust_json_parse(self, response: str) -> dict:
|
||||
"""Robust JSON parsing with multiple fallback strategies"""
|
||||
|
||||
# Strategy 1: Try direct parsing first
|
||||
for json_candidate in self._extract_all_json_candidates(response):
|
||||
result = self._try_parse_json(json_candidate)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Strategy 2: Try with basic cleanup
|
||||
for json_candidate in self._extract_all_json_candidates(response):
|
||||
cleaned = self._basic_json_cleanup(json_candidate)
|
||||
result = self._try_parse_json(cleaned)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Strategy 3: Try progressive quote fixing
|
||||
for json_candidate in self._extract_all_json_candidates(response):
|
||||
fixed = self._progressive_quote_fix(json_candidate)
|
||||
result = self._try_parse_json(fixed)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Strategy 4: Fallback to regex field extraction
|
||||
return self._extract_fields_with_regex(response)
|
||||
|
||||
def _extract_all_json_candidates(self, response: str) -> list:
|
||||
"""Extract all possible JSON candidates from response"""
|
||||
candidates = []
|
||||
|
||||
# Method 1: JSON in code blocks
|
||||
import re
|
||||
|
||||
json_blocks = re.findall(r"```(?:json)?\s*(\{.*?\})\s*```", response, re.DOTALL)
|
||||
candidates.extend(json_blocks)
|
||||
|
||||
# Method 2: Balanced braces
|
||||
brace_count = 0
|
||||
start_pos = -1
|
||||
|
||||
for i, char in enumerate(response):
|
||||
if char == "{":
|
||||
if brace_count == 0:
|
||||
start_pos = i
|
||||
brace_count += 1
|
||||
elif char == "}":
|
||||
brace_count -= 1
|
||||
if brace_count == 0 and start_pos != -1:
|
||||
candidates.append(response[start_pos : i + 1])
|
||||
|
||||
# Method 3: Simple regex fallback
|
||||
simple_match = re.search(r"\{.*\}", response, re.DOTALL)
|
||||
if simple_match:
|
||||
candidates.append(simple_match.group(0))
|
||||
|
||||
return candidates
|
||||
|
||||
def _try_parse_json(self, json_str: str) -> dict:
|
||||
"""Try to parse JSON string, return None if failed"""
|
||||
if not json_str or not json_str.strip():
|
||||
return None
|
||||
|
||||
try:
|
||||
return json.loads(json_str)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return None
|
||||
|
||||
def _basic_json_cleanup(self, json_str: str) -> str:
|
||||
"""Basic cleanup for common JSON issues"""
|
||||
# Remove extra whitespace
|
||||
json_str = json_str.strip()
|
||||
|
||||
# Fix common quote issues
|
||||
json_str = json_str.replace('"', '"').replace('"', '"') # Smart quotes
|
||||
json_str = json_str.replace(""", "'").replace(""", "'") # Smart apostrophes
|
||||
|
||||
# Fix trailing commas (simple case)
|
||||
json_str = re.sub(r",(\s*[}\]])", r"\1", json_str)
|
||||
|
||||
return json_str
|
||||
|
||||
def _progressive_quote_fix(self, json_str: str) -> str:
|
||||
"""Progressive fixing of quote and escape issues"""
|
||||
# Only escape unescaped backslashes before quotes
|
||||
json_str = re.sub(r'(?<!\\)\\(?=")', r"\\\\", json_str)
|
||||
|
||||
# Fix unescaped backslashes in string values (more conservative)
|
||||
def fix_string_content(match):
|
||||
content = match.group(1)
|
||||
# Only escape obvious problematic patterns
|
||||
content = re.sub(r"\\(?=[a-zA-Z])", r"\\\\", content) # \alpha -> \\alpha
|
||||
return f'"{content}"'
|
||||
|
||||
json_str = re.sub(r'"([^"]*(?:\\.[^"]*)*)"', fix_string_content, json_str)
|
||||
return json_str
|
||||
|
||||
def _extract_fields_with_regex(self, response: str) -> dict:
|
||||
"""Extract required fields using regex as last resort"""
|
||||
logger.warning("Using regex fallback for JSON parsing")
|
||||
|
||||
# Extract detailed_description
|
||||
desc_match = re.search(
|
||||
r'"detailed_description":\s*"([^"]*(?:\\.[^"]*)*)"', response, re.DOTALL
|
||||
)
|
||||
description = desc_match.group(1) if desc_match else ""
|
||||
|
||||
# Extract entity_name
|
||||
name_match = re.search(r'"entity_name":\s*"([^"]*(?:\\.[^"]*)*)"', response)
|
||||
entity_name = name_match.group(1) if name_match else "unknown_entity"
|
||||
|
||||
# Extract entity_type
|
||||
type_match = re.search(r'"entity_type":\s*"([^"]*(?:\\.[^"]*)*)"', response)
|
||||
entity_type = type_match.group(1) if type_match else "unknown"
|
||||
|
||||
# Extract summary
|
||||
summary_match = re.search(
|
||||
r'"summary":\s*"([^"]*(?:\\.[^"]*)*)"', response, re.DOTALL
|
||||
)
|
||||
summary = summary_match.group(1) if summary_match else description[:100]
|
||||
|
||||
return {
|
||||
"detailed_description": description,
|
||||
"entity_info": {
|
||||
"entity_name": entity_name,
|
||||
"entity_type": entity_type,
|
||||
"summary": summary,
|
||||
},
|
||||
}
|
||||
|
||||
def _extract_json_from_response(self, response: str) -> str:
|
||||
"""Legacy method - now handled by _extract_all_json_candidates"""
|
||||
candidates = self._extract_all_json_candidates(response)
|
||||
return candidates[0] if candidates else None
|
||||
|
||||
def _fix_json_escapes(self, json_str: str) -> str:
|
||||
"""Legacy method - now handled by progressive strategies"""
|
||||
return self._progressive_quote_fix(json_str)
|
||||
|
||||
async def _process_chunk_for_extraction(
|
||||
self, chunk_id: str, modal_entity_name: str, batch_mode: bool = False
|
||||
):
|
||||
@@ -743,9 +881,7 @@ class ImageModalProcessor(BaseModalProcessor):
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Parse model response"""
|
||||
try:
|
||||
response_data = json.loads(
|
||||
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
||||
)
|
||||
response_data = self._robust_json_parse(response)
|
||||
|
||||
description = response_data.get("detailed_description", "")
|
||||
entity_data = response_data.get("entity_info", {})
|
||||
@@ -768,6 +904,7 @@ class ImageModalProcessor(BaseModalProcessor):
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
||||
logger.error(f"Error parsing image analysis response: {e}")
|
||||
logger.debug(f"Raw response: {response}")
|
||||
fallback_entity = {
|
||||
"entity_name": entity_name
|
||||
if entity_name
|
||||
@@ -867,9 +1004,7 @@ class TableModalProcessor(BaseModalProcessor):
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Parse table analysis response"""
|
||||
try:
|
||||
response_data = json.loads(
|
||||
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
||||
)
|
||||
response_data = self._robust_json_parse(response)
|
||||
|
||||
description = response_data.get("detailed_description", "")
|
||||
entity_data = response_data.get("entity_info", {})
|
||||
@@ -892,6 +1027,7 @@ class TableModalProcessor(BaseModalProcessor):
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
||||
logger.error(f"Error parsing table analysis response: {e}")
|
||||
logger.debug(f"Raw response: {response}")
|
||||
fallback_entity = {
|
||||
"entity_name": entity_name
|
||||
if entity_name
|
||||
@@ -979,11 +1115,9 @@ class EquationModalProcessor(BaseModalProcessor):
|
||||
def _parse_equation_response(
|
||||
self, response: str, entity_name: str = None
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Parse equation analysis response"""
|
||||
"""Parse equation analysis response with robust JSON handling"""
|
||||
try:
|
||||
response_data = json.loads(
|
||||
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
||||
)
|
||||
response_data = self._robust_json_parse(response)
|
||||
|
||||
description = response_data.get("detailed_description", "")
|
||||
entity_data = response_data.get("entity_info", {})
|
||||
@@ -1006,6 +1140,7 @@ class EquationModalProcessor(BaseModalProcessor):
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
||||
logger.error(f"Error parsing equation analysis response: {e}")
|
||||
logger.debug(f"Raw response: {response}")
|
||||
fallback_entity = {
|
||||
"entity_name": entity_name
|
||||
if entity_name
|
||||
@@ -1085,9 +1220,7 @@ class GenericModalProcessor(BaseModalProcessor):
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Parse generic analysis response"""
|
||||
try:
|
||||
response_data = json.loads(
|
||||
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
||||
)
|
||||
response_data = self._robust_json_parse(response)
|
||||
|
||||
description = response_data.get("detailed_description", "")
|
||||
entity_data = response_data.get("entity_info", {})
|
||||
@@ -1109,7 +1242,8 @@ class GenericModalProcessor(BaseModalProcessor):
|
||||
return description, entity_data
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
||||
logger.error(f"Error parsing generic analysis response: {e}")
|
||||
logger.error(f"Error parsing {content_type} analysis response: {e}")
|
||||
logger.debug(f"Raw response: {response}")
|
||||
fallback_entity = {
|
||||
"entity_name": entity_name
|
||||
if entity_name
|
||||
|
||||
1662
raganything/parser.py
Normal file
1662
raganything/parser.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -5,9 +5,9 @@ Contains methods for parsing documents and processing multimodal content
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from typing import Dict, List, Any
|
||||
from pathlib import Path
|
||||
from raganything.mineru_parser import MineruParser
|
||||
from raganything.parser import MineruParser, DoclingParser
|
||||
from raganything.utils import (
|
||||
separate_content,
|
||||
insert_text_content,
|
||||
@@ -25,23 +25,23 @@ class ProcessorMixin:
|
||||
parse_method: str = None,
|
||||
display_stats: bool = None,
|
||||
**kwargs,
|
||||
) -> Tuple[List[Dict[str, Any]], str]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Parse document using MinerU
|
||||
Parse document
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to parse
|
||||
output_dir: Output directory (defaults to config.mineru_output_dir)
|
||||
output_dir: Output directory (defaults to config.output_dir)
|
||||
parse_method: Parse method (defaults to config.mineru_parse_method)
|
||||
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
|
||||
**kwargs: Additional parameters for MinerU parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
|
||||
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
|
||||
|
||||
Returns:
|
||||
(content_list, md_content): Content list and markdown text
|
||||
List[Dict[str, Any]]: Content list
|
||||
"""
|
||||
# Use config defaults if not provided
|
||||
if output_dir is None:
|
||||
output_dir = self.config.mineru_output_dir
|
||||
output_dir = self.config.parser_output_dir
|
||||
if parse_method is None:
|
||||
parse_method = self.config.mineru_parse_method
|
||||
if display_stats is None:
|
||||
@@ -57,11 +57,14 @@ class ProcessorMixin:
|
||||
ext = file_path.suffix.lower()
|
||||
|
||||
try:
|
||||
doc_parser = (
|
||||
DoclingParser() if self.config.parser == "docling" else MineruParser()
|
||||
)
|
||||
if ext in [".pdf"]:
|
||||
self.logger.info(
|
||||
f"Detected PDF file, using PDF parser (method={parse_method})..."
|
||||
)
|
||||
content_list, md_content = MineruParser.parse_pdf(
|
||||
content_list = doc_parser.parse_pdf(
|
||||
pdf_path=file_path,
|
||||
output_dir=output_dir,
|
||||
method=parse_method,
|
||||
@@ -78,12 +81,24 @@ class ProcessorMixin:
|
||||
".webp",
|
||||
]:
|
||||
self.logger.info("Detected image file, using image parser...")
|
||||
content_list, md_content = MineruParser.parse_image(
|
||||
content_list = MineruParser.parse_image(
|
||||
image_path=file_path, output_dir=output_dir, **kwargs
|
||||
)
|
||||
elif ext in [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"]:
|
||||
self.logger.info("Detected Office document, using Office parser...")
|
||||
content_list, md_content = MineruParser.parse_office_doc(
|
||||
elif ext in [
|
||||
".doc",
|
||||
".docx",
|
||||
".ppt",
|
||||
".pptx",
|
||||
".xls",
|
||||
".xlsx",
|
||||
".html",
|
||||
".htm",
|
||||
".xhtml",
|
||||
]:
|
||||
self.logger.info(
|
||||
"Detected Office or HTML document, using Office parser..."
|
||||
)
|
||||
content_list = doc_parser.parse_office_doc(
|
||||
doc_path=file_path, output_dir=output_dir, **kwargs
|
||||
)
|
||||
else:
|
||||
@@ -91,7 +106,7 @@ class ProcessorMixin:
|
||||
self.logger.info(
|
||||
f"Using generic parser for {ext} file (method={parse_method})..."
|
||||
)
|
||||
content_list, md_content = MineruParser.parse_document(
|
||||
content_list = doc_parser.parse_document(
|
||||
file_path=file_path,
|
||||
method=parse_method,
|
||||
output_dir=output_dir,
|
||||
@@ -102,7 +117,8 @@ class ProcessorMixin:
|
||||
self.logger.error(f"Error during parsing with specific parser: {str(e)}")
|
||||
self.logger.warning("Falling back to generic parser...")
|
||||
# If specific parser fails, fall back to generic parser
|
||||
content_list, md_content = MineruParser.parse_document(
|
||||
content_list = MineruParser.parse_document(
|
||||
MineruParser(),
|
||||
file_path=file_path,
|
||||
method=parse_method,
|
||||
output_dir=output_dir,
|
||||
@@ -112,13 +128,11 @@ class ProcessorMixin:
|
||||
self.logger.info(
|
||||
f"Parsing complete! Extracted {len(content_list)} content blocks"
|
||||
)
|
||||
self.logger.info(f"Markdown text length: {len(md_content)} characters")
|
||||
|
||||
# Display content statistics if requested
|
||||
if display_stats:
|
||||
self.logger.info("\nContent Information:")
|
||||
self.logger.info(f"* Total blocks in content_list: {len(content_list)}")
|
||||
self.logger.info(f"* Markdown content length: {len(md_content)} characters")
|
||||
|
||||
# Count elements by type
|
||||
block_types: Dict[str, int] = {}
|
||||
@@ -132,7 +146,7 @@ class ProcessorMixin:
|
||||
for block_type, count in block_types.items():
|
||||
self.logger.info(f" - {block_type}: {count}")
|
||||
|
||||
return content_list, md_content
|
||||
return content_list
|
||||
|
||||
async def _process_multimodal_content(
|
||||
self, multimodal_items: List[Dict[str, Any]], file_path: str
|
||||
@@ -248,7 +262,7 @@ class ProcessorMixin:
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to process
|
||||
output_dir: MinerU output directory (defaults to config.mineru_output_dir)
|
||||
output_dir: output directory (defaults to config.output_dir)
|
||||
parse_method: Parse method (defaults to config.mineru_parse_method)
|
||||
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
|
||||
split_by_character: Optional character to split the text by
|
||||
@@ -269,8 +283,8 @@ class ProcessorMixin:
|
||||
|
||||
self.logger.info(f"Starting complete document processing: {file_path}")
|
||||
|
||||
# Step 1: Parse document using MinerU
|
||||
content_list, md_content = self.parse_document(
|
||||
# Step 1: Parse document
|
||||
content_list = self.parse_document(
|
||||
file_path, output_dir, parse_method, display_stats, **kwargs
|
||||
)
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ from raganything.query import QueryMixin
|
||||
from raganything.processor import ProcessorMixin
|
||||
from raganything.batch import BatchMixin
|
||||
from raganything.utils import get_processor_supports
|
||||
from raganything.mineru_parser import MineruParser
|
||||
from raganything.parser import MineruParser, DoclingParser
|
||||
|
||||
# Import specialized processors
|
||||
from raganything.modalprocessors import (
|
||||
@@ -84,6 +84,11 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
# Set up logger (use existing logger, don't configure it)
|
||||
self.logger = logger
|
||||
|
||||
# Set up document parser
|
||||
self.doc_parser = (
|
||||
DoclingParser() if self.config.parser == "docling" else MineruParser()
|
||||
)
|
||||
|
||||
# Create working directory if needed
|
||||
if not os.path.exists(self.working_dir):
|
||||
os.makedirs(self.working_dir)
|
||||
@@ -96,7 +101,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
# Log configuration info
|
||||
self.logger.info("RAGAnything initialized with config:")
|
||||
self.logger.info(f" Working directory: {self.config.working_dir}")
|
||||
self.logger.info(f" MinerU parse method: {self.config.mineru_parse_method}")
|
||||
self.logger.info(f" Parser: {self.config.parser}")
|
||||
self.logger.info(
|
||||
f" Multimodal processing - Image: {self.config.enable_image_processing}, "
|
||||
f"Table: {self.config.enable_table_processing}, "
|
||||
@@ -186,12 +191,11 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
if self.lightrag is not None:
|
||||
return
|
||||
|
||||
# Check MinerU 2.0 installation
|
||||
if not MineruParser.check_installation():
|
||||
# Check parser installation
|
||||
if not self.doc_parser.check_installation():
|
||||
raise RuntimeError(
|
||||
"MinerU 2.0 is not properly installed. "
|
||||
"Please install it using: pip install -U 'mineru[core]' "
|
||||
"or uv pip install -U 'mineru[core]'"
|
||||
"Parser is not properly installed. "
|
||||
"Please install it using pip install or uv pip install."
|
||||
)
|
||||
|
||||
# Validate required functions
|
||||
@@ -228,14 +232,14 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
Returns:
|
||||
bool: True if MinerU 2.0 is properly installed
|
||||
"""
|
||||
return MineruParser.check_installation()
|
||||
return MineruParser.check_installation(MineruParser())
|
||||
|
||||
def get_config_info(self) -> Dict[str, Any]:
|
||||
"""Get current configuration information"""
|
||||
return {
|
||||
"directory": {
|
||||
"working_dir": self.config.working_dir,
|
||||
"mineru_output_dir": self.config.mineru_output_dir,
|
||||
"mineru_output_dir": self.config.output_dir,
|
||||
},
|
||||
"parsing": {
|
||||
"mineru_parse_method": self.config.mineru_parse_method,
|
||||
@@ -327,7 +331,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
def get_processor_info(self) -> Dict[str, Any]:
|
||||
"""Get processor information"""
|
||||
base_info = {
|
||||
"mineru_installed": MineruParser.check_installation(),
|
||||
"mineru_installed": MineruParser.check_installation(MineruParser()),
|
||||
"config": self.get_config_info(),
|
||||
"models": {
|
||||
"llm_model": "External function"
|
||||
|
||||
Reference in New Issue
Block a user