mirror of
https://github.com/HKUDS/RAG-Anything.git
synced 2025-08-20 19:01:34 +03:00
Merge branch 'main' into docling
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -61,6 +61,7 @@ ignore_this.txt
|
||||
dickens*/
|
||||
book.txt
|
||||
LightRAG.pdf
|
||||
LightRAG_2-4.pdf
|
||||
download_models_hf.py
|
||||
lightrag-dev/
|
||||
gui/
|
||||
|
||||
@@ -108,6 +108,7 @@ async def process_with_rag(
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
)
|
||||
|
||||
# Define LLM model function
|
||||
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||||
return openai_complete_if_cache(
|
||||
@@ -252,7 +253,11 @@ def main():
|
||||
default=os.getenv("LLM_BINDING_API_KEY"),
|
||||
help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
|
||||
)
|
||||
parser.add_argument("--base-url", default=os.getenv("LLM_BINDING_HOST"), help="Optional base URL for API")
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default=os.getenv("LLM_BINDING_HOST"),
|
||||
help="Optional base URL for API",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from .raganything import RAGAnything as RAGAnything
|
||||
from .config import RAGAnythingConfig as RAGAnythingConfig
|
||||
|
||||
__version__ = "1.2.2"
|
||||
__version__ = "1.2.3"
|
||||
__author__ = "Zirui Guo"
|
||||
__url__ = "https://github.com/HKUDS/RAG-Anything"
|
||||
|
||||
|
||||
@@ -25,14 +25,10 @@ class RAGAnythingConfig:
|
||||
)
|
||||
"""Default parsing method for MinerU: 'auto', 'ocr', or 'txt'."""
|
||||
|
||||
parser_output_dir: str = field(
|
||||
default=get_env_value("OUTPUT_DIR", "./output", str)
|
||||
)
|
||||
parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
|
||||
"""Default output directory for MinerU parsed content."""
|
||||
|
||||
parser: str = field(
|
||||
default=get_env_value("PARSER", "mineru", str)
|
||||
)
|
||||
parser: str = field(default=get_env_value("PARSER", "mineru", str))
|
||||
"""Parser selection: 'mineru' or 'docling'."""
|
||||
|
||||
display_content_stats: bool = field(
|
||||
|
||||
@@ -75,7 +75,7 @@ class ContextExtractor:
|
||||
Returns:
|
||||
Extracted context text
|
||||
"""
|
||||
if not content_source:
|
||||
if not content_source and not self.config.context_window:
|
||||
return ""
|
||||
|
||||
try:
|
||||
@@ -517,6 +517,144 @@ class BaseModalProcessor:
|
||||
chunk_results,
|
||||
)
|
||||
|
||||
def _robust_json_parse(self, response: str) -> dict:
|
||||
"""Robust JSON parsing with multiple fallback strategies"""
|
||||
|
||||
# Strategy 1: Try direct parsing first
|
||||
for json_candidate in self._extract_all_json_candidates(response):
|
||||
result = self._try_parse_json(json_candidate)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Strategy 2: Try with basic cleanup
|
||||
for json_candidate in self._extract_all_json_candidates(response):
|
||||
cleaned = self._basic_json_cleanup(json_candidate)
|
||||
result = self._try_parse_json(cleaned)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Strategy 3: Try progressive quote fixing
|
||||
for json_candidate in self._extract_all_json_candidates(response):
|
||||
fixed = self._progressive_quote_fix(json_candidate)
|
||||
result = self._try_parse_json(fixed)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Strategy 4: Fallback to regex field extraction
|
||||
return self._extract_fields_with_regex(response)
|
||||
|
||||
def _extract_all_json_candidates(self, response: str) -> list:
|
||||
"""Extract all possible JSON candidates from response"""
|
||||
candidates = []
|
||||
|
||||
# Method 1: JSON in code blocks
|
||||
import re
|
||||
|
||||
json_blocks = re.findall(r"```(?:json)?\s*(\{.*?\})\s*```", response, re.DOTALL)
|
||||
candidates.extend(json_blocks)
|
||||
|
||||
# Method 2: Balanced braces
|
||||
brace_count = 0
|
||||
start_pos = -1
|
||||
|
||||
for i, char in enumerate(response):
|
||||
if char == "{":
|
||||
if brace_count == 0:
|
||||
start_pos = i
|
||||
brace_count += 1
|
||||
elif char == "}":
|
||||
brace_count -= 1
|
||||
if brace_count == 0 and start_pos != -1:
|
||||
candidates.append(response[start_pos : i + 1])
|
||||
|
||||
# Method 3: Simple regex fallback
|
||||
simple_match = re.search(r"\{.*\}", response, re.DOTALL)
|
||||
if simple_match:
|
||||
candidates.append(simple_match.group(0))
|
||||
|
||||
return candidates
|
||||
|
||||
def _try_parse_json(self, json_str: str) -> dict:
|
||||
"""Try to parse JSON string, return None if failed"""
|
||||
if not json_str or not json_str.strip():
|
||||
return None
|
||||
|
||||
try:
|
||||
return json.loads(json_str)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return None
|
||||
|
||||
def _basic_json_cleanup(self, json_str: str) -> str:
|
||||
"""Basic cleanup for common JSON issues"""
|
||||
# Remove extra whitespace
|
||||
json_str = json_str.strip()
|
||||
|
||||
# Fix common quote issues
|
||||
json_str = json_str.replace('"', '"').replace('"', '"') # Smart quotes
|
||||
json_str = json_str.replace(""", "'").replace(""", "'") # Smart apostrophes
|
||||
|
||||
# Fix trailing commas (simple case)
|
||||
json_str = re.sub(r",(\s*[}\]])", r"\1", json_str)
|
||||
|
||||
return json_str
|
||||
|
||||
def _progressive_quote_fix(self, json_str: str) -> str:
|
||||
"""Progressive fixing of quote and escape issues"""
|
||||
# Only escape unescaped backslashes before quotes
|
||||
json_str = re.sub(r'(?<!\\)\\(?=")', r"\\\\", json_str)
|
||||
|
||||
# Fix unescaped backslashes in string values (more conservative)
|
||||
def fix_string_content(match):
|
||||
content = match.group(1)
|
||||
# Only escape obvious problematic patterns
|
||||
content = re.sub(r"\\(?=[a-zA-Z])", r"\\\\", content) # \alpha -> \\alpha
|
||||
return f'"{content}"'
|
||||
|
||||
json_str = re.sub(r'"([^"]*(?:\\.[^"]*)*)"', fix_string_content, json_str)
|
||||
return json_str
|
||||
|
||||
def _extract_fields_with_regex(self, response: str) -> dict:
|
||||
"""Extract required fields using regex as last resort"""
|
||||
logger.warning("Using regex fallback for JSON parsing")
|
||||
|
||||
# Extract detailed_description
|
||||
desc_match = re.search(
|
||||
r'"detailed_description":\s*"([^"]*(?:\\.[^"]*)*)"', response, re.DOTALL
|
||||
)
|
||||
description = desc_match.group(1) if desc_match else ""
|
||||
|
||||
# Extract entity_name
|
||||
name_match = re.search(r'"entity_name":\s*"([^"]*(?:\\.[^"]*)*)"', response)
|
||||
entity_name = name_match.group(1) if name_match else "unknown_entity"
|
||||
|
||||
# Extract entity_type
|
||||
type_match = re.search(r'"entity_type":\s*"([^"]*(?:\\.[^"]*)*)"', response)
|
||||
entity_type = type_match.group(1) if type_match else "unknown"
|
||||
|
||||
# Extract summary
|
||||
summary_match = re.search(
|
||||
r'"summary":\s*"([^"]*(?:\\.[^"]*)*)"', response, re.DOTALL
|
||||
)
|
||||
summary = summary_match.group(1) if summary_match else description[:100]
|
||||
|
||||
return {
|
||||
"detailed_description": description,
|
||||
"entity_info": {
|
||||
"entity_name": entity_name,
|
||||
"entity_type": entity_type,
|
||||
"summary": summary,
|
||||
},
|
||||
}
|
||||
|
||||
def _extract_json_from_response(self, response: str) -> str:
|
||||
"""Legacy method - now handled by _extract_all_json_candidates"""
|
||||
candidates = self._extract_all_json_candidates(response)
|
||||
return candidates[0] if candidates else None
|
||||
|
||||
def _fix_json_escapes(self, json_str: str) -> str:
|
||||
"""Legacy method - now handled by progressive strategies"""
|
||||
return self._progressive_quote_fix(json_str)
|
||||
|
||||
async def _process_chunk_for_extraction(
|
||||
self, chunk_id: str, modal_entity_name: str, batch_mode: bool = False
|
||||
):
|
||||
@@ -695,31 +833,21 @@ class ImageModalProcessor(BaseModalProcessor):
|
||||
)
|
||||
|
||||
# If image path exists, try to encode image
|
||||
image_base64 = ""
|
||||
if image_path and Path(image_path).exists():
|
||||
image_base64 = self._encode_image_to_base64(image_path)
|
||||
logger.debug(f"Begin Analysis of Image: {image_path}")
|
||||
|
||||
# Call vision model
|
||||
if image_base64:
|
||||
# Use real image for analysis
|
||||
response = await self.modal_caption_func(
|
||||
vision_prompt,
|
||||
image_data=image_base64,
|
||||
system_prompt=PROMPTS["IMAGE_ANALYSIS_SYSTEM"],
|
||||
)
|
||||
else:
|
||||
# Analyze based on existing text information
|
||||
text_prompt = PROMPTS["text_prompt"].format(
|
||||
image_path=image_path,
|
||||
captions=captions,
|
||||
footnotes=footnotes,
|
||||
vision_prompt=vision_prompt,
|
||||
)
|
||||
if not image_path or not Path(image_path).exists():
|
||||
raise FileNotFoundError(f"Image file not found: {image_path}")
|
||||
|
||||
response = await self.modal_caption_func(
|
||||
text_prompt,
|
||||
system_prompt=PROMPTS["IMAGE_ANALYSIS_FALLBACK_SYSTEM"],
|
||||
)
|
||||
image_base64 = self._encode_image_to_base64(image_path)
|
||||
if not image_base64:
|
||||
raise RuntimeError(f"Failed to encode image to base64: {image_path}")
|
||||
|
||||
# Call vision model with encoded image
|
||||
response = await self.modal_caption_func(
|
||||
vision_prompt,
|
||||
image_data=image_base64,
|
||||
system_prompt=PROMPTS["IMAGE_ANALYSIS_SYSTEM"],
|
||||
)
|
||||
|
||||
# Parse response
|
||||
enhanced_caption, entity_info = self._parse_response(response, entity_name)
|
||||
@@ -753,9 +881,7 @@ class ImageModalProcessor(BaseModalProcessor):
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Parse model response"""
|
||||
try:
|
||||
response_data = json.loads(
|
||||
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
||||
)
|
||||
response_data = self._robust_json_parse(response)
|
||||
|
||||
description = response_data.get("detailed_description", "")
|
||||
entity_data = response_data.get("entity_info", {})
|
||||
@@ -778,6 +904,7 @@ class ImageModalProcessor(BaseModalProcessor):
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
||||
logger.error(f"Error parsing image analysis response: {e}")
|
||||
logger.debug(f"Raw response: {response}")
|
||||
fallback_entity = {
|
||||
"entity_name": entity_name
|
||||
if entity_name
|
||||
@@ -815,6 +942,8 @@ class TableModalProcessor(BaseModalProcessor):
|
||||
table_body = content_data.get("table_body", "")
|
||||
table_footnote = content_data.get("table_footnote", [])
|
||||
|
||||
logger.debug(f"Begin Analysis of Table: {table_img_path}")
|
||||
|
||||
# Extract context for current item
|
||||
context = ""
|
||||
if item_info:
|
||||
@@ -875,9 +1004,7 @@ class TableModalProcessor(BaseModalProcessor):
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Parse table analysis response"""
|
||||
try:
|
||||
response_data = json.loads(
|
||||
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
||||
)
|
||||
response_data = self._robust_json_parse(response)
|
||||
|
||||
description = response_data.get("detailed_description", "")
|
||||
entity_data = response_data.get("entity_info", {})
|
||||
@@ -900,6 +1027,7 @@ class TableModalProcessor(BaseModalProcessor):
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
||||
logger.error(f"Error parsing table analysis response: {e}")
|
||||
logger.debug(f"Raw response: {response}")
|
||||
fallback_entity = {
|
||||
"entity_name": entity_name
|
||||
if entity_name
|
||||
@@ -935,6 +1063,8 @@ class EquationModalProcessor(BaseModalProcessor):
|
||||
equation_text = content_data.get("text")
|
||||
equation_format = content_data.get("text_format", "")
|
||||
|
||||
logger.debug(f"Begin Analysis of Equation: {equation_text}")
|
||||
|
||||
# Extract context for current item
|
||||
context = ""
|
||||
if item_info:
|
||||
@@ -985,11 +1115,9 @@ class EquationModalProcessor(BaseModalProcessor):
|
||||
def _parse_equation_response(
|
||||
self, response: str, entity_name: str = None
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Parse equation analysis response"""
|
||||
"""Parse equation analysis response with robust JSON handling"""
|
||||
try:
|
||||
response_data = json.loads(
|
||||
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
||||
)
|
||||
response_data = self._robust_json_parse(response)
|
||||
|
||||
description = response_data.get("detailed_description", "")
|
||||
entity_data = response_data.get("entity_info", {})
|
||||
@@ -1012,6 +1140,7 @@ class EquationModalProcessor(BaseModalProcessor):
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
||||
logger.error(f"Error parsing equation analysis response: {e}")
|
||||
logger.debug(f"Raw response: {response}")
|
||||
fallback_entity = {
|
||||
"entity_name": entity_name
|
||||
if entity_name
|
||||
@@ -1035,6 +1164,8 @@ class GenericModalProcessor(BaseModalProcessor):
|
||||
batch_mode: bool = False,
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Process generic modal content with context support"""
|
||||
logger.debug(f"Begin Analysis of {content_type}: {modal_content}")
|
||||
|
||||
# Extract context for current item
|
||||
context = ""
|
||||
if item_info:
|
||||
@@ -1089,9 +1220,7 @@ class GenericModalProcessor(BaseModalProcessor):
|
||||
) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Parse generic analysis response"""
|
||||
try:
|
||||
response_data = json.loads(
|
||||
re.search(r"\{.*\}", response, re.DOTALL).group(0)
|
||||
)
|
||||
response_data = self._robust_json_parse(response)
|
||||
|
||||
description = response_data.get("detailed_description", "")
|
||||
entity_data = response_data.get("entity_info", {})
|
||||
@@ -1113,7 +1242,8 @@ class GenericModalProcessor(BaseModalProcessor):
|
||||
return description, entity_data
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, ValueError) as e:
|
||||
logger.error(f"Error parsing generic analysis response: {e}")
|
||||
logger.error(f"Error parsing {content_type} analysis response: {e}")
|
||||
logger.debug(f"Raw response: {response}")
|
||||
fallback_entity = {
|
||||
"entity_name": entity_name
|
||||
if entity_name
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,7 @@ Contains methods for parsing documents and processing multimodal content
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from typing import Dict, List, Any
|
||||
from pathlib import Path
|
||||
from raganything.parser import MineruParser, DoclingParser
|
||||
from raganything.utils import (
|
||||
@@ -57,7 +57,9 @@ class ProcessorMixin:
|
||||
ext = file_path.suffix.lower()
|
||||
|
||||
try:
|
||||
doc_parser = DoclingParser() if self.config.parser == "docling" else MineruParser()
|
||||
doc_parser = (
|
||||
DoclingParser() if self.config.parser == "docling" else MineruParser()
|
||||
)
|
||||
if ext in [".pdf"]:
|
||||
self.logger.info(
|
||||
f"Detected PDF file, using PDF parser (method={parse_method})..."
|
||||
@@ -82,8 +84,20 @@ class ProcessorMixin:
|
||||
content_list = MineruParser.parse_image(
|
||||
image_path=file_path, output_dir=output_dir, **kwargs
|
||||
)
|
||||
elif ext in [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".html", ".htm", ".xhtml"]:
|
||||
self.logger.info("Detected Office or HTML document, using Office parser...")
|
||||
elif ext in [
|
||||
".doc",
|
||||
".docx",
|
||||
".ppt",
|
||||
".pptx",
|
||||
".xls",
|
||||
".xlsx",
|
||||
".html",
|
||||
".htm",
|
||||
".xhtml",
|
||||
]:
|
||||
self.logger.info(
|
||||
"Detected Office or HTML document, using Office parser..."
|
||||
)
|
||||
content_list = doc_parser.parse_office_doc(
|
||||
doc_path=file_path, output_dir=output_dir, **kwargs
|
||||
)
|
||||
|
||||
@@ -30,7 +30,7 @@ from raganything.query import QueryMixin
|
||||
from raganything.processor import ProcessorMixin
|
||||
from raganything.batch import BatchMixin
|
||||
from raganything.utils import get_processor_supports
|
||||
from raganything.parser import Parser, MineruParser, DoclingParser
|
||||
from raganything.parser import MineruParser, DoclingParser
|
||||
|
||||
# Import specialized processors
|
||||
from raganything.modalprocessors import (
|
||||
@@ -63,7 +63,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
|
||||
config: Optional[RAGAnythingConfig] = field(default=None)
|
||||
"""Configuration object, if None will create with environment variables."""
|
||||
|
||||
|
||||
# Internal State
|
||||
# ---
|
||||
modal_processors: Dict[str, Any] = field(default_factory=dict, init=False)
|
||||
@@ -83,9 +83,11 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
||||
|
||||
# Set up logger (use existing logger, don't configure it)
|
||||
self.logger = logger
|
||||
|
||||
|
||||
# Set up document parser
|
||||
self.doc_parser = DoclingParser() if self.config.parser == "docling" else MineruParser()
|
||||
self.doc_parser = (
|
||||
DoclingParser() if self.config.parser == "docling" else MineruParser()
|
||||
)
|
||||
|
||||
# Create working directory if needed
|
||||
if not os.path.exists(self.working_dir):
|
||||
|
||||
Reference in New Issue
Block a user