Merge branch 'main' into docling

2025-08-20 19:01:34 +03:00 · 2025-07-17 23:02:39 +08:00
parent e8f9a877e2
commit 656c4cdae6
8 changed files with 763 additions and 275 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -61,6 +61,7 @@ ignore_this.txt
 dickens*/
 book.txt
 LightRAG.pdf
+LightRAG_2-4.pdf
 download_models_hf.py
 lightrag-dev/
 gui/
--- a/examples/raganything_example.py
+++ b/examples/raganything_example.py
@@ -108,6 +108,7 @@ async def process_with_rag(
            enable_table_processing=True,
            enable_equation_processing=True,
        )
+
        # Define LLM model function
        def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
            return openai_complete_if_cache(
@@ -252,7 +253,11 @@ def main():
        default=os.getenv("LLM_BINDING_API_KEY"),
        help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
    )
-    parser.add_argument("--base-url", default=os.getenv("LLM_BINDING_HOST"), help="Optional base URL for API")
+    parser.add_argument(
+        "--base-url",
+        default=os.getenv("LLM_BINDING_HOST"),
+        help="Optional base URL for API",
+    )

    args = parser.parse_args()

--- a/raganything/init.py
+++ b/raganything/init.py
@@ -1,7 +1,7 @@
 from .raganything import RAGAnything as RAGAnything
 from .config import RAGAnythingConfig as RAGAnythingConfig

-__version__ = "1.2.2"
+__version__ = "1.2.3"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/RAG-Anything"

--- a/raganything/config.py
+++ b/raganything/config.py
@@ -25,14 +25,10 @@ class RAGAnythingConfig:
    )
    """Default parsing method for MinerU: 'auto', 'ocr', or 'txt'."""

-    parser_output_dir: str = field(
-        default=get_env_value("OUTPUT_DIR", "./output", str)
-    )
+    parser_output_dir: str = field(default=get_env_value("OUTPUT_DIR", "./output", str))
    """Default output directory for MinerU parsed content."""

-    parser: str = field(
-        default=get_env_value("PARSER", "mineru", str)
-    )
+    parser: str = field(default=get_env_value("PARSER", "mineru", str))
    """Parser selection: 'mineru' or 'docling'."""

    display_content_stats: bool = field(
--- a/raganything/modalprocessors.py
+++ b/raganything/modalprocessors.py
@@ -75,7 +75,7 @@ class ContextExtractor:
        Returns:
            Extracted context text
        """
-        if not content_source:
+        if not content_source and not self.config.context_window:
            return ""

        try:
@@ -517,6 +517,144 @@ class BaseModalProcessor:
            chunk_results,
        )

+    def _robust_json_parse(self, response: str) -> dict:
+        """Robust JSON parsing with multiple fallback strategies"""
+
+        # Strategy 1: Try direct parsing first
+        for json_candidate in self._extract_all_json_candidates(response):
+            result = self._try_parse_json(json_candidate)
+            if result:
+                return result
+
+        # Strategy 2: Try with basic cleanup
+        for json_candidate in self._extract_all_json_candidates(response):
+            cleaned = self._basic_json_cleanup(json_candidate)
+            result = self._try_parse_json(cleaned)
+            if result:
+                return result
+
+        # Strategy 3: Try progressive quote fixing
+        for json_candidate in self._extract_all_json_candidates(response):
+            fixed = self._progressive_quote_fix(json_candidate)
+            result = self._try_parse_json(fixed)
+            if result:
+                return result
+
+        # Strategy 4: Fallback to regex field extraction
+        return self._extract_fields_with_regex(response)
+
+    def _extract_all_json_candidates(self, response: str) -> list:
+        """Extract all possible JSON candidates from response"""
+        candidates = []
+
+        # Method 1: JSON in code blocks
+        import re
+
+        json_blocks = re.findall(r"```(?:json)?\s*(\{.*?\})\s*```", response, re.DOTALL)
+        candidates.extend(json_blocks)
+
+        # Method 2: Balanced braces
+        brace_count = 0
+        start_pos = -1
+
+        for i, char in enumerate(response):
+            if char == "{":
+                if brace_count == 0:
+                    start_pos = i
+                brace_count += 1
+            elif char == "}":
+                brace_count -= 1
+                if brace_count == 0 and start_pos != -1:
+                    candidates.append(response[start_pos : i + 1])
+
+        # Method 3: Simple regex fallback
+        simple_match = re.search(r"\{.*\}", response, re.DOTALL)
+        if simple_match:
+            candidates.append(simple_match.group(0))
+
+        return candidates
+
+    def _try_parse_json(self, json_str: str) -> dict:
+        """Try to parse JSON string, return None if failed"""
+        if not json_str or not json_str.strip():
+            return None
+
+        try:
+            return json.loads(json_str)
+        except (json.JSONDecodeError, ValueError):
+            return None
+
+    def _basic_json_cleanup(self, json_str: str) -> str:
+        """Basic cleanup for common JSON issues"""
+        # Remove extra whitespace
+        json_str = json_str.strip()
+
+        # Fix common quote issues
+        json_str = json_str.replace('"', '"').replace('"', '"')  # Smart quotes
+        json_str = json_str.replace(""", "'").replace(""", "'")  # Smart apostrophes
+
+        # Fix trailing commas (simple case)
+        json_str = re.sub(r",(\s*[}\]])", r"\1", json_str)
+
+        return json_str
+
+    def _progressive_quote_fix(self, json_str: str) -> str:
+        """Progressive fixing of quote and escape issues"""
+        # Only escape unescaped backslashes before quotes
+        json_str = re.sub(r'(?<!\\)\\(?=")', r"\\\\", json_str)
+
+        # Fix unescaped backslashes in string values (more conservative)
+        def fix_string_content(match):
+            content = match.group(1)
+            # Only escape obvious problematic patterns
+            content = re.sub(r"\\(?=[a-zA-Z])", r"\\\\", content)  # \alpha -> \\alpha
+            return f'"{content}"'
+
+        json_str = re.sub(r'"([^"]*(?:\\.[^"]*)*)"', fix_string_content, json_str)
+        return json_str
+
+    def _extract_fields_with_regex(self, response: str) -> dict:
+        """Extract required fields using regex as last resort"""
+        logger.warning("Using regex fallback for JSON parsing")
+
+        # Extract detailed_description
+        desc_match = re.search(
+            r'"detailed_description":\s*"([^"]*(?:\\.[^"]*)*)"', response, re.DOTALL
+        )
+        description = desc_match.group(1) if desc_match else ""
+
+        # Extract entity_name
+        name_match = re.search(r'"entity_name":\s*"([^"]*(?:\\.[^"]*)*)"', response)
+        entity_name = name_match.group(1) if name_match else "unknown_entity"
+
+        # Extract entity_type
+        type_match = re.search(r'"entity_type":\s*"([^"]*(?:\\.[^"]*)*)"', response)
+        entity_type = type_match.group(1) if type_match else "unknown"
+
+        # Extract summary
+        summary_match = re.search(
+            r'"summary":\s*"([^"]*(?:\\.[^"]*)*)"', response, re.DOTALL
+        )
+        summary = summary_match.group(1) if summary_match else description[:100]
+
+        return {
+            "detailed_description": description,
+            "entity_info": {
+                "entity_name": entity_name,
+                "entity_type": entity_type,
+                "summary": summary,
+            },
+        }
+
+    def _extract_json_from_response(self, response: str) -> str:
+        """Legacy method - now handled by _extract_all_json_candidates"""
+        candidates = self._extract_all_json_candidates(response)
+        return candidates[0] if candidates else None
+
+    def _fix_json_escapes(self, json_str: str) -> str:
+        """Legacy method - now handled by progressive strategies"""
+        return self._progressive_quote_fix(json_str)
+
    async def _process_chunk_for_extraction(
        self, chunk_id: str, modal_entity_name: str, batch_mode: bool = False
    ):
@@ -695,31 +833,21 @@ class ImageModalProcessor(BaseModalProcessor):
                )

            # If image path exists, try to encode image
-            image_base64 = ""
-            if image_path and Path(image_path).exists():
-                image_base64 = self._encode_image_to_base64(image_path)
+            logger.debug(f"Begin Analysis of Image: {image_path}")

-            # Call vision model
-            if image_base64:
-                # Use real image for analysis
-                response = await self.modal_caption_func(
-                    vision_prompt,
-                    image_data=image_base64,
-                    system_prompt=PROMPTS["IMAGE_ANALYSIS_SYSTEM"],
-                )
-            else:
-                # Analyze based on existing text information
-                text_prompt = PROMPTS["text_prompt"].format(
-                    image_path=image_path,
-                    captions=captions,
-                    footnotes=footnotes,
-                    vision_prompt=vision_prompt,
-                )
+            if not image_path or not Path(image_path).exists():
+                raise FileNotFoundError(f"Image file not found: {image_path}")

-                response = await self.modal_caption_func(
-                    text_prompt,
-                    system_prompt=PROMPTS["IMAGE_ANALYSIS_FALLBACK_SYSTEM"],
-                )
+            image_base64 = self._encode_image_to_base64(image_path)
+            if not image_base64:
+                raise RuntimeError(f"Failed to encode image to base64: {image_path}")
+
+            # Call vision model with encoded image
+            response = await self.modal_caption_func(
+                vision_prompt,
+                image_data=image_base64,
+                system_prompt=PROMPTS["IMAGE_ANALYSIS_SYSTEM"],
+            )

            # Parse response
            enhanced_caption, entity_info = self._parse_response(response, entity_name)
@@ -753,9 +881,7 @@ class ImageModalProcessor(BaseModalProcessor):
    ) -> Tuple[str, Dict[str, Any]]:
        """Parse model response"""
        try:
-            response_data = json.loads(
-                re.search(r"\{.*\}", response, re.DOTALL).group(0)
-            )
+            response_data = self._robust_json_parse(response)

            description = response_data.get("detailed_description", "")
            entity_data = response_data.get("entity_info", {})
@@ -778,6 +904,7 @@ class ImageModalProcessor(BaseModalProcessor):

        except (json.JSONDecodeError, AttributeError, ValueError) as e:
            logger.error(f"Error parsing image analysis response: {e}")
+            logger.debug(f"Raw response: {response}")
            fallback_entity = {
                "entity_name": entity_name
                if entity_name
@@ -815,6 +942,8 @@ class TableModalProcessor(BaseModalProcessor):
        table_body = content_data.get("table_body", "")
        table_footnote = content_data.get("table_footnote", [])

+        logger.debug(f"Begin Analysis of Table: {table_img_path}")
+
        # Extract context for current item
        context = ""
        if item_info:
@@ -875,9 +1004,7 @@ class TableModalProcessor(BaseModalProcessor):
    ) -> Tuple[str, Dict[str, Any]]:
        """Parse table analysis response"""
        try:
-            response_data = json.loads(
-                re.search(r"\{.*\}", response, re.DOTALL).group(0)
-            )
+            response_data = self._robust_json_parse(response)

            description = response_data.get("detailed_description", "")
            entity_data = response_data.get("entity_info", {})
@@ -900,6 +1027,7 @@ class TableModalProcessor(BaseModalProcessor):

        except (json.JSONDecodeError, AttributeError, ValueError) as e:
            logger.error(f"Error parsing table analysis response: {e}")
+            logger.debug(f"Raw response: {response}")
            fallback_entity = {
                "entity_name": entity_name
                if entity_name
@@ -935,6 +1063,8 @@ class EquationModalProcessor(BaseModalProcessor):
        equation_text = content_data.get("text")
        equation_format = content_data.get("text_format", "")

+        logger.debug(f"Begin Analysis of Equation: {equation_text}")
+
        # Extract context for current item
        context = ""
        if item_info:
@@ -985,11 +1115,9 @@ class EquationModalProcessor(BaseModalProcessor):
    def _parse_equation_response(
        self, response: str, entity_name: str = None
    ) -> Tuple[str, Dict[str, Any]]:
-        """Parse equation analysis response"""
+        """Parse equation analysis response with robust JSON handling"""
        try:
-            response_data = json.loads(
-                re.search(r"\{.*\}", response, re.DOTALL).group(0)
-            )
+            response_data = self._robust_json_parse(response)

            description = response_data.get("detailed_description", "")
            entity_data = response_data.get("entity_info", {})
@@ -1012,6 +1140,7 @@ class EquationModalProcessor(BaseModalProcessor):

        except (json.JSONDecodeError, AttributeError, ValueError) as e:
            logger.error(f"Error parsing equation analysis response: {e}")
+            logger.debug(f"Raw response: {response}")
            fallback_entity = {
                "entity_name": entity_name
                if entity_name
@@ -1035,6 +1164,8 @@ class GenericModalProcessor(BaseModalProcessor):
        batch_mode: bool = False,
    ) -> Tuple[str, Dict[str, Any]]:
        """Process generic modal content with context support"""
+        logger.debug(f"Begin Analysis of {content_type}: {modal_content}")
+
        # Extract context for current item
        context = ""
        if item_info:
@@ -1089,9 +1220,7 @@ class GenericModalProcessor(BaseModalProcessor):
    ) -> Tuple[str, Dict[str, Any]]:
        """Parse generic analysis response"""
        try:
-            response_data = json.loads(
-                re.search(r"\{.*\}", response, re.DOTALL).group(0)
-            )
+            response_data = self._robust_json_parse(response)

            description = response_data.get("detailed_description", "")
            entity_data = response_data.get("entity_info", {})
@@ -1113,7 +1242,8 @@ class GenericModalProcessor(BaseModalProcessor):
            return description, entity_data

        except (json.JSONDecodeError, AttributeError, ValueError) as e:
-            logger.error(f"Error parsing generic analysis response: {e}")
+            logger.error(f"Error parsing {content_type} analysis response: {e}")
+            logger.debug(f"Raw response: {response}")
            fallback_entity = {
                "entity_name": entity_name
                if entity_name
--- a/raganything/parser.py
+++ b/raganything/parser.py
--- a/raganything/processor.py
+++ b/raganything/processor.py
@@ -5,7 +5,7 @@ Contains methods for parsing documents and processing multimodal content
 """

 import os
-from typing import Dict, List, Any, Tuple
+from typing import Dict, List, Any
 from pathlib import Path
 from raganything.parser import MineruParser, DoclingParser
 from raganything.utils import (
@@ -57,7 +57,9 @@ class ProcessorMixin:
        ext = file_path.suffix.lower()

        try:
-            doc_parser = DoclingParser() if self.config.parser == "docling" else MineruParser()
+            doc_parser = (
+                DoclingParser() if self.config.parser == "docling" else MineruParser()
+            )
            if ext in [".pdf"]:
                self.logger.info(
                    f"Detected PDF file, using PDF parser (method={parse_method})..."
@@ -82,8 +84,20 @@ class ProcessorMixin:
                content_list = MineruParser.parse_image(
                    image_path=file_path, output_dir=output_dir, **kwargs
                )
-            elif ext in [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".html", ".htm", ".xhtml"]:
-                self.logger.info("Detected Office or HTML document, using Office parser...")
+            elif ext in [
+                ".doc",
+                ".docx",
+                ".ppt",
+                ".pptx",
+                ".xls",
+                ".xlsx",
+                ".html",
+                ".htm",
+                ".xhtml",
+            ]:
+                self.logger.info(
+                    "Detected Office or HTML document, using Office parser..."
+                )
                content_list = doc_parser.parse_office_doc(
                    doc_path=file_path, output_dir=output_dir, **kwargs
                )
--- a/raganything/raganything.py
+++ b/raganything/raganything.py
@@ -30,7 +30,7 @@ from raganything.query import QueryMixin
 from raganything.processor import ProcessorMixin
 from raganything.batch import BatchMixin
 from raganything.utils import get_processor_supports
-from raganything.parser import Parser, MineruParser, DoclingParser
+from raganything.parser import MineruParser, DoclingParser

 # Import specialized processors
 from raganything.modalprocessors import (
@@ -63,7 +63,7 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):

    config: Optional[RAGAnythingConfig] = field(default=None)
    """Configuration object, if None will create with environment variables."""
-    
+
    # Internal State
    # ---
    modal_processors: Dict[str, Any] = field(default_factory=dict, init=False)
@@ -83,9 +83,11 @@ class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):

        # Set up logger (use existing logger, don't configure it)
        self.logger = logger
-        
+
        # Set up document parser
-        self.doc_parser = DoclingParser() if self.config.parser == "docling" else MineruParser()
+        self.doc_parser = (
+            DoclingParser() if self.config.parser == "docling" else MineruParser()
+        )

        # Create working directory if needed
        if not os.path.exists(self.working_dir):