vlm_enhanced_query

2025-08-20 19:01:34 +03:00 · 2025-08-12 15:59:50 +08:00
parent cf2aa70cfd
commit dfd9ec855e
6 changed files with 514 additions and 28 deletions
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@
 ---

 ## 🎉 News
+- [X] [2025.08.12]🎯📢 🔍 RAG-Anything now supports **VLM Enhanced Query** mode! When documents contain images, the system can automatically pass images directly to VLM for comprehensive multimodal analysis alongside text context.
 - [X] [2025.07.05]🎯📢 RAG-Anything now features a [context configuration module](docs/context_aware_processing.md), enabling intelligent integration of relevant contextual information to enhance multimodal content processing.
 - [X] [2025.07.04]🎯📢 🚀 RAG-Anything now supports multimodal query capabilities, enabling enhanced RAG with seamless processing of text, images, tables, and equations.
 - [X] [2025.07.03]🎯📢 🎉 RAG-Anything has reached 1k🌟 stars on GitHub! Thank you for your incredible support and valuable contributions to the project.
@@ -319,9 +320,22 @@ async def main():

    # Define vision model function for image processing
    def vision_model_func(
-        prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
+        prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
    ):
-        if image_data:
+        # If messages format is provided (for multimodal VLM enhanced query), use it directly
+        if messages:
+            return openai_complete_if_cache(
+                "gpt-4o",
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+        # Traditional single image format
+        elif image_data:
            return openai_complete_if_cache(
                "gpt-4o",
                "",
@@ -350,6 +364,7 @@ async def main():
                base_url=base_url,
                **kwargs,
            )
+        # Pure text format
        else:
            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

@@ -549,7 +564,7 @@ class CustomModalProcessor(GenericModalProcessor):

 #### 5. Query Options

-RAG-Anything provides two types of query methods:
+RAG-Anything provides three types of query methods:

 **Pure Text Queries** - Direct knowledge base search using LightRAG:
 ```python
@@ -563,7 +578,36 @@ text_result_naive = await rag.aquery("Your question", mode="naive")
 sync_text_result = rag.query("Your question", mode="hybrid")
 ```

-**Multimodal Queries** - Enhanced queries with multimodal content analysis:
+**VLM Enhanced Queries** - Automatically analyze images in retrieved context using VLM:
+```python
+# VLM enhanced query (automatically enabled when vision_model_func is provided)
+vlm_result = await rag.aquery(
+    "Analyze the charts and figures in the document",
+    mode="hybrid"
+    # vlm_enhanced=True is automatically set when vision_model_func is available
+)
+
+# Manually control VLM enhancement
+vlm_enabled = await rag.aquery(
+    "What do the images show in this document?",
+    mode="hybrid",
+    vlm_enhanced=True  # Force enable VLM enhancement
+)
+
+vlm_disabled = await rag.aquery(
+    "What do the images show in this document?",
+    mode="hybrid",
+    vlm_enhanced=False  # Force disable VLM enhancement
+)
+
+# When documents contain images, VLM can see and analyze them directly
+# The system will automatically:
+# 1. Retrieve relevant context containing image paths
+# 2. Load and encode images as base64
+# 3. Send both text context and images to VLM for comprehensive analysis
+```
+
+**Multimodal Queries** - Enhanced queries with specific multimodal content analysis:
 ```python
 # Query with table data
 table_result = await rag.aquery_with_multimodal(
@@ -645,9 +689,22 @@ async def load_existing_lightrag():

    # Define vision model function for image processing
    def vision_model_func(
-        prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
+        prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
    ):
-        if image_data:
+        # If messages format is provided (for multimodal VLM enhanced query), use it directly
+        if messages:
+            return openai_complete_if_cache(
+                "gpt-4o",
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+        # Traditional single image format
+        elif image_data:
            return openai_complete_if_cache(
                "gpt-4o",
                "",
@@ -676,6 +733,7 @@ async def load_existing_lightrag():
                base_url=base_url,
                **kwargs,
            )
+        # Pure text format
        else:
            return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)

@@ -738,8 +796,21 @@ async def insert_content_list_example():
            **kwargs,
        )

-    def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs):
-        if image_data:
+    def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
+        # If messages format is provided (for multimodal VLM enhanced query), use it directly
+        if messages:
+            return openai_complete_if_cache(
+                "gpt-4o",
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+        # Traditional single image format
+        elif image_data:
            return openai_complete_if_cache(
                "gpt-4o",
                "",
@@ -759,6 +830,7 @@ async def insert_content_list_example():
                base_url=base_url,
                **kwargs,
            )
+        # Pure text format
        else:
            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

--- a/README_zh.md
+++ b/README_zh.md
@@ -48,6 +48,7 @@
 ---

 ## 🎉 新闻
+- [X] [2025.08.12]🎯📢 🔍 RAGAnything 现在支持 **VLM增强查询** 模式！当文档包含图片时，系统可以自动将图片与文本上下文一起直接传递给VLM进行综合多模态分析。
 - [X] [2025.07.05]🎯📢 RAGAnything 新增[上下文配置模块](docs/context_aware_processing.md)，支持为多模态内容处理添加相关上下文信息。
 - [X] [2025.07.04]🎯📢 RAGAnything 现在支持多模态内容查询，实现了集成文本、图像、表格和公式处理的增强检索生成功能。
 - [X] [2025.07.03]🎯📢 RAGAnything 在GitHub上达到了1K星标🌟！感谢您的支持和贡献。
@@ -315,9 +316,22 @@ async def main():

    # 定义视觉模型函数用于图像处理
    def vision_model_func(
-        prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
+        prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
    ):
-        if image_data:
+        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用
+        if messages:
+            return openai_complete_if_cache(
+                "gpt-4o",
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+        # 传统单图片格式
+        elif image_data:
            return openai_complete_if_cache(
                "gpt-4o",
                "",
@@ -346,6 +360,7 @@ async def main():
                base_url=base_url,
                **kwargs,
            )
+        # 纯文本格式
        else:
            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

@@ -547,7 +562,7 @@ class CustomModalProcessor(GenericModalProcessor):

 #### 5. 查询选项

-RAG-Anything 提供两种类型的查询方法：
+RAG-Anything 提供三种类型的查询方法：

 **纯文本查询** - 使用LightRAG直接进行知识库搜索：
 ```python
@@ -561,7 +576,36 @@ text_result_naive = await rag.aquery("你的问题", mode="naive")
 sync_text_result = rag.query("你的问题", mode="hybrid")
 ```

-**多模态查询** - 包含多模态内容分析的增强查询：
+**VLM增强查询** - 使用VLM自动分析检索上下文中的图像：
+```python
+# VLM增强查询（当提供vision_model_func时自动启用）
+vlm_result = await rag.aquery(
+    "分析文档中的图表和数据",
+    mode="hybrid"
+    # vlm_enhanced=True 当vision_model_func可用时自动设置
+)
+
+# 手动控制VLM增强
+vlm_enabled = await rag.aquery(
+    "这个文档中的图片显示了什么内容？",
+    mode="hybrid",
+    vlm_enhanced=True  # 强制启用VLM增强
+)
+
+vlm_disabled = await rag.aquery(
+    "这个文档中的图片显示了什么内容？",
+    mode="hybrid",
+    vlm_enhanced=False  # 强制禁用VLM增强
+)
+
+# 当文档包含图片时，VLM可以直接查看和分析图片
+# 系统将自动：
+# 1. 检索包含图片路径的相关上下文
+# 2. 加载图片并编码为base64格式
+# 3. 将文本上下文和图片一起发送给VLM进行综合分析
+```
+
+**多模态查询** - 包含特定多模态内容分析的增强查询：
 ```python
 # 包含表格数据的查询
 table_result = await rag.aquery_with_multimodal(
@@ -642,9 +686,22 @@ async def load_existing_lightrag():

    # 定义视觉模型函数用于图像处理
    def vision_model_func(
-        prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
+        prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
    ):
-        if image_data:
+        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用
+        if messages:
+            return openai_complete_if_cache(
+                "gpt-4o",
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+        # 传统单图片格式
+        elif image_data:
            return openai_complete_if_cache(
                "gpt-4o",
                "",
@@ -673,6 +730,7 @@ async def load_existing_lightrag():
                base_url=base_url,
                **kwargs,
            )
+        # 纯文本格式
        else:
            return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)

@@ -735,8 +793,21 @@ async def insert_content_list_example():
            **kwargs,
        )

-    def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs):
-        if image_data:
+    def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
+        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用
+        if messages:
+            return openai_complete_if_cache(
+                "gpt-4o",
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=messages,
+                api_key=api_key,
+                base_url=base_url,
+                **kwargs,
+            )
+        # 传统单图片格式
+        elif image_data:
            return openai_complete_if_cache(
                "gpt-4o",
                "",
@@ -756,6 +827,7 @@ async def insert_content_list_example():
                base_url=base_url,
                **kwargs,
            )
+        # 纯文本格式
        else:
            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

--- a/examples/raganything_example.py
+++ b/examples/raganything_example.py
@@ -129,9 +129,27 @@ async def process_with_rag(

        # Define vision model function for image processing
        def vision_model_func(
-            prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
+            prompt,
+            system_prompt=None,
+            history_messages=[],
+            image_data=None,
+            messages=None,
+            **kwargs,
        ):
-            if image_data:
+            # If messages format is provided (for multimodal VLM enhanced query), use it directly
+            if messages:
+                return openai_complete_if_cache(
+                    "gpt-4o",
+                    "",
+                    system_prompt=None,
+                    history_messages=[],
+                    messages=messages,
+                    api_key=api_key,
+                    base_url=base_url,
+                    **kwargs,
+                )
+            # Traditional single image format
+            elif image_data:
                return openai_complete_if_cache(
                    "gpt-4o",
                    "",
@@ -160,6 +178,7 @@ async def process_with_rag(
                    base_url=base_url,
                    **kwargs,
                )
+            # Pure text format
            else:
                return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

--- a/raganything/init.py
+++ b/raganything/init.py
@@ -1,7 +1,7 @@
 from .raganything import RAGAnything as RAGAnything
 from .config import RAGAnythingConfig as RAGAnythingConfig

-__version__ = "1.2.6"
+__version__ = "1.2.7"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/RAG-Anything"

--- a/raganything/query.py
+++ b/raganything/query.py
@@ -6,12 +6,17 @@ Contains all query-related methods for both text and multimodal queries

 import json
 import hashlib
+import re
 from typing import Dict, List, Any
 from pathlib import Path
 from lightrag import QueryParam
 from lightrag.utils import always_get_an_event_loop
 from raganything.prompt import PROMPTS
-from raganything.utils import get_processor_for_type
+from raganything.utils import (
+    get_processor_for_type,
+    encode_image_to_base64,
+    validate_image_file,
+)


 class QueryMixin:
@@ -92,7 +97,7 @@ class QueryMixin:

        return f"multimodal_query:{cache_hash}"

-    async def aquery(self, query: str, mode: str = "hybrid", **kwargs) -> str:
+    async def aquery(self, query: str, mode: str = "mix", **kwargs) -> str:
        """
        Pure text query - directly calls LightRAG's query functionality

@@ -100,6 +105,9 @@ class QueryMixin:
            query: Query text
            mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
            **kwargs: Other query parameters, will be passed to QueryParam
+                - vlm_enhanced: bool, default True when vision_model_func is available.
+                  If True, will parse image paths in retrieved context and replace them
+                  with base64 encoded images for VLM processing.

        Returns:
            str: Query result
@@ -109,6 +117,30 @@ class QueryMixin:
                "No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance."
            )

+        # Check if VLM enhanced query should be used
+        vlm_enhanced = kwargs.pop("vlm_enhanced", None)
+
+        # Auto-determine VLM enhanced based on availability
+        if vlm_enhanced is None:
+            vlm_enhanced = (
+                hasattr(self, "vision_model_func")
+                and self.vision_model_func is not None
+            )
+
+        # Use VLM enhanced query if enabled and available
+        if (
+            vlm_enhanced
+            and hasattr(self, "vision_model_func")
+            and self.vision_model_func
+        ):
+            return await self.aquery_vlm_enhanced(query, mode=mode, **kwargs)
+        elif vlm_enhanced and (
+            not hasattr(self, "vision_model_func") or not self.vision_model_func
+        ):
+            self.logger.warning(
+                "VLM enhanced query requested but vision_model_func is not available, falling back to normal query"
+            )
+
        # Create query parameters
        query_param = QueryParam(mode=mode, **kwargs)

@@ -125,7 +157,7 @@ class QueryMixin:
        self,
        query: str,
        multimodal_content: List[Dict[str, Any]] = None,
-        mode: str = "hybrid",
+        mode: str = "mix",
        **kwargs,
    ) -> str:
        """
@@ -210,15 +242,12 @@ class QueryMixin:
            query, multimodal_content
        )

-        # Create query parameters
-        query_param = QueryParam(mode=mode, **kwargs)
-
        self.logger.info(
            f"Generated enhanced query length: {len(enhanced_query)} characters"
        )

        # Execute enhanced query
-        result = await self.lightrag.aquery(enhanced_query, param=query_param)
+        result = await self.aquery(enhanced_query, mode=mode, **kwargs)

        # Save to cache if available and enabled
        if (
@@ -264,6 +293,61 @@ class QueryMixin:
        self.logger.info("Multimodal query completed")
        return result

+    async def aquery_vlm_enhanced(self, query: str, mode: str = "mix", **kwargs) -> str:
+        """
+        VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing
+
+        Args:
+            query: User query
+            mode: Underlying LightRAG query mode
+            **kwargs: Other query parameters
+
+        Returns:
+            str: VLM query result
+        """
+        # Ensure VLM is available
+        if not hasattr(self, "vision_model_func") or not self.vision_model_func:
+            raise ValueError(
+                "VLM enhanced query requires vision_model_func. "
+                "Please provide a vision model function when initializing RAGAnything."
+            )
+
+        # Ensure LightRAG is initialized
+        await self._ensure_lightrag_initialized()
+
+        self.logger.info(f"Executing VLM enhanced query: {query[:100]}...")
+
+        # Clear previous image cache
+        if hasattr(self, "_current_images_base64"):
+            delattr(self, "_current_images_base64")
+
+        # 1. Get original retrieval prompt (without generating final answer)
+        query_param = QueryParam(mode=mode, only_need_prompt=True, **kwargs)
+        raw_prompt = await self.lightrag.aquery(query, param=query_param)
+
+        self.logger.info("Retrieved raw prompt from LightRAG")
+
+        # 2. Extract and process image paths
+        enhanced_prompt, images_found = await self._process_image_paths_for_vlm(
+            raw_prompt
+        )
+
+        if not images_found:
+            self.logger.info("No valid images found, falling back to normal query")
+            # Fallback to normal query
+            return await self.aquery(query, mode=mode, **kwargs)
+
+        self.logger.info(f"Processed {images_found} images for VLM")
+
+        # 3. Build VLM message format
+        messages = self._build_vlm_messages_with_images(enhanced_prompt, query)
+
+        # 4. Call VLM for question answering
+        result = await self._call_vlm_with_multimodal_content(messages)
+
+        self.logger.info("VLM enhanced query completed")
+        return result
+
    async def _process_multimodal_query_content(
        self, base_query: str, multimodal_content: List[Dict[str, Any]]
    ) -> str:
@@ -431,8 +515,175 @@ class QueryMixin:

        return description

+    async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]:
+        """
+        Process image paths in prompt, keeping original paths and adding VLM markers
+
+        Args:
+            prompt: Original prompt
+
+        Returns:
+            tuple: (processed prompt, image count)
+        """
+        enhanced_prompt = prompt
+        images_processed = 0
+
+        # Initialize image cache
+        self._current_images_base64 = []
+
+        # Enhanced regex pattern for matching image paths
+        # Matches patterns like "Image Path: \path\to\image.jpg"
+        image_path_pattern = r"Image Path:\s*([^\r\n]+)"
+
+        def replace_image_path(match):
+            nonlocal images_processed
+
+            image_path = match.group(1).strip()
+
+            # Validate path format (basic check)
+            if not image_path or len(image_path) < 3:
+                self.logger.warning(f"Invalid image path format: {image_path}")
+                return match.group(0)  # Keep original
+
+            # Use utility function to validate image file
+            if not validate_image_file(image_path):
+                return match.group(0)  # Keep original if validation fails
+
+            try:
+                # Encode image to base64 using utility function
+                image_base64 = encode_image_to_base64(image_path)
+                if image_base64:
+                    images_processed += 1
+                    # Save base64 to instance variable for later use
+                    self._current_images_base64.append(image_base64)
+
+                    # Keep original path info and add VLM marker
+                    return f"Image Path: {image_path}\n[VLM_IMAGE_{images_processed}]"
+                else:
+                    self.logger.error(f"Failed to encode image: {image_path}")
+                    return match.group(0)  # Keep original if encoding failed
+
+            except Exception as e:
+                self.logger.error(f"Failed to process image {image_path}: {e}")
+                return match.group(0)  # Keep original
+
+        # Execute replacement
+        enhanced_prompt = re.sub(
+            image_path_pattern, replace_image_path, enhanced_prompt
+        )
+
+        return enhanced_prompt, images_processed
+
+    def _build_vlm_messages_with_images(
+        self, enhanced_prompt: str, user_query: str
+    ) -> List[Dict]:
+        """
+        Build VLM message format, using markers to correspond images with text positions
+
+        Args:
+            enhanced_prompt: Enhanced prompt with image markers
+            user_query: User query
+
+        Returns:
+            List[Dict]: VLM message format
+        """
+        images_base64 = getattr(self, "_current_images_base64", [])
+
+        if not images_base64:
+            # Pure text mode
+            return [
+                {
+                    "role": "user",
+                    "content": f"Context:\n{enhanced_prompt}\n\nUser Question: {user_query}",
+                }
+            ]
+
+        # Build multimodal content
+        content_parts = []
+
+        # Split text at image markers and insert images
+        text_parts = enhanced_prompt.split("[VLM_IMAGE_")
+
+        for i, text_part in enumerate(text_parts):
+            if i == 0:
+                # First text part
+                if text_part.strip():
+                    content_parts.append({"type": "text", "text": text_part})
+            else:
+                # Find marker number and insert corresponding image
+                marker_match = re.match(r"(\d+)\](.*)", text_part, re.DOTALL)
+                if marker_match:
+                    image_num = (
+                        int(marker_match.group(1)) - 1
+                    )  # Convert to 0-based index
+                    remaining_text = marker_match.group(2)
+
+                    # Insert corresponding image
+                    if 0 <= image_num < len(images_base64):
+                        content_parts.append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{images_base64[image_num]}"
+                                },
+                            }
+                        )
+
+                    # Insert remaining text
+                    if remaining_text.strip():
+                        content_parts.append({"type": "text", "text": remaining_text})
+
+        # Add user question
+        content_parts.append(
+            {
+                "type": "text",
+                "text": f"\n\nUser Question: {user_query}\n\nPlease answer based on the context and images provided.",
+            }
+        )
+
+        return [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that can analyze both text and image content to provide comprehensive answers.",
+            },
+            {"role": "user", "content": content_parts},
+        ]
+
+    async def _call_vlm_with_multimodal_content(self, messages: List[Dict]) -> str:
+        """
+        Call VLM to process multimodal content
+
+        Args:
+            messages: VLM message format
+
+        Returns:
+            str: VLM response result
+        """
+        try:
+            user_message = messages[1]
+            content = user_message["content"]
+            system_prompt = messages[0]["content"]
+
+            if isinstance(content, str):
+                # Pure text mode
+                result = await self.vision_model_func(
+                    content, system_prompt=system_prompt
+                )
+            else:
+                # Multimodal mode - pass complete messages directly to VLM
+                result = await self.vision_model_func(
+                    "",  # Empty prompt since we're using messages format
+                    messages=messages,
+                )
+
+            return result
+
+        except Exception as e:
+            self.logger.error(f"VLM call failed: {e}")
+            raise
+
    # Synchronous versions of query methods
-    def query(self, query: str, mode: str = "hybrid", **kwargs) -> str:
+    def query(self, query: str, mode: str = "mix", **kwargs) -> str:
        """
        Synchronous version of pure text query

@@ -440,6 +691,9 @@ class QueryMixin:
            query: Query text
            mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
            **kwargs: Other query parameters, will be passed to QueryParam
+                - vlm_enhanced: bool, default True when vision_model_func is available.
+                  If True, will parse image paths in retrieved context and replace them
+                  with base64 encoded images for VLM processing.

        Returns:
            str: Query result
@@ -451,7 +705,7 @@ class QueryMixin:
        self,
        query: str,
        multimodal_content: List[Dict[str, Any]] = None,
-        mode: str = "hybrid",
+        mode: str = "mix",
        **kwargs,
    ) -> str:
        """
--- a/raganything/utils.py
+++ b/raganything/utils.py
@@ -4,7 +4,9 @@ Utility functions for RAGAnything
 Contains helper functions for content separation, text insertion, and other utilities
 """

+import base64
 from typing import Dict, List, Any, Tuple
+from pathlib import Path
 from lightrag.utils import logger


@@ -54,6 +56,73 @@ def separate_content(
    return text_content, multimodal_items


+def encode_image_to_base64(image_path: str) -> str:
+    """
+    Encode image file to base64 string
+
+    Args:
+        image_path: Path to the image file
+
+    Returns:
+        str: Base64 encoded string, empty string if encoding fails
+    """
+    try:
+        with open(image_path, "rb") as image_file:
+            encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+        return encoded_string
+    except Exception as e:
+        logger.error(f"Failed to encode image {image_path}: {e}")
+        return ""
+
+
+def validate_image_file(image_path: str, max_size_mb: int = 50) -> bool:
+    """
+    Validate if a file is a valid image file
+
+    Args:
+        image_path: Path to the image file
+        max_size_mb: Maximum file size in MB
+
+    Returns:
+        bool: True if valid, False otherwise
+    """
+    try:
+        path = Path(image_path)
+
+        # Check if file exists
+        if not path.exists():
+            logger.warning(f"Image file not found: {image_path}")
+            return False
+
+        # Check file extension
+        image_extensions = [
+            ".jpg",
+            ".jpeg",
+            ".png",
+            ".gif",
+            ".bmp",
+            ".webp",
+            ".tiff",
+            ".tif",
+        ]
+        if not any(str(path).lower().endswith(ext) for ext in image_extensions):
+            logger.warning(f"File does not appear to be an image: {image_path}")
+            return False
+
+        # Check file size
+        file_size = path.stat().st_size
+        max_size = max_size_mb * 1024 * 1024
+        if file_size > max_size:
+            logger.warning(f"Image file too large ({file_size} bytes): {image_path}")
+            return False
+
+        return True
+
+    except Exception as e:
+        logger.error(f"Failed to validate image {image_path}: {e}")
+        return False
+
+
 async def insert_text_content(
    lightrag,
    input: str | list[str],