vlm_enhanced_query

This commit is contained in:
zrguo
2025-08-12 15:59:50 +08:00
parent cf2aa70cfd
commit dfd9ec855e
6 changed files with 514 additions and 28 deletions

View File

@@ -48,6 +48,7 @@
---
## 🎉 News
- [X] [2025.08.12]🎯📢 🔍 RAG-Anything now supports **VLM Enhanced Query** mode! When documents contain images, the system can automatically pass images directly to VLM for comprehensive multimodal analysis alongside text context.
- [X] [2025.07.05]🎯📢 RAG-Anything now features a [context configuration module](docs/context_aware_processing.md), enabling intelligent integration of relevant contextual information to enhance multimodal content processing.
- [X] [2025.07.04]🎯📢 🚀 RAG-Anything now supports multimodal query capabilities, enabling enhanced RAG with seamless processing of text, images, tables, and equations.
- [X] [2025.07.03]🎯📢 🎉 RAG-Anything has reached 1k🌟 stars on GitHub! Thank you for your incredible support and valuable contributions to the project.
@@ -319,9 +320,22 @@ async def main():
# Define vision model function for image processing
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
):
if image_data:
# If messages format is provided (for multimodal VLM enhanced query), use it directly
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Traditional single image format
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
@@ -350,6 +364,7 @@ async def main():
base_url=base_url,
**kwargs,
)
# Pure text format
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
@@ -549,7 +564,7 @@ class CustomModalProcessor(GenericModalProcessor):
#### 5. Query Options
RAG-Anything provides two types of query methods:
RAG-Anything provides three types of query methods:
**Pure Text Queries** - Direct knowledge base search using LightRAG:
```python
@@ -563,7 +578,36 @@ text_result_naive = await rag.aquery("Your question", mode="naive")
sync_text_result = rag.query("Your question", mode="hybrid")
```
**Multimodal Queries** - Enhanced queries with multimodal content analysis:
**VLM Enhanced Queries** - Automatically analyze images in retrieved context using VLM:
```python
# VLM enhanced query (automatically enabled when vision_model_func is provided)
vlm_result = await rag.aquery(
"Analyze the charts and figures in the document",
mode="hybrid"
# vlm_enhanced=True is automatically set when vision_model_func is available
)
# Manually control VLM enhancement
vlm_enabled = await rag.aquery(
"What do the images show in this document?",
mode="hybrid",
vlm_enhanced=True # Force enable VLM enhancement
)
vlm_disabled = await rag.aquery(
"What do the images show in this document?",
mode="hybrid",
vlm_enhanced=False # Force disable VLM enhancement
)
# When documents contain images, VLM can see and analyze them directly
# The system will automatically:
# 1. Retrieve relevant context containing image paths
# 2. Load and encode images as base64
# 3. Send both text context and images to VLM for comprehensive analysis
```
**Multimodal Queries** - Enhanced queries with specific multimodal content analysis:
```python
# Query with table data
table_result = await rag.aquery_with_multimodal(
@@ -645,9 +689,22 @@ async def load_existing_lightrag():
# Define vision model function for image processing
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
):
if image_data:
# If messages format is provided (for multimodal VLM enhanced query), use it directly
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Traditional single image format
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
@@ -676,6 +733,7 @@ async def load_existing_lightrag():
base_url=base_url,
**kwargs,
)
# Pure text format
else:
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
@@ -738,8 +796,21 @@ async def insert_content_list_example():
**kwargs,
)
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs):
if image_data:
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
# If messages format is provided (for multimodal VLM enhanced query), use it directly
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Traditional single image format
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
@@ -759,6 +830,7 @@ async def insert_content_list_example():
base_url=base_url,
**kwargs,
)
# Pure text format
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

View File

@@ -48,6 +48,7 @@
---
## 🎉 新闻
- [X] [2025.08.12]🎯📢 🔍 RAGAnything 现在支持 **VLM增强查询** 模式当文档包含图片时系统可以自动将图片与文本上下文一起直接传递给VLM进行综合多模态分析。
- [X] [2025.07.05]🎯📢 RAGAnything 新增[上下文配置模块](docs/context_aware_processing.md),支持为多模态内容处理添加相关上下文信息。
- [X] [2025.07.04]🎯📢 RAGAnything 现在支持多模态内容查询,实现了集成文本、图像、表格和公式处理的增强检索生成功能。
- [X] [2025.07.03]🎯📢 RAGAnything 在GitHub上达到了1K星标🌟感谢您的支持和贡献。
@@ -315,9 +316,22 @@ async def main():
# 定义视觉模型函数用于图像处理
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
):
if image_data:
# 如果提供了messages格式用于多模态VLM增强查询直接使用
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 传统单图片格式
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
@@ -346,6 +360,7 @@ async def main():
base_url=base_url,
**kwargs,
)
# 纯文本格式
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
@@ -547,7 +562,7 @@ class CustomModalProcessor(GenericModalProcessor):
#### 5. 查询选项
RAG-Anything 提供种类型的查询方法:
RAG-Anything 提供种类型的查询方法:
**纯文本查询** - 使用LightRAG直接进行知识库搜索
```python
@@ -561,7 +576,36 @@ text_result_naive = await rag.aquery("你的问题", mode="naive")
sync_text_result = rag.query("你的问题", mode="hybrid")
```
**多模态查询** - 包含多模态内容分析的增强查询
**VLM增强查询** - 使用VLM自动分析检索上下文中的图像
```python
# VLM增强查询当提供vision_model_func时自动启用
vlm_result = await rag.aquery(
"分析文档中的图表和数据",
mode="hybrid"
# vlm_enhanced=True 当vision_model_func可用时自动设置
)
# 手动控制VLM增强
vlm_enabled = await rag.aquery(
"这个文档中的图片显示了什么内容?",
mode="hybrid",
vlm_enhanced=True # 强制启用VLM增强
)
vlm_disabled = await rag.aquery(
"这个文档中的图片显示了什么内容?",
mode="hybrid",
vlm_enhanced=False # 强制禁用VLM增强
)
# 当文档包含图片时VLM可以直接查看和分析图片
# 系统将自动:
# 1. 检索包含图片路径的相关上下文
# 2. 加载图片并编码为base64格式
# 3. 将文本上下文和图片一起发送给VLM进行综合分析
```
**多模态查询** - 包含特定多模态内容分析的增强查询:
```python
# 包含表格数据的查询
table_result = await rag.aquery_with_multimodal(
@@ -642,9 +686,22 @@ async def load_existing_lightrag():
# 定义视觉模型函数用于图像处理
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
):
if image_data:
# 如果提供了messages格式用于多模态VLM增强查询直接使用
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 传统单图片格式
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
@@ -673,6 +730,7 @@ async def load_existing_lightrag():
base_url=base_url,
**kwargs,
)
# 纯文本格式
else:
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
@@ -735,8 +793,21 @@ async def insert_content_list_example():
**kwargs,
)
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs):
if image_data:
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
# 如果提供了messages格式用于多模态VLM增强查询直接使用
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 传统单图片格式
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
@@ -756,6 +827,7 @@ async def insert_content_list_example():
base_url=base_url,
**kwargs,
)
# 纯文本格式
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

View File

@@ -129,9 +129,27 @@ async def process_with_rag(
# Define vision model function for image processing
def vision_model_func(
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
prompt,
system_prompt=None,
history_messages=[],
image_data=None,
messages=None,
**kwargs,
):
if image_data:
# If messages format is provided (for multimodal VLM enhanced query), use it directly
if messages:
return openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# Traditional single image format
elif image_data:
return openai_complete_if_cache(
"gpt-4o",
"",
@@ -160,6 +178,7 @@ async def process_with_rag(
base_url=base_url,
**kwargs,
)
# Pure text format
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

View File

@@ -1,7 +1,7 @@
from .raganything import RAGAnything as RAGAnything
from .config import RAGAnythingConfig as RAGAnythingConfig
__version__ = "1.2.6"
__version__ = "1.2.7"
__author__ = "Zirui Guo"
__url__ = "https://github.com/HKUDS/RAG-Anything"

View File

@@ -6,12 +6,17 @@ Contains all query-related methods for both text and multimodal queries
import json
import hashlib
import re
from typing import Dict, List, Any
from pathlib import Path
from lightrag import QueryParam
from lightrag.utils import always_get_an_event_loop
from raganything.prompt import PROMPTS
from raganything.utils import get_processor_for_type
from raganything.utils import (
get_processor_for_type,
encode_image_to_base64,
validate_image_file,
)
class QueryMixin:
@@ -92,7 +97,7 @@ class QueryMixin:
return f"multimodal_query:{cache_hash}"
async def aquery(self, query: str, mode: str = "hybrid", **kwargs) -> str:
async def aquery(self, query: str, mode: str = "mix", **kwargs) -> str:
"""
Pure text query - directly calls LightRAG's query functionality
@@ -100,6 +105,9 @@ class QueryMixin:
query: Query text
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
**kwargs: Other query parameters, will be passed to QueryParam
- vlm_enhanced: bool, default True when vision_model_func is available.
If True, will parse image paths in retrieved context and replace them
with base64 encoded images for VLM processing.
Returns:
str: Query result
@@ -109,6 +117,30 @@ class QueryMixin:
"No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance."
)
# Check if VLM enhanced query should be used
vlm_enhanced = kwargs.pop("vlm_enhanced", None)
# Auto-determine VLM enhanced based on availability
if vlm_enhanced is None:
vlm_enhanced = (
hasattr(self, "vision_model_func")
and self.vision_model_func is not None
)
# Use VLM enhanced query if enabled and available
if (
vlm_enhanced
and hasattr(self, "vision_model_func")
and self.vision_model_func
):
return await self.aquery_vlm_enhanced(query, mode=mode, **kwargs)
elif vlm_enhanced and (
not hasattr(self, "vision_model_func") or not self.vision_model_func
):
self.logger.warning(
"VLM enhanced query requested but vision_model_func is not available, falling back to normal query"
)
# Create query parameters
query_param = QueryParam(mode=mode, **kwargs)
@@ -125,7 +157,7 @@ class QueryMixin:
self,
query: str,
multimodal_content: List[Dict[str, Any]] = None,
mode: str = "hybrid",
mode: str = "mix",
**kwargs,
) -> str:
"""
@@ -210,15 +242,12 @@ class QueryMixin:
query, multimodal_content
)
# Create query parameters
query_param = QueryParam(mode=mode, **kwargs)
self.logger.info(
f"Generated enhanced query length: {len(enhanced_query)} characters"
)
# Execute enhanced query
result = await self.lightrag.aquery(enhanced_query, param=query_param)
result = await self.aquery(enhanced_query, mode=mode, **kwargs)
# Save to cache if available and enabled
if (
@@ -264,6 +293,61 @@ class QueryMixin:
self.logger.info("Multimodal query completed")
return result
async def aquery_vlm_enhanced(self, query: str, mode: str = "mix", **kwargs) -> str:
"""
VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing
Args:
query: User query
mode: Underlying LightRAG query mode
**kwargs: Other query parameters
Returns:
str: VLM query result
"""
# Ensure VLM is available
if not hasattr(self, "vision_model_func") or not self.vision_model_func:
raise ValueError(
"VLM enhanced query requires vision_model_func. "
"Please provide a vision model function when initializing RAGAnything."
)
# Ensure LightRAG is initialized
await self._ensure_lightrag_initialized()
self.logger.info(f"Executing VLM enhanced query: {query[:100]}...")
# Clear previous image cache
if hasattr(self, "_current_images_base64"):
delattr(self, "_current_images_base64")
# 1. Get original retrieval prompt (without generating final answer)
query_param = QueryParam(mode=mode, only_need_prompt=True, **kwargs)
raw_prompt = await self.lightrag.aquery(query, param=query_param)
self.logger.info("Retrieved raw prompt from LightRAG")
# 2. Extract and process image paths
enhanced_prompt, images_found = await self._process_image_paths_for_vlm(
raw_prompt
)
if not images_found:
self.logger.info("No valid images found, falling back to normal query")
# Fallback to normal query
return await self.aquery(query, mode=mode, **kwargs)
self.logger.info(f"Processed {images_found} images for VLM")
# 3. Build VLM message format
messages = self._build_vlm_messages_with_images(enhanced_prompt, query)
# 4. Call VLM for question answering
result = await self._call_vlm_with_multimodal_content(messages)
self.logger.info("VLM enhanced query completed")
return result
async def _process_multimodal_query_content(
self, base_query: str, multimodal_content: List[Dict[str, Any]]
) -> str:
@@ -431,8 +515,175 @@ class QueryMixin:
return description
async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]:
"""
Process image paths in prompt, keeping original paths and adding VLM markers
Args:
prompt: Original prompt
Returns:
tuple: (processed prompt, image count)
"""
enhanced_prompt = prompt
images_processed = 0
# Initialize image cache
self._current_images_base64 = []
# Enhanced regex pattern for matching image paths
# Matches patterns like "Image Path: \path\to\image.jpg"
image_path_pattern = r"Image Path:\s*([^\r\n]+)"
def replace_image_path(match):
nonlocal images_processed
image_path = match.group(1).strip()
# Validate path format (basic check)
if not image_path or len(image_path) < 3:
self.logger.warning(f"Invalid image path format: {image_path}")
return match.group(0) # Keep original
# Use utility function to validate image file
if not validate_image_file(image_path):
return match.group(0) # Keep original if validation fails
try:
# Encode image to base64 using utility function
image_base64 = encode_image_to_base64(image_path)
if image_base64:
images_processed += 1
# Save base64 to instance variable for later use
self._current_images_base64.append(image_base64)
# Keep original path info and add VLM marker
return f"Image Path: {image_path}\n[VLM_IMAGE_{images_processed}]"
else:
self.logger.error(f"Failed to encode image: {image_path}")
return match.group(0) # Keep original if encoding failed
except Exception as e:
self.logger.error(f"Failed to process image {image_path}: {e}")
return match.group(0) # Keep original
# Execute replacement
enhanced_prompt = re.sub(
image_path_pattern, replace_image_path, enhanced_prompt
)
return enhanced_prompt, images_processed
def _build_vlm_messages_with_images(
self, enhanced_prompt: str, user_query: str
) -> List[Dict]:
"""
Build VLM message format, using markers to correspond images with text positions
Args:
enhanced_prompt: Enhanced prompt with image markers
user_query: User query
Returns:
List[Dict]: VLM message format
"""
images_base64 = getattr(self, "_current_images_base64", [])
if not images_base64:
# Pure text mode
return [
{
"role": "user",
"content": f"Context:\n{enhanced_prompt}\n\nUser Question: {user_query}",
}
]
# Build multimodal content
content_parts = []
# Split text at image markers and insert images
text_parts = enhanced_prompt.split("[VLM_IMAGE_")
for i, text_part in enumerate(text_parts):
if i == 0:
# First text part
if text_part.strip():
content_parts.append({"type": "text", "text": text_part})
else:
# Find marker number and insert corresponding image
marker_match = re.match(r"(\d+)\](.*)", text_part, re.DOTALL)
if marker_match:
image_num = (
int(marker_match.group(1)) - 1
) # Convert to 0-based index
remaining_text = marker_match.group(2)
# Insert corresponding image
if 0 <= image_num < len(images_base64):
content_parts.append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{images_base64[image_num]}"
},
}
)
# Insert remaining text
if remaining_text.strip():
content_parts.append({"type": "text", "text": remaining_text})
# Add user question
content_parts.append(
{
"type": "text",
"text": f"\n\nUser Question: {user_query}\n\nPlease answer based on the context and images provided.",
}
)
return [
{
"role": "system",
"content": "You are a helpful assistant that can analyze both text and image content to provide comprehensive answers.",
},
{"role": "user", "content": content_parts},
]
async def _call_vlm_with_multimodal_content(self, messages: List[Dict]) -> str:
"""
Call VLM to process multimodal content
Args:
messages: VLM message format
Returns:
str: VLM response result
"""
try:
user_message = messages[1]
content = user_message["content"]
system_prompt = messages[0]["content"]
if isinstance(content, str):
# Pure text mode
result = await self.vision_model_func(
content, system_prompt=system_prompt
)
else:
# Multimodal mode - pass complete messages directly to VLM
result = await self.vision_model_func(
"", # Empty prompt since we're using messages format
messages=messages,
)
return result
except Exception as e:
self.logger.error(f"VLM call failed: {e}")
raise
# Synchronous versions of query methods
def query(self, query: str, mode: str = "hybrid", **kwargs) -> str:
def query(self, query: str, mode: str = "mix", **kwargs) -> str:
"""
Synchronous version of pure text query
@@ -440,6 +691,9 @@ class QueryMixin:
query: Query text
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
**kwargs: Other query parameters, will be passed to QueryParam
- vlm_enhanced: bool, default True when vision_model_func is available.
If True, will parse image paths in retrieved context and replace them
with base64 encoded images for VLM processing.
Returns:
str: Query result
@@ -451,7 +705,7 @@ class QueryMixin:
self,
query: str,
multimodal_content: List[Dict[str, Any]] = None,
mode: str = "hybrid",
mode: str = "mix",
**kwargs,
) -> str:
"""

View File

@@ -4,7 +4,9 @@ Utility functions for RAGAnything
Contains helper functions for content separation, text insertion, and other utilities
"""
import base64
from typing import Dict, List, Any, Tuple
from pathlib import Path
from lightrag.utils import logger
@@ -54,6 +56,73 @@ def separate_content(
return text_content, multimodal_items
def encode_image_to_base64(image_path: str) -> str:
"""
Encode image file to base64 string
Args:
image_path: Path to the image file
Returns:
str: Base64 encoded string, empty string if encoding fails
"""
try:
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
return encoded_string
except Exception as e:
logger.error(f"Failed to encode image {image_path}: {e}")
return ""
def validate_image_file(image_path: str, max_size_mb: int = 50) -> bool:
"""
Validate if a file is a valid image file
Args:
image_path: Path to the image file
max_size_mb: Maximum file size in MB
Returns:
bool: True if valid, False otherwise
"""
try:
path = Path(image_path)
# Check if file exists
if not path.exists():
logger.warning(f"Image file not found: {image_path}")
return False
# Check file extension
image_extensions = [
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".webp",
".tiff",
".tif",
]
if not any(str(path).lower().endswith(ext) for ext in image_extensions):
logger.warning(f"File does not appear to be an image: {image_path}")
return False
# Check file size
file_size = path.stat().st_size
max_size = max_size_mb * 1024 * 1024
if file_size > max_size:
logger.warning(f"Image file too large ({file_size} bytes): {image_path}")
return False
return True
except Exception as e:
logger.error(f"Failed to validate image {image_path}: {e}")
return False
async def insert_text_content(
lightrag,
input: str | list[str],