mirror of
https://github.com/HKUDS/RAG-Anything.git
synced 2025-08-20 19:01:34 +03:00
vlm_enhanced_query
This commit is contained in:
88
README.md
88
README.md
@@ -48,6 +48,7 @@
|
||||
---
|
||||
|
||||
## 🎉 News
|
||||
- [X] [2025.08.12]🎯📢 🔍 RAG-Anything now supports **VLM Enhanced Query** mode! When documents contain images, the system can automatically pass images directly to VLM for comprehensive multimodal analysis alongside text context.
|
||||
- [X] [2025.07.05]🎯📢 RAG-Anything now features a [context configuration module](docs/context_aware_processing.md), enabling intelligent integration of relevant contextual information to enhance multimodal content processing.
|
||||
- [X] [2025.07.04]🎯📢 🚀 RAG-Anything now supports multimodal query capabilities, enabling enhanced RAG with seamless processing of text, images, tables, and equations.
|
||||
- [X] [2025.07.03]🎯📢 🎉 RAG-Anything has reached 1k🌟 stars on GitHub! Thank you for your incredible support and valuable contributions to the project.
|
||||
@@ -319,9 +320,22 @@ async def main():
|
||||
|
||||
# Define vision model function for image processing
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
||||
):
|
||||
if image_data:
|
||||
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Traditional single image format
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
@@ -350,6 +364,7 @@ async def main():
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Pure text format
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
@@ -549,7 +564,7 @@ class CustomModalProcessor(GenericModalProcessor):
|
||||
|
||||
#### 5. Query Options
|
||||
|
||||
RAG-Anything provides two types of query methods:
|
||||
RAG-Anything provides three types of query methods:
|
||||
|
||||
**Pure Text Queries** - Direct knowledge base search using LightRAG:
|
||||
```python
|
||||
@@ -563,7 +578,36 @@ text_result_naive = await rag.aquery("Your question", mode="naive")
|
||||
sync_text_result = rag.query("Your question", mode="hybrid")
|
||||
```
|
||||
|
||||
**Multimodal Queries** - Enhanced queries with multimodal content analysis:
|
||||
**VLM Enhanced Queries** - Automatically analyze images in retrieved context using VLM:
|
||||
```python
|
||||
# VLM enhanced query (automatically enabled when vision_model_func is provided)
|
||||
vlm_result = await rag.aquery(
|
||||
"Analyze the charts and figures in the document",
|
||||
mode="hybrid"
|
||||
# vlm_enhanced=True is automatically set when vision_model_func is available
|
||||
)
|
||||
|
||||
# Manually control VLM enhancement
|
||||
vlm_enabled = await rag.aquery(
|
||||
"What do the images show in this document?",
|
||||
mode="hybrid",
|
||||
vlm_enhanced=True # Force enable VLM enhancement
|
||||
)
|
||||
|
||||
vlm_disabled = await rag.aquery(
|
||||
"What do the images show in this document?",
|
||||
mode="hybrid",
|
||||
vlm_enhanced=False # Force disable VLM enhancement
|
||||
)
|
||||
|
||||
# When documents contain images, VLM can see and analyze them directly
|
||||
# The system will automatically:
|
||||
# 1. Retrieve relevant context containing image paths
|
||||
# 2. Load and encode images as base64
|
||||
# 3. Send both text context and images to VLM for comprehensive analysis
|
||||
```
|
||||
|
||||
**Multimodal Queries** - Enhanced queries with specific multimodal content analysis:
|
||||
```python
|
||||
# Query with table data
|
||||
table_result = await rag.aquery_with_multimodal(
|
||||
@@ -645,9 +689,22 @@ async def load_existing_lightrag():
|
||||
|
||||
# Define vision model function for image processing
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
||||
):
|
||||
if image_data:
|
||||
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Traditional single image format
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
@@ -676,6 +733,7 @@ async def load_existing_lightrag():
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Pure text format
|
||||
else:
|
||||
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
@@ -738,8 +796,21 @@ async def insert_content_list_example():
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs):
|
||||
if image_data:
|
||||
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
|
||||
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Traditional single image format
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
@@ -759,6 +830,7 @@ async def insert_content_list_example():
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Pure text format
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
|
||||
88
README_zh.md
88
README_zh.md
@@ -48,6 +48,7 @@
|
||||
---
|
||||
|
||||
## 🎉 新闻
|
||||
- [X] [2025.08.12]🎯📢 🔍 RAGAnything 现在支持 **VLM增强查询** 模式!当文档包含图片时,系统可以自动将图片与文本上下文一起直接传递给VLM进行综合多模态分析。
|
||||
- [X] [2025.07.05]🎯📢 RAGAnything 新增[上下文配置模块](docs/context_aware_processing.md),支持为多模态内容处理添加相关上下文信息。
|
||||
- [X] [2025.07.04]🎯📢 RAGAnything 现在支持多模态内容查询,实现了集成文本、图像、表格和公式处理的增强检索生成功能。
|
||||
- [X] [2025.07.03]🎯📢 RAGAnything 在GitHub上达到了1K星标🌟!感谢您的支持和贡献。
|
||||
@@ -315,9 +316,22 @@ async def main():
|
||||
|
||||
# 定义视觉模型函数用于图像处理
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
||||
):
|
||||
if image_data:
|
||||
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 传统单图片格式
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
@@ -346,6 +360,7 @@ async def main():
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 纯文本格式
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
@@ -547,7 +562,7 @@ class CustomModalProcessor(GenericModalProcessor):
|
||||
|
||||
#### 5. 查询选项
|
||||
|
||||
RAG-Anything 提供两种类型的查询方法:
|
||||
RAG-Anything 提供三种类型的查询方法:
|
||||
|
||||
**纯文本查询** - 使用LightRAG直接进行知识库搜索:
|
||||
```python
|
||||
@@ -561,7 +576,36 @@ text_result_naive = await rag.aquery("你的问题", mode="naive")
|
||||
sync_text_result = rag.query("你的问题", mode="hybrid")
|
||||
```
|
||||
|
||||
**多模态查询** - 包含多模态内容分析的增强查询:
|
||||
**VLM增强查询** - 使用VLM自动分析检索上下文中的图像:
|
||||
```python
|
||||
# VLM增强查询(当提供vision_model_func时自动启用)
|
||||
vlm_result = await rag.aquery(
|
||||
"分析文档中的图表和数据",
|
||||
mode="hybrid"
|
||||
# vlm_enhanced=True 当vision_model_func可用时自动设置
|
||||
)
|
||||
|
||||
# 手动控制VLM增强
|
||||
vlm_enabled = await rag.aquery(
|
||||
"这个文档中的图片显示了什么内容?",
|
||||
mode="hybrid",
|
||||
vlm_enhanced=True # 强制启用VLM增强
|
||||
)
|
||||
|
||||
vlm_disabled = await rag.aquery(
|
||||
"这个文档中的图片显示了什么内容?",
|
||||
mode="hybrid",
|
||||
vlm_enhanced=False # 强制禁用VLM增强
|
||||
)
|
||||
|
||||
# 当文档包含图片时,VLM可以直接查看和分析图片
|
||||
# 系统将自动:
|
||||
# 1. 检索包含图片路径的相关上下文
|
||||
# 2. 加载图片并编码为base64格式
|
||||
# 3. 将文本上下文和图片一起发送给VLM进行综合分析
|
||||
```
|
||||
|
||||
**多模态查询** - 包含特定多模态内容分析的增强查询:
|
||||
```python
|
||||
# 包含表格数据的查询
|
||||
table_result = await rag.aquery_with_multimodal(
|
||||
@@ -642,9 +686,22 @@ async def load_existing_lightrag():
|
||||
|
||||
# 定义视觉模型函数用于图像处理
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
||||
):
|
||||
if image_data:
|
||||
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 传统单图片格式
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
@@ -673,6 +730,7 @@ async def load_existing_lightrag():
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 纯文本格式
|
||||
else:
|
||||
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
@@ -735,8 +793,21 @@ async def insert_content_list_example():
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs):
|
||||
if image_data:
|
||||
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
|
||||
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 传统单图片格式
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
@@ -756,6 +827,7 @@ async def insert_content_list_example():
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# 纯文本格式
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
|
||||
@@ -129,9 +129,27 @@ async def process_with_rag(
|
||||
|
||||
# Define vision model function for image processing
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
||||
prompt,
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
image_data=None,
|
||||
messages=None,
|
||||
**kwargs,
|
||||
):
|
||||
if image_data:
|
||||
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
||||
if messages:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=messages,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Traditional single image format
|
||||
elif image_data:
|
||||
return openai_complete_if_cache(
|
||||
"gpt-4o",
|
||||
"",
|
||||
@@ -160,6 +178,7 @@ async def process_with_rag(
|
||||
base_url=base_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Pure text format
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from .raganything import RAGAnything as RAGAnything
|
||||
from .config import RAGAnythingConfig as RAGAnythingConfig
|
||||
|
||||
__version__ = "1.2.6"
|
||||
__version__ = "1.2.7"
|
||||
__author__ = "Zirui Guo"
|
||||
__url__ = "https://github.com/HKUDS/RAG-Anything"
|
||||
|
||||
|
||||
@@ -6,12 +6,17 @@ Contains all query-related methods for both text and multimodal queries
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
import re
|
||||
from typing import Dict, List, Any
|
||||
from pathlib import Path
|
||||
from lightrag import QueryParam
|
||||
from lightrag.utils import always_get_an_event_loop
|
||||
from raganything.prompt import PROMPTS
|
||||
from raganything.utils import get_processor_for_type
|
||||
from raganything.utils import (
|
||||
get_processor_for_type,
|
||||
encode_image_to_base64,
|
||||
validate_image_file,
|
||||
)
|
||||
|
||||
|
||||
class QueryMixin:
|
||||
@@ -92,7 +97,7 @@ class QueryMixin:
|
||||
|
||||
return f"multimodal_query:{cache_hash}"
|
||||
|
||||
async def aquery(self, query: str, mode: str = "hybrid", **kwargs) -> str:
|
||||
async def aquery(self, query: str, mode: str = "mix", **kwargs) -> str:
|
||||
"""
|
||||
Pure text query - directly calls LightRAG's query functionality
|
||||
|
||||
@@ -100,6 +105,9 @@ class QueryMixin:
|
||||
query: Query text
|
||||
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
|
||||
**kwargs: Other query parameters, will be passed to QueryParam
|
||||
- vlm_enhanced: bool, default True when vision_model_func is available.
|
||||
If True, will parse image paths in retrieved context and replace them
|
||||
with base64 encoded images for VLM processing.
|
||||
|
||||
Returns:
|
||||
str: Query result
|
||||
@@ -109,6 +117,30 @@ class QueryMixin:
|
||||
"No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance."
|
||||
)
|
||||
|
||||
# Check if VLM enhanced query should be used
|
||||
vlm_enhanced = kwargs.pop("vlm_enhanced", None)
|
||||
|
||||
# Auto-determine VLM enhanced based on availability
|
||||
if vlm_enhanced is None:
|
||||
vlm_enhanced = (
|
||||
hasattr(self, "vision_model_func")
|
||||
and self.vision_model_func is not None
|
||||
)
|
||||
|
||||
# Use VLM enhanced query if enabled and available
|
||||
if (
|
||||
vlm_enhanced
|
||||
and hasattr(self, "vision_model_func")
|
||||
and self.vision_model_func
|
||||
):
|
||||
return await self.aquery_vlm_enhanced(query, mode=mode, **kwargs)
|
||||
elif vlm_enhanced and (
|
||||
not hasattr(self, "vision_model_func") or not self.vision_model_func
|
||||
):
|
||||
self.logger.warning(
|
||||
"VLM enhanced query requested but vision_model_func is not available, falling back to normal query"
|
||||
)
|
||||
|
||||
# Create query parameters
|
||||
query_param = QueryParam(mode=mode, **kwargs)
|
||||
|
||||
@@ -125,7 +157,7 @@ class QueryMixin:
|
||||
self,
|
||||
query: str,
|
||||
multimodal_content: List[Dict[str, Any]] = None,
|
||||
mode: str = "hybrid",
|
||||
mode: str = "mix",
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
@@ -210,15 +242,12 @@ class QueryMixin:
|
||||
query, multimodal_content
|
||||
)
|
||||
|
||||
# Create query parameters
|
||||
query_param = QueryParam(mode=mode, **kwargs)
|
||||
|
||||
self.logger.info(
|
||||
f"Generated enhanced query length: {len(enhanced_query)} characters"
|
||||
)
|
||||
|
||||
# Execute enhanced query
|
||||
result = await self.lightrag.aquery(enhanced_query, param=query_param)
|
||||
result = await self.aquery(enhanced_query, mode=mode, **kwargs)
|
||||
|
||||
# Save to cache if available and enabled
|
||||
if (
|
||||
@@ -264,6 +293,61 @@ class QueryMixin:
|
||||
self.logger.info("Multimodal query completed")
|
||||
return result
|
||||
|
||||
async def aquery_vlm_enhanced(self, query: str, mode: str = "mix", **kwargs) -> str:
|
||||
"""
|
||||
VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing
|
||||
|
||||
Args:
|
||||
query: User query
|
||||
mode: Underlying LightRAG query mode
|
||||
**kwargs: Other query parameters
|
||||
|
||||
Returns:
|
||||
str: VLM query result
|
||||
"""
|
||||
# Ensure VLM is available
|
||||
if not hasattr(self, "vision_model_func") or not self.vision_model_func:
|
||||
raise ValueError(
|
||||
"VLM enhanced query requires vision_model_func. "
|
||||
"Please provide a vision model function when initializing RAGAnything."
|
||||
)
|
||||
|
||||
# Ensure LightRAG is initialized
|
||||
await self._ensure_lightrag_initialized()
|
||||
|
||||
self.logger.info(f"Executing VLM enhanced query: {query[:100]}...")
|
||||
|
||||
# Clear previous image cache
|
||||
if hasattr(self, "_current_images_base64"):
|
||||
delattr(self, "_current_images_base64")
|
||||
|
||||
# 1. Get original retrieval prompt (without generating final answer)
|
||||
query_param = QueryParam(mode=mode, only_need_prompt=True, **kwargs)
|
||||
raw_prompt = await self.lightrag.aquery(query, param=query_param)
|
||||
|
||||
self.logger.info("Retrieved raw prompt from LightRAG")
|
||||
|
||||
# 2. Extract and process image paths
|
||||
enhanced_prompt, images_found = await self._process_image_paths_for_vlm(
|
||||
raw_prompt
|
||||
)
|
||||
|
||||
if not images_found:
|
||||
self.logger.info("No valid images found, falling back to normal query")
|
||||
# Fallback to normal query
|
||||
return await self.aquery(query, mode=mode, **kwargs)
|
||||
|
||||
self.logger.info(f"Processed {images_found} images for VLM")
|
||||
|
||||
# 3. Build VLM message format
|
||||
messages = self._build_vlm_messages_with_images(enhanced_prompt, query)
|
||||
|
||||
# 4. Call VLM for question answering
|
||||
result = await self._call_vlm_with_multimodal_content(messages)
|
||||
|
||||
self.logger.info("VLM enhanced query completed")
|
||||
return result
|
||||
|
||||
async def _process_multimodal_query_content(
|
||||
self, base_query: str, multimodal_content: List[Dict[str, Any]]
|
||||
) -> str:
|
||||
@@ -431,8 +515,175 @@ class QueryMixin:
|
||||
|
||||
return description
|
||||
|
||||
async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]:
|
||||
"""
|
||||
Process image paths in prompt, keeping original paths and adding VLM markers
|
||||
|
||||
Args:
|
||||
prompt: Original prompt
|
||||
|
||||
Returns:
|
||||
tuple: (processed prompt, image count)
|
||||
"""
|
||||
enhanced_prompt = prompt
|
||||
images_processed = 0
|
||||
|
||||
# Initialize image cache
|
||||
self._current_images_base64 = []
|
||||
|
||||
# Enhanced regex pattern for matching image paths
|
||||
# Matches patterns like "Image Path: \path\to\image.jpg"
|
||||
image_path_pattern = r"Image Path:\s*([^\r\n]+)"
|
||||
|
||||
def replace_image_path(match):
|
||||
nonlocal images_processed
|
||||
|
||||
image_path = match.group(1).strip()
|
||||
|
||||
# Validate path format (basic check)
|
||||
if not image_path or len(image_path) < 3:
|
||||
self.logger.warning(f"Invalid image path format: {image_path}")
|
||||
return match.group(0) # Keep original
|
||||
|
||||
# Use utility function to validate image file
|
||||
if not validate_image_file(image_path):
|
||||
return match.group(0) # Keep original if validation fails
|
||||
|
||||
try:
|
||||
# Encode image to base64 using utility function
|
||||
image_base64 = encode_image_to_base64(image_path)
|
||||
if image_base64:
|
||||
images_processed += 1
|
||||
# Save base64 to instance variable for later use
|
||||
self._current_images_base64.append(image_base64)
|
||||
|
||||
# Keep original path info and add VLM marker
|
||||
return f"Image Path: {image_path}\n[VLM_IMAGE_{images_processed}]"
|
||||
else:
|
||||
self.logger.error(f"Failed to encode image: {image_path}")
|
||||
return match.group(0) # Keep original if encoding failed
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to process image {image_path}: {e}")
|
||||
return match.group(0) # Keep original
|
||||
|
||||
# Execute replacement
|
||||
enhanced_prompt = re.sub(
|
||||
image_path_pattern, replace_image_path, enhanced_prompt
|
||||
)
|
||||
|
||||
return enhanced_prompt, images_processed
|
||||
|
||||
def _build_vlm_messages_with_images(
|
||||
self, enhanced_prompt: str, user_query: str
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Build VLM message format, using markers to correspond images with text positions
|
||||
|
||||
Args:
|
||||
enhanced_prompt: Enhanced prompt with image markers
|
||||
user_query: User query
|
||||
|
||||
Returns:
|
||||
List[Dict]: VLM message format
|
||||
"""
|
||||
images_base64 = getattr(self, "_current_images_base64", [])
|
||||
|
||||
if not images_base64:
|
||||
# Pure text mode
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Context:\n{enhanced_prompt}\n\nUser Question: {user_query}",
|
||||
}
|
||||
]
|
||||
|
||||
# Build multimodal content
|
||||
content_parts = []
|
||||
|
||||
# Split text at image markers and insert images
|
||||
text_parts = enhanced_prompt.split("[VLM_IMAGE_")
|
||||
|
||||
for i, text_part in enumerate(text_parts):
|
||||
if i == 0:
|
||||
# First text part
|
||||
if text_part.strip():
|
||||
content_parts.append({"type": "text", "text": text_part})
|
||||
else:
|
||||
# Find marker number and insert corresponding image
|
||||
marker_match = re.match(r"(\d+)\](.*)", text_part, re.DOTALL)
|
||||
if marker_match:
|
||||
image_num = (
|
||||
int(marker_match.group(1)) - 1
|
||||
) # Convert to 0-based index
|
||||
remaining_text = marker_match.group(2)
|
||||
|
||||
# Insert corresponding image
|
||||
if 0 <= image_num < len(images_base64):
|
||||
content_parts.append(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{images_base64[image_num]}"
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Insert remaining text
|
||||
if remaining_text.strip():
|
||||
content_parts.append({"type": "text", "text": remaining_text})
|
||||
|
||||
# Add user question
|
||||
content_parts.append(
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"\n\nUser Question: {user_query}\n\nPlease answer based on the context and images provided.",
|
||||
}
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant that can analyze both text and image content to provide comprehensive answers.",
|
||||
},
|
||||
{"role": "user", "content": content_parts},
|
||||
]
|
||||
|
||||
async def _call_vlm_with_multimodal_content(self, messages: List[Dict]) -> str:
|
||||
"""
|
||||
Call VLM to process multimodal content
|
||||
|
||||
Args:
|
||||
messages: VLM message format
|
||||
|
||||
Returns:
|
||||
str: VLM response result
|
||||
"""
|
||||
try:
|
||||
user_message = messages[1]
|
||||
content = user_message["content"]
|
||||
system_prompt = messages[0]["content"]
|
||||
|
||||
if isinstance(content, str):
|
||||
# Pure text mode
|
||||
result = await self.vision_model_func(
|
||||
content, system_prompt=system_prompt
|
||||
)
|
||||
else:
|
||||
# Multimodal mode - pass complete messages directly to VLM
|
||||
result = await self.vision_model_func(
|
||||
"", # Empty prompt since we're using messages format
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"VLM call failed: {e}")
|
||||
raise
|
||||
|
||||
# Synchronous versions of query methods
|
||||
def query(self, query: str, mode: str = "hybrid", **kwargs) -> str:
|
||||
def query(self, query: str, mode: str = "mix", **kwargs) -> str:
|
||||
"""
|
||||
Synchronous version of pure text query
|
||||
|
||||
@@ -440,6 +691,9 @@ class QueryMixin:
|
||||
query: Query text
|
||||
mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
|
||||
**kwargs: Other query parameters, will be passed to QueryParam
|
||||
- vlm_enhanced: bool, default True when vision_model_func is available.
|
||||
If True, will parse image paths in retrieved context and replace them
|
||||
with base64 encoded images for VLM processing.
|
||||
|
||||
Returns:
|
||||
str: Query result
|
||||
@@ -451,7 +705,7 @@ class QueryMixin:
|
||||
self,
|
||||
query: str,
|
||||
multimodal_content: List[Dict[str, Any]] = None,
|
||||
mode: str = "hybrid",
|
||||
mode: str = "mix",
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
|
||||
@@ -4,7 +4,9 @@ Utility functions for RAGAnything
|
||||
Contains helper functions for content separation, text insertion, and other utilities
|
||||
"""
|
||||
|
||||
import base64
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from pathlib import Path
|
||||
from lightrag.utils import logger
|
||||
|
||||
|
||||
@@ -54,6 +56,73 @@ def separate_content(
|
||||
return text_content, multimodal_items
|
||||
|
||||
|
||||
def encode_image_to_base64(image_path: str) -> str:
|
||||
"""
|
||||
Encode image file to base64 string
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
|
||||
Returns:
|
||||
str: Base64 encoded string, empty string if encoding fails
|
||||
"""
|
||||
try:
|
||||
with open(image_path, "rb") as image_file:
|
||||
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
||||
return encoded_string
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to encode image {image_path}: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def validate_image_file(image_path: str, max_size_mb: int = 50) -> bool:
|
||||
"""
|
||||
Validate if a file is a valid image file
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
max_size_mb: Maximum file size in MB
|
||||
|
||||
Returns:
|
||||
bool: True if valid, False otherwise
|
||||
"""
|
||||
try:
|
||||
path = Path(image_path)
|
||||
|
||||
# Check if file exists
|
||||
if not path.exists():
|
||||
logger.warning(f"Image file not found: {image_path}")
|
||||
return False
|
||||
|
||||
# Check file extension
|
||||
image_extensions = [
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
".bmp",
|
||||
".webp",
|
||||
".tiff",
|
||||
".tif",
|
||||
]
|
||||
if not any(str(path).lower().endswith(ext) for ext in image_extensions):
|
||||
logger.warning(f"File does not appear to be an image: {image_path}")
|
||||
return False
|
||||
|
||||
# Check file size
|
||||
file_size = path.stat().st_size
|
||||
max_size = max_size_mb * 1024 * 1024
|
||||
if file_size > max_size:
|
||||
logger.warning(f"Image file too large ({file_size} bytes): {image_path}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to validate image {image_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def insert_text_content(
|
||||
lightrag,
|
||||
input: str | list[str],
|
||||
|
||||
Reference in New Issue
Block a user