From 710ed38d886f7885a4880138bab8a9c16c003353 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Wed, 23 Jul 2025 19:01:26 +0800 Subject: [PATCH 1/2] direct content list insertion --- README.md | 176 ++++++++++ README_zh.md | 176 ++++++++++ examples/insert_content_list_example.py | 411 ++++++++++++++++++++++++ examples/raganything_example.py | 4 + raganything/processor.py | 128 ++++++++ 5 files changed, 895 insertions(+) create mode 100644 examples/insert_content_list_example.py diff --git a/README.md b/README.md index 713f7b6..25dc602 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,7 @@ Users can query documents containing **interleaved text**, **visual diagrams**, - **🧠 Specialized Content Analysis** - Dedicated processors for images, tables, mathematical equations, and heterogeneous content types - **🔗 Multimodal Knowledge Graph** - Automatic entity extraction and cross-modal relationship discovery for enhanced understanding - **⚡ Adaptive Processing Modes** - Flexible MinerU-based parsing or direct multimodal content injection workflows +- **📋 Direct Content List Insertion** - Bypass document parsing by directly inserting pre-parsed content lists from external sources - **🎯 Hybrid Intelligent Retrieval** - Advanced search capabilities spanning textual and multimodal content with contextual understanding @@ -702,6 +703,181 @@ if __name__ == "__main__": asyncio.run(load_existing_lightrag()) ``` +#### 7. Direct Content List Insertion + +For scenarios where you already have a pre-parsed content list (e.g., from external parsers or previous processing), you can directly insert it into RAGAnything without document parsing: + +```python +import asyncio +from raganything import RAGAnything, RAGAnythingConfig +from lightrag.llm.openai import openai_complete_if_cache, openai_embed +from lightrag.utils import EmbeddingFunc + +async def insert_content_list_example(): + # Set up API configuration + api_key = "your-api-key" + base_url = "your-base-url" # Optional + + # Create RAGAnything configuration + config = RAGAnythingConfig( + working_dir="./rag_storage", + enable_image_processing=True, + enable_table_processing=True, + enable_equation_processing=True, + ) + + # Define model functions + def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): + return openai_complete_if_cache( + "gpt-4o-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=api_key, + base_url=base_url, + **kwargs, + ) + + def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs): + if image_data: + return openai_complete_if_cache( + "gpt-4o", + "", + system_prompt=None, + history_messages=[], + messages=[ + {"role": "system", "content": system_prompt} if system_prompt else None, + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}} + ], + } if image_data else {"role": "user", "content": prompt}, + ], + api_key=api_key, + base_url=base_url, + **kwargs, + ) + else: + return llm_model_func(prompt, system_prompt, history_messages, **kwargs) + + embedding_func = EmbeddingFunc( + embedding_dim=3072, + max_token_size=8192, + func=lambda texts: openai_embed( + texts, + model="text-embedding-3-large", + api_key=api_key, + base_url=base_url, + ), + ) + + # Initialize RAGAnything + rag = RAGAnything( + config=config, + llm_model_func=llm_model_func, + vision_model_func=vision_model_func, + embedding_func=embedding_func, + ) + + # Example: Pre-parsed content list from external source + content_list = [ + { + "type": "text", + "text": "This is the introduction section of our research paper.", + "page_idx": 0 # Page number where this content appears + }, + { + "type": "image", + "img_path": "/absolute/path/to/figure1.jpg", # IMPORTANT: Use absolute path + "img_caption": ["Figure 1: System Architecture"], + "img_footnote": ["Source: Authors' original design"], + "page_idx": 1 # Page number where this image appears + }, + { + "type": "table", + "table_body": "| Method | Accuracy | F1-Score |\n|--------|----------|----------|\n| Ours | 95.2% | 0.94 |\n| Baseline | 87.3% | 0.85 |", + "table_caption": ["Table 1: Performance Comparison"], + "table_footnote": ["Results on test dataset"], + "page_idx": 2 # Page number where this table appears + }, + { + "type": "equation", + "latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}", + "text": "Document relevance probability formula", + "page_idx": 3 # Page number where this equation appears + }, + { + "type": "text", + "text": "In conclusion, our method demonstrates superior performance across all metrics.", + "page_idx": 4 # Page number where this content appears + } + ] + + # Insert the content list directly + await rag.insert_content_list( + content_list=content_list, + file_path="research_paper.pdf", # Reference file name for citation + split_by_character=None, # Optional text splitting + split_by_character_only=False, # Optional text splitting mode + doc_id=None, # Optional custom document ID (will be auto-generated if not provided) + display_stats=True # Show content statistics + ) + + # Query the inserted content + result = await rag.aquery( + "What are the key findings and performance metrics mentioned in the research?", + mode="hybrid" + ) + print("Query result:", result) + + # You can also insert multiple content lists with different document IDs + another_content_list = [ + { + "type": "text", + "text": "This is content from another document.", + "page_idx": 0 # Page number where this content appears + }, + { + "type": "table", + "table_body": "| Feature | Value |\n|---------|-------|\n| Speed | Fast |\n| Accuracy | High |", + "table_caption": ["Feature Comparison"], + "page_idx": 1 # Page number where this table appears + } + ] + + await rag.insert_content_list( + content_list=another_content_list, + file_path="another_document.pdf", + doc_id="custom-doc-id-123" # Custom document ID + ) + +if __name__ == "__main__": + asyncio.run(insert_content_list_example()) +``` + +**Content List Format:** + +The `content_list` should follow the standard format with each item being a dictionary containing: + +- **Text content**: `{"type": "text", "text": "content text", "page_idx": 0}` +- **Image content**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "img_caption": ["caption"], "img_footnote": ["note"], "page_idx": 1}` +- **Table content**: `{"type": "table", "table_body": "markdown table", "table_caption": ["caption"], "table_footnote": ["note"], "page_idx": 2}` +- **Equation content**: `{"type": "equation", "latex": "LaTeX formula", "text": "description", "page_idx": 3}` +- **Generic content**: `{"type": "custom_type", "content": "any content", "page_idx": 4}` + +**Important Notes:** +- **`img_path`**: Must be an absolute path to the image file (e.g., `/home/user/images/chart.jpg` or `C:\Users\user\images\chart.jpg`) +- **`page_idx`**: Represents the page number where the content appears in the original document (0-based indexing) +- **Content ordering**: Items are processed in the order they appear in the list + +This method is particularly useful when: +- You have content from external parsers (non-MinerU/Docling) +- You want to process programmatically generated content +- You need to insert content from multiple sources into a single knowledge base +- You have cached parsing results that you want to reuse + --- ## 🛠️ Examples diff --git a/README_zh.md b/README_zh.md index 61f4320..e85e62b 100644 --- a/README_zh.md +++ b/README_zh.md @@ -75,6 +75,7 @@ - **🧠 多模态内容分析引擎** - 针对图像、表格、公式和通用文本内容部署专门的处理器,确保各类内容的精准解析 - **🔗 基于知识图谱索引** - 实现自动化实体提取和关系构建,建立跨模态的语义连接网络 - **⚡ 灵活的处理架构** - 支持基于MinerU的智能解析模式和直接多模态内容插入模式,满足不同应用场景需求 +- **📋 直接内容列表插入** - 跳过文档解析,直接插入来自外部源的预解析内容列表,支持多种数据来源整合 - **🎯 跨模态检索机制** - 实现跨文本和多模态内容的智能检索,提供精准的信息定位和匹配能力 @@ -699,6 +700,181 @@ if __name__ == "__main__": asyncio.run(load_existing_lightrag()) ``` +#### 7. 直接插入内容列表 + +当您已经有预解析的内容列表(例如,来自外部解析器或之前的处理结果)时,可以直接插入到 RAGAnything 中而无需文档解析: + +```python +import asyncio +from raganything import RAGAnything, RAGAnythingConfig +from lightrag.llm.openai import openai_complete_if_cache, openai_embed +from lightrag.utils import EmbeddingFunc + +async def insert_content_list_example(): + # 设置 API 配置 + api_key = "your-api-key" + base_url = "your-base-url" # 可选 + + # 创建 RAGAnything 配置 + config = RAGAnythingConfig( + working_dir="./rag_storage", + enable_image_processing=True, + enable_table_processing=True, + enable_equation_processing=True, + ) + + # 定义模型函数 + def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): + return openai_complete_if_cache( + "gpt-4o-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=api_key, + base_url=base_url, + **kwargs, + ) + + def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs): + if image_data: + return openai_complete_if_cache( + "gpt-4o", + "", + system_prompt=None, + history_messages=[], + messages=[ + {"role": "system", "content": system_prompt} if system_prompt else None, + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}} + ], + } if image_data else {"role": "user", "content": prompt}, + ], + api_key=api_key, + base_url=base_url, + **kwargs, + ) + else: + return llm_model_func(prompt, system_prompt, history_messages, **kwargs) + + embedding_func = EmbeddingFunc( + embedding_dim=3072, + max_token_size=8192, + func=lambda texts: openai_embed( + texts, + model="text-embedding-3-large", + api_key=api_key, + base_url=base_url, + ), + ) + + # 初始化 RAGAnything + rag = RAGAnything( + config=config, + llm_model_func=llm_model_func, + vision_model_func=vision_model_func, + embedding_func=embedding_func, + ) + + # 示例:来自外部源的预解析内容列表 + content_list = [ + { + "type": "text", + "text": "这是我们研究论文的引言部分。", + "page_idx": 0 # 此内容出现的页码 + }, + { + "type": "image", + "img_path": "/absolute/path/to/figure1.jpg", # 重要:使用绝对路径 + "img_caption": ["图1:系统架构"], + "img_footnote": ["来源:作者原创设计"], + "page_idx": 1 # 此图像出现的页码 + }, + { + "type": "table", + "table_body": "| 方法 | 准确率 | F1分数 |\n|------|--------|--------|\n| 我们的方法 | 95.2% | 0.94 |\n| 基准方法 | 87.3% | 0.85 |", + "table_caption": ["表1:性能对比"], + "table_footnote": ["测试数据集结果"], + "page_idx": 2 # 此表格出现的页码 + }, + { + "type": "equation", + "latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}", + "text": "文档相关性概率公式", + "page_idx": 3 # 此公式出现的页码 + }, + { + "type": "text", + "text": "总之,我们的方法在所有指标上都表现出优越的性能。", + "page_idx": 4 # 此内容出现的页码 + } + ] + + # 直接插入内容列表 + await rag.insert_content_list( + content_list=content_list, + file_path="research_paper.pdf", # 用于引用的参考文件名 + split_by_character=None, # 可选的文本分割 + split_by_character_only=False, # 可选的文本分割模式 + doc_id=None, # 可选的自定义文档ID(如果未提供将自动生成) + display_stats=True # 显示内容统计信息 + ) + + # 查询插入的内容 + result = await rag.aquery( + "研究中提到的主要发现和性能指标是什么?", + mode="hybrid" + ) + print("查询结果:", result) + + # 您也可以使用不同的文档ID插入多个内容列表 + another_content_list = [ + { + "type": "text", + "text": "这是来自另一个文档的内容。", + "page_idx": 0 # 此内容出现的页码 + }, + { + "type": "table", + "table_body": "| 特性 | 值 |\n|------|----|\n| 速度 | 快速 |\n| 准确性 | 高 |", + "table_caption": ["特性对比"], + "page_idx": 1 # 此表格出现的页码 + } + ] + + await rag.insert_content_list( + content_list=another_content_list, + file_path="another_document.pdf", + doc_id="custom-doc-id-123" # 自定义文档ID + ) + +if __name__ == "__main__": + asyncio.run(insert_content_list_example()) +``` + +**内容列表格式:** + +`content_list` 应遵循标准格式,每个项目都是包含以下内容的字典: + +- **文本内容**: `{"type": "text", "text": "内容文本", "page_idx": 0}` +- **图像内容**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "img_caption": ["标题"], "img_footnote": ["注释"], "page_idx": 1}` +- **表格内容**: `{"type": "table", "table_body": "markdown表格", "table_caption": ["标题"], "table_footnote": ["注释"], "page_idx": 2}` +- **公式内容**: `{"type": "equation", "latex": "LaTeX公式", "text": "描述", "page_idx": 3}` +- **通用内容**: `{"type": "custom_type", "content": "任何内容", "page_idx": 4}` + +**重要说明:** +- **`img_path`**: 必须是图像文件的绝对路径(例如:`/home/user/images/chart.jpg` 或 `C:\Users\user\images\chart.jpg`) +- **`page_idx`**: 表示内容在原始文档中出现的页码(从0开始的索引) +- **内容顺序**: 项目按照在列表中出现的顺序进行处理 + +此方法在以下情况下特别有用: +- 您有来自外部解析器的内容(非MinerU/Docling) +- 您想要处理程序化生成的内容 +- 您需要将来自多个源的内容插入到单个知识库中 +- 您有想要重用的缓存解析结果 + --- ## 🛠️ 示例 diff --git a/examples/insert_content_list_example.py b/examples/insert_content_list_example.py new file mode 100644 index 0000000..e9fe985 --- /dev/null +++ b/examples/insert_content_list_example.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python +""" +Example script demonstrating direct content list insertion with RAGAnything + +This example shows how to: +1. Create a simple content list with different content types +2. Insert content list directly without document parsing using insert_content_list() method +3. Perform pure text queries using aquery() method +4. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method +5. Handle different types of multimodal content in the inserted knowledge base +""" + +import os +import argparse +import asyncio +import logging +import logging.config +from pathlib import Path + +# Add project root directory to Python path +import sys + +sys.path.append(str(Path(__file__).parent.parent)) + +from lightrag.llm.openai import openai_complete_if_cache, openai_embed +from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug +from raganything import RAGAnything, RAGAnythingConfig + +from dotenv import load_dotenv + +load_dotenv(dotenv_path=".env", override=False) + + +def configure_logging(): + """Configure logging for the application""" + # Get log directory path from environment variable or use current directory + log_dir = os.getenv("LOG_DIR", os.getcwd()) + log_file_path = os.path.abspath(os.path.join(log_dir, "insert_content_list_example.log")) + + print(f"\nInsert Content List example log file: {log_file_path}\n") + os.makedirs(os.path.dirname(log_dir), exist_ok=True) + + # Get log file max size and backup count from environment variables + log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB + log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups + + logging.config.dictConfig( + { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "format": "%(levelname)s: %(message)s", + }, + "detailed": { + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + }, + }, + "handlers": { + "console": { + "formatter": "default", + "class": "logging.StreamHandler", + "stream": "ext://sys.stderr", + }, + "file": { + "formatter": "detailed", + "class": "logging.handlers.RotatingFileHandler", + "filename": log_file_path, + "maxBytes": log_max_bytes, + "backupCount": log_backup_count, + "encoding": "utf-8", + }, + }, + "loggers": { + "lightrag": { + "handlers": ["console", "file"], + "level": "INFO", + "propagate": False, + }, + }, + } + ) + + # Set the logger level to INFO + logger.setLevel(logging.INFO) + # Enable verbose debug if needed + set_verbose_debug(os.getenv("VERBOSE", "false").lower() == "true") + + +def create_sample_content_list(): + """ + Create a simple content list for testing insert_content_list functionality + + Returns: + List[Dict]: Sample content list with various content types + + Note: + - img_path should be absolute path to the image file + - page_idx represents the page number where the content appears (0-based) + """ + content_list = [ + # Introduction text + { + "type": "text", + "text": "Welcome to the RAGAnything System Documentation. This guide covers the advanced multimodal document processing capabilities and features of our comprehensive RAG system.", + "page_idx": 0 # Page number where this content appears + }, + + # System architecture image + { + "type": "image", + "img_path": "/absolute/path/to/system_architecture.jpg", # IMPORTANT: Use absolute path to image file + "img_caption": ["Figure 1: RAGAnything System Architecture"], + "img_footnote": ["The architecture shows the complete pipeline from document parsing to multimodal query processing"], + "page_idx": 1 # Page number where this image appears + }, + + # Performance comparison table + { + "type": "table", + "table_body": """| System | Accuracy | Processing Speed | Memory Usage | + |--------|----------|------------------|--------------| + | RAGAnything | 95.2% | 120ms | 2.1GB | + | Traditional RAG | 87.3% | 180ms | 3.2GB | + | Baseline System | 82.1% | 220ms | 4.1GB | + | Simple Retrieval | 76.5% | 95ms | 1.8GB |""", + "table_caption": ["Table 1: Performance Comparison of Different RAG Systems"], + "table_footnote": ["All tests conducted on the same hardware with identical test datasets"], + "page_idx": 2 # Page number where this table appears + }, + + # Mathematical formula + { + "type": "equation", + "latex": "Relevance(d, q) = \\sum_{i=1}^{n} w_i \\cdot sim(t_i^d, t_i^q) \\cdot \\alpha_i", + "text": "Document relevance scoring formula where w_i are term weights, sim() is similarity function, and α_i are modality importance factors", + "page_idx": 3 # Page number where this equation appears + }, + + # Feature description + { + "type": "text", + "text": "The system supports multiple content modalities including text, images, tables, and mathematical equations. Each modality is processed using specialized processors optimized for that content type.", + "page_idx": 4 # Page number where this content appears + }, + + # Technical specifications table + { + "type": "table", + "table_body": """| Feature | Specification | + |---------|---------------| + | Supported Formats | PDF, DOCX, PPTX, XLSX, Images | + | Max Document Size | 100MB | + | Concurrent Processing | Up to 8 documents | + | Query Response Time | <200ms average | + | Knowledge Graph Nodes | Up to 1M entities |""", + "table_caption": ["Table 2: Technical Specifications"], + "table_footnote": ["Specifications may vary based on hardware configuration"], + "page_idx": 5 # Page number where this table appears + }, + + # Conclusion + { + "type": "text", + "text": "RAGAnything represents a significant advancement in multimodal document processing, providing comprehensive solutions for complex knowledge extraction and retrieval tasks.", + "page_idx": 6 # Page number where this content appears + } + ] + + return content_list + + +async def demo_insert_content_list( + api_key: str, + base_url: str = None, + working_dir: str = None, +): + """ + Demonstrate content list insertion and querying with RAGAnything + + Args: + api_key: OpenAI API key + base_url: Optional base URL for API + working_dir: Working directory for RAG storage + """ + try: + # Create RAGAnything configuration + config = RAGAnythingConfig( + working_dir=working_dir or "./rag_storage", + enable_image_processing=True, + enable_table_processing=True, + enable_equation_processing=True, + display_content_stats=True, # Show content statistics + ) + + # Define LLM model function + def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): + return openai_complete_if_cache( + "gpt-4o-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=api_key, + base_url=base_url, + **kwargs, + ) + + # Define vision model function for image processing + def vision_model_func( + prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs + ): + if image_data: + return openai_complete_if_cache( + "gpt-4o", + "", + system_prompt=None, + history_messages=[], + messages=[ + {"role": "system", "content": system_prompt} + if system_prompt + else None, + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_data}" + }, + }, + ], + } + if image_data + else {"role": "user", "content": prompt}, + ], + api_key=api_key, + base_url=base_url, + **kwargs, + ) + else: + return llm_model_func(prompt, system_prompt, history_messages, **kwargs) + + # Define embedding function + embedding_func = EmbeddingFunc( + embedding_dim=3072, + max_token_size=8192, + func=lambda texts: openai_embed( + texts, + model="text-embedding-3-large", + api_key=api_key, + base_url=base_url, + ), + ) + + # Initialize RAGAnything + rag = RAGAnything( + config=config, + llm_model_func=llm_model_func, + vision_model_func=vision_model_func, + embedding_func=embedding_func, + ) + + # Create sample content list + logger.info("Creating sample content list...") + content_list = create_sample_content_list() + logger.info(f"Created content list with {len(content_list)} items") + + # Insert content list directly + logger.info("\nInserting content list into RAGAnything...") + await rag.insert_content_list( + content_list=content_list, + file_path="raganything_documentation.pdf", # Reference file name for citation + split_by_character=None, # Optional text splitting + split_by_character_only=False, # Optional text splitting mode + doc_id="demo-doc-001", # Custom document ID + display_stats=True # Show content statistics + ) + logger.info("Content list insertion completed!") + + # Example queries - demonstrating different query approaches + logger.info("\nQuerying inserted content:") + + # 1. Pure text queries using aquery() + text_queries = [ + "What is RAGAnything and what are its main features?", + "How does RAGAnything compare to traditional RAG systems?", + "What are the technical specifications of the system?", + ] + + for query in text_queries: + logger.info(f"\n[Text Query]: {query}") + result = await rag.aquery(query, mode="hybrid") + logger.info(f"Answer: {result}") + + # 2. Multimodal query with specific multimodal content using aquery_with_multimodal() + logger.info( + "\n[Multimodal Query]: Analyzing new performance data against existing benchmarks" + ) + multimodal_result = await rag.aquery_with_multimodal( + "Compare this new performance data with the existing benchmark results in the documentation", + multimodal_content=[ + { + "type": "table", + "table_data": """Method,Accuracy,Speed,Memory + New_Approach,97.1%,110ms,1.9GB + Enhanced_RAG,91.4%,140ms,2.5GB""", + "table_caption": "Latest experimental results", + } + ], + mode="hybrid", + ) + logger.info(f"Answer: {multimodal_result}") + + # 3. Another multimodal query with equation content + logger.info("\n[Multimodal Query]: Mathematical formula analysis") + equation_result = await rag.aquery_with_multimodal( + "How does this similarity formula relate to the relevance scoring mentioned in the documentation?", + multimodal_content=[ + { + "type": "equation", + "latex": "sim(a, b) = \\frac{a \\cdot b}{||a|| \\times ||b||} + \\beta \\cdot context\\_weight", + "equation_caption": "Enhanced cosine similarity with context weighting", + } + ], + mode="hybrid", + ) + logger.info(f"Answer: {equation_result}") + + # 4. Insert another content list with different document ID + logger.info("\nInserting additional content list...") + additional_content = [ + { + "type": "text", + "text": "This is additional documentation about advanced features and configuration options.", + "page_idx": 0 # Page number where this content appears + }, + { + "type": "table", + "table_body": """| Configuration | Default Value | Range | + |---------------|---------------|-------| + | Chunk Size | 512 tokens | 128-2048 | + | Context Window | 4096 tokens | 1024-8192 | + | Batch Size | 32 | 1-128 |""", + "table_caption": ["Advanced Configuration Parameters"], + "page_idx": 1 # Page number where this table appears + } + ] + + await rag.insert_content_list( + content_list=additional_content, + file_path="advanced_configuration.pdf", + doc_id="demo-doc-002" # Different document ID + ) + + # Query combined knowledge base + logger.info(f"\n[Combined Query]: What configuration options are available?") + combined_result = await rag.aquery("What configuration options are available and what are their default values?", mode="hybrid") + logger.info(f"Answer: {combined_result}") + + except Exception as e: + logger.error(f"Error in content list insertion demo: {str(e)}") + import traceback + logger.error(traceback.format_exc()) + + +def main(): + """Main function to run the example""" + parser = argparse.ArgumentParser(description="Insert Content List Example") + parser.add_argument( + "--working_dir", "-w", default="./rag_storage", help="Working directory path" + ) + parser.add_argument( + "--api-key", + default=os.getenv("LLM_BINDING_API_KEY"), + help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)", + ) + parser.add_argument( + "--base-url", + default=os.getenv("LLM_BINDING_HOST"), + help="Optional base URL for API", + ) + + args = parser.parse_args() + + # Check if API key is provided + if not args.api_key: + logger.error("Error: OpenAI API key is required") + logger.error("Set api key environment variable or use --api-key option") + return + + # Run the demo + asyncio.run( + demo_insert_content_list( + args.api_key, + args.base_url, + args.working_dir, + ) + ) + + +if __name__ == "__main__": + # Configure logging first + configure_logging() + + print("RAGAnything Insert Content List Example") + print("=" * 45) + print("Demonstrating direct content list insertion without document parsing") + print("=" * 45) + + main() \ No newline at end of file diff --git a/examples/raganything_example.py b/examples/raganything_example.py index 572d8fa..ed8e9b7 100644 --- a/examples/raganything_example.py +++ b/examples/raganything_example.py @@ -25,6 +25,10 @@ from lightrag.llm.openai import openai_complete_if_cache, openai_embed from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug from raganything import RAGAnything, RAGAnythingConfig +from dotenv import load_dotenv + +load_dotenv(dotenv_path=".env", override=False) + def configure_logging(): """Configure logging for the application""" diff --git a/raganything/processor.py b/raganything/processor.py index 5de627b..f56515e 100644 --- a/raganything/processor.py +++ b/raganything/processor.py @@ -722,3 +722,131 @@ class ProcessorMixin: ) self.logger.info(f"Document {file_path} processing complete!") + + async def insert_content_list( + self, + content_list: List[Dict[str, Any]], + file_path: str = "unknown_document", + split_by_character: str | None = None, + split_by_character_only: bool = False, + doc_id: str | None = None, + display_stats: bool = None, + ): + """ + Insert content list directly without document parsing + + Args: + content_list: Pre-parsed content list containing text and multimodal items. + Each item should be a dictionary with the following structure: + - Text: {"type": "text", "text": "content", "page_idx": 0} + - Image: {"type": "image", "img_path": "/absolute/path/to/image.jpg", + "img_caption": ["caption"], "img_footnote": ["note"], "page_idx": 1} + - Table: {"type": "table", "table_body": "markdown table", + "table_caption": ["caption"], "table_footnote": ["note"], "page_idx": 2} + - Equation: {"type": "equation", "latex": "LaTeX formula", + "text": "description", "page_idx": 3} + - Generic: {"type": "custom_type", "content": "any content", "page_idx": 4} + file_path: Reference file path/name for citation (defaults to "unknown_document") + split_by_character: Optional character to split the text by + split_by_character_only: If True, split only by the specified character + doc_id: Optional document ID, if not provided will be generated from content + display_stats: Whether to display content statistics (defaults to config.display_content_stats) + + Note: + - img_path must be an absolute path to the image file + - page_idx represents the page number where the content appears (0-based indexing) + - Items are processed in the order they appear in the list + """ + # Ensure LightRAG is initialized + await self._ensure_lightrag_initialized() + + # Use config defaults if not provided + if display_stats is None: + display_stats = self.config.display_content_stats + + self.logger.info( + f"Starting direct content list insertion for: {file_path} ({len(content_list)} items)" + ) + + # Generate doc_id based on content if not provided + if doc_id is None: + doc_id = self._generate_content_based_doc_id(content_list) + + # Display content statistics if requested + if display_stats: + self.logger.info("\nContent Information:") + self.logger.info(f"* Total blocks in content_list: {len(content_list)}") + + # Count elements by type + block_types: Dict[str, int] = {} + for block in content_list: + if isinstance(block, dict): + block_type = block.get("type", "unknown") + if isinstance(block_type, str): + block_types[block_type] = block_types.get(block_type, 0) + 1 + + self.logger.info("* Content block types:") + for block_type, count in block_types.items(): + self.logger.info(f" - {block_type}: {count}") + + # Step 1: Separate text and multimodal content + text_content, multimodal_items = separate_content(content_list) + + # Step 1.5: Set content source for context extraction in multimodal processing + if hasattr(self, "set_content_source_for_context") and multimodal_items: + self.logger.info( + "Setting content source for context-aware multimodal processing..." + ) + self.set_content_source_for_context( + content_list, self.config.content_format + ) + + # Step 2: Insert pure text content with all parameters + if text_content.strip(): + file_name = os.path.basename(file_path) + await insert_text_content( + self.lightrag, + text_content, + file_paths=file_name, + split_by_character=split_by_character, + split_by_character_only=split_by_character_only, + ids=doc_id, + ) + + # Step 3: Process multimodal content (using specialized processors) + if multimodal_items: + await self._process_multimodal_content(multimodal_items, file_path, doc_id) + else: + # If no multimodal content, mark as processed to avoid future checks + try: + existing_doc_status = await self.lightrag.doc_status.get_by_id(doc_id) + if existing_doc_status and not existing_doc_status.get( + "multimodal_processed", False + ): + existing_multimodal_chunks = existing_doc_status.get( + "multimodal_chunks_list", [] + ) + + await self.lightrag.doc_status.upsert( + { + doc_id: { + **existing_doc_status, + "multimodal_chunks_list": existing_multimodal_chunks, + "multimodal_chunks_count": len( + existing_multimodal_chunks + ), + "multimodal_processed": True, + "updated_at": time.strftime("%Y-%m-%dT%H:%M:%S+00:00"), + } + } + ) + await self.lightrag.doc_status.index_done_callback() + self.logger.debug( + f"Marked document {doc_id[:8]}... as having no multimodal content" + ) + except Exception as e: + self.logger.debug( + f"Error updating doc_status for no multimodal content: {e}" + ) + + self.logger.info(f"Content list insertion complete for: {file_path}") From 84276c38ca22313ffd14ffa179a5dac5520360f7 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Wed, 23 Jul 2025 19:04:11 +0800 Subject: [PATCH 2/2] fix lint --- examples/insert_content_list_example.py | 74 ++++++++++++++----------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/examples/insert_content_list_example.py b/examples/insert_content_list_example.py index e9fe985..d277d47 100644 --- a/examples/insert_content_list_example.py +++ b/examples/insert_content_list_example.py @@ -35,7 +35,9 @@ def configure_logging(): """Configure logging for the application""" # Get log directory path from environment variable or use current directory log_dir = os.getenv("LOG_DIR", os.getcwd()) - log_file_path = os.path.abspath(os.path.join(log_dir, "insert_content_list_example.log")) + log_file_path = os.path.abspath( + os.path.join(log_dir, "insert_content_list_example.log") + ) print(f"\nInsert Content List example log file: {log_file_path}\n") os.makedirs(os.path.dirname(log_dir), exist_ok=True) @@ -90,10 +92,10 @@ def configure_logging(): def create_sample_content_list(): """ Create a simple content list for testing insert_content_list functionality - + Returns: List[Dict]: Sample content list with various content types - + Note: - img_path should be absolute path to the image file - page_idx represents the page number where the content appears (0-based) @@ -103,18 +105,18 @@ def create_sample_content_list(): { "type": "text", "text": "Welcome to the RAGAnything System Documentation. This guide covers the advanced multimodal document processing capabilities and features of our comprehensive RAG system.", - "page_idx": 0 # Page number where this content appears + "page_idx": 0, # Page number where this content appears }, - # System architecture image { "type": "image", "img_path": "/absolute/path/to/system_architecture.jpg", # IMPORTANT: Use absolute path to image file "img_caption": ["Figure 1: RAGAnything System Architecture"], - "img_footnote": ["The architecture shows the complete pipeline from document parsing to multimodal query processing"], - "page_idx": 1 # Page number where this image appears + "img_footnote": [ + "The architecture shows the complete pipeline from document parsing to multimodal query processing" + ], + "page_idx": 1, # Page number where this image appears }, - # Performance comparison table { "type": "table", @@ -124,26 +126,27 @@ def create_sample_content_list(): | Traditional RAG | 87.3% | 180ms | 3.2GB | | Baseline System | 82.1% | 220ms | 4.1GB | | Simple Retrieval | 76.5% | 95ms | 1.8GB |""", - "table_caption": ["Table 1: Performance Comparison of Different RAG Systems"], - "table_footnote": ["All tests conducted on the same hardware with identical test datasets"], - "page_idx": 2 # Page number where this table appears + "table_caption": [ + "Table 1: Performance Comparison of Different RAG Systems" + ], + "table_footnote": [ + "All tests conducted on the same hardware with identical test datasets" + ], + "page_idx": 2, # Page number where this table appears }, - # Mathematical formula { "type": "equation", "latex": "Relevance(d, q) = \\sum_{i=1}^{n} w_i \\cdot sim(t_i^d, t_i^q) \\cdot \\alpha_i", "text": "Document relevance scoring formula where w_i are term weights, sim() is similarity function, and α_i are modality importance factors", - "page_idx": 3 # Page number where this equation appears + "page_idx": 3, # Page number where this equation appears }, - # Feature description { "type": "text", "text": "The system supports multiple content modalities including text, images, tables, and mathematical equations. Each modality is processed using specialized processors optimized for that content type.", - "page_idx": 4 # Page number where this content appears + "page_idx": 4, # Page number where this content appears }, - # Technical specifications table { "type": "table", @@ -155,18 +158,19 @@ def create_sample_content_list(): | Query Response Time | <200ms average | | Knowledge Graph Nodes | Up to 1M entities |""", "table_caption": ["Table 2: Technical Specifications"], - "table_footnote": ["Specifications may vary based on hardware configuration"], - "page_idx": 5 # Page number where this table appears + "table_footnote": [ + "Specifications may vary based on hardware configuration" + ], + "page_idx": 5, # Page number where this table appears }, - # Conclusion { "type": "text", "text": "RAGAnything represents a significant advancement in multimodal document processing, providing comprehensive solutions for complex knowledge extraction and retrieval tasks.", - "page_idx": 6 # Page number where this content appears - } + "page_idx": 6, # Page number where this content appears + }, ] - + return content_list @@ -271,10 +275,10 @@ async def demo_insert_content_list( await rag.insert_content_list( content_list=content_list, file_path="raganything_documentation.pdf", # Reference file name for citation - split_by_character=None, # Optional text splitting - split_by_character_only=False, # Optional text splitting mode - doc_id="demo-doc-001", # Custom document ID - display_stats=True # Show content statistics + split_by_character=None, # Optional text splitting + split_by_character_only=False, # Optional text splitting mode + doc_id="demo-doc-001", # Custom document ID + display_stats=True, # Show content statistics ) logger.info("Content list insertion completed!") @@ -333,7 +337,7 @@ async def demo_insert_content_list( { "type": "text", "text": "This is additional documentation about advanced features and configuration options.", - "page_idx": 0 # Page number where this content appears + "page_idx": 0, # Page number where this content appears }, { "type": "table", @@ -343,24 +347,28 @@ async def demo_insert_content_list( | Context Window | 4096 tokens | 1024-8192 | | Batch Size | 32 | 1-128 |""", "table_caption": ["Advanced Configuration Parameters"], - "page_idx": 1 # Page number where this table appears - } + "page_idx": 1, # Page number where this table appears + }, ] await rag.insert_content_list( content_list=additional_content, file_path="advanced_configuration.pdf", - doc_id="demo-doc-002" # Different document ID + doc_id="demo-doc-002", # Different document ID ) # Query combined knowledge base - logger.info(f"\n[Combined Query]: What configuration options are available?") - combined_result = await rag.aquery("What configuration options are available and what are their default values?", mode="hybrid") + logger.info("\n[Combined Query]: What configuration options are available?") + combined_result = await rag.aquery( + "What configuration options are available and what are their default values?", + mode="hybrid", + ) logger.info(f"Answer: {combined_result}") except Exception as e: logger.error(f"Error in content list insertion demo: {str(e)}") import traceback + logger.error(traceback.format_exc()) @@ -408,4 +416,4 @@ if __name__ == "__main__": print("Demonstrating direct content list insertion without document parsing") print("=" * 45) - main() \ No newline at end of file + main()