Files
office_document_test/raganything/processor.py
2025-07-21 23:48:27 +08:00

335 lines
13 KiB
Python

"""
Document processing functionality for RAGAnything
Contains methods for parsing documents and processing multimodal content
"""
import os
from typing import Dict, List, Any
from pathlib import Path
from raganything.parser import MineruParser, DoclingParser
from raganything.utils import (
separate_content,
insert_text_content,
get_processor_for_type,
)
class ProcessorMixin:
"""ProcessorMixin class containing document processing functionality for RAGAnything"""
def parse_document(
self,
file_path: str,
output_dir: str = None,
parse_method: str = None,
display_stats: bool = None,
**kwargs,
) -> List[Dict[str, Any]]:
"""
Parse document
Args:
file_path: Path to the file to parse
output_dir: Output directory (defaults to config.parser_output_dir)
parse_method: Parse method (defaults to config.parse_method)
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
Returns:
List[Dict[str, Any]]: Content list
"""
# Use config defaults if not provided
if output_dir is None:
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.parse_method
if display_stats is None:
display_stats = self.config.display_content_stats
self.logger.info(f"Starting document parsing: {file_path}")
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
# Choose appropriate parsing method based on file extension
ext = file_path.suffix.lower()
try:
doc_parser = (
DoclingParser() if self.config.parser == "docling" else MineruParser()
)
# Log parser and method information
self.logger.info(
f"Using {self.config.parser} parser with method: {parse_method}"
)
if ext in [".pdf"]:
self.logger.info("Detected PDF file, using parser for PDF...")
content_list = doc_parser.parse_pdf(
pdf_path=file_path,
output_dir=output_dir,
method=parse_method,
**kwargs,
)
elif ext in [
".jpg",
".jpeg",
".png",
".bmp",
".tiff",
".tif",
".gif",
".webp",
]:
self.logger.info("Detected image file, using parser for images...")
# Use the selected parser's image parsing capability
if hasattr(doc_parser, "parse_image"):
content_list = doc_parser.parse_image(
image_path=file_path, output_dir=output_dir, **kwargs
)
else:
# Fallback to MinerU for image parsing if current parser doesn't support it
self.logger.warning(
f"{self.config.parser} parser doesn't support image parsing, falling back to MinerU"
)
content_list = MineruParser().parse_image(
image_path=file_path, output_dir=output_dir, **kwargs
)
elif ext in [
".doc",
".docx",
".ppt",
".pptx",
".xls",
".xlsx",
".html",
".htm",
".xhtml",
]:
self.logger.info(
"Detected Office or HTML document, using parser for Office/HTML..."
)
content_list = doc_parser.parse_office_doc(
doc_path=file_path, output_dir=output_dir, **kwargs
)
else:
# For other or unknown formats, use generic parser
self.logger.info(
f"Using generic parser for {ext} file (method={parse_method})..."
)
content_list = doc_parser.parse_document(
file_path=file_path,
method=parse_method,
output_dir=output_dir,
**kwargs,
)
except Exception as e:
self.logger.error(
f"Error during parsing with {self.config.parser} parser: {str(e)}"
)
self.logger.warning("Falling back to MinerU parser...")
# If specific parser fails, fall back to MinerU parser
content_list = MineruParser().parse_document(
file_path=file_path,
method=parse_method,
output_dir=output_dir,
**kwargs,
)
self.logger.info(
f"Parsing complete! Extracted {len(content_list)} content blocks"
)
# Display content statistics if requested
if display_stats:
self.logger.info("\nContent Information:")
self.logger.info(f"* Total blocks in content_list: {len(content_list)}")
# Count elements by type
block_types: Dict[str, int] = {}
for block in content_list:
if isinstance(block, dict):
block_type = block.get("type", "unknown")
if isinstance(block_type, str):
block_types[block_type] = block_types.get(block_type, 0) + 1
self.logger.info("* Content block types:")
for block_type, count in block_types.items():
self.logger.info(f" - {block_type}: {count}")
return content_list
async def _process_multimodal_content(
self, multimodal_items: List[Dict[str, Any]], file_path: str
):
"""
Process multimodal content (using specialized processors)
Args:
multimodal_items: List of multimodal items
file_path: File path (for reference)
"""
if not multimodal_items:
self.logger.debug("No multimodal content to process")
return
self.logger.info("Starting multimodal content processing...")
file_name = os.path.basename(file_path)
# Collect all chunk results for batch processing (similar to text content processing)
all_chunk_results = []
for i, item in enumerate(multimodal_items):
try:
content_type = item.get("type", "unknown")
self.logger.info(
f"Processing item {i+1}/{len(multimodal_items)}: {content_type} content"
)
# Select appropriate processor
processor = get_processor_for_type(self.modal_processors, content_type)
if processor:
# Prepare item info for context extraction
item_info = {
"page_idx": item.get("page_idx", 0),
"index": i,
"type": content_type,
}
# Process content and get chunk results instead of immediately merging
(
enhanced_caption,
entity_info,
chunk_results,
) = await processor.process_multimodal_content(
modal_content=item,
content_type=content_type,
file_path=file_name,
item_info=item_info, # Pass item info for context extraction
batch_mode=True,
)
# Collect chunk results for batch processing
all_chunk_results.extend(chunk_results)
self.logger.info(
f"{content_type} processing complete: {entity_info.get('entity_name', 'Unknown')}"
)
else:
self.logger.warning(
f"No suitable processor found for {content_type} type content"
)
except Exception as e:
self.logger.error(f"Error processing multimodal content: {str(e)}")
self.logger.debug("Exception details:", exc_info=True)
continue
# Batch merge all multimodal content results (similar to text content processing)
if all_chunk_results:
from lightrag.operate import merge_nodes_and_edges
from lightrag.kg.shared_storage import (
get_namespace_data,
get_pipeline_status_lock,
)
# Get pipeline status and lock from shared storage
pipeline_status = await get_namespace_data("pipeline_status")
pipeline_status_lock = get_pipeline_status_lock()
await merge_nodes_and_edges(
chunk_results=all_chunk_results,
knowledge_graph_inst=self.lightrag.chunk_entity_relation_graph,
entity_vdb=self.lightrag.entities_vdb,
relationships_vdb=self.lightrag.relationships_vdb,
global_config=self.lightrag.__dict__,
pipeline_status=pipeline_status,
pipeline_status_lock=pipeline_status_lock,
llm_response_cache=self.lightrag.llm_response_cache,
current_file_number=1,
total_files=1,
file_path=file_name,
)
await self.lightrag._insert_done()
self.logger.info("Multimodal content processing complete")
async def process_document_complete(
self,
file_path: str,
output_dir: str = None,
parse_method: str = None,
display_stats: bool = None,
split_by_character: str | None = None,
split_by_character_only: bool = False,
doc_id: str | None = None,
**kwargs,
):
"""
Complete document processing workflow
Args:
file_path: Path to the file to process
output_dir: output directory (defaults to config.parser_output_dir)
parse_method: Parse method (defaults to config.parse_method)
display_stats: Whether to display content statistics (defaults to config.display_content_stats)
split_by_character: Optional character to split the text by
split_by_character_only: If True, split only by the specified character
doc_id: Optional document ID, if not provided MD5 hash will be generated
**kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)
"""
# Ensure LightRAG is initialized
await self._ensure_lightrag_initialized()
# Use config defaults if not provided
if output_dir is None:
output_dir = self.config.parser_output_dir
if parse_method is None:
parse_method = self.config.parse_method
if display_stats is None:
display_stats = self.config.display_content_stats
self.logger.info(f"Starting complete document processing: {file_path}")
# Step 1: Parse document
content_list = self.parse_document(
file_path, output_dir, parse_method, display_stats, **kwargs
)
# Step 2: Separate text and multimodal content
text_content, multimodal_items = separate_content(content_list)
# Step 2.5: Set content source for context extraction in multimodal processing
if hasattr(self, "set_content_source_for_context") and multimodal_items:
self.logger.info(
"Setting content source for context-aware multimodal processing..."
)
self.set_content_source_for_context(
content_list, self.config.content_format
)
# Step 3: Insert pure text content with all parameters
if text_content.strip():
file_name = os.path.basename(file_path)
await insert_text_content(
self.lightrag,
text_content,
file_paths=file_name,
split_by_character=split_by_character,
split_by_character_only=split_by_character_only,
ids=doc_id,
)
# Step 4: Process multimodal content (using specialized processors)
if multimodal_items:
await self._process_multimodal_content(multimodal_items, file_path)
self.logger.info(f"Document {file_path} processing complete!")