Files
2025-05-11 21:09:46 +03:00

141 lines
4.0 KiB
Python
Executable File

import glob
import os
import statistics
import asyncio
from typing import Dict, List, Any
import numpy as np
from lightrag import LightRAG, QueryParam
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import setup_logger, EmbeddingFunc
from loguru import logger
# Setup environment and logging
setup_logger("lightrag", level="DEBUG")
def get_required_env(name):
value = os.environ.get(name)
if not value:
raise ValueError(f"Missing required environment variable: {name}")
return value
def read_text_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
logger.error(f"Error reading file {file_path}: {e}")
raise
async def llm_model_func(prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs):
try:
return await openai_complete_if_cache(
model=os.environ["LLM_MODEL"],
prompt=prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="anything",
base_url=os.environ["VLLM_LLM_HOST"],
**kwargs,
)
except Exception as e:
logger.error(f"Error in LLM call: {e}")
raise
async def embedding_func(texts: list[str]) -> np.ndarray:
try:
return await openai_embed(
texts,
model=os.environ["EMBEDDING_MODEL"],
api_key="anything",
base_url=os.environ["VLLM_EMBED_HOST"],
)
except Exception as e:
logger.error(f"Error in embedding call: {e}")
raise
async def get_embedding_dim():
test_text = ["This is a test sentence."]
embedding = await embedding_func(test_text)
embedding_dim = embedding.shape[1]
return embedding_dim
async def initialize_rag():
try:
knowledge_graph_path = get_required_env("KNOWLEDGE_GRAPH_PATH")
# Get embedding dimension dynamically
embedding_dimension = await get_embedding_dim()
logger.info(f"Detected embedding dimension: {embedding_dimension}")
rag = LightRAG(
working_dir=knowledge_graph_path,
graph_storage="NetworkXStorage",
kv_storage="JsonKVStorage",
vector_storage="FaissVectorDBStorage",
vector_db_storage_cls_kwargs={
"cosine_better_than_threshold": 0.2
},
embedding_func=EmbeddingFunc(
embedding_dim=embedding_dimension,
max_token_size=8192,
func=embedding_func
),
llm_model_func=llm_model_func,
enable_llm_cache=False,
enable_llm_cache_for_entity_extract=False,
embedding_cache_config={
"enabled": False,
"similarity_threshold": 0.95,
"use_llm_check": False
},
)
# Initialize storages properly
await rag.initialize_storages()
await initialize_pipeline_status()
return rag
except Exception as e:
logger.error(f"Error initializing RAG: {e}")
raise
def main():
try:
# Initialize RAG
logger.info("Initializing LightRAG instance")
rag = asyncio.run(initialize_rag())
# Find text files
input_dir_path = get_required_env("KNOWLEDGE_GRAPH_INPUT_DIR_PATH")
texts = []
txt_files = glob.glob(f"{input_dir_path}/*.txt")
for txt_file in txt_files:
txt = read_text_file(txt_file)
texts.append(txt)
logger.info(f"Found {len(txt_files)} text files in {input_dir_path}")
if not txt_files:
logger.warning(f"No text files found in {input_dir_path}")
return
# results = await process_files(rag, txt_files)
results = rag.insert(texts)
except Exception as e:
logger.error(f"Error in main process: {e}")
raise
if __name__ == "__main__":
main()