graphrag-ancient-history/create.py

import glob
import os
import statistics
import asyncio
from typing import Dict, List, Any
import numpy as np

from lightrag import LightRAG, QueryParam
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import setup_logger, EmbeddingFunc
from loguru import logger

# Setup environment and logging
setup_logger("lightrag", level="DEBUG")


def get_required_env(name):
    value = os.environ.get(name)
    if not value:
        raise ValueError(f"Missing required environment variable: {name}")
    return value


def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        logger.error(f"Error reading file {file_path}: {e}")
        raise


async def llm_model_func(prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs):
    try:
        return await openai_complete_if_cache(
            model=os.environ["LLM_MODEL"],
            prompt=prompt,
            system_prompt=system_prompt,
            history_messages=history_messages,
            api_key="anything",
            base_url=os.environ["VLLM_LLM_HOST"],
            **kwargs,
        )
    except Exception as e:
        logger.error(f"Error in LLM call: {e}")
        raise


async def embedding_func(texts: list[str]) -> np.ndarray:
    try:
        return await openai_embed(
            texts,
            model=os.environ["EMBEDDING_MODEL"],
            api_key="anything",
            base_url=os.environ["VLLM_EMBED_HOST"],
        )
    except Exception as e:
        logger.error(f"Error in embedding call: {e}")
        raise


async def get_embedding_dim():
    test_text = ["This is a test sentence."]
    embedding = await embedding_func(test_text)
    embedding_dim = embedding.shape[1]
    return embedding_dim


async def initialize_rag():
    try:
        knowledge_graph_path = get_required_env("KNOWLEDGE_GRAPH_PATH")

        # Get embedding dimension dynamically
        embedding_dimension = await get_embedding_dim()
        logger.info(f"Detected embedding dimension: {embedding_dimension}")

        rag = LightRAG(
            working_dir=knowledge_graph_path,
            graph_storage="NetworkXStorage",
            kv_storage="JsonKVStorage",
            vector_storage="FaissVectorDBStorage",
            vector_db_storage_cls_kwargs={
                "cosine_better_than_threshold": 0.2
            },
            embedding_func=EmbeddingFunc(
                embedding_dim=embedding_dimension,
                max_token_size=8192,
                func=embedding_func
            ),
            llm_model_func=llm_model_func,
            enable_llm_cache=False,
            enable_llm_cache_for_entity_extract=False,
            embedding_cache_config={
                "enabled": False,
                "similarity_threshold": 0.95,
                "use_llm_check": False
            },
        )

        # Initialize storages properly
        await rag.initialize_storages()
        await initialize_pipeline_status()
        return rag
    except Exception as e:
        logger.error(f"Error initializing RAG: {e}")
        raise


def main():
    try:
        # Initialize RAG
        logger.info("Initializing LightRAG instance")
        rag = asyncio.run(initialize_rag())

        # Find text files
        input_dir_path = get_required_env("KNOWLEDGE_GRAPH_INPUT_DIR_PATH")
        texts = []
        txt_files = glob.glob(f"{input_dir_path}/*.txt")
        for txt_file in txt_files:
            txt = read_text_file(txt_file)
            texts.append(txt)

        logger.info(f"Found {len(txt_files)} text files in {input_dir_path}")

        if not txt_files:
            logger.warning(f"No text files found in {input_dir_path}")
            return

        # results = await process_files(rag, txt_files)

        results = rag.insert(texts)


    except Exception as e:
        logger.error(f"Error in main process: {e}")
        raise


if __name__ == "__main__":
    main()