graphrag-ancient-history/create.py

import glob
import os
import asyncio

import aiofiles
from lightrag import LightRAG, QueryParam
from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import setup_logger, EmbeddingFunc
from tqdm import tqdm
from loguru import logger

setup_logger("lightrag", level="INFO")


def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


async def initialize_rag_azure_openai():
    rag = LightRAG(
        working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GPT4o"],
        graph_storage="NetworkXStorage",
        kv_storage="JsonKVStorage",
        vector_storage="FaissVectorDBStorage",
        vector_db_storage_cls_kwargs={
            "cosine_better_than_threshold": 0.2
        },
        embedding_func=EmbeddingFunc(
            embedding_dim=3072,
            max_token_size=8192,
            func=lambda texts: azure_openai_embed(texts)
        ),
        llm_model_func=azure_openai_complete,
        enable_llm_cache=False,
        enable_llm_cache_for_entity_extract=False,
        embedding_cache_config={
            "enabled": False,
            "similarity_threshold": 0.95,
            "use_llm_check": False
        },
    )
    await rag.initialize_storages()
    await initialize_pipeline_status()
    return rag

async def initilize_rag_ollama():
    rag = LightRAG(
        working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GEMMA327b"],
        graph_storage="NetworkXStorage",  # "Neo4JStorage",
        kv_storage="JsonKVStorage",
        vector_storage="FaissVectorDBStorage",
        vector_db_storage_cls_kwargs={
            "cosine_better_than_threshold": 0.25
        },
        llm_model_func=ollama_model_complete,
        llm_model_name=os.environ["OLLAMA_LLM_MODEL"],
        llm_model_kwargs={
            "host": os.environ["OLLAMA_LLM_HOST"],
            "options": {"num_ctx": 40000},
        },
        enable_llm_cache=False,
        enable_llm_cache_for_entity_extract=False,
        embedding_func=EmbeddingFunc(
            embedding_dim=1024,
            max_token_size=8192,
            func=lambda texts: ollama_embed(
                texts,
                embed_model=os.environ["OLLAMA_EMBED_MODEL"],
                host=os.environ["OLLAMA_EMBED_HOST"]
            ),
        ),
        embedding_cache_config={
            "enabled": False,
            "similarity_threshold": 0.95,
            "use_llm_check": False
        },
    )
    await rag.initialize_storages()
    await initialize_pipeline_status()
    return rag


def main():
    logger.info("Initializing lightRAG instance")
    rag = asyncio.run(initilize_rag_ollama())

    input_dir_path = "/Users/tcudikel/Dev/ancient-history/data/input/transcripts"
    txt_files = glob.glob(f"{input_dir_path}/*.txt")
    logger.debug(f"found {len(txt_files)} files in {input_dir_path}")
    for file_path in tqdm(txt_files, desc="Processing files", unit="file", miniters=1, ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'):
        text = read_text_file(file_path)
        rag.insert(text)

    logger.success(f"{len(txt_files)} files inserted into the knowledge graph.")
    """mode="mix"
    rag.query(
        "What are the top themes in this story?",
        param=QueryParam(
            mode=mode,
            response_type="Single Paragraph",
#            conversation_history=,
#            history_turns=5,
        )
    )"""


if __name__ == "__main__":
    main()