116 lines
3.7 KiB
Python
Executable File
116 lines
3.7 KiB
Python
Executable File
import glob
|
|
import os
|
|
import asyncio
|
|
|
|
import aiofiles
|
|
from lightrag import LightRAG, QueryParam
|
|
from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete
|
|
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
|
|
from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed
|
|
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
from lightrag.utils import setup_logger, EmbeddingFunc
|
|
from tqdm import tqdm
|
|
from loguru import logger
|
|
|
|
setup_logger("lightrag", level="INFO")
|
|
|
|
|
|
def read_text_file(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
text = file.read()
|
|
return text
|
|
|
|
|
|
|
|
async def initialize_rag_azure_openai():
|
|
rag = LightRAG(
|
|
working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GPT4o"],
|
|
graph_storage="NetworkXStorage",
|
|
kv_storage="JsonKVStorage",
|
|
vector_storage="FaissVectorDBStorage",
|
|
vector_db_storage_cls_kwargs={
|
|
"cosine_better_than_threshold": 0.2
|
|
},
|
|
embedding_func=EmbeddingFunc(
|
|
embedding_dim=3072,
|
|
max_token_size=8192,
|
|
func=lambda texts: azure_openai_embed(texts)
|
|
),
|
|
llm_model_func=azure_openai_complete,
|
|
enable_llm_cache=False,
|
|
enable_llm_cache_for_entity_extract=False,
|
|
embedding_cache_config={
|
|
"enabled": False,
|
|
"similarity_threshold": 0.95,
|
|
"use_llm_check": False
|
|
},
|
|
)
|
|
await rag.initialize_storages()
|
|
await initialize_pipeline_status()
|
|
return rag
|
|
|
|
async def initilize_rag_ollama():
|
|
rag = LightRAG(
|
|
working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GEMMA327b"],
|
|
graph_storage="NetworkXStorage", # "Neo4JStorage",
|
|
kv_storage="JsonKVStorage",
|
|
vector_storage="FaissVectorDBStorage",
|
|
vector_db_storage_cls_kwargs={
|
|
"cosine_better_than_threshold": 0.25
|
|
},
|
|
llm_model_func=ollama_model_complete,
|
|
llm_model_name=os.environ["OLLAMA_LLM_MODEL"],
|
|
llm_model_kwargs={
|
|
"host": os.environ["OLLAMA_LLM_HOST"],
|
|
"options": {"num_ctx": 40000},
|
|
},
|
|
enable_llm_cache=False,
|
|
enable_llm_cache_for_entity_extract=False,
|
|
embedding_func=EmbeddingFunc(
|
|
embedding_dim=1024,
|
|
max_token_size=8192,
|
|
func=lambda texts: ollama_embed(
|
|
texts,
|
|
embed_model=os.environ["OLLAMA_EMBED_MODEL"],
|
|
host=os.environ["OLLAMA_EMBED_HOST"]
|
|
),
|
|
),
|
|
embedding_cache_config={
|
|
"enabled": False,
|
|
"similarity_threshold": 0.95,
|
|
"use_llm_check": False
|
|
},
|
|
)
|
|
await rag.initialize_storages()
|
|
await initialize_pipeline_status()
|
|
return rag
|
|
|
|
|
|
|
|
|
|
def main():
|
|
logger.info("Initializing lightRAG instance")
|
|
rag = asyncio.run(initilize_rag_ollama())
|
|
|
|
input_dir_path = "/Users/tcudikel/Dev/ancient-history/data/input/transcripts"
|
|
txt_files = glob.glob(f"{input_dir_path}/*.txt")
|
|
logger.debug(f"found {len(txt_files)} files in {input_dir_path}")
|
|
for file_path in tqdm(txt_files, desc="Processing files", unit="file", miniters=1, ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'):
|
|
text = read_text_file(file_path)
|
|
rag.insert(text)
|
|
|
|
logger.success(f"{len(txt_files)} files inserted into the knowledge graph.")
|
|
"""mode="mix"
|
|
rag.query(
|
|
"What are the top themes in this story?",
|
|
param=QueryParam(
|
|
mode=mode,
|
|
response_type="Single Paragraph",
|
|
# conversation_history=,
|
|
# history_turns=5,
|
|
)
|
|
)"""
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |