From 7211c57b3613c461c925c6da1395ae325446b184 Mon Sep 17 00:00:00 2001 From: TCUDIKEL Date: Sat, 19 Apr 2025 20:54:35 +0300 Subject: [PATCH] initial commit --- .gitignore | 16 ++++++ create.py | 116 ++++++++++++++++++++++++++++++++++++++ data/.gitkeep | 0 inference.py | 58 +++++++++++++++++++ requirements.txt | 8 +++ scripts/download_audio.sh | 8 +++ 6 files changed, 206 insertions(+) create mode 100755 .gitignore create mode 100755 create.py create mode 100644 data/.gitkeep create mode 100755 inference.py create mode 100755 requirements.txt create mode 100755 scripts/download_audio.sh diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..bbc1d11 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +.DS_Store +.idea/ +data/ +*.pyc +*.pyo +*.pyd +__pycache__/ +*.egg-info/ +*.egg +venv +env +.env +*.log +*.sqlite* +*.db +*.db-journal diff --git a/create.py b/create.py new file mode 100755 index 0000000..9dc2246 --- /dev/null +++ b/create.py @@ -0,0 +1,116 @@ +import glob +import os +import asyncio + +import aiofiles +from lightrag import LightRAG, QueryParam +from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete +from lightrag.llm.ollama import ollama_model_complete, ollama_embed +from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed +from lightrag.kg.shared_storage import initialize_pipeline_status +from lightrag.utils import setup_logger, EmbeddingFunc +from tqdm import tqdm +from loguru import logger + +setup_logger("lightrag", level="INFO") + + +def read_text_file(file_path): + with open(file_path, 'r', encoding='utf-8') as file: + text = file.read() + return text + + + +async def initialize_rag_azure_openai(): + rag = LightRAG( + working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GPT4o"], + graph_storage="NetworkXStorage", + kv_storage="JsonKVStorage", + vector_storage="FaissVectorDBStorage", + vector_db_storage_cls_kwargs={ + "cosine_better_than_threshold": 0.2 + }, + embedding_func=EmbeddingFunc( + embedding_dim=3072, + max_token_size=8192, + func=lambda texts: azure_openai_embed(texts) + ), + llm_model_func=azure_openai_complete, + enable_llm_cache=False, + enable_llm_cache_for_entity_extract=False, + embedding_cache_config={ + "enabled": False, + "similarity_threshold": 0.95, + "use_llm_check": False + }, + ) + await rag.initialize_storages() + await initialize_pipeline_status() + return rag + +async def initilize_rag_ollama(): + rag = LightRAG( + working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GEMMA327b"], + graph_storage="NetworkXStorage", # "Neo4JStorage", + kv_storage="JsonKVStorage", + vector_storage="FaissVectorDBStorage", + vector_db_storage_cls_kwargs={ + "cosine_better_than_threshold": 0.25 + }, + llm_model_func=ollama_model_complete, + llm_model_name=os.environ["OLLAMA_LLM_MODEL"], + llm_model_kwargs={ + "host": os.environ["OLLAMA_LLM_HOST"], + "options": {"num_ctx": 40000}, + }, + enable_llm_cache=False, + enable_llm_cache_for_entity_extract=False, + embedding_func=EmbeddingFunc( + embedding_dim=1024, + max_token_size=8192, + func=lambda texts: ollama_embed( + texts, + embed_model=os.environ["OLLAMA_EMBED_MODEL"], + host=os.environ["OLLAMA_EMBED_HOST"] + ), + ), + embedding_cache_config={ + "enabled": False, + "similarity_threshold": 0.95, + "use_llm_check": False + }, + ) + await rag.initialize_storages() + await initialize_pipeline_status() + return rag + + + + +def main(): + logger.info("Initializing lightRAG instance") + rag = asyncio.run(initilize_rag_ollama()) + + input_dir_path = "/Users/tcudikel/Dev/ancient-history/data/input/transcripts" + txt_files = glob.glob(f"{input_dir_path}/*.txt") + logger.debug(f"found {len(txt_files)} files in {input_dir_path}") + for file_path in tqdm(txt_files, desc="Processing files", unit="file", miniters=1, ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'): + text = read_text_file(file_path) + rag.insert(text) + + logger.success(f"{len(txt_files)} files inserted into the knowledge graph.") + """mode="mix" + rag.query( + "What are the top themes in this story?", + param=QueryParam( + mode=mode, + response_type="Single Paragraph", +# conversation_history=, +# history_turns=5, + ) + )""" + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/inference.py b/inference.py new file mode 100755 index 0000000..0605570 --- /dev/null +++ b/inference.py @@ -0,0 +1,58 @@ +import glob +import os +import asyncio + +import aiofiles +from lightrag import LightRAG, QueryParam +from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete +from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed +from lightrag.kg.shared_storage import initialize_pipeline_status +from lightrag.utils import setup_logger, EmbeddingFunc +from tqdm import tqdm + +setup_logger("lightrag", level="INFO") + + + +async def initialize_rag(): + rag = LightRAG( + working_dir="/Users/tcudikel/Dev/ancient-history/data/storage/base_gpt4o", + graph_storage="NetworkXStorage", + vector_storage="ChromaVectorDBStorage", + vector_db_storage_cls_kwargs={ + "local_path": "/Users/tcudikel/Dev/ancient-history/data/storage/base_gpt4o/vdb", + "cosine_better_than_threshold": 0.5, + }, + embedding_func=EmbeddingFunc( + embedding_dim=3072, + max_token_size=8192, + func=lambda texts: azure_openai_embed(texts) + ), + llm_model_func=azure_openai_complete + ) + + await rag.initialize_storages() + await initialize_pipeline_status() + + return rag + + +def main(): + rag = asyncio.run(initialize_rag()) + + mode = "mix" + response = rag.query( + "Which prophets exist before Noah?", + param=QueryParam( + mode=mode, + response_type="Single Paragraphs", + only_need_context=False, + # conversation_history=, + # history_turns=5, + ) + ) + print(response) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..8e74b3f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +imgui_bundle +moderngl +networkx +numpy +pyglm +python-louvain +scipy +tk \ No newline at end of file diff --git a/scripts/download_audio.sh b/scripts/download_audio.sh new file mode 100755 index 0000000..37e60ee --- /dev/null +++ b/scripts/download_audio.sh @@ -0,0 +1,8 @@ +#!/bin/bash + + +yt-dlp -x --audio-format mp3 --audio-quality 0 --output "%(title)s.%(ext)s" --restrict-filenames --yes-playlist $1 && find . -maxdepth 1 -type f -name "*_*" -print0 | while IFS= read -r -d '' file; do new_file=$(echo "$file" | sed 's/_/ /g'); mv "$file" "$new_file"; done + +mv *.mp3 ./data/input + +https://www.youtube.com/@the5thkind https://www.youtube.com/@coasttocoastamofficial https://www.youtube.com/@zoharancientdiscovery https://www.youtube.com/@zoharancienthistory https://www.youtube.com/@ancientastronautarchive \ No newline at end of file