initial commit

2025-04-19 20:54:35 +03:00
commit 7211c57b36
6 changed files with 206 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,16 @@
+.DS_Store
+.idea/
+data/
+*.pyc
+*.pyo
+*.pyd
+__pycache__/
+*.egg-info/
+*.egg
+venv
+env
+.env
+*.log
+*.sqlite*
+*.db
+*.db-journal
--- a/create.py
+++ b/create.py
@@ -0,0 +1,116 @@
+import glob
+import os
+import asyncio
+
+import aiofiles
+from lightrag import LightRAG, QueryParam
+from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed
+from lightrag.kg.shared_storage import initialize_pipeline_status
+from lightrag.utils import setup_logger, EmbeddingFunc
+from tqdm import tqdm
+from loguru import logger
+
+setup_logger("lightrag", level="INFO")
+
+
+def read_text_file(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        text = file.read()
+    return text
+
+
+
+async def initialize_rag_azure_openai():
+    rag = LightRAG(
+        working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GPT4o"],
+        graph_storage="NetworkXStorage",
+        kv_storage="JsonKVStorage",
+        vector_storage="FaissVectorDBStorage",
+        vector_db_storage_cls_kwargs={
+            "cosine_better_than_threshold": 0.2
+        },
+        embedding_func=EmbeddingFunc(
+            embedding_dim=3072,
+            max_token_size=8192,
+            func=lambda texts: azure_openai_embed(texts)
+        ),
+        llm_model_func=azure_openai_complete,
+        enable_llm_cache=False,
+        enable_llm_cache_for_entity_extract=False,
+        embedding_cache_config={
+            "enabled": False,
+            "similarity_threshold": 0.95,
+            "use_llm_check": False
+        },
+    )
+    await rag.initialize_storages()
+    await initialize_pipeline_status()
+    return rag
+
+async def initilize_rag_ollama():
+    rag = LightRAG(
+        working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GEMMA327b"],
+        graph_storage="NetworkXStorage",  # "Neo4JStorage",
+        kv_storage="JsonKVStorage",
+        vector_storage="FaissVectorDBStorage",
+        vector_db_storage_cls_kwargs={
+            "cosine_better_than_threshold": 0.25
+        },
+        llm_model_func=ollama_model_complete,
+        llm_model_name=os.environ["OLLAMA_LLM_MODEL"],
+        llm_model_kwargs={
+            "host": os.environ["OLLAMA_LLM_HOST"],
+            "options": {"num_ctx": 40000},
+        },
+        enable_llm_cache=False,
+        enable_llm_cache_for_entity_extract=False,
+        embedding_func=EmbeddingFunc(
+            embedding_dim=1024,
+            max_token_size=8192,
+            func=lambda texts: ollama_embed(
+                texts,
+                embed_model=os.environ["OLLAMA_EMBED_MODEL"],
+                host=os.environ["OLLAMA_EMBED_HOST"]
+            ),
+        ),
+        embedding_cache_config={
+            "enabled": False,
+            "similarity_threshold": 0.95,
+            "use_llm_check": False
+        },
+    )
+    await rag.initialize_storages()
+    await initialize_pipeline_status()
+    return rag
+
+
+
+
+def main():
+    logger.info("Initializing lightRAG instance")
+    rag = asyncio.run(initilize_rag_ollama())
+
+    input_dir_path = "/Users/tcudikel/Dev/ancient-history/data/input/transcripts"
+    txt_files = glob.glob(f"{input_dir_path}/*.txt")
+    logger.debug(f"found {len(txt_files)} files in {input_dir_path}")
+    for file_path in tqdm(txt_files, desc="Processing files", unit="file", miniters=1, ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'):
+        text = read_text_file(file_path)
+        rag.insert(text)
+
+    logger.success(f"{len(txt_files)} files inserted into the knowledge graph.")
+    """mode="mix"
+    rag.query(
+        "What are the top themes in this story?",
+        param=QueryParam(
+            mode=mode,
+            response_type="Single Paragraph",
+#            conversation_history=,
+#            history_turns=5,
+        )
+    )"""
+
+
+if __name__ == "__main__":
+    main()
--- a/data/.gitkeep
+++ b/data/.gitkeep
--- a/inference.py
+++ b/inference.py
@@ -0,0 +1,58 @@
+import glob
+import os
+import asyncio
+
+import aiofiles
+from lightrag import LightRAG, QueryParam
+from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete
+from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed
+from lightrag.kg.shared_storage import initialize_pipeline_status
+from lightrag.utils import setup_logger, EmbeddingFunc
+from tqdm import tqdm
+
+setup_logger("lightrag", level="INFO")
+
+
+
+async def initialize_rag():
+    rag = LightRAG(
+        working_dir="/Users/tcudikel/Dev/ancient-history/data/storage/base_gpt4o",
+        graph_storage="NetworkXStorage",
+        vector_storage="ChromaVectorDBStorage",
+        vector_db_storage_cls_kwargs={
+            "local_path": "/Users/tcudikel/Dev/ancient-history/data/storage/base_gpt4o/vdb",
+            "cosine_better_than_threshold": 0.5,
+        },
+        embedding_func=EmbeddingFunc(
+            embedding_dim=3072,
+            max_token_size=8192,
+            func=lambda texts: azure_openai_embed(texts)
+        ),
+        llm_model_func=azure_openai_complete
+    )
+
+    await rag.initialize_storages()
+    await initialize_pipeline_status()
+
+    return rag
+
+
+def main():
+    rag = asyncio.run(initialize_rag())
+
+    mode = "mix"
+    response = rag.query(
+        "Which prophets exist before Noah?",
+        param=QueryParam(
+            mode=mode,
+            response_type="Single Paragraphs",
+            only_need_context=False,
+            #            conversation_history=,
+            #            history_turns=5,
+        )
+    )
+    print(response)
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+imgui_bundle
+moderngl
+networkx
+numpy
+pyglm
+python-louvain
+scipy
+tk
--- a/scripts/download_audio.sh
+++ b/scripts/download_audio.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+
+yt-dlp -x --audio-format mp3 --audio-quality 0 --output "%(title)s.%(ext)s" --restrict-filenames --yes-playlist $1 && find . -maxdepth 1 -type f -name "*_*" -print0 | while IFS= read -r -d '' file; do new_file=$(echo "$file" | sed 's/_/ /g'); mv "$file" "$new_file"; done
+
+mv *.mp3 ./data/input
+
+https://www.youtube.com/@the5thkind https://www.youtube.com/@coasttocoastamofficial https://www.youtube.com/@zoharancientdiscovery https://www.youtube.com/@zoharancienthistory https://www.youtube.com/@ancientastronautarchive