initial commit

This commit is contained in:
TCUDIKEL
2025-04-19 20:54:35 +03:00
commit 7211c57b36
6 changed files with 206 additions and 0 deletions

16
.gitignore vendored Executable file
View File

@@ -0,0 +1,16 @@
.DS_Store
.idea/
data/
*.pyc
*.pyo
*.pyd
__pycache__/
*.egg-info/
*.egg
venv
env
.env
*.log
*.sqlite*
*.db
*.db-journal

116
create.py Executable file
View File

@@ -0,0 +1,116 @@
import glob
import os
import asyncio
import aiofiles
from lightrag import LightRAG, QueryParam
from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import setup_logger, EmbeddingFunc
from tqdm import tqdm
from loguru import logger
setup_logger("lightrag", level="INFO")
def read_text_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
return text
async def initialize_rag_azure_openai():
rag = LightRAG(
working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GPT4o"],
graph_storage="NetworkXStorage",
kv_storage="JsonKVStorage",
vector_storage="FaissVectorDBStorage",
vector_db_storage_cls_kwargs={
"cosine_better_than_threshold": 0.2
},
embedding_func=EmbeddingFunc(
embedding_dim=3072,
max_token_size=8192,
func=lambda texts: azure_openai_embed(texts)
),
llm_model_func=azure_openai_complete,
enable_llm_cache=False,
enable_llm_cache_for_entity_extract=False,
embedding_cache_config={
"enabled": False,
"similarity_threshold": 0.95,
"use_llm_check": False
},
)
await rag.initialize_storages()
await initialize_pipeline_status()
return rag
async def initilize_rag_ollama():
rag = LightRAG(
working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GEMMA327b"],
graph_storage="NetworkXStorage", # "Neo4JStorage",
kv_storage="JsonKVStorage",
vector_storage="FaissVectorDBStorage",
vector_db_storage_cls_kwargs={
"cosine_better_than_threshold": 0.25
},
llm_model_func=ollama_model_complete,
llm_model_name=os.environ["OLLAMA_LLM_MODEL"],
llm_model_kwargs={
"host": os.environ["OLLAMA_LLM_HOST"],
"options": {"num_ctx": 40000},
},
enable_llm_cache=False,
enable_llm_cache_for_entity_extract=False,
embedding_func=EmbeddingFunc(
embedding_dim=1024,
max_token_size=8192,
func=lambda texts: ollama_embed(
texts,
embed_model=os.environ["OLLAMA_EMBED_MODEL"],
host=os.environ["OLLAMA_EMBED_HOST"]
),
),
embedding_cache_config={
"enabled": False,
"similarity_threshold": 0.95,
"use_llm_check": False
},
)
await rag.initialize_storages()
await initialize_pipeline_status()
return rag
def main():
logger.info("Initializing lightRAG instance")
rag = asyncio.run(initilize_rag_ollama())
input_dir_path = "/Users/tcudikel/Dev/ancient-history/data/input/transcripts"
txt_files = glob.glob(f"{input_dir_path}/*.txt")
logger.debug(f"found {len(txt_files)} files in {input_dir_path}")
for file_path in tqdm(txt_files, desc="Processing files", unit="file", miniters=1, ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'):
text = read_text_file(file_path)
rag.insert(text)
logger.success(f"{len(txt_files)} files inserted into the knowledge graph.")
"""mode="mix"
rag.query(
"What are the top themes in this story?",
param=QueryParam(
mode=mode,
response_type="Single Paragraph",
# conversation_history=,
# history_turns=5,
)
)"""
if __name__ == "__main__":
main()

0
data/.gitkeep Normal file
View File

58
inference.py Executable file
View File

@@ -0,0 +1,58 @@
import glob
import os
import asyncio
import aiofiles
from lightrag import LightRAG, QueryParam
from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete
from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import setup_logger, EmbeddingFunc
from tqdm import tqdm
setup_logger("lightrag", level="INFO")
async def initialize_rag():
rag = LightRAG(
working_dir="/Users/tcudikel/Dev/ancient-history/data/storage/base_gpt4o",
graph_storage="NetworkXStorage",
vector_storage="ChromaVectorDBStorage",
vector_db_storage_cls_kwargs={
"local_path": "/Users/tcudikel/Dev/ancient-history/data/storage/base_gpt4o/vdb",
"cosine_better_than_threshold": 0.5,
},
embedding_func=EmbeddingFunc(
embedding_dim=3072,
max_token_size=8192,
func=lambda texts: azure_openai_embed(texts)
),
llm_model_func=azure_openai_complete
)
await rag.initialize_storages()
await initialize_pipeline_status()
return rag
def main():
rag = asyncio.run(initialize_rag())
mode = "mix"
response = rag.query(
"Which prophets exist before Noah?",
param=QueryParam(
mode=mode,
response_type="Single Paragraphs",
only_need_context=False,
# conversation_history=,
# history_turns=5,
)
)
print(response)
if __name__ == "__main__":
main()

8
requirements.txt Executable file
View File

@@ -0,0 +1,8 @@
imgui_bundle
moderngl
networkx
numpy
pyglm
python-louvain
scipy
tk

8
scripts/download_audio.sh Executable file
View File

@@ -0,0 +1,8 @@
#!/bin/bash
yt-dlp -x --audio-format mp3 --audio-quality 0 --output "%(title)s.%(ext)s" --restrict-filenames --yes-playlist $1 && find . -maxdepth 1 -type f -name "*_*" -print0 | while IFS= read -r -d '' file; do new_file=$(echo "$file" | sed 's/_/ /g'); mv "$file" "$new_file"; done
mv *.mp3 ./data/input
https://www.youtube.com/@the5thkind https://www.youtube.com/@coasttocoastamofficial https://www.youtube.com/@zoharancientdiscovery https://www.youtube.com/@zoharancienthistory https://www.youtube.com/@ancientastronautarchive