initial commit
This commit is contained in:
16
.gitignore
vendored
Executable file
16
.gitignore
vendored
Executable file
@@ -0,0 +1,16 @@
|
||||
.DS_Store
|
||||
.idea/
|
||||
data/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
__pycache__/
|
||||
*.egg-info/
|
||||
*.egg
|
||||
venv
|
||||
env
|
||||
.env
|
||||
*.log
|
||||
*.sqlite*
|
||||
*.db
|
||||
*.db-journal
|
||||
116
create.py
Executable file
116
create.py
Executable file
@@ -0,0 +1,116 @@
|
||||
import glob
|
||||
import os
|
||||
import asyncio
|
||||
|
||||
import aiofiles
|
||||
from lightrag import LightRAG, QueryParam
|
||||
from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete
|
||||
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
|
||||
from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed
|
||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
||||
from lightrag.utils import setup_logger, EmbeddingFunc
|
||||
from tqdm import tqdm
|
||||
from loguru import logger
|
||||
|
||||
setup_logger("lightrag", level="INFO")
|
||||
|
||||
|
||||
def read_text_file(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
text = file.read()
|
||||
return text
|
||||
|
||||
|
||||
|
||||
async def initialize_rag_azure_openai():
|
||||
rag = LightRAG(
|
||||
working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GPT4o"],
|
||||
graph_storage="NetworkXStorage",
|
||||
kv_storage="JsonKVStorage",
|
||||
vector_storage="FaissVectorDBStorage",
|
||||
vector_db_storage_cls_kwargs={
|
||||
"cosine_better_than_threshold": 0.2
|
||||
},
|
||||
embedding_func=EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: azure_openai_embed(texts)
|
||||
),
|
||||
llm_model_func=azure_openai_complete,
|
||||
enable_llm_cache=False,
|
||||
enable_llm_cache_for_entity_extract=False,
|
||||
embedding_cache_config={
|
||||
"enabled": False,
|
||||
"similarity_threshold": 0.95,
|
||||
"use_llm_check": False
|
||||
},
|
||||
)
|
||||
await rag.initialize_storages()
|
||||
await initialize_pipeline_status()
|
||||
return rag
|
||||
|
||||
async def initilize_rag_ollama():
|
||||
rag = LightRAG(
|
||||
working_dir=os.environ["KNOWLEDGE_GRAPH_PATH_GEMMA327b"],
|
||||
graph_storage="NetworkXStorage", # "Neo4JStorage",
|
||||
kv_storage="JsonKVStorage",
|
||||
vector_storage="FaissVectorDBStorage",
|
||||
vector_db_storage_cls_kwargs={
|
||||
"cosine_better_than_threshold": 0.25
|
||||
},
|
||||
llm_model_func=ollama_model_complete,
|
||||
llm_model_name=os.environ["OLLAMA_LLM_MODEL"],
|
||||
llm_model_kwargs={
|
||||
"host": os.environ["OLLAMA_LLM_HOST"],
|
||||
"options": {"num_ctx": 40000},
|
||||
},
|
||||
enable_llm_cache=False,
|
||||
enable_llm_cache_for_entity_extract=False,
|
||||
embedding_func=EmbeddingFunc(
|
||||
embedding_dim=1024,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: ollama_embed(
|
||||
texts,
|
||||
embed_model=os.environ["OLLAMA_EMBED_MODEL"],
|
||||
host=os.environ["OLLAMA_EMBED_HOST"]
|
||||
),
|
||||
),
|
||||
embedding_cache_config={
|
||||
"enabled": False,
|
||||
"similarity_threshold": 0.95,
|
||||
"use_llm_check": False
|
||||
},
|
||||
)
|
||||
await rag.initialize_storages()
|
||||
await initialize_pipeline_status()
|
||||
return rag
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
logger.info("Initializing lightRAG instance")
|
||||
rag = asyncio.run(initilize_rag_ollama())
|
||||
|
||||
input_dir_path = "/Users/tcudikel/Dev/ancient-history/data/input/transcripts"
|
||||
txt_files = glob.glob(f"{input_dir_path}/*.txt")
|
||||
logger.debug(f"found {len(txt_files)} files in {input_dir_path}")
|
||||
for file_path in tqdm(txt_files, desc="Processing files", unit="file", miniters=1, ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'):
|
||||
text = read_text_file(file_path)
|
||||
rag.insert(text)
|
||||
|
||||
logger.success(f"{len(txt_files)} files inserted into the knowledge graph.")
|
||||
"""mode="mix"
|
||||
rag.query(
|
||||
"What are the top themes in this story?",
|
||||
param=QueryParam(
|
||||
mode=mode,
|
||||
response_type="Single Paragraph",
|
||||
# conversation_history=,
|
||||
# history_turns=5,
|
||||
)
|
||||
)"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
data/.gitkeep
Normal file
0
data/.gitkeep
Normal file
58
inference.py
Executable file
58
inference.py
Executable file
@@ -0,0 +1,58 @@
|
||||
import glob
|
||||
import os
|
||||
import asyncio
|
||||
|
||||
import aiofiles
|
||||
from lightrag import LightRAG, QueryParam
|
||||
from lightrag.llm.azure_openai import azure_openai_embed, azure_openai_complete
|
||||
from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed
|
||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
||||
from lightrag.utils import setup_logger, EmbeddingFunc
|
||||
from tqdm import tqdm
|
||||
|
||||
setup_logger("lightrag", level="INFO")
|
||||
|
||||
|
||||
|
||||
async def initialize_rag():
|
||||
rag = LightRAG(
|
||||
working_dir="/Users/tcudikel/Dev/ancient-history/data/storage/base_gpt4o",
|
||||
graph_storage="NetworkXStorage",
|
||||
vector_storage="ChromaVectorDBStorage",
|
||||
vector_db_storage_cls_kwargs={
|
||||
"local_path": "/Users/tcudikel/Dev/ancient-history/data/storage/base_gpt4o/vdb",
|
||||
"cosine_better_than_threshold": 0.5,
|
||||
},
|
||||
embedding_func=EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: azure_openai_embed(texts)
|
||||
),
|
||||
llm_model_func=azure_openai_complete
|
||||
)
|
||||
|
||||
await rag.initialize_storages()
|
||||
await initialize_pipeline_status()
|
||||
|
||||
return rag
|
||||
|
||||
|
||||
def main():
|
||||
rag = asyncio.run(initialize_rag())
|
||||
|
||||
mode = "mix"
|
||||
response = rag.query(
|
||||
"Which prophets exist before Noah?",
|
||||
param=QueryParam(
|
||||
mode=mode,
|
||||
response_type="Single Paragraphs",
|
||||
only_need_context=False,
|
||||
# conversation_history=,
|
||||
# history_turns=5,
|
||||
)
|
||||
)
|
||||
print(response)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
8
requirements.txt
Executable file
8
requirements.txt
Executable file
@@ -0,0 +1,8 @@
|
||||
imgui_bundle
|
||||
moderngl
|
||||
networkx
|
||||
numpy
|
||||
pyglm
|
||||
python-louvain
|
||||
scipy
|
||||
tk
|
||||
8
scripts/download_audio.sh
Executable file
8
scripts/download_audio.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
yt-dlp -x --audio-format mp3 --audio-quality 0 --output "%(title)s.%(ext)s" --restrict-filenames --yes-playlist $1 && find . -maxdepth 1 -type f -name "*_*" -print0 | while IFS= read -r -d '' file; do new_file=$(echo "$file" | sed 's/_/ /g'); mv "$file" "$new_file"; done
|
||||
|
||||
mv *.mp3 ./data/input
|
||||
|
||||
https://www.youtube.com/@the5thkind https://www.youtube.com/@coasttocoastamofficial https://www.youtube.com/@zoharancientdiscovery https://www.youtube.com/@zoharancienthistory https://www.youtube.com/@ancientastronautarchive
|
||||
Reference in New Issue
Block a user