mirror of
https://github.com/BaranziniLab/KG_RAG.git
synced 2024-06-08 14:12:54 +03:00
56 lines
2.5 KiB
YAML
56 lines
2.5 KiB
YAML
---
|
|
|
|
# KG-RAG hyperparameters
|
|
CONTEXT_VOLUME : 150
|
|
QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD : 75
|
|
QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY : 0.5
|
|
SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL : 'sentence-transformers/all-MiniLM-L6-v2'
|
|
SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL : 'pritamdeka/S-PubMedBert-MS-MARCO'
|
|
|
|
# VectorDB hyperparameters
|
|
VECTOR_DB_DISEASE_ENTITY_PATH : '/data/somank/KG_RAG/data/disease_with_relation_to_genes.pickle'
|
|
VECTOR_DB_PATH : '/data/somank/KG_RAG/data/vectorDB/disease_nodes_db'
|
|
VECTOR_DB_CHUNK_SIZE : 650
|
|
VECTOR_DB_CHUNK_OVERLAP : 200
|
|
VECTOR_DB_BATCH_SIZE : 200
|
|
VECTOR_DB_SENTENCE_EMBEDDING_MODEL : 'sentence-transformers/all-MiniLM-L6-v2'
|
|
|
|
# Path for context file from SPOKE KG
|
|
NODE_CONTEXT_PATH : '/data/somank/KG_RAG/data/context_of_disease_which_has_relation_to_genes.csv'
|
|
|
|
# Just note that, this assumes your GPT config file is in the $HOME path, if not, change it accordingly
|
|
# Also, GPT '.env' file should contain values for API_KEY, and optionally API_VERSION and RESOURCE_ENDPOINT. We are not including those parameters in this yaml file
|
|
GPT_CONFIG_FILE : '$HOME/.gpt_config.env'
|
|
# Can be 'azure' or 'open_ai'.
|
|
GPT_API_TYPE : 'azure'
|
|
|
|
# Llama model name (Refer Hugging face to get the correct name for the model version you would like to use, also make sure you have the right permission to use the model)
|
|
LLAMA_MODEL_NAME : 'meta-llama/Llama-2-13b-chat-hf'
|
|
LLAMA_MODEL_BRANCH : 'main'
|
|
|
|
# Path for caching LLM model files (When the model gets downloaded from hugging face, it will be saved in this path)
|
|
LLM_CACHE_DIR : '/data/somank/llm_data/llm_models/huggingface'
|
|
LLM_TEMPERATURE : 0
|
|
|
|
# Path to save results
|
|
SAVE_RESULTS_PATH : '/data/somank/kg_rag_fork/KG_RAG/data/results'
|
|
|
|
# File paths for test questions
|
|
MCQ_PATH : '/data/somank/kg_rag_fork/KG_RAG/data/benchmark_data/mcq_questions.csv'
|
|
TRUE_FALSE_PATH : '/data/somank/kg_rag_fork/KG_RAG/data/benchmark_data/true_false_questions.csv'
|
|
SINGLE_DISEASE_ENTITY_FILE : '/data/somank/KG_RAG/data/hyperparam_tuning_data/single_disease_entity_prompts.csv'
|
|
TWO_DISEASE_ENTITY_FILE : '/data/somank/KG_RAG/data/hyperparam_tuning_data/two_disease_entity_prompts.csv'
|
|
|
|
# SPOKE-API params
|
|
BASE_URI : 'https://spoke.rbvi.ucsf.edu'
|
|
cutoff_Compound_max_phase : 3
|
|
cutoff_Protein_source : ['SwissProt']
|
|
cutoff_DaG_diseases_sources : ['knowledge', 'experiments']
|
|
cutoff_DaG_textmining : 3
|
|
cutoff_CtD_phase : 3
|
|
cutoff_PiP_confidence : 0.7
|
|
cutoff_ACTeG_level : ['Low', 'Medium', 'High']
|
|
depth : 1
|
|
cutoff_DpL_average_prevalence : 0.001
|
|
|