Files
KG_RAG/.test_notebooks/testing_edge_property_fetch.ipynb
2024-03-22 16:18:59 -07:00

449 lines
19 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "d153feab",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "00e1c456",
"metadata": {},
"outputs": [],
"source": [
"config_data = {\n",
" 'BASE_URI' : 'https://spoke.rbvi.ucsf.edu',\n",
" 'cutoff_Compound_max_phase' : 3,\n",
" 'cutoff_Protein_source' : ['SwissProt'],\n",
" 'cutoff_DaG_diseases_sources' : ['knowledge', 'experiments'],\n",
" 'cutoff_DaG_textmining' : 3,\n",
" 'cutoff_CtD_phase' : 3,\n",
" 'cutoff_PiP_confidence' : 0.7,\n",
" 'cutoff_ACTeG_level' : ['Low', 'Medium', 'High'] \n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "dcf1b00e",
"metadata": {},
"outputs": [],
"source": [
"def get_spoke_api_resp(base_uri, end_point, params=None):\n",
" uri = base_uri + end_point\n",
" if params:\n",
" return requests.get(uri, params=params)\n",
" else:\n",
" return requests.get(uri)\n",
"\n",
" \n",
"def get_context_using_spoke_api(node_value):\n",
" type_end_point = \"/api/v1/types\"\n",
" result = get_spoke_api_resp(config_data['BASE_URI'], type_end_point)\n",
" data_spoke_types = result.json()\n",
" node_types = list(data_spoke_types[\"nodes\"].keys())\n",
" edge_types = list(data_spoke_types[\"edges\"].keys())\n",
" node_types_to_remove = [\"DatabaseTimestamp\", \"Version\"]\n",
" filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]\n",
" api_params = {\n",
" 'node_filters' : filtered_node_types,\n",
" 'edge_filters': edge_types,\n",
" 'cutoff_Compound_max_phase': config_data['cutoff_Compound_max_phase'],\n",
" 'cutoff_Protein_source': config_data['cutoff_Protein_source'],\n",
" 'cutoff_DaG_diseases_sources': config_data['cutoff_DaG_diseases_sources'],\n",
" 'cutoff_DaG_textmining': config_data['cutoff_DaG_textmining'],\n",
" 'cutoff_CtD_phase': config_data['cutoff_CtD_phase'],\n",
" 'cutoff_PiP_confidence': config_data['cutoff_PiP_confidence'],\n",
" 'cutoff_ACTeG_level': config_data['cutoff_ACTeG_level']\n",
" }\n",
" node_type = \"Disease\"\n",
" attribute = \"name\"\n",
" nbr_end_point = \"/api/v1/neighborhood/{}/{}/{}\".format(node_type, attribute, node_value)\n",
" result = get_spoke_api_resp(config_data['BASE_URI'], nbr_end_point, params=api_params)\n",
" node_context = result.json()\n",
" nbr_nodes = []\n",
" nbr_edges = []\n",
" for item in node_context:\n",
" if \"_\" not in item[\"data\"][\"neo4j_type\"]:\n",
" try:\n",
" if item[\"data\"][\"neo4j_type\"] == \"Protein\":\n",
" nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"description\"]))\n",
" else:\n",
" nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"name\"]))\n",
" except:\n",
" nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"identifier\"]))\n",
" elif \"_\" in item[\"data\"][\"neo4j_type\"]:\n",
" try:\n",
" provenance = \", \".join(item[\"data\"][\"properties\"][\"sources\"])\n",
" except:\n",
" try:\n",
" provenance = item[\"data\"][\"properties\"][\"source\"]\n",
" if isinstance(provenance, list):\n",
" provenance = \", \".join(provenance) \n",
" except:\n",
" try: \n",
" preprint_list = ast.literal_eval(item[\"data\"][\"properties\"][\"preprint_list\"])\n",
" if len(preprint_list) > 0: \n",
" provenance = \", \".join(preprint_list)\n",
" else:\n",
" pmid_list = ast.literal_eval(item[\"data\"][\"properties\"][\"pmid_list\"])\n",
" pmid_list = map(lambda x:\"pubmedId:\"+x, pmid_list)\n",
" if len(pmid_list) > 0:\n",
" provenance = \", \".join(pmid_list)\n",
" else:\n",
" provenance = \"Based on data from Institute For Systems Biology (ISB)\"\n",
" except: \n",
" provenance = \"SPOKE-KG\" \n",
" nbr_edges.append((item[\"data\"][\"source\"], item[\"data\"][\"neo4j_type\"], item[\"data\"][\"target\"], provenance))\n",
" nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=[\"node_type\", \"node_id\", \"node_name\"])\n",
" nbr_edges_df = pd.DataFrame(nbr_edges, columns=[\"source\", \"edge_type\", \"target\", \"provenance\"])\n",
" merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on=\"source\", right_on=\"node_id\").drop(\"node_id\", axis=1)\n",
" merge_1.loc[:,\"node_name\"] = merge_1.node_type + \" \" + merge_1.node_name\n",
" merge_1.drop([\"source\", \"node_type\"], axis=1, inplace=True)\n",
" merge_1 = merge_1.rename(columns={\"node_name\":\"source\"})\n",
" merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on=\"target\", right_on=\"node_id\").drop(\"node_id\", axis=1)\n",
" merge_2.loc[:,\"node_name\"] = merge_2.node_type + \" \" + merge_2.node_name\n",
" merge_2.drop([\"target\", \"node_type\"], axis=1, inplace=True)\n",
" merge_2 = merge_2.rename(columns={\"node_name\":\"target\"})\n",
" merge_2 = merge_2[[\"source\", \"edge_type\", \"target\", \"provenance\"]]\n",
" merge_2.loc[:, \"predicate\"] = merge_2.edge_type.apply(lambda x:x.split(\"_\")[0])\n",
" merge_2.loc[:, \"context\"] = merge_2.source + \" \" + merge_2.predicate.str.lower() + \" \" + merge_2.target + \" and Provenance of this association is \" + merge_2.provenance + \". \"\n",
" context = merge_2['context'].str.cat(sep=' ')\n",
" context += node_value + \" has a \" + node_context[0][\"data\"][\"properties\"][\"source\"] + \" identifier of \" + node_context[0][\"data\"][\"properties\"][\"identifier\"] + \" and Provenance of this association is \" + node_context[0][\"data\"][\"properties\"][\"source\"] + \".\"\n",
" return context\n"
]
},
{
"cell_type": "code",
"execution_count": 161,
"id": "5b7bd91c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 68.7 ms, sys: 6.93 ms, total: 75.6 ms\n",
"Wall time: 286 ms\n"
]
},
{
"data": {
"text/plain": [
"124"
]
},
"execution_count": 161,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"node_value = 'giant cell glioblastoma'\n",
"\n",
"type_end_point = \"/api/v1/types\"\n",
"result = get_spoke_api_resp(config_data['BASE_URI'], type_end_point)\n",
"data_spoke_types = result.json()\n",
"node_types = list(data_spoke_types[\"nodes\"].keys())\n",
"edge_types = list(data_spoke_types[\"edges\"].keys())\n",
"node_types_to_remove = [\"DatabaseTimestamp\", \"Version\"]\n",
"filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]\n",
"api_params = {\n",
" 'node_filters' : filtered_node_types,\n",
" 'edge_filters': edge_types,\n",
" 'cutoff_Compound_max_phase': config_data['cutoff_Compound_max_phase'],\n",
" 'cutoff_Protein_source': config_data['cutoff_Protein_source'],\n",
" 'cutoff_DaG_diseases_sources': config_data['cutoff_DaG_diseases_sources'],\n",
" 'cutoff_DaG_textmining': config_data['cutoff_DaG_textmining'],\n",
" 'cutoff_CtD_phase': config_data['cutoff_CtD_phase'],\n",
" 'cutoff_PiP_confidence': config_data['cutoff_PiP_confidence'],\n",
" 'cutoff_ACTeG_level': config_data['cutoff_ACTeG_level'],\n",
" 'depth' : 1\n",
"}\n",
"node_type = \"Disease\"\n",
"attribute = \"name\"\n",
"nbr_end_point = \"/api/v1/neighborhood/{}/{}/{}\".format(node_type, attribute, node_value)\n",
"result = get_spoke_api_resp(config_data['BASE_URI'], nbr_end_point, params=api_params)\n",
"node_context = result.json()\n",
"len(node_context)\n"
]
},
{
"cell_type": "code",
"execution_count": 143,
"id": "100577db",
"metadata": {},
"outputs": [],
"source": [
"edge_evidence = False\n",
"\n",
"nbr_nodes = []\n",
"nbr_edges = []\n",
"for item in node_context:\n",
" if \"_\" not in item[\"data\"][\"neo4j_type\"]:\n",
" try:\n",
" if item[\"data\"][\"neo4j_type\"] == \"Protein\":\n",
" nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"description\"]))\n",
" else:\n",
" nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"name\"]))\n",
" except:\n",
" nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"identifier\"]))\n",
" elif \"_\" in item[\"data\"][\"neo4j_type\"]:\n",
" try:\n",
" provenance = \", \".join(item[\"data\"][\"properties\"][\"sources\"])\n",
" except:\n",
" try:\n",
" provenance = item[\"data\"][\"properties\"][\"source\"]\n",
" if isinstance(provenance, list):\n",
" provenance = \", \".join(provenance) \n",
" except:\n",
" try: \n",
" preprint_list = ast.literal_eval(item[\"data\"][\"properties\"][\"preprint_list\"])\n",
" if len(preprint_list) > 0: \n",
" provenance = \", \".join(preprint_list)\n",
" else:\n",
" pmid_list = ast.literal_eval(item[\"data\"][\"properties\"][\"pmid_list\"])\n",
" pmid_list = map(lambda x:\"pubmedId:\"+x, pmid_list)\n",
" if len(pmid_list) > 0:\n",
" provenance = \", \".join(pmid_list)\n",
" else:\n",
" provenance = \"Based on data from Institute For Systems Biology (ISB)\"\n",
" except: \n",
" provenance = \"SPOKE-KG\" \n",
" try:\n",
" evidence = item[\"data\"][\"properties\"]\n",
" except:\n",
" evidence = None\n",
" nbr_edges.append((item[\"data\"][\"source\"], item[\"data\"][\"neo4j_type\"], item[\"data\"][\"target\"], provenance, evidence))\n",
" nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=[\"node_type\", \"node_id\", \"node_name\"])\n",
" nbr_edges_df = pd.DataFrame(nbr_edges, columns=[\"source\", \"edge_type\", \"target\", \"provenance\", \"evidence\"])\n",
" merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on=\"source\", right_on=\"node_id\").drop(\"node_id\", axis=1)\n",
" merge_1.loc[:,\"node_name\"] = merge_1.node_type + \" \" + merge_1.node_name\n",
" merge_1.drop([\"source\", \"node_type\"], axis=1, inplace=True)\n",
" merge_1 = merge_1.rename(columns={\"node_name\":\"source\"})\n",
" merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on=\"target\", right_on=\"node_id\").drop(\"node_id\", axis=1)\n",
" merge_2.loc[:,\"node_name\"] = merge_2.node_type + \" \" + merge_2.node_name\n",
" merge_2.drop([\"target\", \"node_type\"], axis=1, inplace=True)\n",
" merge_2 = merge_2.rename(columns={\"node_name\":\"target\"})\n",
" merge_2 = merge_2[[\"source\", \"edge_type\", \"target\", \"provenance\", \"evidence\"]]\n",
" merge_2.loc[:, \"predicate\"] = merge_2.edge_type.apply(lambda x:x.split(\"_\")[0])\n",
"# if edge_evidence:\n",
"# merge_2.loc[:, \"context\"] = merge_2.source + \" \" + merge_2.predicate.str.lower() + \" \" + merge_2.target + \" and Provenance of this association is \" + merge_2.provenance + \" and attributes associated with this association is in the following JSON format:\\n \" + merge_2.evidence.astype('str') + \"\\n\\n\"\n",
"# else:\n",
" merge_2.loc[:, \"context\"] = merge_2.source + \" \" + merge_2.predicate.str.lower() + \" \" + merge_2.target + \" and Provenance of this association is \" + merge_2.provenance + \". \"\n",
" context = merge_2.context.str.cat(sep=' ')\n",
" context += node_value + \" has a \" + node_context[0][\"data\"][\"properties\"][\"source\"] + \" identifier of \" + node_context[0][\"data\"][\"properties\"][\"identifier\"] + \" and Provenance of this is from \" + node_context[0][\"data\"][\"properties\"][\"source\"] + \".\"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 154,
"id": "490adee9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Disease secondary progressive multiple sclerosis isa Disease multiple sclerosis and Provenance of this association is Disease Ontology. '"
]
},
"execution_count": 154,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"high_similarity_context = ['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS. ',\n",
" 'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES. ',\n",
" 'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS. ']\n",
"\n",
"merge_2[merge_2.context.isin(high_similarity_context)]\n",
"merge_2.context.values[0]\n"
]
},
{
"cell_type": "code",
"execution_count": 134,
"id": "e3db5024",
"metadata": {},
"outputs": [],
"source": [
"high_similarity_context = ['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS.',\n",
" 'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES.',\n",
" 'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS.']\n",
"high_similarity_context.append('Gene Xs sds associates with Disease multiple sclerosis and Provenance of this association is GWAS.')\n",
"node_name = 'multiple sclerosis'\n",
"node_types = nbr_nodes_df.node_type.unique()\n",
"\n",
"\n",
"nodes = list(filter(None, list(map(lambda x:x if '_' not in x['data']['neo4j_type'] else None, node_context))))\n",
"edges = list(filter(None, list(map(lambda x:x if '_' in x['data']['neo4j_type'] else None, node_context))))\n",
"\n",
"source_node_id = list(map(lambda x:x['data']['id'] if x['data']['properties']['name'] == node_name else None, nodes))[0]\n"
]
},
{
"cell_type": "code",
"execution_count": 132,
"id": "c7b6cf3f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(array([0]),)\n",
"(array([4]),)\n"
]
},
{
"data": {
"text/plain": [
"'HLA-DQA1 and Provenance of this association is GWAS.'"
]
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"sentence = high_similarity_context[0]\n",
"\n",
"for node_type in node_types:\n",
" if node_type in sentence: \n",
" print(np.where(node_type == np.array(sentence.split(' '))))\n",
"\n",
"sentence.split('Gene ')[-1]"
]
},
{
"cell_type": "code",
"execution_count": 133,
"id": "768d83c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS.',\n",
" 'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES.',\n",
" 'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS.']"
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "31239a88",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'diseases_identifiers': ['https://diseases.jensenlab.org/Entity?documents=10&type1=9606&id1=ENSP00000369889&type2=-26&id2=DOID:0080044',\n",
" 'MedlinePlus'],\n",
" 'diseases_scores': ['6.503', 'CURATED'],\n",
" 'sources': ['DISEASES'],\n",
" 'diseases_sources': ['textmining', 'knowledge'],\n",
" 'diseases_confidences': [3.252, 5.0]}"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"item[\"data\"][\"properties\"]"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "019ceba5",
"metadata": {},
"outputs": [],
"source": [
"source = 'multiple sclerosis'\n",
"target = 'COL2A1'\n",
"\n",
"nodes = list(filter(None, list(map(lambda x:x if '_' not in x['data']['neo4j_type'] else None, node_context))))\n",
"edges = list(filter(None, list(map(lambda x:x if '_' in x['data']['neo4j_type'] else None, node_context))))\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "84481154",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"152375"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# def get_node_id(inp_node)\n",
"\n",
"list(map(lambda x:x['data']['id'] if x['data']['properties']['name'] == source else None, nodes))[0]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e811b4a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}