KG_RAG/.test_notebooks/testing_edge_property_fetch.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d153feab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "00e1c456",
   "metadata": {},
   "outputs": [],
   "source": [
    "config_data = {\n",
    "    'BASE_URI' : 'https://spoke.rbvi.ucsf.edu',\n",
    "    'cutoff_Compound_max_phase' : 3,\n",
    "    'cutoff_Protein_source' : ['SwissProt'],\n",
    "    'cutoff_DaG_diseases_sources' : ['knowledge', 'experiments'],\n",
    "    'cutoff_DaG_textmining' : 3,\n",
    "    'cutoff_CtD_phase' : 3,\n",
    "    'cutoff_PiP_confidence' : 0.7,\n",
    "    'cutoff_ACTeG_level' : ['Low', 'Medium', 'High']    \n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "dcf1b00e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_spoke_api_resp(base_uri, end_point, params=None):\n",
    "    uri = base_uri + end_point\n",
    "    if params:\n",
    "        return requests.get(uri, params=params)\n",
    "    else:\n",
    "        return requests.get(uri)\n",
    "\n",
    "    \n",
    "def get_context_using_spoke_api(node_value):\n",
    "    type_end_point = \"/api/v1/types\"\n",
    "    result = get_spoke_api_resp(config_data['BASE_URI'], type_end_point)\n",
    "    data_spoke_types = result.json()\n",
    "    node_types = list(data_spoke_types[\"nodes\"].keys())\n",
    "    edge_types = list(data_spoke_types[\"edges\"].keys())\n",
    "    node_types_to_remove = [\"DatabaseTimestamp\", \"Version\"]\n",
    "    filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]\n",
    "    api_params = {\n",
    "        'node_filters' : filtered_node_types,\n",
    "        'edge_filters': edge_types,\n",
    "        'cutoff_Compound_max_phase': config_data['cutoff_Compound_max_phase'],\n",
    "        'cutoff_Protein_source': config_data['cutoff_Protein_source'],\n",
    "        'cutoff_DaG_diseases_sources': config_data['cutoff_DaG_diseases_sources'],\n",
    "        'cutoff_DaG_textmining': config_data['cutoff_DaG_textmining'],\n",
    "        'cutoff_CtD_phase': config_data['cutoff_CtD_phase'],\n",
    "        'cutoff_PiP_confidence': config_data['cutoff_PiP_confidence'],\n",
    "        'cutoff_ACTeG_level': config_data['cutoff_ACTeG_level']\n",
    "    }\n",
    "    node_type = \"Disease\"\n",
    "    attribute = \"name\"\n",
    "    nbr_end_point = \"/api/v1/neighborhood/{}/{}/{}\".format(node_type, attribute, node_value)\n",
    "    result = get_spoke_api_resp(config_data['BASE_URI'], nbr_end_point, params=api_params)\n",
    "    node_context = result.json()\n",
    "    nbr_nodes = []\n",
    "    nbr_edges = []\n",
    "    for item in node_context:\n",
    "        if \"_\" not in item[\"data\"][\"neo4j_type\"]:\n",
    "            try:\n",
    "                if item[\"data\"][\"neo4j_type\"] == \"Protein\":\n",
    "                    nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"description\"]))\n",
    "                else:\n",
    "                    nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"name\"]))\n",
    "            except:\n",
    "                nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"identifier\"]))\n",
    "        elif \"_\" in item[\"data\"][\"neo4j_type\"]:\n",
    "            try:\n",
    "                provenance = \", \".join(item[\"data\"][\"properties\"][\"sources\"])\n",
    "            except:\n",
    "                try:\n",
    "                    provenance = item[\"data\"][\"properties\"][\"source\"]\n",
    "                    if isinstance(provenance, list):\n",
    "                        provenance = \", \".join(provenance)                    \n",
    "                except:\n",
    "                    try:                    \n",
    "                        preprint_list = ast.literal_eval(item[\"data\"][\"properties\"][\"preprint_list\"])\n",
    "                        if len(preprint_list) > 0:                                                    \n",
    "                            provenance = \", \".join(preprint_list)\n",
    "                        else:\n",
    "                            pmid_list = ast.literal_eval(item[\"data\"][\"properties\"][\"pmid_list\"])\n",
    "                            pmid_list = map(lambda x:\"pubmedId:\"+x, pmid_list)\n",
    "                            if len(pmid_list) > 0:\n",
    "                                provenance = \", \".join(pmid_list)\n",
    "                            else:\n",
    "                                provenance = \"Based on data from Institute For Systems Biology (ISB)\"\n",
    "                    except:                                \n",
    "                        provenance = \"SPOKE-KG\"                                    \n",
    "            nbr_edges.append((item[\"data\"][\"source\"], item[\"data\"][\"neo4j_type\"], item[\"data\"][\"target\"], provenance))\n",
    "    nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=[\"node_type\", \"node_id\", \"node_name\"])\n",
    "    nbr_edges_df = pd.DataFrame(nbr_edges, columns=[\"source\", \"edge_type\", \"target\", \"provenance\"])\n",
    "    merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on=\"source\", right_on=\"node_id\").drop(\"node_id\", axis=1)\n",
    "    merge_1.loc[:,\"node_name\"] = merge_1.node_type + \" \" + merge_1.node_name\n",
    "    merge_1.drop([\"source\", \"node_type\"], axis=1, inplace=True)\n",
    "    merge_1 = merge_1.rename(columns={\"node_name\":\"source\"})\n",
    "    merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on=\"target\", right_on=\"node_id\").drop(\"node_id\", axis=1)\n",
    "    merge_2.loc[:,\"node_name\"] = merge_2.node_type + \" \" + merge_2.node_name\n",
    "    merge_2.drop([\"target\", \"node_type\"], axis=1, inplace=True)\n",
    "    merge_2 = merge_2.rename(columns={\"node_name\":\"target\"})\n",
    "    merge_2 = merge_2[[\"source\", \"edge_type\", \"target\", \"provenance\"]]\n",
    "    merge_2.loc[:, \"predicate\"] = merge_2.edge_type.apply(lambda x:x.split(\"_\")[0])\n",
    "    merge_2.loc[:, \"context\"] =  merge_2.source + \" \" + merge_2.predicate.str.lower() + \" \" + merge_2.target + \" and Provenance of this association is \" + merge_2.provenance + \". \"\n",
    "    context = merge_2['context'].str.cat(sep=' ')\n",
    "    context += node_value + \" has a \" + node_context[0][\"data\"][\"properties\"][\"source\"] + \" identifier of \" + node_context[0][\"data\"][\"properties\"][\"identifier\"] + \" and Provenance of this association is \" + node_context[0][\"data\"][\"properties\"][\"source\"] + \".\"\n",
    "    return context\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "id": "5b7bd91c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 68.7 ms, sys: 6.93 ms, total: 75.6 ms\n",
      "Wall time: 286 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "124"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "node_value = 'giant cell glioblastoma'\n",
    "\n",
    "type_end_point = \"/api/v1/types\"\n",
    "result = get_spoke_api_resp(config_data['BASE_URI'], type_end_point)\n",
    "data_spoke_types = result.json()\n",
    "node_types = list(data_spoke_types[\"nodes\"].keys())\n",
    "edge_types = list(data_spoke_types[\"edges\"].keys())\n",
    "node_types_to_remove = [\"DatabaseTimestamp\", \"Version\"]\n",
    "filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]\n",
    "api_params = {\n",
    "    'node_filters' : filtered_node_types,\n",
    "    'edge_filters': edge_types,\n",
    "    'cutoff_Compound_max_phase': config_data['cutoff_Compound_max_phase'],\n",
    "    'cutoff_Protein_source': config_data['cutoff_Protein_source'],\n",
    "    'cutoff_DaG_diseases_sources': config_data['cutoff_DaG_diseases_sources'],\n",
    "    'cutoff_DaG_textmining': config_data['cutoff_DaG_textmining'],\n",
    "    'cutoff_CtD_phase': config_data['cutoff_CtD_phase'],\n",
    "    'cutoff_PiP_confidence': config_data['cutoff_PiP_confidence'],\n",
    "    'cutoff_ACTeG_level': config_data['cutoff_ACTeG_level'],\n",
    "    'depth' : 1\n",
    "}\n",
    "node_type = \"Disease\"\n",
    "attribute = \"name\"\n",
    "nbr_end_point = \"/api/v1/neighborhood/{}/{}/{}\".format(node_type, attribute, node_value)\n",
    "result = get_spoke_api_resp(config_data['BASE_URI'], nbr_end_point, params=api_params)\n",
    "node_context = result.json()\n",
    "len(node_context)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "id": "100577db",
   "metadata": {},
   "outputs": [],
   "source": [
    "edge_evidence = False\n",
    "\n",
    "nbr_nodes = []\n",
    "nbr_edges = []\n",
    "for item in node_context:\n",
    "    if \"_\" not in item[\"data\"][\"neo4j_type\"]:\n",
    "        try:\n",
    "            if item[\"data\"][\"neo4j_type\"] == \"Protein\":\n",
    "                nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"description\"]))\n",
    "            else:\n",
    "                nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"name\"]))\n",
    "        except:\n",
    "            nbr_nodes.append((item[\"data\"][\"neo4j_type\"], item[\"data\"][\"id\"], item[\"data\"][\"properties\"][\"identifier\"]))\n",
    "    elif \"_\" in item[\"data\"][\"neo4j_type\"]:\n",
    "        try:\n",
    "            provenance = \", \".join(item[\"data\"][\"properties\"][\"sources\"])\n",
    "        except:\n",
    "            try:\n",
    "                provenance = item[\"data\"][\"properties\"][\"source\"]\n",
    "                if isinstance(provenance, list):\n",
    "                    provenance = \", \".join(provenance)                    \n",
    "            except:\n",
    "                try:                    \n",
    "                    preprint_list = ast.literal_eval(item[\"data\"][\"properties\"][\"preprint_list\"])\n",
    "                    if len(preprint_list) > 0:                                                    \n",
    "                        provenance = \", \".join(preprint_list)\n",
    "                    else:\n",
    "                        pmid_list = ast.literal_eval(item[\"data\"][\"properties\"][\"pmid_list\"])\n",
    "                        pmid_list = map(lambda x:\"pubmedId:\"+x, pmid_list)\n",
    "                        if len(pmid_list) > 0:\n",
    "                            provenance = \", \".join(pmid_list)\n",
    "                        else:\n",
    "                            provenance = \"Based on data from Institute For Systems Biology (ISB)\"\n",
    "                except:                                \n",
    "                    provenance = \"SPOKE-KG\"     \n",
    "        try:\n",
    "            evidence = item[\"data\"][\"properties\"]\n",
    "        except:\n",
    "            evidence = None\n",
    "        nbr_edges.append((item[\"data\"][\"source\"], item[\"data\"][\"neo4j_type\"], item[\"data\"][\"target\"], provenance, evidence))\n",
    "    nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=[\"node_type\", \"node_id\", \"node_name\"])\n",
    "    nbr_edges_df = pd.DataFrame(nbr_edges, columns=[\"source\", \"edge_type\", \"target\", \"provenance\", \"evidence\"])\n",
    "    merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on=\"source\", right_on=\"node_id\").drop(\"node_id\", axis=1)\n",
    "    merge_1.loc[:,\"node_name\"] = merge_1.node_type + \" \" + merge_1.node_name\n",
    "    merge_1.drop([\"source\", \"node_type\"], axis=1, inplace=True)\n",
    "    merge_1 = merge_1.rename(columns={\"node_name\":\"source\"})\n",
    "    merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on=\"target\", right_on=\"node_id\").drop(\"node_id\", axis=1)\n",
    "    merge_2.loc[:,\"node_name\"] = merge_2.node_type + \" \" + merge_2.node_name\n",
    "    merge_2.drop([\"target\", \"node_type\"], axis=1, inplace=True)\n",
    "    merge_2 = merge_2.rename(columns={\"node_name\":\"target\"})\n",
    "    merge_2 = merge_2[[\"source\", \"edge_type\", \"target\", \"provenance\", \"evidence\"]]\n",
    "    merge_2.loc[:, \"predicate\"] = merge_2.edge_type.apply(lambda x:x.split(\"_\")[0])\n",
    "#     if edge_evidence:\n",
    "#         merge_2.loc[:, \"context\"] =  merge_2.source + \" \" + merge_2.predicate.str.lower() + \" \" + merge_2.target + \" and Provenance of this association is \" + merge_2.provenance + \" and attributes associated with this association is in the following JSON format:\\n \" + merge_2.evidence.astype('str') + \"\\n\\n\"\n",
    "#     else:\n",
    "    merge_2.loc[:, \"context\"] =  merge_2.source + \" \" + merge_2.predicate.str.lower() + \" \" + merge_2.target + \" and Provenance of this association is \" + merge_2.provenance + \". \"\n",
    "    context = merge_2.context.str.cat(sep=' ')\n",
    "    context += node_value + \" has a \" + node_context[0][\"data\"][\"properties\"][\"source\"] + \" identifier of \" + node_context[0][\"data\"][\"properties\"][\"identifier\"] + \" and Provenance of this is from \" + node_context[0][\"data\"][\"properties\"][\"source\"] + \".\"\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "490adee9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Disease secondary progressive multiple sclerosis isa Disease multiple sclerosis and Provenance of this association is Disease Ontology. '"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "high_similarity_context = ['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS. ',\n",
    "                          'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES. ',\n",
    "                          'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS. ']\n",
    "\n",
    "merge_2[merge_2.context.isin(high_similarity_context)]\n",
    "merge_2.context.values[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "e3db5024",
   "metadata": {},
   "outputs": [],
   "source": [
    "high_similarity_context = ['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS.',\n",
    "                          'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES.',\n",
    "                          'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS.']\n",
    "high_similarity_context.append('Gene Xs sds associates with Disease multiple sclerosis and Provenance of this association is GWAS.')\n",
    "node_name = 'multiple sclerosis'\n",
    "node_types = nbr_nodes_df.node_type.unique()\n",
    "\n",
    "\n",
    "nodes = list(filter(None, list(map(lambda x:x if '_' not in x['data']['neo4j_type'] else None, node_context))))\n",
    "edges = list(filter(None, list(map(lambda x:x if '_' in x['data']['neo4j_type'] else None, node_context))))\n",
    "\n",
    "source_node_id = list(map(lambda x:x['data']['id'] if x['data']['properties']['name'] == node_name else None, nodes))[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "c7b6cf3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([0]),)\n",
      "(array([4]),)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'HLA-DQA1 and Provenance of this association is GWAS.'"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "sentence = high_similarity_context[0]\n",
    "\n",
    "for node_type in node_types:\n",
    "    if node_type in sentence:        \n",
    "        print(np.where(node_type == np.array(sentence.split(' '))))\n",
    "\n",
    "sentence.split('Gene ')[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "768d83c6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS.',\n",
       " 'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES.',\n",
       " 'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS.']"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "31239a88",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'diseases_identifiers': ['https://diseases.jensenlab.org/Entity?documents=10&type1=9606&id1=ENSP00000369889&type2=-26&id2=DOID:0080044',\n",
       "  'MedlinePlus'],\n",
       " 'diseases_scores': ['6.503', 'CURATED'],\n",
       " 'sources': ['DISEASES'],\n",
       " 'diseases_sources': ['textmining', 'knowledge'],\n",
       " 'diseases_confidences': [3.252, 5.0]}"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "item[\"data\"][\"properties\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "019ceba5",
   "metadata": {},
   "outputs": [],
   "source": [
    "source = 'multiple sclerosis'\n",
    "target = 'COL2A1'\n",
    "\n",
    "nodes = list(filter(None, list(map(lambda x:x if '_' not in x['data']['neo4j_type'] else None, node_context))))\n",
    "edges = list(filter(None, list(map(lambda x:x if '_' in x['data']['neo4j_type'] else None, node_context))))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "84481154",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "152375"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# def get_node_id(inp_node)\n",
    "\n",
    "list(map(lambda x:x['data']['id'] if x['data']['properties']['name'] == source else None, nodes))[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e811b4a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}