added updated notebooks + code

2023-10-11 20:04:54 +03:00 · 2022-07-13 23:51:04 +02:00
parent a5cafb6e47
commit 51e36ccd37
4 changed files with 952 additions and 635 deletions
--- a/integrations/haystack/nhs-search/README.md
+++ b/integrations/haystack/nhs-search/README.md
@@ -1,3 +1,11 @@
-# NHS Search App \[Pinecone x Haystack\]
+# Doc Search App [Pinecone x Streamlit]

-Here is the code covering the NHS search app presented at the Pinecone x Haystack webinar on 14th July 2022.
+## Ideas
+
+* Doc search app
+* Using scraped technical docs from Streamlit
+* Could be cool to add technical docs from other places \[HuggingFace, Pinecone\]?
+* * With additional docs, we could show off metadata filtering
+* UI nice simple search bar, metadata filtering could be dropdown/checkboxes, or a recommendation after performing a search that returns mostly docs from one of the sections
+* Q&A style search would be best, if not possible a vanilla semantic search, which we could fine-tune easily w/ TSDAE if performance is not optimal
+# Search Tool
--- a/integrations/haystack/nhs-search/notebooks/00_indexing.ipynb
+++ b/integrations/haystack/nhs-search/notebooks/00_indexing.ipynb
--- a/integrations/haystack/nhs-search/notebooks/01_test_pipeline.ipynb
+++ b/integrations/haystack/nhs-search/notebooks/01_test_pipeline.ipynb
@@ -36,7 +36,7 @@
    {
     "data": {
      "text/plain": [
-       "(100, 100)"
+       "(8521, 8521)"
      ]
     },
     "execution_count": 2,
@@ -59,16 +59,16 @@
     "text": [
      "INFO - haystack.modeling.utils -  Using devices: CPU\n",
      "INFO - haystack.modeling.utils -  Number of GPUs: 0\n",
-      "INFO - haystack.retriever.dense -  Init retriever using embeddings of model flax-sentence-embeddings/all_datasets_v3_mpnet-base\n"
+      "INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/multi-qa-mpnet-base-dot-v1\n"
     ]
    }
   ],
   "source": [
-    "from haystack.retriever.dense import EmbeddingRetriever\n",
+    "from haystack.nodes.retriever import EmbeddingRetriever\n",
    "\n",
    "retriever = EmbeddingRetriever(\n",
    "    document_store=document_store,\n",
-    "    embedding_model='flax-sentence-embeddings/all_datasets_v3_mpnet-base',\n",
+    "    embedding_model='sentence-transformers/multi-qa-mpnet-base-dot-v1',\n",
    "    model_format=\"sentence_transformers\"\n",
    ")"
   ]
@@ -86,9 +86,119 @@
      "INFO - haystack.modeling.utils -  Number of GPUs: 0\n",
      "INFO - haystack.modeling.model.language_model -  LOADING MODEL\n",
      "INFO - haystack.modeling.model.language_model -  =============\n",
-      "INFO - haystack.modeling.model.language_model -  Could not find deepset/electra-base-squad2 locally.\n",
-      "INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...\n",
-      "INFO - haystack.modeling.model.language_model -  Loaded deepset/electra-base-squad2\n",
+      "INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2-distilled locally.\n",
+      "INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fae8519fb44d405bacae249f96bb9d69",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/729 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "098a78dd37da48b5bccdf95c85e27bbc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2-distilled\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da97f59cc2854872b11751048ab910bc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/295 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4670b856e53645eeb59a22922d3d082e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "938ab35200e54240870e2eee72d6bda2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "33780f9f6e0f495893004257e95d7c5e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc09919fe2d64cae89b573fc93042b6b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
      "INFO - haystack.modeling.utils -  Using devices: CPU\n",
      "INFO - haystack.modeling.utils -  Number of GPUs: 0\n",
      "INFO - haystack.modeling.infer -  Got ya 9 parallel workers to do inference ...\n",
@@ -103,8 +213,7 @@
    "from haystack.pipelines import ExtractiveQAPipeline\n",
    "\n",
    "reader = FARMReader(\n",
-    "    model_name_or_path='deepset/electra-base-squad2',\n",
-    "    use_gpu=True\n",
+    "    model_name_or_path='deepset/roberta-base-squad2-distilled'\n",
    ")\n",
    "pipe = ExtractiveQAPipeline(reader, retriever)"
   ]
@@ -124,7 +233,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f03771f42ee040668e73c919b9ef6aeb",
+       "model_id": "baca227f02f647c8ad8f76bd121a220c",
       "version_major": 2,
       "version_minor": 0
      },
@@ -166,8 +275,8 @@
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
@@ -183,11 +292,11 @@
     "text": [
      "Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]/Users/jamesbriggs/opt/anaconda3/envs/ml/lib/python3.9/site-packages/haystack/modeling/model/prediction_head.py:483: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
      "  start_indices = flat_sorted_indices // max_seq_len\n",
-      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.17 Batches/s]\n",
-      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.61 Batches/s]\n",
-      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.45 Batches/s]\n",
-      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.51 Batches/s]\n",
-      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.63 Batches/s]\n"
+      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  8.38 Batches/s]\n",
+      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.36 Batches/s]\n",
+      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.45 Batches/s]\n",
+      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.58 Batches/s]\n",
+      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.61 Batches/s]\n"
     ]
    }
   ],
@@ -213,16 +322,14 @@
      "\n",
      "Query: Who is affected by pre-eclampsia?\n",
      "Answers:\n",
-      "[   {   'answer': 'the mother and her baby',\n",
-      "        'context': 'cause no problems and improve soon after the baby is '\n",
-      "                   \"delivered, there's a risk of serious complications that \"\n",
-      "                   'can affect both the mother and her baby. '},\n",
-      "    {   'answer': 'before you were pregnant having an autoimmune condition, '\n",
-      "                  'such as lupus or antiphospholipid syndrome having high '\n",
-      "                  'blood pressure or pre-eclampsia in a previous pregnancy',\n",
-      "        'context': ' before you were pregnant having an autoimmune condition, '\n",
-      "                   'such as lupus or antiphospholipid syndrome having high '\n",
-      "                   'blood pressure or pre-eclampsia in a previous pregnancy'}]\n"
+      "[   {   'answer': 'pregnant women',\n",
+      "        'context': 'atment Complications Pre-eclampsia is a condition that '\n",
+      "                   'affects some pregnant women, usually during the second '\n",
+      "                   'half of pregnancy (from 20 weeks) or soo'},\n",
+      "    {   'answer': 'mother and baby',\n",
+      "        'context': ' are mild, the condition can lead to serious complications '\n",
+      "                   \"for both mother and baby if it's not monitored and \"\n",
+      "                   'treated.  The earlier pre-eclampsia is d'}]\n"
     ]
    }
   ],
@@ -240,16 +347,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "\"cause no problems and improve soon after the baby is delivered, there's a risk of serious complications that can affect both the mother and her baby. \""
+       "'atment Complications Pre-eclampsia is a condition that affects some pregnant women, usually during the second half of pregnancy (from 20 weeks) or soo'"
      ]
     },
-     "execution_count": 14,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -260,16 +367,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "(125, 148)"
+       "(68, 82)"
      ]
     },
-     "execution_count": 26,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -283,16 +390,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "'the mother and her baby'"
+       "'pregnant women'"
      ]
     },
-     "execution_count": 27,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -303,16 +410,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "<Answer {'answer': 'the mother and her baby', 'type': 'extractive', 'score': 0.9425204694271088, 'context': \"cause no problems and improve soon after the baby is delivered, there's a risk of serious complications that can affect both the mother and her baby. \", 'offsets_in_document': [{'start': 631, 'end': 654}], 'offsets_in_context': [{'start': 125, 'end': 148}], 'document_id': 'f65f2a23620b9ea4761c43683729fc6c', 'meta': {'url': 'www.nhs.uk_conditions_pre-eclampsia_.txt'}}>"
+       "<Answer {'answer': 'pregnant women', 'type': 'extractive', 'score': 0.8104832470417023, 'context': 'atment Complications Pre-eclampsia is a condition that affects some pregnant women, usually during the second half of pregnancy (from 20 weeks) or soo', 'offsets_in_document': [{'start': 140, 'end': 154}], 'offsets_in_context': [{'start': 68, 'end': 82}], 'document_id': '3bc401b213c2720c83ee9bddb0e769b8', 'meta': {'url': 'www.nhs.uk_conditions_pre-eclampsia_.txt'}}>"
      ]
     },
-     "execution_count": 28,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -323,16 +430,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "0.9425204694271088"
+       "0.8104832470417023"
      ]
     },
-     "execution_count": 29,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -343,7 +450,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@@ -352,7 +459,7 @@
       "'www.nhs.uk_conditions_pre-eclampsia_.txt'"
      ]
     },
-     "execution_count": 31,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
--- a/integrations/haystack/nhs-search/src/doc_search/app.py
+++ b/integrations/haystack/nhs-search/src/doc_search/app.py
@@ -7,9 +7,9 @@ from haystack.pipelines import ExtractiveQAPipeline

 PINECONE_API_KEY = st.secrets["PINECONE_KEY"]
 RETRIEVER = 'mpnet'
-RETRIEVER_URL = 'flax-sentence-embeddings/all_datasets_v3_mpnet-base'
-READER = 'electra'
-READER_URL = 'deepset/electra-base-squad2'
+RETRIEVER_URL = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
+READER = 'roberta-distilled'
+READER_URL = 'deepset/roberta-base-squad2-distilled'
 INDEX = 'haystack-nhs-jul'
 DIMS = 768