QA Finetuning

2021-06-01 09:24:59 +03:00 · 2020-11-14 10:32:58 +03:00
parent e7f62add22
commit 60874724ca
1 changed files with 145 additions and 0 deletions
--- a/fine_tuning/QA_Finetuning.ipynb
+++ b/fine_tuning/QA_Finetuning.ipynb
@@ -0,0 +1,145 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "QA_finetuning",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FW0KtxFbsKjP"
+      },
+      "source": [
+        "!pip install transformers"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "0LzPy9QOuXCp"
+      },
+      "source": [
+        "!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/question-answering/run_squad.py\n",
+        "\n",
+        "!git clone https://github.com/dmis-lab/biobert.git"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-RQTc30woc10"
+      },
+      "source": [
+        "# Default BioASQ test files doesn't match SQUAD format. So add dummy keys \n",
+        "# to match it to make predictions.\n",
+        "with open(\"BioASQ/BioASQ/BioASQ-test-factoid-4b-1.json\") as f:\n",
+        "    data = json.load(f)\n",
+        "\n",
+        "new_paragraph = []\n",
+        "for questions in data[\"data\"][0][\"paragraphs\"]:\n",
+        "    new_questions = []\n",
+        "    for question in questions[\"qas\"]:\n",
+        "        new_question = {**question}\n",
+        "        # Adding dummy answer key.\n",
+        "        new_question[\"answers\"] = [{\"text\": \"\", \"answer_start\": 0}]\n",
+        "        new_questions.append(new_question)\n",
+        "        \n",
+        "    new_paragraph.append({**questions, \"qas\": new_questions})\n",
+        "    \n",
+        "data[\"data\"][0][\"paragraphs\"] = new_paragraph\n",
+        "\n",
+        "with open(\"BioASQ/BioASQ/Answered_BioASQ-test-factoid-4b-1.json\", \"w\") as f:\n",
+        "    json.dump(data, f)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XSBhXzODoG9s"
+      },
+      "source": [
+        "# train_files directory format\n",
+        "# train_files\n",
+        "#     train-v1.1.json\n",
+        "#     dev-v1.1.json\n",
+        "# For the predictions on the raw data\n",
+        "#     Answered_BioASQ-test-factoid-4b-1.json"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8XIPgsL8oeGq"
+      },
+      "source": [
+        "# google/electra-small-discriminator\n",
+        "# enelpi/med-electra-small-discriminator"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "e5IVU_WV_kc1"
+      },
+      "source": [
+        "!python run_squad.py \\\n",
+        "--model_type electra \\\n",
+        "--model_name_or_path enelpi/med-electra-small-discriminator \\\n",
+        "--do_train \\\n",
+        "--do_eval \\\n",
+        "--data_dir train_files \\\n",
+        "--predict_file Answered_BioASQ-test-factoid-4b-1.json \\\n",
+        "--cache_dir med_electra_cached_data \\\n",
+        "--per_gpu_train_batch_size 16 \\\n",
+        "--num_train_epochs 5 \\\n",
+        "--max_seq_length 128 \\\n",
+        "--doc_stride 128 \\\n",
+        "--output_dir ./med_electra_finetuned_model"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nOKkXUpzGXdp"
+      },
+      "source": [
+        "!python biobert/biocodes/transform_nbset2bioasqform.py --nbest_path=med_electra_finetuned_model/nbest_predictions_.json --output_path=prediction_outputs"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "KRHTnIHhnt1c"
+      },
+      "source": [
+        "!java -Xmx10G -cp $CLASSPATH:./flat/BioASQEvaluation/dist/BioASQEvaluation.jar evaluation.EvaluatorTask1b -phaseB -e 5 BioASQ/BioASQ/4B1_golden.json prediction_outputs"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}