QA Finetuning

This commit is contained in:
scalisir
2020-11-14 10:32:58 +03:00
parent e7f62add22
commit 60874724ca

View File

@@ -0,0 +1,145 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "QA_finetuning",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "FW0KtxFbsKjP"
},
"source": [
"!pip install transformers"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "0LzPy9QOuXCp"
},
"source": [
"!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/question-answering/run_squad.py\n",
"\n",
"!git clone https://github.com/dmis-lab/biobert.git"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "-RQTc30woc10"
},
"source": [
"# Default BioASQ test files doesn't match SQUAD format. So add dummy keys \n",
"# to match it to make predictions.\n",
"with open(\"BioASQ/BioASQ/BioASQ-test-factoid-4b-1.json\") as f:\n",
" data = json.load(f)\n",
"\n",
"new_paragraph = []\n",
"for questions in data[\"data\"][0][\"paragraphs\"]:\n",
" new_questions = []\n",
" for question in questions[\"qas\"]:\n",
" new_question = {**question}\n",
" # Adding dummy answer key.\n",
" new_question[\"answers\"] = [{\"text\": \"\", \"answer_start\": 0}]\n",
" new_questions.append(new_question)\n",
" \n",
" new_paragraph.append({**questions, \"qas\": new_questions})\n",
" \n",
"data[\"data\"][0][\"paragraphs\"] = new_paragraph\n",
"\n",
"with open(\"BioASQ/BioASQ/Answered_BioASQ-test-factoid-4b-1.json\", \"w\") as f:\n",
" json.dump(data, f)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "XSBhXzODoG9s"
},
"source": [
"# train_files directory format\n",
"# train_files\n",
"# train-v1.1.json\n",
"# dev-v1.1.json\n",
"# For the predictions on the raw data\n",
"# Answered_BioASQ-test-factoid-4b-1.json"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8XIPgsL8oeGq"
},
"source": [
"# google/electra-small-discriminator\n",
"# enelpi/med-electra-small-discriminator"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "e5IVU_WV_kc1"
},
"source": [
"!python run_squad.py \\\n",
"--model_type electra \\\n",
"--model_name_or_path enelpi/med-electra-small-discriminator \\\n",
"--do_train \\\n",
"--do_eval \\\n",
"--data_dir train_files \\\n",
"--predict_file Answered_BioASQ-test-factoid-4b-1.json \\\n",
"--cache_dir med_electra_cached_data \\\n",
"--per_gpu_train_batch_size 16 \\\n",
"--num_train_epochs 5 \\\n",
"--max_seq_length 128 \\\n",
"--doc_stride 128 \\\n",
"--output_dir ./med_electra_finetuned_model"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "nOKkXUpzGXdp"
},
"source": [
"!python biobert/biocodes/transform_nbset2bioasqform.py --nbest_path=med_electra_finetuned_model/nbest_predictions_.json --output_path=prediction_outputs"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "KRHTnIHhnt1c"
},
"source": [
"!java -Xmx10G -cp $CLASSPATH:./flat/BioASQEvaluation/dist/BioASQEvaluation.jar evaluation.EvaluatorTask1b -phaseB -e 5 BioASQ/BioASQ/4B1_golden.json prediction_outputs"
],
"execution_count": null,
"outputs": []
}
]
}