add evaluation code&prompt
This commit is contained in:
214
dataset/LiHua-World/evaluation.ipynb
Normal file
214
dataset/LiHua-World/evaluation.ipynb
Normal file
@@ -0,0 +1,214 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from huggingface_hub import login\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"import csv\n",
|
||||
"from tqdm import trange\n",
|
||||
"from transformers import AutoModel,AutoTokenizer\n",
|
||||
"FILE_PATH = './QA_results_GT.csv'\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = AAA"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ANA_FILE_PATH = './mthp_output.csv'\n",
|
||||
"\n",
|
||||
"naiveanswer_LIST = []\n",
|
||||
"lightraganswer_LIST = []\n",
|
||||
"minianswer_LIST = []\n",
|
||||
"QUESTION_LIST = []\n",
|
||||
"GA_LIST = []\n",
|
||||
"filelength = 0\n",
|
||||
"with open(ANA_FILE_PATH, mode='r', encoding='utf-8') as question_file:\n",
|
||||
" reader = csv.DictReader(question_file)\n",
|
||||
" for row in reader:\n",
|
||||
" QUESTION_LIST.append(row['Question'])\n",
|
||||
" GA_LIST.append(row['Gold Answer'])\n",
|
||||
" naiveanswer_LIST.append(row['naive'])\n",
|
||||
" lightraganswer_LIST.append(row['lightrag'])\n",
|
||||
" minianswer_LIST.append(row['minirag'])\n",
|
||||
" filelength = filelength+1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROMPT = \"\"\"\n",
|
||||
"Now, I'll give you a question, a gold answer to this question, and three answers provided by different students.\n",
|
||||
"\n",
|
||||
"Determine the answer according to the following rules:\n",
|
||||
"If the answer is correct, get 1 point.\n",
|
||||
"If the answer is irrelevant to the question, it will receive 0 points.\n",
|
||||
"If the answer is incorrect, get -1 point.\n",
|
||||
"\n",
|
||||
"Return your answer in JSON mode.\n",
|
||||
"\n",
|
||||
"For example:\n",
|
||||
"\n",
|
||||
"Question:\n",
|
||||
"When does Li Hua arrive to the city?\n",
|
||||
"\n",
|
||||
"Gold Answer:\n",
|
||||
"20260105\n",
|
||||
"\n",
|
||||
"Answer1: LiHua arrived on the afternoon of January 5th\n",
|
||||
"Answer2: Sorry, there is no information about LiHua's arrival in the information you provided\n",
|
||||
"Answer3: There is no accurate answer in the information you provided, but according to the first information found, LiHua arrived on April 17th\n",
|
||||
"\n",
|
||||
"output:\n",
|
||||
"{{\n",
|
||||
"\"Score1\": 1,\n",
|
||||
"\"Score2\": 0,\n",
|
||||
"\"Score3\": -1,\n",
|
||||
"}}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Real data:\n",
|
||||
"\n",
|
||||
"Question:\n",
|
||||
"{question}\n",
|
||||
"Gold Answer:\n",
|
||||
"{ga}\n",
|
||||
"\n",
|
||||
"Answer1: {naive}\n",
|
||||
"Answer2: {light}\n",
|
||||
"Answer3: {mini}\n",
|
||||
"\n",
|
||||
"output:\n",
|
||||
"\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#deepseek\n",
|
||||
"from openai import OpenAI\n",
|
||||
"chatbot = OpenAI(api_key=My_deepseek_key, base_url=\"https://api.deepseek.com\")\n",
|
||||
"\n",
|
||||
"chat_list = []\n",
|
||||
"for i in range(filelength):\n",
|
||||
" p = PROMPT.format(question = QUESTION_LIST[i], ga = GA_LIST[i], naive = naiveanswer_LIST[i], light = lightraganswer_LIST[i], mini = minianswer_LIST[i])\n",
|
||||
" chat_completion = chatbot.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\":p,\n",
|
||||
" },\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" ],\n",
|
||||
" model=\"deepseek-chat\",\n",
|
||||
" stream = False\n",
|
||||
" )\n",
|
||||
" chat_list.append(chat_completion.choices[0].message.content.strip())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#openai\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from tqdm import trange\n",
|
||||
"chatbot = OpenAI()\n",
|
||||
"chat_list = []\n",
|
||||
"for i in trange(filelength):\n",
|
||||
" p = PROMPT.format(question = QUESTION_LIST[i], ga = GA_LIST[i], naive = naiveanswer_LIST[i], light = lightraganswer_LIST[i], mini = minianswer_LIST[i])\n",
|
||||
" chat_completion = chatbot.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\":p,\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" model=\"gpt-4o\",\n",
|
||||
" )\n",
|
||||
" chat_list.append(chat_completion.choices[0].message.content.strip())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import json_repair\n",
|
||||
"chat_score_list = [] \n",
|
||||
"for chat in chat_list:\n",
|
||||
" try:\n",
|
||||
" data = json_repair.loads(chat.strip('```json').strip('```'))\n",
|
||||
" chat_score_list.append(data)\n",
|
||||
" except:\n",
|
||||
" chat_score_list.append(0)\n",
|
||||
" print('Error in chat:', chat)\n",
|
||||
"\n",
|
||||
"all_score1 = [data['Score1'] for data in chat_score_list]\n",
|
||||
"all_score2 = [data['Score2'] for data in chat_score_list]\n",
|
||||
"all_score3 = [data['Score3'] for data in chat_score_list]\n",
|
||||
"\n",
|
||||
"all_score1_1 = all_score1.count(1)\n",
|
||||
"all_score1_0 = all_score1.count(0)\n",
|
||||
"all_score1_neg = all_score1.count(-1)\n",
|
||||
"\n",
|
||||
"all_score2_1 = all_score2.count(1)\n",
|
||||
"all_score2_0 = all_score2.count(0)\n",
|
||||
"all_score2_neg = all_score2.count(-1)\n",
|
||||
"\n",
|
||||
"all_score3_1 = all_score3.count(1)\n",
|
||||
"all_score3_0 = all_score3.count(0)\n",
|
||||
"all_score3_neg = all_score3.count(-1)\n",
|
||||
"\n",
|
||||
"all = len(all_score1)\n",
|
||||
"print(all_score1_1, all_score1_0, all_score1_neg)\n",
|
||||
"print(all_score2_1, all_score2_0, all_score2_neg)\n",
|
||||
"print(all_score3_1, all_score3_0, all_score3_neg)\n",
|
||||
"\n",
|
||||
"print(f\"Score1 1: {all_score1_1 / all * 100:.2f}\\%, Score1 0: {all_score1_0 / all * 100:.2f}\\%, Score1 -1: {all_score1_neg / all * 100:.2f}\\%\") \n",
|
||||
"print(f\"Score2 1: {all_score2_1 / all * 100:.2f}\\%, Score2 0: {all_score2_0 / all * 100:.2f}\\%, Score2 -1: {all_score2_neg / all * 100:.2f}\\%\")\n",
|
||||
"print(f\"Score3 1: {all_score3_1 / all * 100:.2f}\\%, Score3 0: {all_score3_0 / all * 100:.2f}\\%, Score3 -1: {all_score3_neg / all * 100:.2f}\\%\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Tianyu_agent",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.9.19"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user