add evaluation code&prompt

This commit is contained in:
tianyufan
2025-02-27 15:41:42 +08:00
parent 908b1a49d6
commit 5d520fc5ba

View File

@@ -0,0 +1,214 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import login\n",
"import os\n",
"import sys\n",
"import csv\n",
"from tqdm import trange\n",
"from transformers import AutoModel,AutoTokenizer\n",
"FILE_PATH = './QA_results_GT.csv'\n",
"os.environ[\"OPENAI_API_KEY\"] = AAA"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ANA_FILE_PATH = './mthp_output.csv'\n",
"\n",
"naiveanswer_LIST = []\n",
"lightraganswer_LIST = []\n",
"minianswer_LIST = []\n",
"QUESTION_LIST = []\n",
"GA_LIST = []\n",
"filelength = 0\n",
"with open(ANA_FILE_PATH, mode='r', encoding='utf-8') as question_file:\n",
" reader = csv.DictReader(question_file)\n",
" for row in reader:\n",
" QUESTION_LIST.append(row['Question'])\n",
" GA_LIST.append(row['Gold Answer'])\n",
" naiveanswer_LIST.append(row['naive'])\n",
" lightraganswer_LIST.append(row['lightrag'])\n",
" minianswer_LIST.append(row['minirag'])\n",
" filelength = filelength+1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"PROMPT = \"\"\"\n",
"Now, I'll give you a question, a gold answer to this question, and three answers provided by different students.\n",
"\n",
"Determine the answer according to the following rules:\n",
"If the answer is correct, get 1 point.\n",
"If the answer is irrelevant to the question, it will receive 0 points.\n",
"If the answer is incorrect, get -1 point.\n",
"\n",
"Return your answer in JSON mode.\n",
"\n",
"For example:\n",
"\n",
"Question:\n",
"When does Li Hua arrive to the city?\n",
"\n",
"Gold Answer:\n",
"20260105\n",
"\n",
"Answer1: LiHua arrived on the afternoon of January 5th\n",
"Answer2: Sorry, there is no information about LiHua's arrival in the information you provided\n",
"Answer3: There is no accurate answer in the information you provided, but according to the first information found, LiHua arrived on April 17th\n",
"\n",
"output:\n",
"{{\n",
"\"Score1\": 1,\n",
"\"Score2\": 0,\n",
"\"Score3\": -1,\n",
"}}\n",
"\n",
"\n",
"\n",
"Real data:\n",
"\n",
"Question:\n",
"{question}\n",
"Gold Answer:\n",
"{ga}\n",
"\n",
"Answer1: {naive}\n",
"Answer2: {light}\n",
"Answer3: {mini}\n",
"\n",
"output:\n",
"\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#deepseek\n",
"from openai import OpenAI\n",
"chatbot = OpenAI(api_key=My_deepseek_key, base_url=\"https://api.deepseek.com\")\n",
"\n",
"chat_list = []\n",
"for i in range(filelength):\n",
" p = PROMPT.format(question = QUESTION_LIST[i], ga = GA_LIST[i], naive = naiveanswer_LIST[i], light = lightraganswer_LIST[i], mini = minianswer_LIST[i])\n",
" chat_completion = chatbot.chat.completions.create(\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\":p,\n",
" },\n",
" \n",
"\n",
" ],\n",
" model=\"deepseek-chat\",\n",
" stream = False\n",
" )\n",
" chat_list.append(chat_completion.choices[0].message.content.strip())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#openai\n",
"from openai import OpenAI\n",
"from tqdm import trange\n",
"chatbot = OpenAI()\n",
"chat_list = []\n",
"for i in trange(filelength):\n",
" p = PROMPT.format(question = QUESTION_LIST[i], ga = GA_LIST[i], naive = naiveanswer_LIST[i], light = lightraganswer_LIST[i], mini = minianswer_LIST[i])\n",
" chat_completion = chatbot.chat.completions.create(\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\":p,\n",
" },\n",
" ],\n",
" model=\"gpt-4o\",\n",
" )\n",
" chat_list.append(chat_completion.choices[0].message.content.strip())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import json_repair\n",
"chat_score_list = [] \n",
"for chat in chat_list:\n",
" try:\n",
" data = json_repair.loads(chat.strip('```json').strip('```'))\n",
" chat_score_list.append(data)\n",
" except:\n",
" chat_score_list.append(0)\n",
" print('Error in chat:', chat)\n",
"\n",
"all_score1 = [data['Score1'] for data in chat_score_list]\n",
"all_score2 = [data['Score2'] for data in chat_score_list]\n",
"all_score3 = [data['Score3'] for data in chat_score_list]\n",
"\n",
"all_score1_1 = all_score1.count(1)\n",
"all_score1_0 = all_score1.count(0)\n",
"all_score1_neg = all_score1.count(-1)\n",
"\n",
"all_score2_1 = all_score2.count(1)\n",
"all_score2_0 = all_score2.count(0)\n",
"all_score2_neg = all_score2.count(-1)\n",
"\n",
"all_score3_1 = all_score3.count(1)\n",
"all_score3_0 = all_score3.count(0)\n",
"all_score3_neg = all_score3.count(-1)\n",
"\n",
"all = len(all_score1)\n",
"print(all_score1_1, all_score1_0, all_score1_neg)\n",
"print(all_score2_1, all_score2_0, all_score2_neg)\n",
"print(all_score3_1, all_score3_0, all_score3_neg)\n",
"\n",
"print(f\"Score1 1: {all_score1_1 / all * 100:.2f}\\%, Score1 0: {all_score1_0 / all * 100:.2f}\\%, Score1 -1: {all_score1_neg / all * 100:.2f}\\%\") \n",
"print(f\"Score2 1: {all_score2_1 / all * 100:.2f}\\%, Score2 0: {all_score2_0 / all * 100:.2f}\\%, Score2 -1: {all_score2_neg / all * 100:.2f}\\%\")\n",
"print(f\"Score3 1: {all_score3_1 / all * 100:.2f}\\%, Score3 0: {all_score3_0 / all * 100:.2f}\\%, Score3 -1: {all_score3_neg / all * 100:.2f}\\%\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Tianyu_agent",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}