Files
RAG_Techniques/evaluation/define_evaluation_metrics.ipynb

323 lines
10 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI \n",
"from langchain.chains import LLMChain\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.evaluation import load_evaluator\n",
"from langchain_core.pydantic_v1 import BaseModel, Field\n",
"\n",
"# from langchain.evaluation.criteria import {\n",
"# CriteriaEvalChain,\n",
"# LabeledCriteriaEvalChain\n",
"# }\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.vectorstores import FAISS\n",
"# from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n",
"import os\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"llm = ChatOpenAI(temperature=0, model_name=\"gpt-4o\", max_tokens=4000)\n"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
"class ResultScore(BaseModel):\n",
" score: float = Field(..., description=\"The score of the result, ranging from 0 to 1 where 1 is the best possible score.\")\n",
" # explanation: str = Field(..., description=\"An extensive explanation of the score.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"correctness_prompt = PromptTemplate(\n",
"input_variables=[\"question\", \"ground_truth\", \"generated_answer\"],\n",
"template=\"\"\"\n",
"Question: {question}\n",
"Ground Truth: {ground_truth}\n",
"Generated Answer: {generated_answer}\n",
"\n",
"Evaluate the correctness of the generated answer compared to the ground truth.\n",
"Score from 0 to 1, where 1 is perfectly correct and 0 is completely incorrect.\n",
"any score between 0 and 1 is acceptable and depends on how correct the generated answer is.\n",
"\n",
"Score:\n",
"\"\"\"\n",
")\n",
"correctness_chain = correctness_prompt | llm.with_structured_output(ResultScore)\n",
"\n",
"\n",
"def evaluate_correctness(question, ground_truth, generated_answer):\n",
" \"\"\"Evaluates the correctness of the generated answer compared to the ground truth.\n",
"\n",
" Args:\n",
" question: The question.\n",
" ground_truth: The ground truth answer.\n",
" generated_answer: The generated answer.\n",
"\n",
" Returns:\n",
" A float between 0 and 1, where 1 is the best possible score.\n",
" \"\"\"\n",
" result = correctness_chain.invoke({\"question\": question, \"ground_truth\": ground_truth, \"generated_answer\": generated_answer})\n",
" return result.score\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# test create_correctness_chain\n",
"question = \"What is the capital of France and Spain?\"\n",
"ground_truth = \"Paris and Barcelona\"\n",
"generated_answer = \"Paris\"\n",
"score = evaluate_correctness(question, ground_truth, generated_answer)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"faithfulness_prompt = PromptTemplate(\n",
"input_variables=[\"question\",\"context\", \"generated_answer\"],\n",
"template=\"\"\"\n",
"Question: {question}\n",
"Context: {context}\n",
"Generated Answer: {generated_answer}\n",
"\n",
"Evaluate if the generate answer to the question can be deduced from the context.\n",
"Score of 0 or 1, where 1 is perfectly faithful *AND CAN BE DERIVED FROM THE CONTEXT* and 0 otherwise.\n",
"you don't mind if the answer is correct, all you care about is if the answer can be deduced from the context.\n",
"\n",
"example:\n",
"Question: What are the capitals of France and Spain?\n",
"Context: Paris is the capital of France and Madrid is the capital of Spain.\n",
"Generated Answer: Paris\n",
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
"\n",
"example:\n",
"Question: What are the capital cities of France and Spain?\n",
"Context: London is the capital of France and Barcelona is the capital of Spain.\n",
"Generated Answer: London and Barcelona.\n",
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
"\n",
"example:\n",
"Question: What are the capital cities of France and Spain?\n",
"Context: Paris is the capital of France and Madrid is the capital of Spain.\n",
"Generated Answer: Paris.\n",
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
"\n",
"exmaple:\n",
"Question: What are the capitals of France and Spain?\n",
"Context: London is the capital of France and Madrid is the Capital of Spain.\n",
"Generated Answer: Paris and Madrid.\n",
"in this case the generated answer is based on the pretrained knowledge of the llm and is not faithful to the context so the score should be *0*.\n",
"\n",
"example:\n",
"Question: What is the capital of France and Spain?\n",
"Context: Monkeys like to eat bananas.\n",
"Generated Answer: Paris and Madrid.\n",
"in this case the generated answer is not based on the context so the score should be *0*.\n",
"\n",
"example:\n",
"Question: What is the capital of France?\n",
"Context: Paris.\n",
"Generated Answer: Paris.\n",
"in this case the context doesn't specify that Paris is the capital of France, and it cannot be deduced from the context, so the score should be *0*.\n",
"\n",
"\n",
"Example:\n",
"Question: What is 2+2?\n",
"Context: 4.\n",
"Generated Answer: 4.\n",
"In this case, the context states '4', but it does not provide information to deduce the answer to 'What is 2+2?', so the score should be *0*.\n",
"\"\"\"\n",
")\n",
"faithfulness_chain = faithfulness_prompt | llm.with_structured_output(ResultScore)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_faithfulness(question, context, generated_answer):\n",
" \"\"\"Evaluates if the generate answer to the question can be deduced from the context.\n",
"\n",
" Args:\n",
" question: The question.\n",
" context: The context.\n",
" generated_answer: The generated answer.\n",
"\n",
" Returns:\n",
" A float between 0 and 1, where 1 is the best possible score.\n",
" \"\"\"\n",
" result = faithfulness_chain.invoke({\"question\": question, \"context\": context, \"generated_answer\": generated_answer})\n",
" return result.score, result.explanation"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0\n",
"The context states '6', but it does not provide information to deduce the answer to 'What is 3+3?'. The answer is correct, but it cannot be derived from the context.\n"
]
}
],
"source": [
"# test create_faithfulness_chain\n",
"question = \"what is 3+3?\"\n",
"context = \"6\"\n",
"generated_answer = \"6\"\n",
"score, explanation = evaluate_faithfulness(question, context, generated_answer)\n",
"print(score)\n",
"print(explanation)"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"from langchain import PromptTemplate\n",
"\n",
"relevancy_score_prompt = PromptTemplate(\n",
" input_variables=[\"question\", \"contexts\"],\n",
" template=\"\"\"\n",
"Q: {question}\n",
"Docs: {contexts}\n",
"\n",
"Score each doc's relevance:\n",
"0.00 - Irrelevant: No relation to the question\n",
"0.33 - Somewhat relevant: Contains related keywords or concepts\n",
"0.66 - Relevant: Partially answers or strongly implies the answer\n",
"1.00 - Highly relevant: Directly and fully answers the question\n",
"\n",
"Consider: Relevance, Directness, Completeness, Accuracy\n",
"\n",
"Final Score: [Average of all scores]\n",
"\"\"\"\n",
")\n",
"ratio_of_relevant_docs_chain = ratio_of_relevant_docs_prompt | llm.with_structured_output(ResultScore)"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_ratio_of_relevant_docs(question, contexts):\n",
" \"\"\"Evaluates the ratio of relevant documents in the contexts to the question.\n",
"\n",
" Args:\n",
" question: The question.\n",
" contexts: A list of documents.\n",
"\n",
" Returns:\n",
" A float between 0 and 1, where 1 is the best possible score.\n",
" \"\"\"\n",
" result = ratio_of_relevant_docs_chain.invoke({\"question\": question, \"contexts\": contexts})\n",
" return result.score"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0\n"
]
}
],
"source": [
"# test create_ratio_of_relevant_docs_chain\n",
"question = \"What is the capital of France?\"\n",
"contexts = [\"Paris.\", \"i was traveling in France.\"]\n",
"score = evaluate_ratio_of_relevant_docs(question, contexts)\n",
"# score, explanation = evaluate_ratio_of_relevant_docs(question, contexts)\n",
"print(score)\n",
"# print(explanation)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}