RAG_Techniques/evaluation/define_evaluation_metrics.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_openai import ChatOpenAI \n",
    "from langchain.chains import LLMChain\n",
    "from langchain.prompts import PromptTemplate\n",
    "from langchain.evaluation import load_evaluator\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "# from langchain.evaluation.criteria import {\n",
    "#     CriteriaEvalChain,\n",
    "#     LabeledCriteriaEvalChain\n",
    "# }\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.vectorstores import FAISS\n",
    "# from sklearn.metrics.pairwise import cosine_similarity\n",
    "import numpy as np\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = ChatOpenAI(temperature=0, model_name=\"gpt-4o\", max_tokens=4000)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "class ResultScore(BaseModel):\n",
    "    score: float = Field(..., description=\"The score of the result, ranging from 0 to 1 where 1 is the best possible score.\")\n",
    "    # explanation: str = Field(..., description=\"An extensive explanation of the score.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "correctness_prompt = PromptTemplate(\n",
    "input_variables=[\"question\", \"ground_truth\", \"generated_answer\"],\n",
    "template=\"\"\"\n",
    "Question: {question}\n",
    "Ground Truth: {ground_truth}\n",
    "Generated Answer: {generated_answer}\n",
    "\n",
    "Evaluate the correctness of the generated answer compared to the ground truth.\n",
    "Score from 0 to 1, where 1 is perfectly correct and 0 is completely incorrect.\n",
    "any score between 0 and 1 is acceptable and depends on how correct the generated answer is.\n",
    "\n",
    "Score:\n",
    "\"\"\"\n",
    ")\n",
    "correctness_chain = correctness_prompt | llm.with_structured_output(ResultScore)\n",
    "\n",
    "\n",
    "def evaluate_correctness(question, ground_truth, generated_answer):\n",
    "    \"\"\"Evaluates the correctness of the generated answer compared to the ground truth.\n",
    "\n",
    "    Args:\n",
    "        question: The question.\n",
    "        ground_truth: The ground truth answer.\n",
    "        generated_answer: The generated answer.\n",
    "\n",
    "    Returns:\n",
    "        A float between 0 and 1, where 1 is the best possible score.\n",
    "    \"\"\"\n",
    "    result = correctness_chain.invoke({\"question\": question, \"ground_truth\": ground_truth, \"generated_answer\": generated_answer})\n",
    "    return result.score\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test create_correctness_chain\n",
    "question = \"What is the capital of France and Spain?\"\n",
    "ground_truth = \"Paris and Barcelona\"\n",
    "generated_answer = \"Paris\"\n",
    "score = evaluate_correctness(question, ground_truth, generated_answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "faithfulness_prompt = PromptTemplate(\n",
    "input_variables=[\"question\",\"context\", \"generated_answer\"],\n",
    "template=\"\"\"\n",
    "Question: {question}\n",
    "Context: {context}\n",
    "Generated Answer: {generated_answer}\n",
    "\n",
    "Evaluate if the generate answer to the question can be deduced from the context.\n",
    "Score of 0 or 1, where 1 is perfectly faithful *AND CAN BE DERIVED FROM THE CONTEXT* and 0 otherwise.\n",
    "you don't mind if the answer is correct, all you care about is if the answer can be deduced from the context.\n",
    "\n",
    "example:\n",
    "Question: What are the capitals of France and Spain?\n",
    "Context: Paris is the capital of France and Madrid is the capital of Spain.\n",
    "Generated Answer: Paris\n",
    "in this case the generated answer is faithful to the context so the score should be *1*.\n",
    "\n",
    "example:\n",
    "Question: What are the capital cities of France and Spain?\n",
    "Context: London is the capital of France and Barcelona is the capital of Spain.\n",
    "Generated Answer: London and Barcelona.\n",
    "in this case the generated answer is faithful to the context so the score should be *1*.\n",
    "\n",
    "example:\n",
    "Question: What are the capital cities of France and Spain?\n",
    "Context: Paris is the capital of France and Madrid is the capital of Spain.\n",
    "Generated Answer: Paris.\n",
    "in this case the generated answer is faithful to the context so the score should be *1*.\n",
    "\n",
    "exmaple:\n",
    "Question: What are the capitals of France and Spain?\n",
    "Context: London is the capital of France and Madrid is the Capital of Spain.\n",
    "Generated Answer: Paris and Madrid.\n",
    "in this case the generated answer is based on the pretrained knowledge of the llm and is not faithful to the context so the score should be *0*.\n",
    "\n",
    "example:\n",
    "Question: What is the capital of France and Spain?\n",
    "Context: Monkeys like to eat bananas.\n",
    "Generated Answer: Paris and Madrid.\n",
    "in this case the generated answer is not based on the context so the score should be *0*.\n",
    "\n",
    "example:\n",
    "Question: What is the capital of France?\n",
    "Context: Paris.\n",
    "Generated Answer: Paris.\n",
    "in this case the context doesn't specify that Paris is the capital of France, and it cannot be deduced from the context, so the score should be *0*.\n",
    "\n",
    "\n",
    "Example:\n",
    "Question: What is 2+2?\n",
    "Context: 4.\n",
    "Generated Answer: 4.\n",
    "In this case, the context states '4', but it does not provide information to deduce the answer to 'What is 2+2?', so the score should be *0*.\n",
    "\"\"\"\n",
    ")\n",
    "faithfulness_chain = faithfulness_prompt | llm.with_structured_output(ResultScore)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_faithfulness(question, context, generated_answer):\n",
    "    \"\"\"Evaluates if the generate answer to the question can be deduced from the context.\n",
    "\n",
    "    Args:\n",
    "        question: The question.\n",
    "        context: The context.\n",
    "        generated_answer: The generated answer.\n",
    "\n",
    "    Returns:\n",
    "        A float between 0 and 1, where 1 is the best possible score.\n",
    "    \"\"\"\n",
    "    result = faithfulness_chain.invoke({\"question\": question, \"context\": context, \"generated_answer\": generated_answer})\n",
    "    return result.score, result.explanation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0\n",
      "The context states '6', but it does not provide information to deduce the answer to 'What is 3+3?'. The answer is correct, but it cannot be derived from the context.\n"
     ]
    }
   ],
   "source": [
    "# test create_faithfulness_chain\n",
    "question = \"what is 3+3?\"\n",
    "context = \"6\"\n",
    "generated_answer = \"6\"\n",
    "score, explanation = evaluate_faithfulness(question, context, generated_answer)\n",
    "print(score)\n",
    "print(explanation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain import PromptTemplate\n",
    "\n",
    "relevancy_score_prompt = PromptTemplate(\n",
    "    input_variables=[\"question\", \"contexts\"],\n",
    "    template=\"\"\"\n",
    "Q: {question}\n",
    "Docs: {contexts}\n",
    "\n",
    "Score each doc's relevance:\n",
    "0.00 - Irrelevant: No relation to the question\n",
    "0.33 - Somewhat relevant: Contains related keywords or concepts\n",
    "0.66 - Relevant: Partially answers or strongly implies the answer\n",
    "1.00 - Highly relevant: Directly and fully answers the question\n",
    "\n",
    "Consider: Relevance, Directness, Completeness, Accuracy\n",
    "\n",
    "Final Score: [Average of all scores]\n",
    "\"\"\"\n",
    ")\n",
    "ratio_of_relevant_docs_chain = ratio_of_relevant_docs_prompt | llm.with_structured_output(ResultScore)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_ratio_of_relevant_docs(question, contexts):\n",
    "    \"\"\"Evaluates the ratio of relevant documents in the contexts to the question.\n",
    "\n",
    "    Args:\n",
    "        question: The question.\n",
    "        contexts: A list of documents.\n",
    "\n",
    "    Returns:\n",
    "        A float between 0 and 1, where 1 is the best possible score.\n",
    "    \"\"\"\n",
    "    result = ratio_of_relevant_docs_chain.invoke({\"question\": question, \"contexts\": contexts})\n",
    "    return result.score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.0\n"
     ]
    }
   ],
   "source": [
    "# test create_ratio_of_relevant_docs_chain\n",
    "question = \"What is the capital of France?\"\n",
    "contexts = [\"Paris.\", \"i was traveling in France.\"]\n",
    "score = evaluate_ratio_of_relevant_docs(question, contexts)\n",
    "# score, explanation = evaluate_ratio_of_relevant_docs(question, contexts)\n",
    "print(score)\n",
    "# print(explanation)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}