mirror of
https://github.com/NirDiamant/RAG_Techniques.git
synced 2025-04-07 00:48:52 +03:00
323 lines
10 KiB
Plaintext
323 lines
10 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain_openai import ChatOpenAI \n",
|
|
"from langchain.chains import LLMChain\n",
|
|
"from langchain.prompts import PromptTemplate\n",
|
|
"from langchain.evaluation import load_evaluator\n",
|
|
"from langchain_core.pydantic_v1 import BaseModel, Field\n",
|
|
"\n",
|
|
"# from langchain.evaluation.criteria import {\n",
|
|
"# CriteriaEvalChain,\n",
|
|
"# LabeledCriteriaEvalChain\n",
|
|
"# }\n",
|
|
"from langchain.embeddings import OpenAIEmbeddings\n",
|
|
"from langchain.vectorstores import FAISS\n",
|
|
"# from sklearn.metrics.pairwise import cosine_similarity\n",
|
|
"import numpy as np\n",
|
|
"import os\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"load_dotenv()\n",
|
|
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"llm = ChatOpenAI(temperature=0, model_name=\"gpt-4o\", max_tokens=4000)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 129,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class ResultScore(BaseModel):\n",
|
|
" score: float = Field(..., description=\"The score of the result, ranging from 0 to 1 where 1 is the best possible score.\")\n",
|
|
" # explanation: str = Field(..., description=\"An extensive explanation of the score.\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 104,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"correctness_prompt = PromptTemplate(\n",
|
|
"input_variables=[\"question\", \"ground_truth\", \"generated_answer\"],\n",
|
|
"template=\"\"\"\n",
|
|
"Question: {question}\n",
|
|
"Ground Truth: {ground_truth}\n",
|
|
"Generated Answer: {generated_answer}\n",
|
|
"\n",
|
|
"Evaluate the correctness of the generated answer compared to the ground truth.\n",
|
|
"Score from 0 to 1, where 1 is perfectly correct and 0 is completely incorrect.\n",
|
|
"any score between 0 and 1 is acceptable and depends on how correct the generated answer is.\n",
|
|
"\n",
|
|
"Score:\n",
|
|
"\"\"\"\n",
|
|
")\n",
|
|
"correctness_chain = correctness_prompt | llm.with_structured_output(ResultScore)\n",
|
|
"\n",
|
|
"\n",
|
|
"def evaluate_correctness(question, ground_truth, generated_answer):\n",
|
|
" \"\"\"Evaluates the correctness of the generated answer compared to the ground truth.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" question: The question.\n",
|
|
" ground_truth: The ground truth answer.\n",
|
|
" generated_answer: The generated answer.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" A float between 0 and 1, where 1 is the best possible score.\n",
|
|
" \"\"\"\n",
|
|
" result = correctness_chain.invoke({\"question\": question, \"ground_truth\": ground_truth, \"generated_answer\": generated_answer})\n",
|
|
" return result.score\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# test create_correctness_chain\n",
|
|
"question = \"What is the capital of France and Spain?\"\n",
|
|
"ground_truth = \"Paris and Barcelona\"\n",
|
|
"generated_answer = \"Paris\"\n",
|
|
"score = evaluate_correctness(question, ground_truth, generated_answer)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.5"
|
|
]
|
|
},
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"score"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 105,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"faithfulness_prompt = PromptTemplate(\n",
|
|
"input_variables=[\"question\",\"context\", \"generated_answer\"],\n",
|
|
"template=\"\"\"\n",
|
|
"Question: {question}\n",
|
|
"Context: {context}\n",
|
|
"Generated Answer: {generated_answer}\n",
|
|
"\n",
|
|
"Evaluate if the generate answer to the question can be deduced from the context.\n",
|
|
"Score of 0 or 1, where 1 is perfectly faithful *AND CAN BE DERIVED FROM THE CONTEXT* and 0 otherwise.\n",
|
|
"you don't mind if the answer is correct, all you care about is if the answer can be deduced from the context.\n",
|
|
"\n",
|
|
"example:\n",
|
|
"Question: What are the capitals of France and Spain?\n",
|
|
"Context: Paris is the capital of France and Madrid is the capital of Spain.\n",
|
|
"Generated Answer: Paris\n",
|
|
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
|
|
"\n",
|
|
"example:\n",
|
|
"Question: What are the capital cities of France and Spain?\n",
|
|
"Context: London is the capital of France and Barcelona is the capital of Spain.\n",
|
|
"Generated Answer: London and Barcelona.\n",
|
|
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
|
|
"\n",
|
|
"example:\n",
|
|
"Question: What are the capital cities of France and Spain?\n",
|
|
"Context: Paris is the capital of France and Madrid is the capital of Spain.\n",
|
|
"Generated Answer: Paris.\n",
|
|
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
|
|
"\n",
|
|
"exmaple:\n",
|
|
"Question: What are the capitals of France and Spain?\n",
|
|
"Context: London is the capital of France and Madrid is the Capital of Spain.\n",
|
|
"Generated Answer: Paris and Madrid.\n",
|
|
"in this case the generated answer is based on the pretrained knowledge of the llm and is not faithful to the context so the score should be *0*.\n",
|
|
"\n",
|
|
"example:\n",
|
|
"Question: What is the capital of France and Spain?\n",
|
|
"Context: Monkeys like to eat bananas.\n",
|
|
"Generated Answer: Paris and Madrid.\n",
|
|
"in this case the generated answer is not based on the context so the score should be *0*.\n",
|
|
"\n",
|
|
"example:\n",
|
|
"Question: What is the capital of France?\n",
|
|
"Context: Paris.\n",
|
|
"Generated Answer: Paris.\n",
|
|
"in this case the context doesn't specify that Paris is the capital of France, and it cannot be deduced from the context, so the score should be *0*.\n",
|
|
"\n",
|
|
"\n",
|
|
"Example:\n",
|
|
"Question: What is 2+2?\n",
|
|
"Context: 4.\n",
|
|
"Generated Answer: 4.\n",
|
|
"In this case, the context states '4', but it does not provide information to deduce the answer to 'What is 2+2?', so the score should be *0*.\n",
|
|
"\"\"\"\n",
|
|
")\n",
|
|
"faithfulness_chain = faithfulness_prompt | llm.with_structured_output(ResultScore)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 83,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def evaluate_faithfulness(question, context, generated_answer):\n",
|
|
" \"\"\"Evaluates if the generate answer to the question can be deduced from the context.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" question: The question.\n",
|
|
" context: The context.\n",
|
|
" generated_answer: The generated answer.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" A float between 0 and 1, where 1 is the best possible score.\n",
|
|
" \"\"\"\n",
|
|
" result = faithfulness_chain.invoke({\"question\": question, \"context\": context, \"generated_answer\": generated_answer})\n",
|
|
" return result.score, result.explanation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 86,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"0.0\n",
|
|
"The context states '6', but it does not provide information to deduce the answer to 'What is 3+3?'. The answer is correct, but it cannot be derived from the context.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# test create_faithfulness_chain\n",
|
|
"question = \"what is 3+3?\"\n",
|
|
"context = \"6\"\n",
|
|
"generated_answer = \"6\"\n",
|
|
"score, explanation = evaluate_faithfulness(question, context, generated_answer)\n",
|
|
"print(score)\n",
|
|
"print(explanation)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 130,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain import PromptTemplate\n",
|
|
"\n",
|
|
"relevancy_score_prompt = PromptTemplate(\n",
|
|
" input_variables=[\"question\", \"contexts\"],\n",
|
|
" template=\"\"\"\n",
|
|
"Q: {question}\n",
|
|
"Docs: {contexts}\n",
|
|
"\n",
|
|
"Score each doc's relevance:\n",
|
|
"0.00 - Irrelevant: No relation to the question\n",
|
|
"0.33 - Somewhat relevant: Contains related keywords or concepts\n",
|
|
"0.66 - Relevant: Partially answers or strongly implies the answer\n",
|
|
"1.00 - Highly relevant: Directly and fully answers the question\n",
|
|
"\n",
|
|
"Consider: Relevance, Directness, Completeness, Accuracy\n",
|
|
"\n",
|
|
"Final Score: [Average of all scores]\n",
|
|
"\"\"\"\n",
|
|
")\n",
|
|
"ratio_of_relevant_docs_chain = ratio_of_relevant_docs_prompt | llm.with_structured_output(ResultScore)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 131,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def evaluate_ratio_of_relevant_docs(question, contexts):\n",
|
|
" \"\"\"Evaluates the ratio of relevant documents in the contexts to the question.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" question: The question.\n",
|
|
" contexts: A list of documents.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" A float between 0 and 1, where 1 is the best possible score.\n",
|
|
" \"\"\"\n",
|
|
" result = ratio_of_relevant_docs_chain.invoke({\"question\": question, \"contexts\": contexts})\n",
|
|
" return result.score"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 132,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# test create_ratio_of_relevant_docs_chain\n",
|
|
"question = \"What is the capital of France?\"\n",
|
|
"contexts = [\"Paris.\", \"i was traveling in France.\"]\n",
|
|
"score = evaluate_ratio_of_relevant_docs(question, contexts)\n",
|
|
"# score, explanation = evaluate_ratio_of_relevant_docs(question, contexts)\n",
|
|
"print(score)\n",
|
|
"# print(explanation)\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|