mirror of
https://github.com/NirDiamant/RAG_Techniques.git
synced 2025-04-07 00:48:52 +03:00
169 lines
4.9 KiB
Python
169 lines
4.9 KiB
Python
"""
|
|
RAG Evaluation Script
|
|
|
|
This script evaluates the performance of a Retrieval-Augmented Generation (RAG) system
|
|
using various metrics from the deepeval library.
|
|
|
|
Dependencies:
|
|
- deepeval
|
|
- langchain_openai
|
|
- json
|
|
|
|
Custom modules:
|
|
- helper_functions (for RAG-specific operations)
|
|
"""
|
|
|
|
import json
|
|
from typing import List, Tuple, Dict, Any
|
|
|
|
from deepeval import evaluate
|
|
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
|
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
from langchain_openai import ChatOpenAI
|
|
from langchain_core.prompts import PromptTemplate
|
|
from langchain_core.output_parsers import StrOutputParser
|
|
|
|
# 09/15/24 kimmeyh Added path where helper functions is located to the path
|
|
# Add the parent directory to the path since we work with notebooks
|
|
import sys
|
|
import os
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
parent_dir = os.path.dirname(current_dir)
|
|
sys.path.append(parent_dir)
|
|
|
|
from helper_functions import (
|
|
create_question_answer_from_context_chain,
|
|
answer_question_from_context,
|
|
retrieve_context_per_question
|
|
)
|
|
|
|
def create_deep_eval_test_cases(
|
|
questions: List[str],
|
|
gt_answers: List[str],
|
|
generated_answers: List[str],
|
|
retrieved_documents: List[str]
|
|
) -> List[LLMTestCase]:
|
|
"""
|
|
Create a list of LLMTestCase objects for evaluation.
|
|
|
|
Args:
|
|
questions (List[str]): List of input questions.
|
|
gt_answers (List[str]): List of ground truth answers.
|
|
generated_answers (List[str]): List of generated answers.
|
|
retrieved_documents (List[str]): List of retrieved documents.
|
|
|
|
Returns:
|
|
List[LLMTestCase]: List of LLMTestCase objects.
|
|
"""
|
|
return [
|
|
LLMTestCase(
|
|
input=question,
|
|
expected_output=gt_answer,
|
|
actual_output=generated_answer,
|
|
retrieval_context=retrieved_document
|
|
)
|
|
for question, gt_answer, generated_answer, retrieved_document in zip(
|
|
questions, gt_answers, generated_answers, retrieved_documents
|
|
)
|
|
]
|
|
|
|
# Define evaluation metrics
|
|
correctness_metric = GEval(
|
|
name="Correctness",
|
|
model="gpt-4o",
|
|
evaluation_params=[
|
|
LLMTestCaseParams.EXPECTED_OUTPUT,
|
|
LLMTestCaseParams.ACTUAL_OUTPUT
|
|
],
|
|
evaluation_steps=[
|
|
"Determine whether the actual output is factually correct based on the expected output."
|
|
],
|
|
)
|
|
|
|
faithfulness_metric = FaithfulnessMetric(
|
|
threshold=0.7,
|
|
model="gpt-4",
|
|
include_reason=False
|
|
)
|
|
|
|
relevance_metric = ContextualRelevancyMetric(
|
|
threshold=1,
|
|
model="gpt-4",
|
|
include_reason=True
|
|
)
|
|
|
|
def evaluate_rag(retriever, num_questions: int = 5) -> Dict[str, Any]:
|
|
"""
|
|
Evaluates a RAG system using predefined test questions and metrics.
|
|
|
|
Args:
|
|
retriever: The retriever component to evaluate
|
|
num_questions: Number of test questions to generate
|
|
|
|
Returns:
|
|
Dict containing evaluation metrics
|
|
"""
|
|
|
|
# Initialize LLM
|
|
llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo-preview")
|
|
|
|
# Create evaluation prompt
|
|
eval_prompt = PromptTemplate.from_template("""
|
|
Evaluate the following retrieval results for the question.
|
|
|
|
Question: {question}
|
|
Retrieved Context: {context}
|
|
|
|
Rate on a scale of 1-5 (5 being best) for:
|
|
1. Relevance: How relevant is the retrieved information to the question?
|
|
2. Completeness: Does the context contain all necessary information?
|
|
3. Conciseness: Is the retrieved context focused and free of irrelevant information?
|
|
|
|
Provide ratings in JSON format:
|
|
""")
|
|
|
|
# Create evaluation chain
|
|
eval_chain = (
|
|
eval_prompt
|
|
| llm
|
|
| StrOutputParser()
|
|
)
|
|
|
|
# Generate test questions
|
|
question_gen_prompt = PromptTemplate.from_template(
|
|
"Generate {num_questions} diverse test questions about climate change:"
|
|
)
|
|
question_chain = question_gen_prompt | llm | StrOutputParser()
|
|
|
|
questions = question_chain.invoke({"num_questions": num_questions}).split("\n")
|
|
|
|
# Evaluate each question
|
|
results = []
|
|
for question in questions:
|
|
# Get retrieval results
|
|
context = retriever.get_relevant_documents(question)
|
|
context_text = "\n".join([doc.page_content for doc in context])
|
|
|
|
# Evaluate results
|
|
eval_result = eval_chain.invoke({
|
|
"question": question,
|
|
"context": context_text
|
|
})
|
|
results.append(eval_result)
|
|
|
|
return {
|
|
"questions": questions,
|
|
"results": results,
|
|
"average_scores": calculate_average_scores(results)
|
|
}
|
|
|
|
def calculate_average_scores(results: List[Dict]) -> Dict[str, float]:
|
|
"""Calculate average scores across all evaluation results."""
|
|
# Implementation depends on the exact format of your results
|
|
pass
|
|
|
|
if __name__ == "__main__":
|
|
# Add any necessary setup or configuration here
|
|
# Example: evaluate_rag(your_chunks_query_retriever_function)
|
|
pass
|