updated code

This commit is contained in:
nird
2025-02-02 12:24:43 +02:00
parent f50f0e4373
commit 76a529eccf
5 changed files with 185 additions and 34 deletions

View File

@@ -1,5 +1,6 @@
FORM 10-K FORM 10-KUNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C.
Washington, D.C. 20549
FORM 10-K
(Mark One)

View File

@@ -14,12 +14,14 @@ Custom modules:
"""
import json
from typing import List, Tuple
from typing import List, Tuple, Dict, Any
from deepeval import evaluate
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
# 09/15/24 kimmeyh Added path where helper functions is located to the path
# Add the parent directory to the path since we work with notebooks
@@ -90,41 +92,75 @@ relevance_metric = ContextualRelevancyMetric(
include_reason=True
)
def evaluate_rag(chunks_query_retriever, num_questions: int = 5) -> None:
def evaluate_rag(retriever, num_questions: int = 5) -> Dict[str, Any]:
"""
Evaluate the RAG system using predefined metrics.
Evaluates a RAG system using predefined test questions and metrics.
Args:
chunks_query_retriever: Function to retrieve context chunks for a given query.
num_questions (int): Number of questions to evaluate (default: 5).
retriever: The retriever component to evaluate
num_questions: Number of test questions to generate
Returns:
Dict containing evaluation metrics
"""
llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=2000)
question_answer_from_context_chain = create_question_answer_from_context_chain(llm)
# Load questions and answers from JSON file
q_a_file_name = "../data/q_a.json"
with open(q_a_file_name, "r", encoding="utf-8") as json_file:
q_a = json.load(json_file)
questions = [qa["question"] for qa in q_a][:num_questions]
ground_truth_answers = [qa["answer"] for qa in q_a][:num_questions]
generated_answers = []
retrieved_documents = []
# Generate answers and retrieve documents for each question
for question in questions:
context = retrieve_context_per_question(question, chunks_query_retriever)
retrieved_documents.append(context)
context_string = " ".join(context)
result = answer_question_from_context(question, context_string, question_answer_from_context_chain)
generated_answers.append(result["answer"])
# Create test cases and evaluate
test_cases = create_deep_eval_test_cases(questions, ground_truth_answers, generated_answers, retrieved_documents)
evaluate(
test_cases=test_cases,
metrics=[correctness_metric, faithfulness_metric, relevance_metric]
# Initialize LLM
llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo-preview")
# Create evaluation prompt
eval_prompt = PromptTemplate.from_template("""
Evaluate the following retrieval results for the question.
Question: {question}
Retrieved Context: {context}
Rate on a scale of 1-5 (5 being best) for:
1. Relevance: How relevant is the retrieved information to the question?
2. Completeness: Does the context contain all necessary information?
3. Conciseness: Is the retrieved context focused and free of irrelevant information?
Provide ratings in JSON format:
""")
# Create evaluation chain
eval_chain = (
eval_prompt
| llm
| StrOutputParser()
)
# Generate test questions
question_gen_prompt = PromptTemplate.from_template(
"Generate {num_questions} diverse test questions about climate change:"
)
question_chain = question_gen_prompt | llm | StrOutputParser()
questions = question_chain.invoke({"num_questions": num_questions}).split("\n")
# Evaluate each question
results = []
for question in questions:
# Get retrieval results
context = retriever.get_relevant_documents(question)
context_text = "\n".join([doc.page_content for doc in context])
# Evaluate results
eval_result = eval_chain.invoke({
"question": question,
"context": context_text
})
results.append(eval_result)
return {
"questions": questions,
"results": results,
"average_scores": calculate_average_scores(results)
}
def calculate_average_scores(results: List[Dict]) -> Dict[str, float]:
"""Calculate average scores across all evaluation results."""
# Implementation depends on the exact format of your results
pass
if __name__ == "__main__":
# Add any necessary setup or configuration here

View File

@@ -17,7 +17,7 @@ from enum import Enum
def replace_t_with_space(list_of_documents):
"""
Replaces all tab characters ('\t') with spaces in the page content of each document.
Replaces all tab characters ('\t') with spaces in the page content of each document
Args:
list_of_documents: A list of document objects, each with a 'page_content' attribute.

View File

@@ -208,3 +208,55 @@ nbformat==5.10.4
xxhash==3.5.0
yarl==1.10.0
zipp==3.20.1
# Core LangChain packages
langchain>=0.1.0
langchain-core>=0.1.17
langchain-community>=0.0.13
langchain-openai>=0.0.5
langchain-anthropic>=0.0.9
langchain-groq>=0.0.1
langchain-cohere>=0.0.1
# Vector stores and embeddings
faiss-cpu>=1.7.4
chromadb>=0.4.22
# Document processing
PyMuPDF>=1.23.8 # for fitz
python-docx>=1.0.1
pypdf>=3.17.4
rank-bm25>=0.2.2
# Machine Learning and Data Science
numpy>=1.24.3
pandas>=2.0.3
scikit-learn>=1.3.0
# API Clients
openai>=1.12.0
anthropic>=0.8.1
cohere>=4.48
groq>=0.4.2
# Testing and Evaluation
pytest>=7.4.0
deepeval>=0.20.12
grouse>=0.3.0
# Development Tools
python-dotenv>=1.0.0
jupyter>=1.0.0
notebook>=7.0.6
ipykernel>=6.29.2
# Type Checking
pydantic>=2.6.1
typing-extensions>=4.9.0
# Async Support
aiohttp>=3.9.1
asyncio>=3.4.3
# Utilities
tqdm>=4.66.1

View File

@@ -1,10 +1,18 @@
import pytest
import os
import sys
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import CharacterTextSplitter
from dotenv import load_dotenv
# Add the main folder to sys.path
sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../"))
# Load environment variables
load_dotenv()
def pytest_addoption(parser):
parser.addoption(
"--exclude", action="store", help="Comma-separated list of notebook or script files' paths to exclude"
@@ -40,4 +48,58 @@ def script_paths(request):
path_with_full_address = [folder + s for s in include_scripts]
return path_with_full_address
return path_with_full_address
@pytest.fixture(scope="session")
def llm():
"""Fixture for ChatOpenAI model."""
return ChatOpenAI(
temperature=0,
model_name="gpt-4-turbo-preview",
max_tokens=4000
)
@pytest.fixture(scope="session")
def embeddings():
"""Fixture for OpenAI embeddings."""
return OpenAIEmbeddings()
@pytest.fixture(scope="session")
def text_splitter():
"""Fixture for text splitter."""
return CharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
@pytest.fixture(scope="session")
def sample_texts():
"""Fixture for sample test data."""
return [
"The Earth is the third planet from the Sun.",
"Climate change is a significant global challenge.",
"Renewable energy sources include solar and wind power."
]
@pytest.fixture(scope="session")
def vector_store(embeddings, sample_texts, text_splitter):
"""Fixture for vector store."""
docs = text_splitter.create_documents(sample_texts)
return FAISS.from_documents(docs, embeddings)
@pytest.fixture(scope="session")
def retriever(vector_store):
"""Fixture for retriever."""
return vector_store.as_retriever(search_kwargs={"k": 2})
@pytest.fixture(scope="session")
def basic_prompt():
"""Fixture for basic prompt template."""
return PromptTemplate.from_template("""
Answer the following question based on the context provided:
Context: {context}
Question: {question}
Answer:
""")