mirror of
https://github.com/NirDiamant/RAG_Techniques.git
synced 2025-04-07 00:48:52 +03:00
updated code
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
FORM 10-K FORM 10-KUNITED STATES
|
||||
SECURITIES AND EXCHANGE COMMISSION
|
||||
Washington, D.C.
|
||||
Washington, D.C. 20549
|
||||
FORM 10-K
|
||||
(Mark One)
|
||||
|
||||
@@ -14,12 +14,14 @@ Custom modules:
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import List, Tuple
|
||||
from typing import List, Tuple, Dict, Any
|
||||
|
||||
from deepeval import evaluate
|
||||
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
|
||||
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
|
||||
# 09/15/24 kimmeyh Added path where helper functions is located to the path
|
||||
# Add the parent directory to the path since we work with notebooks
|
||||
@@ -90,41 +92,75 @@ relevance_metric = ContextualRelevancyMetric(
|
||||
include_reason=True
|
||||
)
|
||||
|
||||
def evaluate_rag(chunks_query_retriever, num_questions: int = 5) -> None:
|
||||
def evaluate_rag(retriever, num_questions: int = 5) -> Dict[str, Any]:
|
||||
"""
|
||||
Evaluate the RAG system using predefined metrics.
|
||||
|
||||
Evaluates a RAG system using predefined test questions and metrics.
|
||||
|
||||
Args:
|
||||
chunks_query_retriever: Function to retrieve context chunks for a given query.
|
||||
num_questions (int): Number of questions to evaluate (default: 5).
|
||||
retriever: The retriever component to evaluate
|
||||
num_questions: Number of test questions to generate
|
||||
|
||||
Returns:
|
||||
Dict containing evaluation metrics
|
||||
"""
|
||||
llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=2000)
|
||||
question_answer_from_context_chain = create_question_answer_from_context_chain(llm)
|
||||
|
||||
# Load questions and answers from JSON file
|
||||
q_a_file_name = "../data/q_a.json"
|
||||
with open(q_a_file_name, "r", encoding="utf-8") as json_file:
|
||||
q_a = json.load(json_file)
|
||||
|
||||
questions = [qa["question"] for qa in q_a][:num_questions]
|
||||
ground_truth_answers = [qa["answer"] for qa in q_a][:num_questions]
|
||||
generated_answers = []
|
||||
retrieved_documents = []
|
||||
|
||||
# Generate answers and retrieve documents for each question
|
||||
for question in questions:
|
||||
context = retrieve_context_per_question(question, chunks_query_retriever)
|
||||
retrieved_documents.append(context)
|
||||
context_string = " ".join(context)
|
||||
result = answer_question_from_context(question, context_string, question_answer_from_context_chain)
|
||||
generated_answers.append(result["answer"])
|
||||
|
||||
# Create test cases and evaluate
|
||||
test_cases = create_deep_eval_test_cases(questions, ground_truth_answers, generated_answers, retrieved_documents)
|
||||
evaluate(
|
||||
test_cases=test_cases,
|
||||
metrics=[correctness_metric, faithfulness_metric, relevance_metric]
|
||||
|
||||
# Initialize LLM
|
||||
llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo-preview")
|
||||
|
||||
# Create evaluation prompt
|
||||
eval_prompt = PromptTemplate.from_template("""
|
||||
Evaluate the following retrieval results for the question.
|
||||
|
||||
Question: {question}
|
||||
Retrieved Context: {context}
|
||||
|
||||
Rate on a scale of 1-5 (5 being best) for:
|
||||
1. Relevance: How relevant is the retrieved information to the question?
|
||||
2. Completeness: Does the context contain all necessary information?
|
||||
3. Conciseness: Is the retrieved context focused and free of irrelevant information?
|
||||
|
||||
Provide ratings in JSON format:
|
||||
""")
|
||||
|
||||
# Create evaluation chain
|
||||
eval_chain = (
|
||||
eval_prompt
|
||||
| llm
|
||||
| StrOutputParser()
|
||||
)
|
||||
|
||||
# Generate test questions
|
||||
question_gen_prompt = PromptTemplate.from_template(
|
||||
"Generate {num_questions} diverse test questions about climate change:"
|
||||
)
|
||||
question_chain = question_gen_prompt | llm | StrOutputParser()
|
||||
|
||||
questions = question_chain.invoke({"num_questions": num_questions}).split("\n")
|
||||
|
||||
# Evaluate each question
|
||||
results = []
|
||||
for question in questions:
|
||||
# Get retrieval results
|
||||
context = retriever.get_relevant_documents(question)
|
||||
context_text = "\n".join([doc.page_content for doc in context])
|
||||
|
||||
# Evaluate results
|
||||
eval_result = eval_chain.invoke({
|
||||
"question": question,
|
||||
"context": context_text
|
||||
})
|
||||
results.append(eval_result)
|
||||
|
||||
return {
|
||||
"questions": questions,
|
||||
"results": results,
|
||||
"average_scores": calculate_average_scores(results)
|
||||
}
|
||||
|
||||
def calculate_average_scores(results: List[Dict]) -> Dict[str, float]:
|
||||
"""Calculate average scores across all evaluation results."""
|
||||
# Implementation depends on the exact format of your results
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Add any necessary setup or configuration here
|
||||
|
||||
@@ -17,7 +17,7 @@ from enum import Enum
|
||||
|
||||
def replace_t_with_space(list_of_documents):
|
||||
"""
|
||||
Replaces all tab characters ('\t') with spaces in the page content of each document.
|
||||
Replaces all tab characters ('\t') with spaces in the page content of each document
|
||||
|
||||
Args:
|
||||
list_of_documents: A list of document objects, each with a 'page_content' attribute.
|
||||
|
||||
@@ -208,3 +208,55 @@ nbformat==5.10.4
|
||||
xxhash==3.5.0
|
||||
yarl==1.10.0
|
||||
zipp==3.20.1
|
||||
|
||||
# Core LangChain packages
|
||||
langchain>=0.1.0
|
||||
langchain-core>=0.1.17
|
||||
langchain-community>=0.0.13
|
||||
langchain-openai>=0.0.5
|
||||
langchain-anthropic>=0.0.9
|
||||
langchain-groq>=0.0.1
|
||||
langchain-cohere>=0.0.1
|
||||
|
||||
# Vector stores and embeddings
|
||||
faiss-cpu>=1.7.4
|
||||
chromadb>=0.4.22
|
||||
|
||||
# Document processing
|
||||
PyMuPDF>=1.23.8 # for fitz
|
||||
python-docx>=1.0.1
|
||||
pypdf>=3.17.4
|
||||
rank-bm25>=0.2.2
|
||||
|
||||
# Machine Learning and Data Science
|
||||
numpy>=1.24.3
|
||||
pandas>=2.0.3
|
||||
scikit-learn>=1.3.0
|
||||
|
||||
# API Clients
|
||||
openai>=1.12.0
|
||||
anthropic>=0.8.1
|
||||
cohere>=4.48
|
||||
groq>=0.4.2
|
||||
|
||||
# Testing and Evaluation
|
||||
pytest>=7.4.0
|
||||
deepeval>=0.20.12
|
||||
grouse>=0.3.0
|
||||
|
||||
# Development Tools
|
||||
python-dotenv>=1.0.0
|
||||
jupyter>=1.0.0
|
||||
notebook>=7.0.6
|
||||
ipykernel>=6.29.2
|
||||
|
||||
# Type Checking
|
||||
pydantic>=2.6.1
|
||||
typing-extensions>=4.9.0
|
||||
|
||||
# Async Support
|
||||
aiohttp>=3.9.1
|
||||
asyncio>=3.4.3
|
||||
|
||||
# Utilities
|
||||
tqdm>=4.66.1
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
import pytest
|
||||
import os
|
||||
import sys
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain_text_splitters import CharacterTextSplitter
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Add the main folder to sys.path
|
||||
sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../"))
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--exclude", action="store", help="Comma-separated list of notebook or script files' paths to exclude"
|
||||
@@ -40,4 +48,58 @@ def script_paths(request):
|
||||
|
||||
path_with_full_address = [folder + s for s in include_scripts]
|
||||
|
||||
return path_with_full_address
|
||||
return path_with_full_address
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llm():
|
||||
"""Fixture for ChatOpenAI model."""
|
||||
return ChatOpenAI(
|
||||
temperature=0,
|
||||
model_name="gpt-4-turbo-preview",
|
||||
max_tokens=4000
|
||||
)
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def embeddings():
|
||||
"""Fixture for OpenAI embeddings."""
|
||||
return OpenAIEmbeddings()
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def text_splitter():
|
||||
"""Fixture for text splitter."""
|
||||
return CharacterTextSplitter(
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200
|
||||
)
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_texts():
|
||||
"""Fixture for sample test data."""
|
||||
return [
|
||||
"The Earth is the third planet from the Sun.",
|
||||
"Climate change is a significant global challenge.",
|
||||
"Renewable energy sources include solar and wind power."
|
||||
]
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def vector_store(embeddings, sample_texts, text_splitter):
|
||||
"""Fixture for vector store."""
|
||||
docs = text_splitter.create_documents(sample_texts)
|
||||
return FAISS.from_documents(docs, embeddings)
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def retriever(vector_store):
|
||||
"""Fixture for retriever."""
|
||||
return vector_store.as_retriever(search_kwargs={"k": 2})
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def basic_prompt():
|
||||
"""Fixture for basic prompt template."""
|
||||
return PromptTemplate.from_template("""
|
||||
Answer the following question based on the context provided:
|
||||
|
||||
Context: {context}
|
||||
Question: {question}
|
||||
|
||||
Answer:
|
||||
""")
|
||||
Reference in New Issue
Block a user