mirror of
https://github.com/NirDiamant/RAG_Techniques.git
synced 2025-04-07 00:48:52 +03:00
added context window enrichment method, and semantic chunking
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -8,3 +8,7 @@
|
||||
!*jpeg
|
||||
!LICENSE
|
||||
!*.gif
|
||||
!all_rag_techniques
|
||||
!data
|
||||
!evaluation
|
||||
|
||||
|
||||
0
__init__.py
Normal file
0
__init__.py
Normal file
0
all_rag_techniques/__init__.py
Normal file
0
all_rag_techniques/__init__.py
Normal file
311
all_rag_techniques/context_enrichment_window_around_chunk.ipynb
Normal file
311
all_rag_techniques/context_enrichment_window_around_chunk.ipynb
Normal file
@@ -0,0 +1,311 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Import libraries and environment variables"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\N7\\PycharmProjects\\llm_tasks\\RAG_TECHNIQUES\\.venv\\Lib\\site-packages\\deepeval\\__init__.py:42: UserWarning: You are using deepeval version 0.21.65, however version 0.21.67 is available. You should consider upgrading via the \"pip install --upgrade deepeval\" command.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks\n",
|
||||
"from helper_functions import *\n",
|
||||
"from evaluation.evalute_rag import *\n",
|
||||
"\n",
|
||||
"# Load environment variables from a .env file\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"# Set the OpenAI API key environment variable\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define path to PDF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path = \"../data/Understanding_Climate_Change.pdf\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Read PDF to string"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"content = read_pdf_to_string(path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Function to split text into chunks with metadata of the chunk chronological index"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def split_text_to_chunks_with_indices(text: str, chunk_size: int, chunk_overlap: int) -> List[Document]:\n",
|
||||
" chunks = []\n",
|
||||
" start = 0\n",
|
||||
" while start < len(text):\n",
|
||||
" end = start + chunk_size\n",
|
||||
" chunk = text[start:end]\n",
|
||||
" chunks.append(Document(page_content=chunk, metadata={\"index\": len(chunks), \"text\": text}))\n",
|
||||
" start += chunk_size - chunk_overlap\n",
|
||||
" return chunks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Split our document accordingly"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chunks_size = 200\n",
|
||||
"chunk_overlap = 100\n",
|
||||
"docs = split_text_to_chunks_with_indices(content, chunks_size, chunk_overlap)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create vector store and retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"vectorstore = FAISS.from_documents(docs, embeddings)\n",
|
||||
"chunks_query_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 2})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Function to draw the k<sup>th</sup> chunk (in the original order) from the vector store \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_chunk_by_index(vectorstore, target_index: int):\n",
|
||||
" \"\"\"\n",
|
||||
" Retrieve a chunk from the vectorstore based on its index in the metadata.\n",
|
||||
" \n",
|
||||
" Args:\n",
|
||||
" vectorstore (VectorStore): The vectorstore containing the chunks.\n",
|
||||
" target_index (int): The index of the chunk to retrieve.\n",
|
||||
" debug (bool): If True, print debug information.\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" Optional[Document]: The retrieved chunk as a Document object, or None if not found.\n",
|
||||
" \"\"\"\n",
|
||||
" # Retrieve all documents from the vectorstore\n",
|
||||
" all_docs = vectorstore.similarity_search(\"\", k=vectorstore.index.ntotal)\n",
|
||||
" \n",
|
||||
" # Search for the document with the matching index\n",
|
||||
" for doc in all_docs:\n",
|
||||
" # Check if 'index' is in metadata and matches the target index\n",
|
||||
" if 'index' in doc.metadata and doc.metadata['index'] == target_index:\n",
|
||||
" return doc\n",
|
||||
" \n",
|
||||
" # If we've gone through all documents and haven't found a match, return None\n",
|
||||
" return None"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Check the function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Understanding Climate Change \n",
|
||||
"Chapter 1: Introduction to Climate Change \n",
|
||||
"Climate change refers to significant, long-term changes in the global climate. The term \n",
|
||||
"\"global climate\" encompasses the plane\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chunk = get_chunk_by_index(vectorstore, 0)\n",
|
||||
"print(chunk.page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Function that retrieves from the vector stroe based on semantic similarity and then pads each retrieved chunk with its num_neighbors before and after, taking into account the chunk overlap to construct a meaningful wide window arround it"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def retrieve_with_context_overlap(vectorstore, query: str, k: int = 2, num_neighbors: int = 2, chunk_size: int = 200, chunk_overlap: int = 100) -> List[str]:\n",
|
||||
" \"\"\"\n",
|
||||
" Retrieve chunks based on a query, then fetch neighboring chunks and concatenate them, \n",
|
||||
" accounting for overlap and correct indexing.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" vectorstore (VectorStore): The vectorstore containing the chunks.\n",
|
||||
" query (str): The query to search for relevant chunks.\n",
|
||||
" k (int): The number of relevant chunks to retrieve.\n",
|
||||
" num_neighbors (int): The number of chunks to retrieve before and after each relevant chunk.\n",
|
||||
" chunk_size (int): The size of each chunk when originally split.\n",
|
||||
" chunk_overlap (int): The overlap between chunks when originally split.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" List[str]: List of concatenated chunk sequences, each centered on a relevant chunk.\n",
|
||||
" \"\"\"\n",
|
||||
" retriever = vectorstore.as_retriever(search_kwargs={\"k\": k})\n",
|
||||
" relevant_chunks = retriever.get_relevant_documents(query)\n",
|
||||
"\n",
|
||||
" result_sequences = []\n",
|
||||
"\n",
|
||||
" for chunk in relevant_chunks:\n",
|
||||
" current_index = chunk.metadata.get('index')\n",
|
||||
" if current_index is None:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # Collect neighboring chunks\n",
|
||||
" neighbor_chunks = [chunk] # Include the current chunk\n",
|
||||
" for i in range(1, num_neighbors + 1):\n",
|
||||
" prev_chunk = get_chunk_by_index(vectorstore, current_index - i)\n",
|
||||
" if prev_chunk:\n",
|
||||
" neighbor_chunks.insert(0, prev_chunk)\n",
|
||||
" next_chunk = get_chunk_by_index(vectorstore, current_index + i)\n",
|
||||
" if next_chunk:\n",
|
||||
" neighbor_chunks.append(next_chunk)\n",
|
||||
"\n",
|
||||
" # Sort chunks by their index\n",
|
||||
" neighbor_chunks.sort(key=lambda x: x.metadata.get('index', 0))\n",
|
||||
"\n",
|
||||
" # Concatenate chunks accounting for overlap\n",
|
||||
" concatenated_text = neighbor_chunks[0].page_content\n",
|
||||
" for i in range(1, len(neighbor_chunks)):\n",
|
||||
" current_chunk = neighbor_chunks[i].page_content\n",
|
||||
" overlap_start = chunk_size - chunk_overlap\n",
|
||||
" concatenated_text += current_chunk[overlap_start:]\n",
|
||||
"\n",
|
||||
" result_sequences.append(concatenated_text)\n",
|
||||
"\n",
|
||||
" return result_sequences"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Comparing regular retrival and retrival with context window"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"climate change\"\n",
|
||||
"context = chunks_query_retriever.get_relevant_documents(query)\n",
|
||||
"context_pages_content = [doc.page_content for doc in context]\n",
|
||||
"\n",
|
||||
"print(\"regular retrieval:\\n\")\n",
|
||||
"show_context(context_pages_content)\n",
|
||||
"\n",
|
||||
"sequences = retrieve_with_context_overlap(vectorstore, query)\n",
|
||||
"print(\"retrieval with context overlap:\\n\")\n",
|
||||
"show_context(sequences)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
172
all_rag_techniques/semantic_chunking.ipynb
Normal file
172
all_rag_techniques/semantic_chunking.ipynb
Normal file
@@ -0,0 +1,172 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Import libraries "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"import fitz\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks\n",
|
||||
"from helper_functions import *\n",
|
||||
"from evaluation.evalute_rag import *\n",
|
||||
"\n",
|
||||
"from langchain_experimental.text_splitter import SemanticChunker\n",
|
||||
"from langchain_openai.embeddings import OpenAIEmbeddings\n",
|
||||
"\n",
|
||||
"# Load environment variables from a .env file\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"# Set the OpenAI API key environment variable\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define file path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path = \"../data/Understanding_Climate_Change.pdf\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Read PDF to string"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Open the PDF document located at the specified path\n",
|
||||
"doc = fitz.open(path)\n",
|
||||
"\n",
|
||||
"content = \"\"\n",
|
||||
"\n",
|
||||
"# Iterate over each page in the document\n",
|
||||
"for page_num in range(len(doc)):\n",
|
||||
" # Get the current page\n",
|
||||
" page = doc[page_num]\n",
|
||||
" # Extract the text content from the current page and append it to the content string\n",
|
||||
" content += page.get_text()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Breakpoint types: \n",
|
||||
"* 'percentile': all differences between sentences are calculated, and then any difference greater than the X percentile is split.\n",
|
||||
"* 'standard_deviation': any difference greater than X standard deviations is split.\n",
|
||||
"* 'interquartile': the interquartile distance is used to split chunks."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90) # chose which embeddings and breakpoint type and threshold to use"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Split original text to semantic chunks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = text_splitter.create_documents([content])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create vector store and retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"vectorstore = FAISS.from_documents(docs, embeddings)\n",
|
||||
"chunks_query_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 2})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Test the retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_query = \"What is the main cause of climate change?\"\n",
|
||||
"context = retrieve_context_per_question(test_query, chunks_query_retriever)\n",
|
||||
"show_context(context)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -9,22 +9,24 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"from evalute_rag import *\n",
|
||||
"import openai\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks\n",
|
||||
"from helper_functions import *\n",
|
||||
"from evaluation.evalute_rag import *\n",
|
||||
"\n",
|
||||
"# Load environment variables from a .env file\n",
|
||||
"load_dotenv()\n",
|
||||
"openai.api_key = os.getenv(\"OPENAI_API_KEY\")"
|
||||
"\n",
|
||||
"# Set the OpenAI API key environment variable\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -36,49 +38,98 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_dir = \"data\"\n",
|
||||
"documents = SimpleDirectoryReader(data_dir).load_data()"
|
||||
"path = \"../data/Understanding_Climate_Change.pdf\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Index docs"
|
||||
"### Encode document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vector_index = VectorStoreIndex.from_documents(documents, chunk_size=256,chunk_overlap=64)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Read Q&A file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"q_a_file_name = \"data/q_a.json\"\n",
|
||||
"with open(q_a_file_name, \"r\", encoding=\"utf-8\") as json_file:\n",
|
||||
" q_a = json.load(json_file)\n",
|
||||
"def encode_pdf(path, chunk_size=1000, chunk_overlap=200):\n",
|
||||
" \"\"\"\n",
|
||||
" Encodes a PDF book into a vector store using OpenAI embeddings.\n",
|
||||
"\n",
|
||||
"questions = [qa[\"question\"] for qa in q_a]\n",
|
||||
"ground_truth_answers = [qa[\"answer\"] for qa in q_a]"
|
||||
" Args:\n",
|
||||
" path: The path to the PDF file.\n",
|
||||
" chunk_size: The desired size of each text chunk.\n",
|
||||
" chunk_overlap: The amount of overlap between consecutive chunks.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" A FAISS vector store containing the encoded book content.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # Load PDF documents\n",
|
||||
" loader = PyPDFLoader(path)\n",
|
||||
" documents = loader.load()\n",
|
||||
"\n",
|
||||
" # Split documents into chunks\n",
|
||||
" text_splitter = RecursiveCharacterTextSplitter(\n",
|
||||
" chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len\n",
|
||||
" )\n",
|
||||
" texts = text_splitter.split_documents(documents)\n",
|
||||
" cleaned_texts = replace_t_with_space(texts)\n",
|
||||
"\n",
|
||||
" # Create embeddings and vector store\n",
|
||||
" embeddings = OpenAIEmbeddings()\n",
|
||||
" vectorstore = FAISS.from_documents(cleaned_texts, embeddings)\n",
|
||||
"\n",
|
||||
" return vectorstore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chunks_vector_store = encode_pdf(path, chunk_size=1000, chunk_overlap=200)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={\"k\": 2})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Test retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_query = \"What is the main cause of climate change?\"\n",
|
||||
"context = retrieve_context_per_question(test_query, chunks_query_retriever)\n",
|
||||
"show_context(context)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -90,39 +141,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"eval_results = await evaluate_rag(vector_index, questions, ground_truth_answers)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Display metrics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Faithfulness Score: 0.93\n",
|
||||
"Relevancy Score: 0.93\n",
|
||||
"Correctness Score: 0.93\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"faithfulness_score = get_eval_results(\"faithfulness\", eval_results)\n",
|
||||
"relevancy_score = get_eval_results(\"relevancy\", eval_results)\n",
|
||||
"correctness_score = get_eval_results(\"correctness\", eval_results)"
|
||||
"evaluate_rag(chunks_query_retriever)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
0
data/__init__.py
Normal file
0
data/__init__.py
Normal file
0
evaluation/__init__.py
Normal file
0
evaluation/__init__.py
Normal file
322
evaluation/define_evaluation_metrics.ipynb
Normal file
322
evaluation/define_evaluation_metrics.ipynb
Normal file
@@ -0,0 +1,322 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_openai import ChatOpenAI \n",
|
||||
"from langchain.chains import LLMChain\n",
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"from langchain_core.pydantic_v1 import BaseModel, Field\n",
|
||||
"\n",
|
||||
"# from langchain.evaluation.criteria import {\n",
|
||||
"# CriteriaEvalChain,\n",
|
||||
"# LabeledCriteriaEvalChain\n",
|
||||
"# }\n",
|
||||
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import FAISS\n",
|
||||
"# from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||
"import numpy as np\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"load_dotenv()\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = ChatOpenAI(temperature=0, model_name=\"gpt-4o\", max_tokens=4000)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 129,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class ResultScore(BaseModel):\n",
|
||||
" score: float = Field(..., description=\"The score of the result, ranging from 0 to 1 where 1 is the best possible score.\")\n",
|
||||
" # explanation: str = Field(..., description=\"An extensive explanation of the score.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 104,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"correctness_prompt = PromptTemplate(\n",
|
||||
"input_variables=[\"question\", \"ground_truth\", \"generated_answer\"],\n",
|
||||
"template=\"\"\"\n",
|
||||
"Question: {question}\n",
|
||||
"Ground Truth: {ground_truth}\n",
|
||||
"Generated Answer: {generated_answer}\n",
|
||||
"\n",
|
||||
"Evaluate the correctness of the generated answer compared to the ground truth.\n",
|
||||
"Score from 0 to 1, where 1 is perfectly correct and 0 is completely incorrect.\n",
|
||||
"any score between 0 and 1 is acceptable and depends on how correct the generated answer is.\n",
|
||||
"\n",
|
||||
"Score:\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"correctness_chain = correctness_prompt | llm.with_structured_output(ResultScore)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def evaluate_correctness(question, ground_truth, generated_answer):\n",
|
||||
" \"\"\"Evaluates the correctness of the generated answer compared to the ground truth.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" question: The question.\n",
|
||||
" ground_truth: The ground truth answer.\n",
|
||||
" generated_answer: The generated answer.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" A float between 0 and 1, where 1 is the best possible score.\n",
|
||||
" \"\"\"\n",
|
||||
" result = correctness_chain.invoke({\"question\": question, \"ground_truth\": ground_truth, \"generated_answer\": generated_answer})\n",
|
||||
" return result.score\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# test create_correctness_chain\n",
|
||||
"question = \"What is the capital of France and Spain?\"\n",
|
||||
"ground_truth = \"Paris and Barcelona\"\n",
|
||||
"generated_answer = \"Paris\"\n",
|
||||
"score = evaluate_correctness(question, ground_truth, generated_answer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.5"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 105,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"faithfulness_prompt = PromptTemplate(\n",
|
||||
"input_variables=[\"question\",\"context\", \"generated_answer\"],\n",
|
||||
"template=\"\"\"\n",
|
||||
"Question: {question}\n",
|
||||
"Context: {context}\n",
|
||||
"Generated Answer: {generated_answer}\n",
|
||||
"\n",
|
||||
"Evaluate if the generate answer to the question can be deduced from the context.\n",
|
||||
"Score of 0 or 1, where 1 is perfectly faithful *AND CAN BE DERIVED FROM THE CONTEXT* and 0 otherwise.\n",
|
||||
"you don't mind if the answer is correct, all you care about is if the answer can be deduced from the context.\n",
|
||||
"\n",
|
||||
"example:\n",
|
||||
"Question: What are the capitals of France and Spain?\n",
|
||||
"Context: Paris is the capital of France and Madrid is the capital of Spain.\n",
|
||||
"Generated Answer: Paris\n",
|
||||
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
|
||||
"\n",
|
||||
"example:\n",
|
||||
"Question: What are the capital cities of France and Spain?\n",
|
||||
"Context: London is the capital of France and Barcelona is the capital of Spain.\n",
|
||||
"Generated Answer: London and Barcelona.\n",
|
||||
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
|
||||
"\n",
|
||||
"example:\n",
|
||||
"Question: What are the capital cities of France and Spain?\n",
|
||||
"Context: Paris is the capital of France and Madrid is the capital of Spain.\n",
|
||||
"Generated Answer: Paris.\n",
|
||||
"in this case the generated answer is faithful to the context so the score should be *1*.\n",
|
||||
"\n",
|
||||
"exmaple:\n",
|
||||
"Question: What are the capitals of France and Spain?\n",
|
||||
"Context: London is the capital of France and Madrid is the Capital of Spain.\n",
|
||||
"Generated Answer: Paris and Madrid.\n",
|
||||
"in this case the generated answer is based on the pretrained knowledge of the llm and is not faithful to the context so the score should be *0*.\n",
|
||||
"\n",
|
||||
"example:\n",
|
||||
"Question: What is the capital of France and Spain?\n",
|
||||
"Context: Monkeys like to eat bananas.\n",
|
||||
"Generated Answer: Paris and Madrid.\n",
|
||||
"in this case the generated answer is not based on the context so the score should be *0*.\n",
|
||||
"\n",
|
||||
"example:\n",
|
||||
"Question: What is the capital of France?\n",
|
||||
"Context: Paris.\n",
|
||||
"Generated Answer: Paris.\n",
|
||||
"in this case the context doesn't specify that Paris is the capital of France, and it cannot be deduced from the context, so the score should be *0*.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Example:\n",
|
||||
"Question: What is 2+2?\n",
|
||||
"Context: 4.\n",
|
||||
"Generated Answer: 4.\n",
|
||||
"In this case, the context states '4', but it does not provide information to deduce the answer to 'What is 2+2?', so the score should be *0*.\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"faithfulness_chain = faithfulness_prompt | llm.with_structured_output(ResultScore)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 83,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def evaluate_faithfulness(question, context, generated_answer):\n",
|
||||
" \"\"\"Evaluates if the generate answer to the question can be deduced from the context.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" question: The question.\n",
|
||||
" context: The context.\n",
|
||||
" generated_answer: The generated answer.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" A float between 0 and 1, where 1 is the best possible score.\n",
|
||||
" \"\"\"\n",
|
||||
" result = faithfulness_chain.invoke({\"question\": question, \"context\": context, \"generated_answer\": generated_answer})\n",
|
||||
" return result.score, result.explanation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 86,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.0\n",
|
||||
"The context states '6', but it does not provide information to deduce the answer to 'What is 3+3?'. The answer is correct, but it cannot be derived from the context.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# test create_faithfulness_chain\n",
|
||||
"question = \"what is 3+3?\"\n",
|
||||
"context = \"6\"\n",
|
||||
"generated_answer = \"6\"\n",
|
||||
"score, explanation = evaluate_faithfulness(question, context, generated_answer)\n",
|
||||
"print(score)\n",
|
||||
"print(explanation)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 130,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain import PromptTemplate\n",
|
||||
"\n",
|
||||
"relevancy_score_prompt = PromptTemplate(\n",
|
||||
" input_variables=[\"question\", \"contexts\"],\n",
|
||||
" template=\"\"\"\n",
|
||||
"Q: {question}\n",
|
||||
"Docs: {contexts}\n",
|
||||
"\n",
|
||||
"Score each doc's relevance:\n",
|
||||
"0.00 - Irrelevant: No relation to the question\n",
|
||||
"0.33 - Somewhat relevant: Contains related keywords or concepts\n",
|
||||
"0.66 - Relevant: Partially answers or strongly implies the answer\n",
|
||||
"1.00 - Highly relevant: Directly and fully answers the question\n",
|
||||
"\n",
|
||||
"Consider: Relevance, Directness, Completeness, Accuracy\n",
|
||||
"\n",
|
||||
"Final Score: [Average of all scores]\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"ratio_of_relevant_docs_chain = ratio_of_relevant_docs_prompt | llm.with_structured_output(ResultScore)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 131,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def evaluate_ratio_of_relevant_docs(question, contexts):\n",
|
||||
" \"\"\"Evaluates the ratio of relevant documents in the contexts to the question.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" question: The question.\n",
|
||||
" contexts: A list of documents.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" A float between 0 and 1, where 1 is the best possible score.\n",
|
||||
" \"\"\"\n",
|
||||
" result = ratio_of_relevant_docs_chain.invoke({\"question\": question, \"contexts\": contexts})\n",
|
||||
" return result.score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 132,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# test create_ratio_of_relevant_docs_chain\n",
|
||||
"question = \"What is the capital of France?\"\n",
|
||||
"contexts = [\"Paris.\", \"i was traveling in France.\"]\n",
|
||||
"score = evaluate_ratio_of_relevant_docs(question, contexts)\n",
|
||||
"# score, explanation = evaluate_ratio_of_relevant_docs(question, contexts)\n",
|
||||
"print(score)\n",
|
||||
"# print(explanation)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
209
evaluation/evaluation_deep_eval.ipynb
Normal file
209
evaluation/evaluation_deep_eval.ipynb
Normal file
@@ -0,0 +1,209 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from deepeval import evaluate\n",
|
||||
"from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric\n",
|
||||
"from deepeval.test_case import LLMTestCase, LLMTestCaseParams"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Test Correctness"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"correctness_metric = GEval(\n",
|
||||
" name=\"Correctness\",\n",
|
||||
" model=\"gpt-4o\",\n",
|
||||
" evaluation_params=[\n",
|
||||
" LLMTestCaseParams.EXPECTED_OUTPUT,\n",
|
||||
" LLMTestCaseParams.ACTUAL_OUTPUT],\n",
|
||||
" evaluation_steps=[\n",
|
||||
" \"Determine whether the actual output is factually correct based on the expected output.\"\n",
|
||||
" ],\n",
|
||||
"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"gt_answer = \"Madrid is the capital of Spain.\"\n",
|
||||
"pred_answer = \"MadriD.\"\n",
|
||||
"\n",
|
||||
"test_case_correctness = LLMTestCase(\n",
|
||||
" input=\"What is the capital of Spain?\",\n",
|
||||
" expected_output=gt_answer,\n",
|
||||
" actual_output=pred_answer,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"correctness_metric.measure(test_case_correctness)\n",
|
||||
"print(correctness_metric.score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Test faithfulness"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question = \"what is 3+3?\"\n",
|
||||
"context = [\"6\"]\n",
|
||||
"generated_answer = \"6\"\n",
|
||||
"\n",
|
||||
"faithfulness_metric = FaithfulnessMetric(\n",
|
||||
" threshold=0.7,\n",
|
||||
" model=\"gpt-4\",\n",
|
||||
" include_reason=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"test_case = LLMTestCase(\n",
|
||||
" input = question,\n",
|
||||
" actual_output=generated_answer,\n",
|
||||
" retrieval_context=context\n",
|
||||
"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"faithfulness_metric.measure(test_case)\n",
|
||||
"print(faithfulness_metric.score)\n",
|
||||
"print(faithfulness_metric.reason)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Test contextual relevancy "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"actual_output = \"then go somewhere else.\"\n",
|
||||
"retrieval_context = [\"this is a test context\",\"mike is a cat\",\"if the shoes don't fit, then go somewhere else.\"]\n",
|
||||
"gt_answer = \"if the shoes don't fit, then go somewhere else.\"\n",
|
||||
"\n",
|
||||
"relevance_metric = ContextualRelevancyMetric(\n",
|
||||
" threshold=1,\n",
|
||||
" model=\"gpt-4\",\n",
|
||||
" include_reason=True\n",
|
||||
")\n",
|
||||
"relevance_test_case = LLMTestCase(\n",
|
||||
" input=\"What if these shoes don't fit?\",\n",
|
||||
" actual_output=actual_output,\n",
|
||||
" retrieval_context=retrieval_context,\n",
|
||||
" expected_output=gt_answer,\n",
|
||||
"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"relevance_metric.measure(relevance_test_case)\n",
|
||||
"print(relevance_metric.score)\n",
|
||||
"print(relevance_metric.reason)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"new_test_case = LLMTestCase(\n",
|
||||
" input=\"What is the capital of Spain?\",\n",
|
||||
" expected_output=\"Madrid is the capital of Spain.\",\n",
|
||||
" actual_output=\"MadriD.\",\n",
|
||||
" retrieval_context=[\"Madrid is the capital of Spain.\"]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Test two different cases together with several metrics together"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluate(\n",
|
||||
" test_cases=[relevance_test_case, new_test_case],\n",
|
||||
" metrics=[correctness_metric, faithfulness_metric, relevance_metric]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Funcion to create multiple LLMTestCases based on four lists: \n",
|
||||
"* Questions\n",
|
||||
"* Ground Truth Answers\n",
|
||||
"* Generated Answers\n",
|
||||
"* Retrieved Documents - Each element is a list"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_deep_eval_test_cases(questions, gt_answers, generated_answers, retrieved_documents):\n",
|
||||
" return [\n",
|
||||
" LLMTestCase(\n",
|
||||
" input=question,\n",
|
||||
" expected_output=gt_answer,\n",
|
||||
" actual_output=generated_answer,\n",
|
||||
" retrieval_context=retrieved_document\n",
|
||||
" )\n",
|
||||
" for question, gt_answer, generated_answer, retrieved_document in zip(\n",
|
||||
" questions, gt_answers, generated_answers, retrieved_documents\n",
|
||||
" )\n",
|
||||
" ]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
124
evaluation/evalute_rag.py
Normal file
124
evaluation/evalute_rag.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""
|
||||
RAG Evaluation Script
|
||||
|
||||
This script evaluates the performance of a Retrieval-Augmented Generation (RAG) system
|
||||
using various metrics from the deepeval library.
|
||||
|
||||
Dependencies:
|
||||
- deepeval
|
||||
- langchain_openai
|
||||
- json
|
||||
|
||||
Custom modules:
|
||||
- helper_functions (for RAG-specific operations)
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import List, Tuple
|
||||
|
||||
from deepeval import evaluate
|
||||
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
|
||||
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
from helper_functions import (
|
||||
create_question_answer_from_context_chain,
|
||||
answer_question_from_context,
|
||||
retrieve_context_per_question
|
||||
)
|
||||
|
||||
def create_deep_eval_test_cases(
|
||||
questions: List[str],
|
||||
gt_answers: List[str],
|
||||
generated_answers: List[str],
|
||||
retrieved_documents: List[str]
|
||||
) -> List[LLMTestCase]:
|
||||
"""
|
||||
Create a list of LLMTestCase objects for evaluation.
|
||||
|
||||
Args:
|
||||
questions (List[str]): List of input questions.
|
||||
gt_answers (List[str]): List of ground truth answers.
|
||||
generated_answers (List[str]): List of generated answers.
|
||||
retrieved_documents (List[str]): List of retrieved documents.
|
||||
|
||||
Returns:
|
||||
List[LLMTestCase]: List of LLMTestCase objects.
|
||||
"""
|
||||
return [
|
||||
LLMTestCase(
|
||||
input=question,
|
||||
expected_output=gt_answer,
|
||||
actual_output=generated_answer,
|
||||
retrieval_context=retrieved_document
|
||||
)
|
||||
for question, gt_answer, generated_answer, retrieved_document in zip(
|
||||
questions, gt_answers, generated_answers, retrieved_documents
|
||||
)
|
||||
]
|
||||
|
||||
# Define evaluation metrics
|
||||
correctness_metric = GEval(
|
||||
name="Correctness",
|
||||
model="gpt-4o",
|
||||
evaluation_params=[
|
||||
LLMTestCaseParams.EXPECTED_OUTPUT,
|
||||
LLMTestCaseParams.ACTUAL_OUTPUT
|
||||
],
|
||||
evaluation_steps=[
|
||||
"Determine whether the actual output is factually correct based on the expected output."
|
||||
],
|
||||
)
|
||||
|
||||
faithfulness_metric = FaithfulnessMetric(
|
||||
threshold=0.7,
|
||||
model="gpt-4",
|
||||
include_reason=False
|
||||
)
|
||||
|
||||
relevance_metric = ContextualRelevancyMetric(
|
||||
threshold=1,
|
||||
model="gpt-4",
|
||||
include_reason=True
|
||||
)
|
||||
|
||||
def evaluate_rag(chunks_query_retriever, num_questions: int = 5) -> None:
|
||||
"""
|
||||
Evaluate the RAG system using predefined metrics.
|
||||
|
||||
Args:
|
||||
chunks_query_retriever: Function to retrieve context chunks for a given query.
|
||||
num_questions (int): Number of questions to evaluate (default: 5).
|
||||
"""
|
||||
llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=2000)
|
||||
question_answer_from_context_chain = create_question_answer_from_context_chain(llm)
|
||||
|
||||
# Load questions and answers from JSON file
|
||||
q_a_file_name = "../data/q_a.json"
|
||||
with open(q_a_file_name, "r", encoding="utf-8") as json_file:
|
||||
q_a = json.load(json_file)
|
||||
|
||||
questions = [qa["question"] for qa in q_a][:num_questions]
|
||||
ground_truth_answers = [qa["answer"] for qa in q_a][:num_questions]
|
||||
generated_answers = []
|
||||
retrieved_documents = []
|
||||
|
||||
# Generate answers and retrieve documents for each question
|
||||
for question in questions:
|
||||
context = retrieve_context_per_question(question, chunks_query_retriever)
|
||||
retrieved_documents.append(context)
|
||||
context_string = " ".join(context)
|
||||
result = answer_question_from_context(question, context_string, question_answer_from_context_chain)
|
||||
generated_answers.append(result["answer"])
|
||||
|
||||
# Create test cases and evaluate
|
||||
test_cases = create_deep_eval_test_cases(questions, ground_truth_answers, generated_answers, retrieved_documents)
|
||||
evaluate(
|
||||
test_cases=test_cases,
|
||||
metrics=[correctness_metric, faithfulness_metric, relevance_metric]
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Add any necessary setup or configuration here
|
||||
# Example: evaluate_rag(your_chunks_query_retriever_function)
|
||||
pass
|
||||
156
helper_functions.py
Normal file
156
helper_functions.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field
|
||||
from langchain import PromptTemplate
|
||||
import fitz
|
||||
|
||||
|
||||
|
||||
|
||||
def replace_t_with_space(list_of_documents):
|
||||
"""
|
||||
Replaces all tab characters ('\t') with spaces in the page content of each document.
|
||||
|
||||
Args:
|
||||
list_of_documents: A list of document objects, each with a 'page_content' attribute.
|
||||
|
||||
Returns:
|
||||
The modified list of documents with tab characters replaced by spaces.
|
||||
"""
|
||||
|
||||
for doc in list_of_documents:
|
||||
doc.page_content = doc.page_content.replace('\t', ' ') # Replace tabs with spaces
|
||||
return list_of_documents
|
||||
|
||||
|
||||
|
||||
|
||||
def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
|
||||
"""
|
||||
Encodes a PDF book into a vector store using OpenAI embeddings.
|
||||
|
||||
Args:
|
||||
path: The path to the PDF file.
|
||||
chunk_size: The desired size of each text chunk.
|
||||
chunk_overlap: The amount of overlap between consecutive chunks.
|
||||
|
||||
Returns:
|
||||
A FAISS vector store containing the encoded book content.
|
||||
"""
|
||||
|
||||
# Load PDF documents
|
||||
loader = PyPDFLoader(path)
|
||||
documents = loader.load()
|
||||
|
||||
# Split documents into chunks
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
|
||||
)
|
||||
texts = text_splitter.split_documents(documents)
|
||||
cleaned_texts = replace_t_with_space(texts)
|
||||
|
||||
# Create embeddings and vector store
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = FAISS.from_documents(cleaned_texts, embeddings)
|
||||
|
||||
return vectorstore
|
||||
|
||||
|
||||
def retrieve_context_per_question(question, chunks_query_retriever):
|
||||
"""
|
||||
Retrieves relevant context and unique URLs for a given question using the chunks query retriever.
|
||||
|
||||
Args:
|
||||
question: The question for which to retrieve context and URLs.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- A string with the concatenated content of relevant documents.
|
||||
- A list of unique URLs from the metadata of the relevant documents.
|
||||
"""
|
||||
|
||||
# Retrieve relevant documents for the given question
|
||||
docs = chunks_query_retriever.get_relevant_documents(question)
|
||||
|
||||
# Concatenate document content
|
||||
# context = " ".join(doc.page_content for doc in docs)
|
||||
context = [doc.page_content for doc in docs]
|
||||
|
||||
|
||||
return context
|
||||
|
||||
class QuestionAnswerFromContext(BaseModel):
|
||||
"""
|
||||
Model to generate an answer to a query based on a given context.
|
||||
|
||||
Attributes:
|
||||
answer_based_on_content (str): The generated answer based on the context.
|
||||
"""
|
||||
answer_based_on_content: str = Field(description="Generates an answer to a query based on a given context.")
|
||||
|
||||
def create_question_answer_from_context_chain(llm):
|
||||
|
||||
# Initialize the ChatOpenAI model with specific parameters
|
||||
question_answer_from_context_llm = llm
|
||||
|
||||
# Define the prompt template for chain-of-thought reasoning
|
||||
question_answer_prompt_template = """
|
||||
For the question below, provide a concise but suffice answer based ONLY on the provided context:
|
||||
{context}
|
||||
Question
|
||||
{question}
|
||||
"""
|
||||
|
||||
# Create a PromptTemplate object with the specified template and input variables
|
||||
question_answer_from_context_prompt = PromptTemplate(
|
||||
template=question_answer_prompt_template,
|
||||
input_variables=["context", "question"],
|
||||
)
|
||||
|
||||
# Create a chain by combining the prompt template and the language model
|
||||
question_answer_from_context_cot_chain = question_answer_from_context_prompt | question_answer_from_context_llm.with_structured_output(QuestionAnswerFromContext)
|
||||
return question_answer_from_context_cot_chain
|
||||
|
||||
|
||||
|
||||
def answer_question_from_context(question, context, question_answer_from_context_chain):
|
||||
"""
|
||||
Answer a question using the given context by invoking a chain of reasoning.
|
||||
|
||||
Args:
|
||||
question: The question to be answered.
|
||||
context: The context to be used for answering the question.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the answer, context, and question.
|
||||
"""
|
||||
input_data = {
|
||||
"question": question,
|
||||
"context": context
|
||||
}
|
||||
print("Answering the question from the retrieved context...")
|
||||
|
||||
output = question_answer_from_context_chain.invoke(input_data)
|
||||
answer = output.answer_based_on_content
|
||||
return {"answer": answer, "context": context, "question": question}
|
||||
|
||||
|
||||
def show_context(context):
|
||||
for i, c in enumerate(context):
|
||||
print(f"Context {i+1}:")
|
||||
print(c)
|
||||
print("\n")
|
||||
|
||||
def read_pdf_to_string(path):
|
||||
# Open the PDF document located at the specified path
|
||||
doc = fitz.open(path)
|
||||
content = ""
|
||||
# Iterate over each page in the document
|
||||
for page_num in range(len(doc)):
|
||||
# Get the current page
|
||||
page = doc[page_num]
|
||||
# Extract the text content from the current page and append it to the content string
|
||||
content += page.get_text()
|
||||
return content
|
||||
Reference in New Issue
Block a user