mirror of
https://github.com/NirDiamant/RAG_Techniques.git
synced 2025-04-07 00:48:52 +03:00
312 lines
9.4 KiB
Plaintext
312 lines
9.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Import libraries and environment variables"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"c:\\Users\\N7\\PycharmProjects\\llm_tasks\\RAG_TECHNIQUES\\.venv\\Lib\\site-packages\\deepeval\\__init__.py:42: UserWarning: You are using deepeval version 0.21.65, however version 0.21.67 is available. You should consider upgrading via the \"pip install --upgrade deepeval\" command.\n",
|
|
" warnings.warn(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"import sys\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from langchain.docstore.document import Document\n",
|
|
"\n",
|
|
"\n",
|
|
"sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks\n",
|
|
"from helper_functions import *\n",
|
|
"from evaluation.evalute_rag import *\n",
|
|
"\n",
|
|
"# Load environment variables from a .env file\n",
|
|
"load_dotenv()\n",
|
|
"\n",
|
|
"# Set the OpenAI API key environment variable\n",
|
|
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Define path to PDF"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"path = \"../data/Understanding_Climate_Change.pdf\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Read PDF to string"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"content = read_pdf_to_string(path)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Function to split text into chunks with metadata of the chunk chronological index"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def split_text_to_chunks_with_indices(text: str, chunk_size: int, chunk_overlap: int) -> List[Document]:\n",
|
|
" chunks = []\n",
|
|
" start = 0\n",
|
|
" while start < len(text):\n",
|
|
" end = start + chunk_size\n",
|
|
" chunk = text[start:end]\n",
|
|
" chunks.append(Document(page_content=chunk, metadata={\"index\": len(chunks), \"text\": text}))\n",
|
|
" start += chunk_size - chunk_overlap\n",
|
|
" return chunks"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Split our document accordingly"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"chunks_size = 200\n",
|
|
"chunk_overlap = 100\n",
|
|
"docs = split_text_to_chunks_with_indices(content, chunks_size, chunk_overlap)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Create vector store and retriever"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embeddings = OpenAIEmbeddings()\n",
|
|
"vectorstore = FAISS.from_documents(docs, embeddings)\n",
|
|
"chunks_query_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 2})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Function to draw the k<sup>th</sup> chunk (in the original order) from the vector store \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_chunk_by_index(vectorstore, target_index: int):\n",
|
|
" \"\"\"\n",
|
|
" Retrieve a chunk from the vectorstore based on its index in the metadata.\n",
|
|
" \n",
|
|
" Args:\n",
|
|
" vectorstore (VectorStore): The vectorstore containing the chunks.\n",
|
|
" target_index (int): The index of the chunk to retrieve.\n",
|
|
" debug (bool): If True, print debug information.\n",
|
|
" \n",
|
|
" Returns:\n",
|
|
" Optional[Document]: The retrieved chunk as a Document object, or None if not found.\n",
|
|
" \"\"\"\n",
|
|
" # Retrieve all documents from the vectorstore\n",
|
|
" all_docs = vectorstore.similarity_search(\"\", k=vectorstore.index.ntotal)\n",
|
|
" \n",
|
|
" # Search for the document with the matching index\n",
|
|
" for doc in all_docs:\n",
|
|
" # Check if 'index' is in metadata and matches the target index\n",
|
|
" if 'index' in doc.metadata and doc.metadata['index'] == target_index:\n",
|
|
" return doc\n",
|
|
" \n",
|
|
" # If we've gone through all documents and haven't found a match, return None\n",
|
|
" return None"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Check the function"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Understanding Climate Change \n",
|
|
"Chapter 1: Introduction to Climate Change \n",
|
|
"Climate change refers to significant, long-term changes in the global climate. The term \n",
|
|
"\"global climate\" encompasses the plane\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"chunk = get_chunk_by_index(vectorstore, 0)\n",
|
|
"print(chunk.page_content)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Function that retrieves from the vector stroe based on semantic similarity and then pads each retrieved chunk with its num_neighbors before and after, taking into account the chunk overlap to construct a meaningful wide window arround it"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def retrieve_with_context_overlap(vectorstore, query: str, k: int = 2, num_neighbors: int = 2, chunk_size: int = 200, chunk_overlap: int = 100) -> List[str]:\n",
|
|
" \"\"\"\n",
|
|
" Retrieve chunks based on a query, then fetch neighboring chunks and concatenate them, \n",
|
|
" accounting for overlap and correct indexing.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" vectorstore (VectorStore): The vectorstore containing the chunks.\n",
|
|
" query (str): The query to search for relevant chunks.\n",
|
|
" k (int): The number of relevant chunks to retrieve.\n",
|
|
" num_neighbors (int): The number of chunks to retrieve before and after each relevant chunk.\n",
|
|
" chunk_size (int): The size of each chunk when originally split.\n",
|
|
" chunk_overlap (int): The overlap between chunks when originally split.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" List[str]: List of concatenated chunk sequences, each centered on a relevant chunk.\n",
|
|
" \"\"\"\n",
|
|
" retriever = vectorstore.as_retriever(search_kwargs={\"k\": k})\n",
|
|
" relevant_chunks = retriever.get_relevant_documents(query)\n",
|
|
"\n",
|
|
" result_sequences = []\n",
|
|
"\n",
|
|
" for chunk in relevant_chunks:\n",
|
|
" current_index = chunk.metadata.get('index')\n",
|
|
" if current_index is None:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" # Collect neighboring chunks\n",
|
|
" neighbor_chunks = [chunk] # Include the current chunk\n",
|
|
" for i in range(1, num_neighbors + 1):\n",
|
|
" prev_chunk = get_chunk_by_index(vectorstore, current_index - i)\n",
|
|
" if prev_chunk:\n",
|
|
" neighbor_chunks.insert(0, prev_chunk)\n",
|
|
" next_chunk = get_chunk_by_index(vectorstore, current_index + i)\n",
|
|
" if next_chunk:\n",
|
|
" neighbor_chunks.append(next_chunk)\n",
|
|
"\n",
|
|
" # Sort chunks by their index\n",
|
|
" neighbor_chunks.sort(key=lambda x: x.metadata.get('index', 0))\n",
|
|
"\n",
|
|
" # Concatenate chunks accounting for overlap\n",
|
|
" concatenated_text = neighbor_chunks[0].page_content\n",
|
|
" for i in range(1, len(neighbor_chunks)):\n",
|
|
" current_chunk = neighbor_chunks[i].page_content\n",
|
|
" overlap_start = chunk_size - chunk_overlap\n",
|
|
" concatenated_text += current_chunk[overlap_start:]\n",
|
|
"\n",
|
|
" result_sequences.append(concatenated_text)\n",
|
|
"\n",
|
|
" return result_sequences"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Comparing regular retrival and retrival with context window"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"query = \"climate change\"\n",
|
|
"context = chunks_query_retriever.get_relevant_documents(query)\n",
|
|
"context_pages_content = [doc.page_content for doc in context]\n",
|
|
"\n",
|
|
"print(\"regular retrieval:\\n\")\n",
|
|
"show_context(context_pages_content)\n",
|
|
"\n",
|
|
"sequences = retrieve_with_context_overlap(vectorstore, query)\n",
|
|
"print(\"retrieval with context overlap:\\n\")\n",
|
|
"show_context(sequences)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|