RAG_Techniques/all_rag_techniques/context_enrichment_window_around_chunk.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import libraries and environment variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\N7\\PycharmProjects\\llm_tasks\\RAG_TECHNIQUES\\.venv\\Lib\\site-packages\\deepeval\\__init__.py:42: UserWarning: You are using deepeval version 0.21.65, however version 0.21.67 is available. You should consider upgrading via the \"pip install --upgrade deepeval\" command.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "from dotenv import load_dotenv\n",
    "from langchain.docstore.document import Document\n",
    "\n",
    "\n",
    "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks\n",
    "from helper_functions import *\n",
    "from evaluation.evalute_rag import *\n",
    "\n",
    "# Load environment variables from a .env file\n",
    "load_dotenv()\n",
    "\n",
    "# Set the OpenAI API key environment variable\n",
    "os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define path to PDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../data/Understanding_Climate_Change.pdf\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read PDF to string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "content = read_pdf_to_string(path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Function to split text into chunks with metadata of the chunk chronological index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_text_to_chunks_with_indices(text: str, chunk_size: int, chunk_overlap: int) -> List[Document]:\n",
    "    chunks = []\n",
    "    start = 0\n",
    "    while start < len(text):\n",
    "        end = start + chunk_size\n",
    "        chunk = text[start:end]\n",
    "        chunks.append(Document(page_content=chunk, metadata={\"index\": len(chunks), \"text\": text}))\n",
    "        start += chunk_size - chunk_overlap\n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split our document accordingly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks_size = 200\n",
    "chunk_overlap = 100\n",
    "docs = split_text_to_chunks_with_indices(content, chunks_size, chunk_overlap)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create vector store and retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = OpenAIEmbeddings()\n",
    "vectorstore = FAISS.from_documents(docs, embeddings)\n",
    "chunks_query_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 2})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Function to draw the k<sup>th</sup> chunk (in the original order) from the vector store \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_chunk_by_index(vectorstore, target_index: int):\n",
    "    \"\"\"\n",
    "    Retrieve a chunk from the vectorstore based on its index in the metadata.\n",
    "    \n",
    "    Args:\n",
    "    vectorstore (VectorStore): The vectorstore containing the chunks.\n",
    "    target_index (int): The index of the chunk to retrieve.\n",
    "    debug (bool): If True, print debug information.\n",
    "    \n",
    "    Returns:\n",
    "    Optional[Document]: The retrieved chunk as a Document object, or None if not found.\n",
    "    \"\"\"\n",
    "    # Retrieve all documents from the vectorstore\n",
    "    all_docs = vectorstore.similarity_search(\"\", k=vectorstore.index.ntotal)\n",
    "    \n",
    "    # Search for the document with the matching index\n",
    "    for doc in all_docs:\n",
    "        # Check if 'index' is in metadata and matches the target index\n",
    "        if 'index' in doc.metadata and doc.metadata['index'] == target_index:\n",
    "            return doc\n",
    "    \n",
    "    # If we've gone through all documents and haven't found a match, return None\n",
    "    return None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check the function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Understanding Climate Change \n",
      "Chapter 1: Introduction to Climate Change \n",
      "Climate change refers to significant, long-term changes in the global climate. The term \n",
      "\"global climate\" encompasses the plane\n"
     ]
    }
   ],
   "source": [
    "chunk = get_chunk_by_index(vectorstore, 0)\n",
    "print(chunk.page_content)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Function that retrieves from the vector stroe based on semantic similarity and then pads each retrieved chunk with its num_neighbors before and after, taking into account the chunk overlap to construct a meaningful wide window arround it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def retrieve_with_context_overlap(vectorstore, query: str, k: int = 2, num_neighbors: int = 2, chunk_size: int = 200, chunk_overlap: int = 100) -> List[str]:\n",
    "    \"\"\"\n",
    "    Retrieve chunks based on a query, then fetch neighboring chunks and concatenate them, \n",
    "    accounting for overlap and correct indexing.\n",
    "\n",
    "    Args:\n",
    "    vectorstore (VectorStore): The vectorstore containing the chunks.\n",
    "    query (str): The query to search for relevant chunks.\n",
    "    k (int): The number of relevant chunks to retrieve.\n",
    "    num_neighbors (int): The number of chunks to retrieve before and after each relevant chunk.\n",
    "    chunk_size (int): The size of each chunk when originally split.\n",
    "    chunk_overlap (int): The overlap between chunks when originally split.\n",
    "\n",
    "    Returns:\n",
    "    List[str]: List of concatenated chunk sequences, each centered on a relevant chunk.\n",
    "    \"\"\"\n",
    "    retriever = vectorstore.as_retriever(search_kwargs={\"k\": k})\n",
    "    relevant_chunks = retriever.get_relevant_documents(query)\n",
    "\n",
    "    result_sequences = []\n",
    "\n",
    "    for chunk in relevant_chunks:\n",
    "        current_index = chunk.metadata.get('index')\n",
    "        if current_index is None:\n",
    "            continue\n",
    "\n",
    "        # Collect neighboring chunks\n",
    "        neighbor_chunks = [chunk]  # Include the current chunk\n",
    "        for i in range(1, num_neighbors + 1):\n",
    "            prev_chunk = get_chunk_by_index(vectorstore, current_index - i)\n",
    "            if prev_chunk:\n",
    "                neighbor_chunks.insert(0, prev_chunk)\n",
    "            next_chunk = get_chunk_by_index(vectorstore, current_index + i)\n",
    "            if next_chunk:\n",
    "                neighbor_chunks.append(next_chunk)\n",
    "\n",
    "        # Sort chunks by their index\n",
    "        neighbor_chunks.sort(key=lambda x: x.metadata.get('index', 0))\n",
    "\n",
    "        # Concatenate chunks accounting for overlap\n",
    "        concatenated_text = neighbor_chunks[0].page_content\n",
    "        for i in range(1, len(neighbor_chunks)):\n",
    "            current_chunk = neighbor_chunks[i].page_content\n",
    "            overlap_start = chunk_size - chunk_overlap\n",
    "            concatenated_text += current_chunk[overlap_start:]\n",
    "\n",
    "        result_sequences.append(concatenated_text)\n",
    "\n",
    "    return result_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Comparing regular retrival and retrival with context window"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"climate change\"\n",
    "context = chunks_query_retriever.get_relevant_documents(query)\n",
    "context_pages_content = [doc.page_content for doc in context]\n",
    "\n",
    "print(\"regular retrieval:\\n\")\n",
    "show_context(context_pages_content)\n",
    "\n",
    "sequences = retrieve_with_context_overlap(vectorstore, query)\n",
    "print(\"retrieval with context overlap:\\n\")\n",
    "show_context(sequences)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}