RAG_Techniques/all_rag_techniques/semantic_chunking.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import libraries "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import fitz\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "\n",
    "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks\n",
    "from helper_functions import *\n",
    "from evaluation.evalute_rag import *\n",
    "\n",
    "from langchain_experimental.text_splitter import SemanticChunker\n",
    "from langchain_openai.embeddings import OpenAIEmbeddings\n",
    "\n",
    "# Load environment variables from a .env file\n",
    "load_dotenv()\n",
    "\n",
    "# Set the OpenAI API key environment variable\n",
    "os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define file path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../data/Understanding_Climate_Change.pdf\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read PDF to string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Open the PDF document located at the specified path\n",
    "doc = fitz.open(path)\n",
    "\n",
    "content = \"\"\n",
    "\n",
    "# Iterate over each page in the document\n",
    "for page_num in range(len(doc)):\n",
    "    # Get the current page\n",
    "    page = doc[page_num]\n",
    "    # Extract the text content from the current page and append it to the content string\n",
    "    content += page.get_text()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Breakpoint types: \n",
    "* 'percentile': all differences between sentences are calculated, and then any difference greater than the X percentile is split.\n",
    "* 'standard_deviation': any difference greater than X standard deviations is split.\n",
    "* 'interquartile': the interquartile distance is used to split chunks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90) # chose which embeddings and breakpoint type and threshold to use"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split original text to semantic chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = text_splitter.create_documents([content])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create vector store and retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = OpenAIEmbeddings()\n",
    "vectorstore = FAISS.from_documents(docs, embeddings)\n",
    "chunks_query_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 2})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test the retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_query = \"What is the main cause of climate change?\"\n",
    "context = retrieve_context_per_question(test_query, chunks_query_retriever)\n",
    "show_context(context)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}