Files
RAG_Techniques/all_rag_techniques/semantic_chunking.ipynb

173 lines
4.1 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import libraries "
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import fitz\n",
"from dotenv import load_dotenv\n",
"\n",
"\n",
"sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks\n",
"from helper_functions import *\n",
"from evaluation.evalute_rag import *\n",
"\n",
"from langchain_experimental.text_splitter import SemanticChunker\n",
"from langchain_openai.embeddings import OpenAIEmbeddings\n",
"\n",
"# Load environment variables from a .env file\n",
"load_dotenv()\n",
"\n",
"# Set the OpenAI API key environment variable\n",
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define file path"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"path = \"../data/Understanding_Climate_Change.pdf\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Read PDF to string"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# Open the PDF document located at the specified path\n",
"doc = fitz.open(path)\n",
"\n",
"content = \"\"\n",
"\n",
"# Iterate over each page in the document\n",
"for page_num in range(len(doc)):\n",
" # Get the current page\n",
" page = doc[page_num]\n",
" # Extract the text content from the current page and append it to the content string\n",
" content += page.get_text()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Breakpoint types: \n",
"* 'percentile': all differences between sentences are calculated, and then any difference greater than the X percentile is split.\n",
"* 'standard_deviation': any difference greater than X standard deviations is split.\n",
"* 'interquartile': the interquartile distance is used to split chunks."
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90) # chose which embeddings and breakpoint type and threshold to use"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Split original text to semantic chunks"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"docs = text_splitter.create_documents([content])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create vector store and retriever"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"embeddings = OpenAIEmbeddings()\n",
"vectorstore = FAISS.from_documents(docs, embeddings)\n",
"chunks_query_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 2})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test the retriever"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_query = \"What is the main cause of climate change?\"\n",
"context = retrieve_context_per_question(test_query, chunks_query_retriever)\n",
"show_context(context)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}