mirror of
https://github.com/NirDiamant/RAG_Techniques.git
synced 2025-04-07 00:48:52 +03:00
173 lines
4.1 KiB
Plaintext
173 lines
4.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Import libraries "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 57,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import sys\n",
|
|
"import fitz\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"\n",
|
|
"\n",
|
|
"sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path sicnce we work with notebooks\n",
|
|
"from helper_functions import *\n",
|
|
"from evaluation.evalute_rag import *\n",
|
|
"\n",
|
|
"from langchain_experimental.text_splitter import SemanticChunker\n",
|
|
"from langchain_openai.embeddings import OpenAIEmbeddings\n",
|
|
"\n",
|
|
"# Load environment variables from a .env file\n",
|
|
"load_dotenv()\n",
|
|
"\n",
|
|
"# Set the OpenAI API key environment variable\n",
|
|
"os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Define file path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"path = \"../data/Understanding_Climate_Change.pdf\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Read PDF to string"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Open the PDF document located at the specified path\n",
|
|
"doc = fitz.open(path)\n",
|
|
"\n",
|
|
"content = \"\"\n",
|
|
"\n",
|
|
"# Iterate over each page in the document\n",
|
|
"for page_num in range(len(doc)):\n",
|
|
" # Get the current page\n",
|
|
" page = doc[page_num]\n",
|
|
" # Extract the text content from the current page and append it to the content string\n",
|
|
" content += page.get_text()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Breakpoint types: \n",
|
|
"* 'percentile': all differences between sentences are calculated, and then any difference greater than the X percentile is split.\n",
|
|
"* 'standard_deviation': any difference greater than X standard deviations is split.\n",
|
|
"* 'interquartile': the interquartile distance is used to split chunks."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 51,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90) # chose which embeddings and breakpoint type and threshold to use"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Split original text to semantic chunks"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 53,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"docs = text_splitter.create_documents([content])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Create vector store and retriever"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 54,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embeddings = OpenAIEmbeddings()\n",
|
|
"vectorstore = FAISS.from_documents(docs, embeddings)\n",
|
|
"chunks_query_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 2})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Test the retriever"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"test_query = \"What is the main cause of climate change?\"\n",
|
|
"context = retrieve_context_per_question(test_query, chunks_query_retriever)\n",
|
|
"show_context(context)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|