mirror of
https://github.com/NirDiamant/RAG_Techniques.git
synced 2025-04-07 00:48:52 +03:00
157 lines
5.0 KiB
Python
157 lines
5.0 KiB
Python
from langchain.document_loaders import PyPDFLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from langchain.vectorstores import FAISS
|
|
from langchain_core.pydantic_v1 import BaseModel, Field
|
|
from langchain import PromptTemplate
|
|
import fitz
|
|
|
|
|
|
|
|
|
|
def replace_t_with_space(list_of_documents):
|
|
"""
|
|
Replaces all tab characters ('\t') with spaces in the page content of each document.
|
|
|
|
Args:
|
|
list_of_documents: A list of document objects, each with a 'page_content' attribute.
|
|
|
|
Returns:
|
|
The modified list of documents with tab characters replaced by spaces.
|
|
"""
|
|
|
|
for doc in list_of_documents:
|
|
doc.page_content = doc.page_content.replace('\t', ' ') # Replace tabs with spaces
|
|
return list_of_documents
|
|
|
|
|
|
|
|
|
|
def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
|
|
"""
|
|
Encodes a PDF book into a vector store using OpenAI embeddings.
|
|
|
|
Args:
|
|
path: The path to the PDF file.
|
|
chunk_size: The desired size of each text chunk.
|
|
chunk_overlap: The amount of overlap between consecutive chunks.
|
|
|
|
Returns:
|
|
A FAISS vector store containing the encoded book content.
|
|
"""
|
|
|
|
# Load PDF documents
|
|
loader = PyPDFLoader(path)
|
|
documents = loader.load()
|
|
|
|
# Split documents into chunks
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
|
|
)
|
|
texts = text_splitter.split_documents(documents)
|
|
cleaned_texts = replace_t_with_space(texts)
|
|
|
|
# Create embeddings and vector store
|
|
embeddings = OpenAIEmbeddings()
|
|
vectorstore = FAISS.from_documents(cleaned_texts, embeddings)
|
|
|
|
return vectorstore
|
|
|
|
|
|
def retrieve_context_per_question(question, chunks_query_retriever):
|
|
"""
|
|
Retrieves relevant context and unique URLs for a given question using the chunks query retriever.
|
|
|
|
Args:
|
|
question: The question for which to retrieve context and URLs.
|
|
|
|
Returns:
|
|
A tuple containing:
|
|
- A string with the concatenated content of relevant documents.
|
|
- A list of unique URLs from the metadata of the relevant documents.
|
|
"""
|
|
|
|
# Retrieve relevant documents for the given question
|
|
docs = chunks_query_retriever.get_relevant_documents(question)
|
|
|
|
# Concatenate document content
|
|
# context = " ".join(doc.page_content for doc in docs)
|
|
context = [doc.page_content for doc in docs]
|
|
|
|
|
|
return context
|
|
|
|
class QuestionAnswerFromContext(BaseModel):
|
|
"""
|
|
Model to generate an answer to a query based on a given context.
|
|
|
|
Attributes:
|
|
answer_based_on_content (str): The generated answer based on the context.
|
|
"""
|
|
answer_based_on_content: str = Field(description="Generates an answer to a query based on a given context.")
|
|
|
|
def create_question_answer_from_context_chain(llm):
|
|
|
|
# Initialize the ChatOpenAI model with specific parameters
|
|
question_answer_from_context_llm = llm
|
|
|
|
# Define the prompt template for chain-of-thought reasoning
|
|
question_answer_prompt_template = """
|
|
For the question below, provide a concise but suffice answer based ONLY on the provided context:
|
|
{context}
|
|
Question
|
|
{question}
|
|
"""
|
|
|
|
# Create a PromptTemplate object with the specified template and input variables
|
|
question_answer_from_context_prompt = PromptTemplate(
|
|
template=question_answer_prompt_template,
|
|
input_variables=["context", "question"],
|
|
)
|
|
|
|
# Create a chain by combining the prompt template and the language model
|
|
question_answer_from_context_cot_chain = question_answer_from_context_prompt | question_answer_from_context_llm.with_structured_output(QuestionAnswerFromContext)
|
|
return question_answer_from_context_cot_chain
|
|
|
|
|
|
|
|
def answer_question_from_context(question, context, question_answer_from_context_chain):
|
|
"""
|
|
Answer a question using the given context by invoking a chain of reasoning.
|
|
|
|
Args:
|
|
question: The question to be answered.
|
|
context: The context to be used for answering the question.
|
|
|
|
Returns:
|
|
A dictionary containing the answer, context, and question.
|
|
"""
|
|
input_data = {
|
|
"question": question,
|
|
"context": context
|
|
}
|
|
print("Answering the question from the retrieved context...")
|
|
|
|
output = question_answer_from_context_chain.invoke(input_data)
|
|
answer = output.answer_based_on_content
|
|
return {"answer": answer, "context": context, "question": question}
|
|
|
|
|
|
def show_context(context):
|
|
for i, c in enumerate(context):
|
|
print(f"Context {i+1}:")
|
|
print(c)
|
|
print("\n")
|
|
|
|
def read_pdf_to_string(path):
|
|
# Open the PDF document located at the specified path
|
|
doc = fitz.open(path)
|
|
content = ""
|
|
# Iterate over each page in the document
|
|
for page_num in range(len(doc)):
|
|
# Get the current page
|
|
page = doc[page_num]
|
|
# Extract the text content from the current page and append it to the content string
|
|
content += page.get_text()
|
|
return content
|