Files
Advanced-LangChain-RAG/ingest.py
shiroyasha13 ce498257fd Initial commit
2024-05-19 19:53:10 +05:30

135 lines
7.3 KiB
Python

# Used to load the PDF files from the source_files directory into langchain documents
from langchain_community.document_loaders import PyPDFLoader
# Used to recursively chunk the langchain document's page_content into appropriate size based on context-performance and LLM based on a
# list of characters
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Using ollama for serving llama3 embeddings
from langchain_community.embeddings import OllamaEmbeddings
# Using chroma vectorstore
from langchain_community.vectorstores import Chroma
# For setting up the chat template
from langchain_core.prompts import ChatPromptTemplate
# for using the groq llama3 api
from langchain_groq import ChatGroq
import os
import subprocess
import json
os.environ['GROQ_API_KEY'] = ''
class MyCustomError(Exception):
def __init__(self, message):
super().__init__(message)
class local_pdf_gpt_ingester:
def __init__(self, path, embedding_model, vectorstore, images=True):
self.path = path
# since we are using ollama, the model should have already been served, if not then we can throw an error or pull the model oursellves
self.embeddings = embedding_model
# here we are only using the conditions for Chroma, but can be extended
self.vectorstore = vectorstore
self.images = images
def extract_file_metadata(self, context):
try:
llm = ChatGroq(model_name = 'llama3-8b-8192')
system = "You are a helpful assistant.From the given text extract the client name, service provider name and the date of the contract as a JSON response with keys as client, service_provider and contract_date(in the format dd-mm-YYYY). Do not return any other additonal messages."
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{text}")])
chain = prompt | llm
response = chain.invoke({"text":context})
return response.content
except Exception as e:
print(e)
return False
def parse_meta(self, info):
text = info.strip()
if not text.startswith('{') and '{' not in text:
text = '{' + text
# Check and add '}' at the end if missing
if not text.endswith('}') and '}' not in text:
text = text + '}'
dictionary = json.loads(text)
return dictionary
def pdf_to_documents(self, **kwargs):
# using the langchain pdf loader for loading the documents and converting images to flat text( only OCR happens, not an image
# embeddding) - We cannot use this loader for multimoddal content (we can use the pypdf library to extract images and text separately
# or use the unstructured.io library)
files = os.listdir(self.path)
files = [f"{self.path}/{f}" for f in files if f.endswith(".pdf")]
return_chunks = []
for f in files:
docs = PyPDFLoader(f,extract_images = self.images)
# setting up the splitter (here the splitter is configured to work with llama3, need to change appropriately)
splitter = RecursiveCharacterTextSplitter(chunk_size = kwargs.get('chunk_size', 3800),
chunk_overlap = kwargs.get('chunk_overlap',50),
separators = kwargs.get('separators',["\n\n","\n"," ","."]),
is_separator_regex=kwargs.get('is_separator_regex', False)
)
# splitting the documents into appropriate chunks and returning the chunked content
doc_chunks = docs.load_and_split(text_splitter = splitter)
my_chunk = doc_chunks[0].page_content
meta_info = self.extract_file_metadata(my_chunk)
meta_info = self.parse_meta(meta_info)
return_chunks.append({'doc':[i.page_content for i in doc_chunks], 'metadata': meta_info})
return return_chunks
def pull_model_to_ollama(model_name):
# Construct the command
command = ["ollama", "pull", model_name]
try:
# Execute the command
result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
# Print the standard output and error
print("Output:\n", result.stdout)
print("Error (if any):\n", result.stderr)
if result.stderr:
return False
else:
return True
except subprocess.CalledProcessError as e:
# Handle errors in the command execution
print(f"An error occurred while pulling the model '{model_name}':")
print("Return code:", e.returncode)
print("Output:\n", e.output)
print("Error:\n", e.stderr)
def load_embeddings(self):
# need to check whether the embeddings are already present, if not we have to validate and pull the model into our ollama
try:
# find all the llama3 models here - https://ollama.com/library/llama3:8b
# we are using the default connection and parameters for ollama emeddings
self.embeddings = OllamaEmbeddings(model=self.embeddings)
except:
model_check = self.pull_model_to_ollama(self.embeddings)
if model_check:
self.embeddings = OllamaEmbeddings(model=self.embeddings)
else:
raise MyCustomError(f"The provided model name - {self.embeddings} is invalid. Find the list of supported llama3 models here - https://ollama.com/library/llama3:8b")
def embed_and_store(self, documents):
# we are going to use the ollama embeddings with Chroma store here
if self.vectorstore=='chroma':
try:
# default methodis get_or_create collection, so if the name already exists this will append
# and not overwrite that content
self.vectorstore = Chroma(collection_name="MSA_4k_chunks",
embedding_function=self.embeddings,
persist_directory="./db")
except Exception as e:
raise MyCustomError(f"Chroma DB error - {e}")
for doc in documents:
metas = [doc['metadata'] for i in doc['doc']]
# optionally you can also IDs. passing metadata can be donw like this, or you can directly
# add the metadata to the document object itself. No specific reason for me doing this.
# adding metadata also allows easy updation and deletion, we can always check if a document
# is present or absent based on the metdata, right.
self.vectorstore.add_texts(texts = doc['doc'],
metadatas = metas)
print("The documents have been embedded and stored in the vector database")
if __name__=='__main__':
msa_bot = local_pdf_gpt_ingester("source_files/", "llama3", "chroma")
msa_docs = msa_bot.pdf_to_documents()
msa_bot.load_embeddings()
msa_bot.embed_and_store(msa_docs)