Files
agent-video-summarizer/dev.ipynb
2024-06-09 23:38:32 +03:00

355 lines
14 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "4767b5c4-536d-44c9-baba-d72efbb27df1",
"metadata": {},
"outputs": [],
"source": [
"import io\n",
"import os\n",
"import time\n",
"import pickle\n",
"from jinja2 import Template\n",
"\n",
"import yt_dlp\n",
"from whisper import load_model\n",
"from loguru import logger\n",
"from dotenv import load_dotenv\n",
"import tiktoken\n",
"\n",
"from llms import LLMService\n",
"import pipeline\n",
"import utils\n",
"\n",
"\n",
"logger.debug(\"loading env vars\")\n",
"load_dotenv(dotenv_path='./env')\n",
"\n",
"logger.debug(\"initializing llm query engines\")\n",
"query_engines = {}\n",
"for model in [\"gpt35turbo\", \"gpt4\"]:\n",
" llm_service = LLMService(provider=\"azure\", model=model)\n",
" query_engine = llm_service.initialize_client_query_engine()\n",
" query_engines[model] = query_engine \n",
" llm_servce = None\n",
"\n",
"logger.debug(\"loading stt model\")\n",
"model = load_model(\"large-v3\", device=\"cpu\")\n",
"\n",
"logger.debug(\"distributing stt model on multi-gpu setup\")\n",
"# multigpu whisper source:\n",
"# https://github.com/openai/whisper/discussions/360\n",
"model.encoder.to(\"cuda:0\")\n",
"model.decoder.to(\"cuda:1\")\n",
"model.decoder.register_forward_pre_hook(lambda _, inputs: tuple([inputs[0].to(\"cuda:1\"), inputs[1].to(\"cuda:1\")] + list(inputs[2:])))\n",
"model.decoder.register_forward_hook(lambda _, inputs, outputs: outputs.to(\"cuda:0\"))\n",
"\n",
"# test:\n",
"# https://www.youtube.com/watch?v=jLVl5V8roMU\n",
"# https://www.youtube.com/watch?v=ihVOcWVKslc\n",
"# https://www.youtube.com/watch?v=CzCcYlH_6_I"
]
},
{
"cell_type": "markdown",
"id": "b5644c5a-6654-4b8d-a7f2-8ac49491cf6f",
"metadata": {},
"source": [
"## SUMMARY"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1c7d8742-ea54-4c73-85a2-09039baa0dce",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-06-09 18:04:50.924\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpipeline\u001b[0m:\u001b[36mdownload_audio\u001b[0m:\u001b[36m9\u001b[0m - \u001b[34m\u001b[1mdownloading audio from: {url}\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[youtube] Extracting URL: https://www.youtube.com/watch?v=mdLBr9IMmgI\n",
"[youtube] mdLBr9IMmgI: Downloading webpage\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[proxychains] Dynamic chain ... 192.168.1.212:8080 ... www.youtube.com:443 ... OK\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[youtube] mdLBr9IMmgI: Downloading ios player API JSON\n",
"[youtube] mdLBr9IMmgI: Downloading m3u8 information\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[proxychains] Dynamic chain ... 192.168.1.212:8080 ... manifest.googlevideo.com:443 ... OK\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[info] mdLBr9IMmgI: Downloading 1 format(s): 251\n",
"[download] Destination: downloads/Marker This Open-Source Tool will make your PDFs LLM Ready.webm\n",
"[download] 9.6% of 10.43MiB at 10.05MiB/s ETA 00:00"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[proxychains] Dynamic chain ... 192.168.1.212:8080 ... rr5---sn-u0g3jxaa-n5fs.googlevideo.com:443 ... OK\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[download] 100% of 10.43MiB in 00:00:01 at 9.38MiB/s \n",
"[ExtractAudio] Destination: downloads/Marker This Open-Source Tool will make your PDFs LLM Ready.mp3\n",
"Deleting original file downloads/Marker This Open-Source Tool will make your PDFs LLM Ready.webm (pass -k to keep)\n"
]
}
],
"source": [
"path_audiofile, name_audiofile = pipeline.download_audio(url=\"https://www.youtube.com/watch?v=mdLBr9IMmgI\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ceca019d-464a-48c7-b7c7-77d72a4db165",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"downloads/Marker This Open-Source Tool will make your PDFs LLM Ready.mp3\n",
"Marker This Open-Source Tool will make your PDFs LLM Ready.mp3\n"
]
}
],
"source": [
"print(path_audiofile)\n",
"\n",
"print(name_audiofile)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "46770909-7059-4f56-90dd-4314eedbfa38",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-06-09 18:05:54.046\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpipeline\u001b[0m:\u001b[36mtranscribe\u001b[0m:\u001b[36m31\u001b[0m - \u001b[34m\u001b[1mtranscribing audio\u001b[0m\n",
"\u001b[32m2024-06-09 18:10:31.316\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpipeline\u001b[0m:\u001b[36msplit_transcript_into_chunks\u001b[0m:\u001b[36m35\u001b[0m - \u001b[34m\u001b[1mspltting transcript into chunks with max_tokens: 2048\u001b[0m\n",
"\u001b[32m2024-06-09 18:10:31.577\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[32m\u001b[1msubtranscripts saved into cache - Marker This Open-Source Tool will make your PDFs LLM Ready.mp3\u001b[0m\n"
]
}
],
"source": [
"transcript = pipeline.transcribe(model=model, audio=path_audiofile)\n",
"subtranscripts = pipeline.divide_transcript(transcript, max_tokens=2048)\n",
"partial_summaries = pipeline.generate_partial_summaries(subtranscripts)\n",
"merged_summary = pipeline.generate_merged_summary(partial_summaries)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "db22c301-3555-480e-a79d-2bf8784448c0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"**key highlights**\n",
"- PDFs present challenges for LLM applications due to their complex structure, lack of standard layout, and issues with different encodings, fonts, formatting, tables, and images.\n",
"- To make PDFs LLM ready, common approaches include converting PDFs to plain text for easier parsing and using machine learning models and optical character recognition (OCR) models.\n",
"- Markdowns are easier to work with for LLM as they can be easily converted to plain text and retain the original formatting, including titles, headers, images, and tables.\n",
"- Marker is an open source tool that converts complex PDF files into well-structured markdowns, preserving the structure and content of the original document.\n",
"- Marker supports a wide variety of documents, removes headers, footers, and artifacts, formats tables and code blocks, extracts and saves images, converts most equations to LaTeX, and runs on GPU, CPU, or MPS.\n",
"- The output folder created by Marker contains all the images extracted from the document and a JSON file with all the metadata, including language, file type, number of pages, equations, and tables.\n",
"- Marker accurately extracts equations, preserves tables, and maintains the relative location of text, images, and equations.\n",
"- Despite its capabilities, Marker has limitations with converting equations to LaTeX, formatting tables correctly, respecting white spaces, and joining line spans properly. Post-processing may be required to ensure accuracy of images and tables.\n",
"- Marker can be used in commercial projects for organizations that meet certain revenue and funding criteria, otherwise a license is required.\n",
"\n",
"**summary**\n",
"The transcript discusses the challenges associated with using PDFs in LLM applications and presents solutions, primarily focusing on the use of Marker, an open source tool that converts complex PDF files into markdowns. Marker preserves the structure and content of the original document, supports various document types, and runs on different processing units. It effectively extracts equations, preserves tables, and maintains the relative location of text, images, and equations. The output folder contains all extracted images and a JSON file with metadata. While Marker has certain limitations and may require post-processing for accuracy, it is a valuable tool for converting PDFs into structured markdowns.\n"
]
}
],
"source": [
"print(merged_summary)"
]
},
{
"cell_type": "markdown",
"id": "9391ecd2-f9e8-4fdd-b49d-6a6bbd336d97",
"metadata": {},
"source": [
"## RAG"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a98f174f-0ae8-4caf-bcbe-9de8c5ba1629",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-06-09 18:17:35.830\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36msplit_into_chunks\u001b[0m:\u001b[36m5\u001b[0m - \u001b[34m\u001b[1msplitting transcript into chunks for rag\u001b[0m\n",
"\u001b[32m2024-06-09 18:17:35.833\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36msplit_into_chunks\u001b[0m:\u001b[36m12\u001b[0m - \u001b[34m\u001b[1mTotal chunk count: 18\u001b[0m\n",
"\u001b[32m2024-06-09 18:17:35.835\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36msplit_into_chunks\u001b[0m:\u001b[36m23\u001b[0m - \u001b[34m\u001b[1mAverage chunk len: 982.1666666666666\u001b[0m\n",
"\u001b[32m2024-06-09 18:17:35.836\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36msplit_into_chunks\u001b[0m:\u001b[36m24\u001b[0m - \u001b[34m\u001b[1mAverage chunk token count: 216.16666666666666\u001b[0m\n"
]
}
],
"source": [
"from chromadb.config import Settings\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_huggingface import HuggingFaceEmbeddings\n",
"from langchain_chroma import Chroma\n",
"from langchain.chains import RetrievalQA\n",
"from langchain import hub\n",
"\n",
"# disable telemetry source:\n",
"# https://github.com/langchain-ai/langchain/issues/7804\n",
"\n",
"# azure embedding model deployment:\n",
"# https://oai.azure.com/portal/80c8b24f4ccd499591d1de3d70dd5e7a/deployment/embedding-ada-002?tenantid=cd67ec44-7e33-4c2f-9c8f-69ff48c2172b\n",
"\n",
"# huggingface sentence transformers embedding:\n",
"# https://python.langchain.com/v0.2/docs/integrations/text_embedding/sentence_transformers/\n",
"\n",
"# vector store retrieval source: \n",
"# https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/\n",
"\n",
"\n",
"transcript_chunks = pipeline.split_into_chunks(transcript)\n",
"qa_chain = pipeline.instantiate_qa_chain(knowledge_chunks=transcript_chunks)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "859fd0d0-e938-4082-b12b-4fb7f87000e7",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "b75a40f4-a406-4a78-b87c-8002e95bd135",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/uad/crawler/video-summarizer/venv/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The method `Chain.__call__` was deprecated in langchain 0.1.0 and will be removed in 0.3.0. Use invoke instead.\n",
" warn_deprecated(\n",
"[proxychains] Dynamic chain ... 192.168.1.212:8080 ... aias-rdm-openai-2.openai.azure.com:443 ... OK\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Marker appears to be quite accurate based on the context provided. It successfully preserved the structure of a book when converting it, unlike the program Nougat. It also has the ability to remove headers, footers, and other artifacts, format tables and code blocks, extract images, and convert equations to latex. However, it may require additional post-processing steps to ensure absolute accuracy, especially with images and tables.\n"
]
}
],
"source": [
"query = \"How accurate is Marker?\"\n",
"result = qa_chain({\"query\": query})\n",
"print(result[\"result\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "061c07c0-406c-433b-a184-e70ab4d72107",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"query = \"How accurate is Marker?\"\n",
"response = qa_chain({\"query\": query})"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "e4c716da-54c7-4447-93ad-990798a7fe75",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The Marker is described as being quite accurate. It is able to preserve the structure of documents, including tables of content, headers, footers, and images. It is noted to be more accurate than Nougat, and it can also convert most equations to latex. However, some post-processing might be required to ensure the accuracy of images and tables it extracts.\n"
]
}
],
"source": [
"print(response[\"result\"])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "vidsumm",
"language": "python",
"name": "vidsumm"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}