Files
nlm-ingestor-llmsherpa/notebooks/test_llmsherpa_api.ipynb
2024-01-26 16:14:34 -05:00

347 lines
12 KiB
Plaintext
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "81bab6ee-23ad-48bc-a363-1ad7d5bef433",
"metadata": {},
"source": [
"This notebook helps test a local or pip installed copy of llmsherpa with the ingestor core code"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "903bcef2-c45f-44a3-8f57-d5114d4b45c0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: llmsherpa in /Users/ambikasukla/projects/venvs/ingestor-test/lib/python3.11/site-packages (0.1.3)\n",
"Requirement already satisfied: urllib3 in /Users/ambikasukla/projects/venvs/ingestor-test/lib/python3.11/site-packages (from llmsherpa) (1.26.18)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"!pip install llmsherpa"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "ad949323-cf42-498c-8199-690e77137548",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_7973/2245081773.py:6: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
" from IPython.core.display import display, HTML\n"
]
}
],
"source": [
"import os, sys\n",
"directory_path = \"/Users/ambikasukla/projects/llmsherpa\"\n",
"sys.path.insert(0, directory_path)\n",
"%load_ext autoreload\n",
"from llmsherpa.readers import LayoutPDFReader\n",
"from IPython.core.display import display, HTML\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "d765b72f-5d58-4343-9f48-432acb31b7d6",
"metadata": {},
"outputs": [],
"source": [
"# llmsherpa_api_url = \"https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all\"\n",
"llmsherpa_api_url = \"http://localhost:5001/api/parseDocument?renderFormat=all&useNewIndentParser=true\"\n",
"# pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n",
"# pdf_url = \"https://www.apache.org/licenses/LICENSE-2.0.txt\"\n",
"# pdf_url = \"https://microsoft.gcs-web.com/static-files/931d7780-ccfc-47e3-97ad-09d87e12b795\"\n",
"# pdf_url = \"https://podcasts.ceu.edu/sites/podcasts.ceu.edu/files/sample.doc\"\n",
"# pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n",
"pdf_url = \"https://en.wikipedia.org/wiki/Language_model\"\n",
"pdf_url = \"/Users/ambikasukla/Downloads/test.pdf\"\n",
"pdf_url = \"https://github.com/nlmatics/nlm-ingestor\"\n",
"# pdf_url = \"https://raw.githubusercontent.com/nlmatics/llmsherpa/main/README.md\"\n",
"# pdf_url = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=35362092&retmode=xml\"\n",
"# pdf_url = \"https://solutions.weblite.ca/pdfocrx/scansmpl.pdf\"\n",
"do_ocr = True\n",
"if do_ocr:\n",
" llmsherpa_api_url = llmsherpa_api_url + \"&applyOcr=yes\"\n",
"pdf_reader = LayoutPDFReader(llmsherpa_api_url)\n",
"doc = pdf_reader.read_pdf(pdf_url)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "274fc39e-a574-4312-9d44-53b7758fa961",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<html><h1>Search code, repositories, users, issues, pull requests...</h1><p>Clear</p><h1>\n",
" Provide feedback\n",
" </h1><p>We read every piece of feedback, and take your input very seriously.</p><h1>\n",
" Saved searches\n",
" </h1><h2>Use saved searches to filter your results more quickly</h2><p> \n",
" \n",
" \n",
" To see all available qualifiers, see our documentation.\n",
" \n",
" \n",
" \n",
" </p><p>You signed in with another tab or window.\n",
"Reload to refresh your session.</p><p>You signed out in another tab or window.\n",
"Reload to refresh your session.</p><p>You switched accounts on another tab or window.\n",
"Reload to refresh your session.</p><p> \n",
" \n",
" \n",
" nlmatics</p><p>/</p><p>Public<ul><li>Notifications</li><li>Fork\n",
"20</li><li> \n",
" \n",
" \n",
" Star\n",
"542</li></ul></p><p> \n",
" \n",
" \n",
" This repo provides the server side code for llmsherpa API to connect.\n",
"It includes parsers for various file formats.\n",
" \n",
" \n",
" </p><p>www.nlmatics.com</p><h3>License</h3><p>542</p><p>20</p><p>Branches</p><p>Tags</p><p>Activity</p><p> \n",
" \n",
" \n",
" Star<ul><li>Code</li><li>Issues\n",
"7</li><li>Pull requests\n",
"0</li><li>Actions</li><li>Projects\n",
"0</li><li>Security</li><li>Insights</li><li> \n",
" \n",
" \n",
" Code</li><li> \n",
" \n",
" \n",
" Issues</li><li> \n",
" \n",
" \n",
" Pull requests</li><li> \n",
" \n",
" \n",
" Actions</li><li> \n",
" \n",
" \n",
" Projects</li><li> \n",
" \n",
" \n",
" Security</li><li> \n",
" \n",
" \n",
" Insights</li></ul></p><h2>Use saved searches to filter your results more quickly</h2><p> \n",
" \n",
" \n",
" To see all available qualifiers, see our documentation.\n",
" \n",
" \n",
" \n",
" </p><p>You signed in with another tab or window.\n",
"Reload to refresh your session.</p><p>You signed out in another tab or window.\n",
"Reload to refresh your session.</p><p>You switched accounts on another tab or window.\n",
"Reload to refresh your session.</p><p> \n",
" \n",
" \n",
" nlmatics</p><p>/</p><p>Public<ul><li>Notifications</li><li>Fork\n",
"20</li><li> \n",
" \n",
" \n",
" Star\n",
"542</li></ul></p><p> \n",
" \n",
" \n",
" This repo provides the server side code for llmsherpa API to connect.\n",
"It includes parsers for various file formats.\n",
" \n",
" \n",
" </p><p>www.nlmatics.com</p><h3>License</h3><p>542</p><p>20</p><p>Branches</p><p>Tags</p><p>Activity</p><p> \n",
" \n",
" \n",
" Star<ul><li>Code</li><li>Issues\n",
"7</li><li>Pull requests\n",
"0</li><li>Actions</li><li>Projects\n",
"0</li><li>Security</li><li>Insights</li><li> \n",
" \n",
" \n",
" Code</li><li> \n",
" \n",
" \n",
" Issues</li><li> \n",
" \n",
" \n",
" Pull requests</li><li> \n",
" \n",
" \n",
" Actions</li><li> \n",
" \n",
" \n",
" Projects</li><li> \n",
" \n",
" \n",
" Security</li><li> \n",
" \n",
" \n",
" Insights</li></ul></p><h3>License</h3><p>542</p><p>20</p><p>Branches</p><p>Tags</p><p>Activity</p><p> \n",
" \n",
" \n",
" Star<ul><li>Code</li><li>Issues\n",
"7</li><li>Pull requests\n",
"0</li><li>Actions</li><li>Projects\n",
"0</li><li>Security</li><li>Insights</li><li> \n",
" \n",
" \n",
" Code</li><li> \n",
" \n",
" \n",
" Issues</li><li> \n",
" \n",
" \n",
" Pull requests</li><li> \n",
" \n",
" \n",
" Actions</li><li> \n",
" \n",
" \n",
" Projects</li><li> \n",
" \n",
" \n",
" Security</li><li> \n",
" \n",
" \n",
" Insights</li></ul></p><h1>nlmatics/nlm-ingestor</h1><h2>About</h2><p> \n",
" \n",
" \n",
" This repo provides the server side code for llmsherpa API to connect.\n",
"It includes parsers for various file formats.\n",
" \n",
" \n",
" </p><p>www.nlmatics.com</p><h3>Resources</h3><h3>License</h3><p>Activity</p><p>Custom properties</p><h3>Stars</h3><h3>Watchers</h3><h3>Forks</h3><h2>\n",
"\n",
" Releases\n",
" 6\n",
"</h2><p>First production ready release with Apache 2.0 license</p><p> \n",
" \n",
" \n",
" Latest</p><h2>\n",
"\n",
" Packages\n",
" 1\n",
"</h2><h2>Languages</h2><li>Python\n",
"50.2%</li><li>Jupyter Notebook\n",
"49.7%</li><li>Other\n",
"0.1%</li><h2>Footer</h2><p> \n",
" \n",
" \n",
" © 2024 GitHub, Inc.\n",
" \n",
" \n",
" </p><h3>Footer navigation</h3><li>Terms</li><li>Privacy</li><li>Security</li><li>Status</li><li>Docs</li><li>Contact</li><li> \n",
" \n",
" Manage cookies</li><li> \n",
" \n",
" Do not share my personal information</li><h2>About</h2><p> \n",
" \n",
" \n",
" This repo provides the server side code for llmsherpa API to connect.\n",
"It includes parsers for various file formats.\n",
" \n",
" \n",
" </p><p>www.nlmatics.com</p><h3>Resources</h3><h3>License</h3><p>Activity</p><p>Custom properties</p><h3>Stars</h3><h3>Watchers</h3><h3>Forks</h3><h3>Resources</h3><h3>License</h3><p>Activity</p><p>Custom properties</p><h3>Stars</h3><h3>Watchers</h3><h3>Forks</h3><h2>\n",
"\n",
" Releases\n",
" 6\n",
"</h2><p>First production ready release with Apache 2.0 license</p><p> \n",
" \n",
" \n",
" Latest</p><h2>\n",
"\n",
" Packages\n",
" 1\n",
"</h2><h2>Languages</h2><li>Python\n",
"50.2%</li><li>Jupyter Notebook\n",
"49.7%</li><li>Other\n",
"0.1%</li><h2>Footer</h2><p> \n",
" \n",
" \n",
" © 2024 GitHub, Inc.\n",
" \n",
" \n",
" </p><h3>Footer navigation</h3><li>Terms</li><li>Privacy</li><li>Security</li><li>Status</li><li>Docs</li><li>Contact</li><li> \n",
" \n",
" Manage cookies</li><li> \n",
" \n",
" Do not share my personal information</li><h3>Footer navigation</h3><li>Terms</li><li>Privacy</li><li>Security</li><li>Status</li><li>Docs</li><li>Contact</li><li> \n",
" \n",
" Manage cookies</li><li> \n",
" \n",
" Do not share my personal information</li></html>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# HTML(doc.sections()[0].to_html(include_children=True, recurse=True))\n",
"# doc.sections()[1].block_json\n",
"# doc.sections()[0].to_text()\n",
"# doc.sections()[1].bbox\n",
"# llmsherpa.readers.Layout\n",
"HTML(doc.to_html())\n",
"# print(doc.to_html())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}