mirror of
https://github.com/nlmatics/nlm-ingestor.git
synced 2024-08-02 20:58:47 +03:00
347 lines
12 KiB
Plaintext
347 lines
12 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "81bab6ee-23ad-48bc-a363-1ad7d5bef433",
|
||
"metadata": {},
|
||
"source": [
|
||
"This notebook helps test a local or pip installed copy of llmsherpa with the ingestor core code"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "903bcef2-c45f-44a3-8f57-d5114d4b45c0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Requirement already satisfied: llmsherpa in /Users/ambikasukla/projects/venvs/ingestor-test/lib/python3.11/site-packages (0.1.3)\n",
|
||
"Requirement already satisfied: urllib3 in /Users/ambikasukla/projects/venvs/ingestor-test/lib/python3.11/site-packages (from llmsherpa) (1.26.18)\n",
|
||
"\n",
|
||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n",
|
||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!pip install llmsherpa"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "ad949323-cf42-498c-8199-690e77137548",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"The autoreload extension is already loaded. To reload it, use:\n",
|
||
" %reload_ext autoreload\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_7973/2245081773.py:6: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
|
||
" from IPython.core.display import display, HTML\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import os, sys\n",
|
||
"directory_path = \"/Users/ambikasukla/projects/llmsherpa\"\n",
|
||
"sys.path.insert(0, directory_path)\n",
|
||
"%load_ext autoreload\n",
|
||
"from llmsherpa.readers import LayoutPDFReader\n",
|
||
"from IPython.core.display import display, HTML\n",
|
||
"%autoreload 2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"id": "d765b72f-5d58-4343-9f48-432acb31b7d6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# llmsherpa_api_url = \"https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all\"\n",
|
||
"llmsherpa_api_url = \"http://localhost:5001/api/parseDocument?renderFormat=all&useNewIndentParser=true\"\n",
|
||
"# pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n",
|
||
"# pdf_url = \"https://www.apache.org/licenses/LICENSE-2.0.txt\"\n",
|
||
"# pdf_url = \"https://microsoft.gcs-web.com/static-files/931d7780-ccfc-47e3-97ad-09d87e12b795\"\n",
|
||
"# pdf_url = \"https://podcasts.ceu.edu/sites/podcasts.ceu.edu/files/sample.doc\"\n",
|
||
"# pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n",
|
||
"pdf_url = \"https://en.wikipedia.org/wiki/Language_model\"\n",
|
||
"pdf_url = \"/Users/ambikasukla/Downloads/test.pdf\"\n",
|
||
"pdf_url = \"https://github.com/nlmatics/nlm-ingestor\"\n",
|
||
"# pdf_url = \"https://raw.githubusercontent.com/nlmatics/llmsherpa/main/README.md\"\n",
|
||
"# pdf_url = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=35362092&retmode=xml\"\n",
|
||
"# pdf_url = \"https://solutions.weblite.ca/pdfocrx/scansmpl.pdf\"\n",
|
||
"do_ocr = True\n",
|
||
"if do_ocr:\n",
|
||
" llmsherpa_api_url = llmsherpa_api_url + \"&applyOcr=yes\"\n",
|
||
"pdf_reader = LayoutPDFReader(llmsherpa_api_url)\n",
|
||
"doc = pdf_reader.read_pdf(pdf_url)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"id": "274fc39e-a574-4312-9d44-53b7758fa961",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<html><h1>Search code, repositories, users, issues, pull requests...</h1><p>Clear</p><h1>\n",
|
||
" Provide feedback\n",
|
||
" </h1><p>We read every piece of feedback, and take your input very seriously.</p><h1>\n",
|
||
" Saved searches\n",
|
||
" </h1><h2>Use saved searches to filter your results more quickly</h2><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" To see all available qualifiers, see our documentation.\n",
|
||
" \n",
|
||
" \n",
|
||
" \n",
|
||
" </p><p>You signed in with another tab or window.\n",
|
||
"Reload to refresh your session.</p><p>You signed out in another tab or window.\n",
|
||
"Reload to refresh your session.</p><p>You switched accounts on another tab or window.\n",
|
||
"Reload to refresh your session.</p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" nlmatics</p><p>/</p><p>Public<ul><li>Notifications</li><li>Fork\n",
|
||
"20</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Star\n",
|
||
"542</li></ul></p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" This repo provides the server side code for llmsherpa API to connect.\n",
|
||
"It includes parsers for various file formats.\n",
|
||
" \n",
|
||
" \n",
|
||
" </p><p>www.nlmatics.com</p><h3>License</h3><p>542</p><p>20</p><p>Branches</p><p>Tags</p><p>Activity</p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" Star<ul><li>Code</li><li>Issues\n",
|
||
"7</li><li>Pull requests\n",
|
||
"0</li><li>Actions</li><li>Projects\n",
|
||
"0</li><li>Security</li><li>Insights</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Code</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Issues</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Pull requests</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Actions</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Projects</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Security</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Insights</li></ul></p><h2>Use saved searches to filter your results more quickly</h2><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" To see all available qualifiers, see our documentation.\n",
|
||
" \n",
|
||
" \n",
|
||
" \n",
|
||
" </p><p>You signed in with another tab or window.\n",
|
||
"Reload to refresh your session.</p><p>You signed out in another tab or window.\n",
|
||
"Reload to refresh your session.</p><p>You switched accounts on another tab or window.\n",
|
||
"Reload to refresh your session.</p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" nlmatics</p><p>/</p><p>Public<ul><li>Notifications</li><li>Fork\n",
|
||
"20</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Star\n",
|
||
"542</li></ul></p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" This repo provides the server side code for llmsherpa API to connect.\n",
|
||
"It includes parsers for various file formats.\n",
|
||
" \n",
|
||
" \n",
|
||
" </p><p>www.nlmatics.com</p><h3>License</h3><p>542</p><p>20</p><p>Branches</p><p>Tags</p><p>Activity</p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" Star<ul><li>Code</li><li>Issues\n",
|
||
"7</li><li>Pull requests\n",
|
||
"0</li><li>Actions</li><li>Projects\n",
|
||
"0</li><li>Security</li><li>Insights</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Code</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Issues</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Pull requests</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Actions</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Projects</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Security</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Insights</li></ul></p><h3>License</h3><p>542</p><p>20</p><p>Branches</p><p>Tags</p><p>Activity</p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" Star<ul><li>Code</li><li>Issues\n",
|
||
"7</li><li>Pull requests\n",
|
||
"0</li><li>Actions</li><li>Projects\n",
|
||
"0</li><li>Security</li><li>Insights</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Code</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Issues</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Pull requests</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Actions</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Projects</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Security</li><li> \n",
|
||
" \n",
|
||
" \n",
|
||
" Insights</li></ul></p><h1>nlmatics/nlm-ingestor</h1><h2>About</h2><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" This repo provides the server side code for llmsherpa API to connect.\n",
|
||
"It includes parsers for various file formats.\n",
|
||
" \n",
|
||
" \n",
|
||
" </p><p>www.nlmatics.com</p><h3>Resources</h3><h3>License</h3><p>Activity</p><p>Custom properties</p><h3>Stars</h3><h3>Watchers</h3><h3>Forks</h3><h2>\n",
|
||
"\n",
|
||
" Releases\n",
|
||
" 6\n",
|
||
"</h2><p>First production ready release with Apache 2.0 license</p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" Latest</p><h2>\n",
|
||
"\n",
|
||
" Packages\n",
|
||
" 1\n",
|
||
"</h2><h2>Languages</h2><li>Python\n",
|
||
"50.2%</li><li>Jupyter Notebook\n",
|
||
"49.7%</li><li>Other\n",
|
||
"0.1%</li><h2>Footer</h2><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" © 2024 GitHub, Inc.\n",
|
||
" \n",
|
||
" \n",
|
||
" </p><h3>Footer navigation</h3><li>Terms</li><li>Privacy</li><li>Security</li><li>Status</li><li>Docs</li><li>Contact</li><li> \n",
|
||
" \n",
|
||
" Manage cookies</li><li> \n",
|
||
" \n",
|
||
" Do not share my personal information</li><h2>About</h2><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" This repo provides the server side code for llmsherpa API to connect.\n",
|
||
"It includes parsers for various file formats.\n",
|
||
" \n",
|
||
" \n",
|
||
" </p><p>www.nlmatics.com</p><h3>Resources</h3><h3>License</h3><p>Activity</p><p>Custom properties</p><h3>Stars</h3><h3>Watchers</h3><h3>Forks</h3><h3>Resources</h3><h3>License</h3><p>Activity</p><p>Custom properties</p><h3>Stars</h3><h3>Watchers</h3><h3>Forks</h3><h2>\n",
|
||
"\n",
|
||
" Releases\n",
|
||
" 6\n",
|
||
"</h2><p>First production ready release with Apache 2.0 license</p><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" Latest</p><h2>\n",
|
||
"\n",
|
||
" Packages\n",
|
||
" 1\n",
|
||
"</h2><h2>Languages</h2><li>Python\n",
|
||
"50.2%</li><li>Jupyter Notebook\n",
|
||
"49.7%</li><li>Other\n",
|
||
"0.1%</li><h2>Footer</h2><p> \n",
|
||
" \n",
|
||
" \n",
|
||
" © 2024 GitHub, Inc.\n",
|
||
" \n",
|
||
" \n",
|
||
" </p><h3>Footer navigation</h3><li>Terms</li><li>Privacy</li><li>Security</li><li>Status</li><li>Docs</li><li>Contact</li><li> \n",
|
||
" \n",
|
||
" Manage cookies</li><li> \n",
|
||
" \n",
|
||
" Do not share my personal information</li><h3>Footer navigation</h3><li>Terms</li><li>Privacy</li><li>Security</li><li>Status</li><li>Docs</li><li>Contact</li><li> \n",
|
||
" \n",
|
||
" Manage cookies</li><li> \n",
|
||
" \n",
|
||
" Do not share my personal information</li></html>"
|
||
],
|
||
"text/plain": [
|
||
"<IPython.core.display.HTML object>"
|
||
]
|
||
},
|
||
"execution_count": 39,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# HTML(doc.sections()[0].to_html(include_children=True, recurse=True))\n",
|
||
"# doc.sections()[1].block_json\n",
|
||
"# doc.sections()[0].to_text()\n",
|
||
"# doc.sections()[1].bbox\n",
|
||
"# llmsherpa.readers.Layout\n",
|
||
"HTML(doc.to_html())\n",
|
||
"# print(doc.to_html())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|