added bbox, fixed imports and bumped version

This commit is contained in:
Ambika Sukla
2024-01-23 17:55:27 -05:00
parent 62ff19008e
commit e59831bac5
9 changed files with 53 additions and 26 deletions

View File

@@ -8,6 +8,7 @@ The key maintainers of this codebase are:
- For small changes or a bug fixes, go ahead and create a PR
- For large changes, create an issue on github with the proposal and @ one of the key maintainers for discussion before working on it
- Run existing test cases and add test cases for changes to line_parser
- Note that the ingestor tests are not yet runnable and have database dependencies (work in progress), so the likelihood of breaking things is very high
## Contribution areas
- Make the new_indent_parser more accurate

View File

@@ -64,7 +64,7 @@ def parse_document(
return make_response(jsonify({"status": status, "reason": msg}), rc)
def main():
logger.info("Starting parser service..")
logger.info("Starting ingestor service..")
app.run(host="0.0.0.0", port=5001, debug=False)

View File

@@ -255,6 +255,12 @@ class BlockRenderer:
"page_idx": block["page_idx"],
"block_class": block["block_class"],
"sentences": [block_text],
"bbox": [
block["box_style"][1],
block["box_style"][0],
block["box_style"][1] + block["box_style"][3],
block["box_style"][0] + block["box_style"][4],
]
}
elif block_type == "list_item" and not is_rendering_table:
block_dict = self.render_nested_block_as_dict(block, "list_item")
@@ -266,6 +272,12 @@ class BlockRenderer:
"page_idx": block["page_idx"],
"block_class": block["block_class"],
"sentences": [block_text],
"bbox": [
block["box_style"][1],
block["box_style"][0],
block["box_style"][1] + block["box_style"][3],
block["box_style"][0] + block["box_style"][4],
]
}
if block_dict:
@@ -329,7 +341,14 @@ class BlockRenderer:
if 'is_table_end' in block:
is_rendering_table = False
render_dict["blocks"][-1]["table_rows"] = table_rows
table_block = render_dict["blocks"][-1]
table_block["table_rows"] = table_rows
table_block["bbox"] = [
table_block["left"],
table_block["top"],
table_block["left"] + block["box_style"][3],
table_block["top"] + block["box_style"][4],
]
table_rows = []
return render_dict
@@ -349,6 +368,12 @@ class BlockRenderer:
"block_class": block["block_class"],
"sentences": [sent for sent in block["block_sents"]],
"block_idx": block["block_idx"],
"bbox": [
block["box_style"][1],
block["box_style"][0],
block["box_style"][1] + block["box_style"][3],
block["box_style"][0] + block["box_style"][4],
]
}
return block_dict

View File

@@ -131,7 +131,7 @@ def format_to_tr_block(prev_block, gap_threshold):
child_x1 = child['box_style'][1]
if gap_threshold <= round(child_x1 - prev_child_x2):
new_child_block = ingestor.visual_ingestor.Doc.merge_vls(block_buff)
new_child_block = nlm_ingestor.ingestor.visual_ingestor.Doc.merge_vls(block_buff)
new_visual_lines.append(new_child_block)
new_block_children.append({"text": block_text,
"centroid": get_centroid(block_buff[0]['box_style'][1],
@@ -152,7 +152,7 @@ def format_to_tr_block(prev_block, gap_threshold):
block_buff[-1]['box_style'][2]),
"span": (prev_child['box_style'][1], child['box_style'][2])
})
new_child_block = ingestor.visual_ingestor.Doc.merge_vls(block_buff)
new_child_block = nlm_ingestor.ingestor.visual_ingestor.Doc.merge_vls(block_buff)
new_visual_lines.append(new_child_block)
return new_block_children, new_visual_lines

View File

@@ -40,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 61,
"id": "d765b72f-5d58-4343-9f48-432acb31b7d6",
"metadata": {},
"outputs": [
@@ -48,7 +48,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/2792954116.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
"/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/3984827310.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
" from IPython.core.display import display, HTML\n"
]
}
@@ -58,11 +58,11 @@
"from IPython.core.display import display, HTML\n",
"# llmsherpa_api_url = \"https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all\"\n",
"llmsherpa_api_url = \"http://localhost:5001/api/parseDocument?renderFormat=all\"\n",
"# pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n",
"pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n",
"# pdf_url = \"https://www.apache.org/licenses/LICENSE-2.0.txt\"\n",
"# pdf_url = \"https://microsoft.gcs-web.com/static-files/931d7780-ccfc-47e3-97ad-09d87e12b795\"\n",
"# pdf_url = \"https://podcasts.ceu.edu/sites/podcasts.ceu.edu/files/sample.doc\"\n",
"pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n",
"# pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n",
"# pdf_url = \"https://en.wikipedia.org/wiki/Language_model\"\n",
"# pdf_url = \"https://raw.githubusercontent.com/nlmatics/llmsherpa/main/README.md\"\n",
"# pdf_url = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=35362092&retmode=xml\"\n",
@@ -72,29 +72,30 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 74,
"id": "274fc39e-a574-4312-9d44-53b7758fa961",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<h1>NLMatics Kubernetes Cluster</h1><h2>1. Connect to the cluster.</h2><p>e.g.\n",
"to connect to LLMSherpa AKS.</p><p>2. List the pods</p><p>3. Log tailing for a specific pod</p><h2>4. Certificate Renewal</h2><p>· Delete the certificate.</p><p>· Delete the secret associated with the certificate.</p><p>· Restart the ingress-nginx-controller.</p><p>$ kubectl delete certificate tls-secret; kubectl delete secret tls-secret</p><p>Find the ingress-nginx-controller deployment file.</p><p>Edit the deployment file.</p><p>Modify the “replicas” to “0”.\n",
"Save and quit.</p><p>After 5 seconds, re-edit the file and modify the “replicas” back to 1.\n",
"Wait for ~30 seconds and the new certificate will be applied.</p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
"{'bbox': [179.52, 165.33, 421.01, 177.29000000000002],\n",
" 'block_class': 'cls_5',\n",
" 'block_idx': 2,\n",
" 'level': 1,\n",
" 'page_idx': 0,\n",
" 'sentences': ['{mikelewis,yinhanliu,naman}@fb.com'],\n",
" 'tag': 'header'}"
]
},
"execution_count": 28,
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HTML(doc.sections()[0].to_html(include_children=True, recurse=True))"
"# HTML(doc.sections()[0].to_html(include_children=True, recurse=True))\n",
"doc.sections()[1].block_json"
]
}
],

View File

@@ -1,7 +1,7 @@
from setuptools import setup, find_packages
setup(
name='nlm-ingestor',
version='0.1.1',
version='0.1.2',
description='Parsers and ingestors for different file types and formats',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
@@ -11,7 +11,7 @@ setup(
license='Apache License 2.0',
packages=find_packages(),
include_package_data=True,
package_data={'': ['ingestor_utils/*.txt']},
package_data={'': ['ingestor_utils/*.txt', ]},
install_requires=[
"flask",
"flask_restful",

View File

@@ -9,8 +9,8 @@ from nlm_utils.storage import file_storage
from pymongo import MongoClient
from tika import parser
from ingestor import table_parser
from ingestor import visual_ingestor
from nlm_ingestor.ingestor import table_parser
from nlm_ingestor.ingestor import visual_ingestor
db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]

View File

@@ -9,8 +9,8 @@ from nlm_utils.storage import file_storage
from pymongo import MongoClient
from tika import parser
from ingestor import table_parser
from ingestor import visual_ingestor
from nlm_ingestor.ingestor import table_parser
from nlm_ingestor.ingestor import visual_ingestor
db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]

View File

@@ -10,8 +10,8 @@ from nlm_utils.storage import file_storage
from pymongo import MongoClient
from tika import parser
from ingestor import table_parser
from ingestor import visual_ingestor
from nlm_ingestor.ingestor import table_parser
from nlm_ingestor.ingestor import visual_ingestor
db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]