mirror of
https://github.com/nlmatics/nlm-ingestor.git
synced 2024-08-02 20:58:47 +03:00
added bbox, fixed imports and bumped version
This commit is contained in:
@@ -8,6 +8,7 @@ The key maintainers of this codebase are:
|
||||
- For small changes or a bug fixes, go ahead and create a PR
|
||||
- For large changes, create an issue on github with the proposal and @ one of the key maintainers for discussion before working on it
|
||||
- Run existing test cases and add test cases for changes to line_parser
|
||||
- Note that the ingestor tests are not yet runnable and have database dependencies (work in progress), so the likelihood of breaking things is very high
|
||||
|
||||
## Contribution areas
|
||||
- Make the new_indent_parser more accurate
|
||||
|
||||
@@ -64,7 +64,7 @@ def parse_document(
|
||||
return make_response(jsonify({"status": status, "reason": msg}), rc)
|
||||
|
||||
def main():
|
||||
logger.info("Starting parser service..")
|
||||
logger.info("Starting ingestor service..")
|
||||
app.run(host="0.0.0.0", port=5001, debug=False)
|
||||
|
||||
|
||||
|
||||
@@ -255,6 +255,12 @@ class BlockRenderer:
|
||||
"page_idx": block["page_idx"],
|
||||
"block_class": block["block_class"],
|
||||
"sentences": [block_text],
|
||||
"bbox": [
|
||||
block["box_style"][1],
|
||||
block["box_style"][0],
|
||||
block["box_style"][1] + block["box_style"][3],
|
||||
block["box_style"][0] + block["box_style"][4],
|
||||
]
|
||||
}
|
||||
elif block_type == "list_item" and not is_rendering_table:
|
||||
block_dict = self.render_nested_block_as_dict(block, "list_item")
|
||||
@@ -266,6 +272,12 @@ class BlockRenderer:
|
||||
"page_idx": block["page_idx"],
|
||||
"block_class": block["block_class"],
|
||||
"sentences": [block_text],
|
||||
"bbox": [
|
||||
block["box_style"][1],
|
||||
block["box_style"][0],
|
||||
block["box_style"][1] + block["box_style"][3],
|
||||
block["box_style"][0] + block["box_style"][4],
|
||||
]
|
||||
}
|
||||
|
||||
if block_dict:
|
||||
@@ -329,7 +341,14 @@ class BlockRenderer:
|
||||
|
||||
if 'is_table_end' in block:
|
||||
is_rendering_table = False
|
||||
render_dict["blocks"][-1]["table_rows"] = table_rows
|
||||
table_block = render_dict["blocks"][-1]
|
||||
table_block["table_rows"] = table_rows
|
||||
table_block["bbox"] = [
|
||||
table_block["left"],
|
||||
table_block["top"],
|
||||
table_block["left"] + block["box_style"][3],
|
||||
table_block["top"] + block["box_style"][4],
|
||||
]
|
||||
table_rows = []
|
||||
|
||||
return render_dict
|
||||
@@ -349,6 +368,12 @@ class BlockRenderer:
|
||||
"block_class": block["block_class"],
|
||||
"sentences": [sent for sent in block["block_sents"]],
|
||||
"block_idx": block["block_idx"],
|
||||
"bbox": [
|
||||
block["box_style"][1],
|
||||
block["box_style"][0],
|
||||
block["box_style"][1] + block["box_style"][3],
|
||||
block["box_style"][0] + block["box_style"][4],
|
||||
]
|
||||
}
|
||||
return block_dict
|
||||
|
||||
|
||||
@@ -131,7 +131,7 @@ def format_to_tr_block(prev_block, gap_threshold):
|
||||
child_x1 = child['box_style'][1]
|
||||
|
||||
if gap_threshold <= round(child_x1 - prev_child_x2):
|
||||
new_child_block = ingestor.visual_ingestor.Doc.merge_vls(block_buff)
|
||||
new_child_block = nlm_ingestor.ingestor.visual_ingestor.Doc.merge_vls(block_buff)
|
||||
new_visual_lines.append(new_child_block)
|
||||
new_block_children.append({"text": block_text,
|
||||
"centroid": get_centroid(block_buff[0]['box_style'][1],
|
||||
@@ -152,7 +152,7 @@ def format_to_tr_block(prev_block, gap_threshold):
|
||||
block_buff[-1]['box_style'][2]),
|
||||
"span": (prev_child['box_style'][1], child['box_style'][2])
|
||||
})
|
||||
new_child_block = ingestor.visual_ingestor.Doc.merge_vls(block_buff)
|
||||
new_child_block = nlm_ingestor.ingestor.visual_ingestor.Doc.merge_vls(block_buff)
|
||||
new_visual_lines.append(new_child_block)
|
||||
return new_block_children, new_visual_lines
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 61,
|
||||
"id": "d765b72f-5d58-4343-9f48-432acb31b7d6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -48,7 +48,7 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/2792954116.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
|
||||
"/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/3984827310.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
|
||||
" from IPython.core.display import display, HTML\n"
|
||||
]
|
||||
}
|
||||
@@ -58,11 +58,11 @@
|
||||
"from IPython.core.display import display, HTML\n",
|
||||
"# llmsherpa_api_url = \"https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all\"\n",
|
||||
"llmsherpa_api_url = \"http://localhost:5001/api/parseDocument?renderFormat=all\"\n",
|
||||
"# pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n",
|
||||
"pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n",
|
||||
"# pdf_url = \"https://www.apache.org/licenses/LICENSE-2.0.txt\"\n",
|
||||
"# pdf_url = \"https://microsoft.gcs-web.com/static-files/931d7780-ccfc-47e3-97ad-09d87e12b795\"\n",
|
||||
"# pdf_url = \"https://podcasts.ceu.edu/sites/podcasts.ceu.edu/files/sample.doc\"\n",
|
||||
"pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n",
|
||||
"# pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n",
|
||||
"# pdf_url = \"https://en.wikipedia.org/wiki/Language_model\"\n",
|
||||
"# pdf_url = \"https://raw.githubusercontent.com/nlmatics/llmsherpa/main/README.md\"\n",
|
||||
"# pdf_url = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=35362092&retmode=xml\"\n",
|
||||
@@ -72,29 +72,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"execution_count": 74,
|
||||
"id": "274fc39e-a574-4312-9d44-53b7758fa961",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<h1>NLMatics Kubernetes Cluster</h1><h2>1. Connect to the cluster.</h2><p>e.g.\n",
|
||||
"to connect to LLMSherpa AKS.</p><p>2. List the pods</p><p>3. Log tailing for a specific pod</p><h2>4. Certificate Renewal</h2><p>· Delete the certificate.</p><p>· Delete the secret associated with the certificate.</p><p>· Restart the ingress-nginx-controller.</p><p>$ kubectl delete certificate tls-secret; kubectl delete secret tls-secret</p><p>Find the ingress-nginx-controller deployment file.</p><p>Edit the deployment file.</p><p>Modify the “replicas” to “0”.\n",
|
||||
"Save and quit.</p><p>After 5 seconds, re-edit the file and modify the “replicas” back to 1.\n",
|
||||
"Wait for ~30 seconds and the new certificate will be applied.</p>"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.HTML object>"
|
||||
"{'bbox': [179.52, 165.33, 421.01, 177.29000000000002],\n",
|
||||
" 'block_class': 'cls_5',\n",
|
||||
" 'block_idx': 2,\n",
|
||||
" 'level': 1,\n",
|
||||
" 'page_idx': 0,\n",
|
||||
" 'sentences': ['{mikelewis,yinhanliu,naman}@fb.com'],\n",
|
||||
" 'tag': 'header'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"execution_count": 74,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"HTML(doc.sections()[0].to_html(include_children=True, recurse=True))"
|
||||
"# HTML(doc.sections()[0].to_html(include_children=True, recurse=True))\n",
|
||||
"doc.sections()[1].block_json"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
4
setup.py
4
setup.py
@@ -1,7 +1,7 @@
|
||||
from setuptools import setup, find_packages
|
||||
setup(
|
||||
name='nlm-ingestor',
|
||||
version='0.1.1',
|
||||
version='0.1.2',
|
||||
description='Parsers and ingestors for different file types and formats',
|
||||
long_description=open('README.md').read(),
|
||||
long_description_content_type='text/markdown',
|
||||
@@ -11,7 +11,7 @@ setup(
|
||||
license='Apache License 2.0',
|
||||
packages=find_packages(),
|
||||
include_package_data=True,
|
||||
package_data={'': ['ingestor_utils/*.txt']},
|
||||
package_data={'': ['ingestor_utils/*.txt', ]},
|
||||
install_requires=[
|
||||
"flask",
|
||||
"flask_restful",
|
||||
|
||||
@@ -9,8 +9,8 @@ from nlm_utils.storage import file_storage
|
||||
from pymongo import MongoClient
|
||||
from tika import parser
|
||||
|
||||
from ingestor import table_parser
|
||||
from ingestor import visual_ingestor
|
||||
from nlm_ingestor.ingestor import table_parser
|
||||
from nlm_ingestor.ingestor import visual_ingestor
|
||||
|
||||
db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
|
||||
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]
|
||||
|
||||
@@ -9,8 +9,8 @@ from nlm_utils.storage import file_storage
|
||||
from pymongo import MongoClient
|
||||
from tika import parser
|
||||
|
||||
from ingestor import table_parser
|
||||
from ingestor import visual_ingestor
|
||||
from nlm_ingestor.ingestor import table_parser
|
||||
from nlm_ingestor.ingestor import visual_ingestor
|
||||
|
||||
db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
|
||||
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]
|
||||
|
||||
@@ -10,8 +10,8 @@ from nlm_utils.storage import file_storage
|
||||
from pymongo import MongoClient
|
||||
from tika import parser
|
||||
|
||||
from ingestor import table_parser
|
||||
from ingestor import visual_ingestor
|
||||
from nlm_ingestor.ingestor import table_parser
|
||||
from nlm_ingestor.ingestor import visual_ingestor
|
||||
|
||||
db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
|
||||
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]
|
||||
|
||||
Reference in New Issue
Block a user