Files
2024-03-29 02:01:56 -04:00

143 lines
5.1 KiB
Python

import json
import logging
from collections import namedtuple
from nlm_ingestor.ingestor_utils.utils import NpEncoder
from nlm_ingestor.ingestor_utils import utils
from nlm_ingestor.ingestor.visual_ingestor import block_renderer
from nlm_ingestor.ingestor_utils.ing_named_tuples import LineStyle
from . import processors
class TextIngestor:
def __init__(self, doc_location, parse_options):
self.logger = logging.getLogger(self.__class__.__name__)
self.logger.setLevel(logging.INFO)
render_format = parse_options.get("render_format", "all") \
if parse_options else "all"
with open(doc_location) as f:
raw_lines = f.readlines()
blocks, _block_texts, _sents, _file_data, result, page_dim, num_pages = parse_blocks(
raw_lines=raw_lines
)
self.blocks = blocks
self.line_style_classes = {}
self.class_levels = {}
self.add_styles()
return_dict = {
"page_dim": page_dim,
"num_pages": num_pages,
}
if render_format == "json":
return_dict["result"] = result[0].get("document", {})
elif render_format == "all":
return_dict["result"] = result[1].get("document", {})
self.return_dict = return_dict
br = block_renderer.BlockRenderer(self)
self.html_str = br.render_html()
self.json_dict = br.render_json()
def add_styles(self):
title_style = LineStyle(
"Roboto, Georgia, serif",
"bold",
14.0,
"500",
"left",
0, # TODO: Decide what font_space_width needs to be added
"left"
)
self.line_style_classes[title_style] = "nlm-text-title"
self.class_levels["nlm-text-title"] = 0
header_style = LineStyle(
"Roboto, Georgia, serif",
"normal",
12.0,
"600",
"left",
0, # TODO: Decide what font_space_width needs to be added
"left"
)
self.line_style_classes[header_style] = "nlm-text-header"
self.class_levels["nlm-text-header"] = 1
para_style = LineStyle(
"Roboto, Georgia, serif",
"normal",
10.0,
"400",
"left",
0, # TODO: Decide what font_space_width needs to be added
"left"
)
self.line_style_classes[para_style] = 'nlm-text-body'
self.class_levels['nlm-text-body'] = 2
def parse_blocks(raw_lines):
blocks = processors.clean_lines(raw_lines)
page_blocks = [blocks]
blocks = blocks_to_json(page_blocks)
blocks = [item for sublist in blocks for item in sublist]
title = ""
if len(blocks) > 0:
title = blocks[0]["block_text"]
if len(title) > 50:
title = title[0:50] + "..."
sents, _ = utils.blocks_to_sents(blocks)
block_texts, _ = utils.get_block_texts(blocks)
#this code needs a more complete rework
doc_dict = {"blocks": blocks, "line_style_classes": {}, "class_levels": {}}
doc = namedtuple("ObjectName", doc_dict.keys())(*doc_dict.values())
br = block_renderer.BlockRenderer(doc)
html_str = br.render_html()
json_dict = br.render_json()
result = [
{"title": title, "text": html_str, "title_page_fonts": {"first_level": [title]}},
{"title": title, "document": json_dict, "title_page_fonts": {"first_level": [title]}}, # JSON not enabled now.
]
file_data = [json.dumps(res, cls=NpEncoder) for res in result]
return blocks, block_texts, sents, file_data, result, [1, 1], 0
def blocks_to_json(page_blocks):
results = []
block_count = 0
for page_idx, blocks in enumerate(page_blocks):
result = []
block_start = block_count
header_block_idx = -1
header_block_text = ""
for block_idx_in_page, block in enumerate(blocks):
if block["block_text"]:
block_sents = utils.sent_tokenize(block["block_text"])
# header_block_idx = block["header_block_idx"]
if block["block_type"] == "header":
header_block_idx = block["block_idx"]
header_block_text = block["block_text"]
result.append(
{
"block_text": block["block_text"],
"block_idx": block["block_idx"],
"block_sents": block_sents,
"block_type": block["block_type"],
"header_block_idx": block_start + header_block_idx,
"page_idx": page_idx,
"block_idx_in_page": block_start + block_idx_in_page,
"header_text": header_block_text,
"text_group_start_idx": block["text_group_start_idx"],
"block_list": block["block_list"],
"level":0,
"block_class": block["block_class"] if "block_class" in block else {}
},
)
block_count += 1
results.append(result)
return results