mirror of
https://github.com/nlmatics/nlm-ingestor.git
synced 2024-08-02 20:58:47 +03:00
158 lines
6.4 KiB
Python
158 lines
6.4 KiB
Python
import xml.etree.ElementTree as ET
|
|
import re
|
|
|
|
from nlm_ingestor.ingestor import processors
|
|
from nlm_ingestor.ingestor.visual_ingestor import block_renderer
|
|
from nlm_ingestor.ingestor_utils.utils import sent_tokenize
|
|
from nlm_ingestor.ingestor_utils.ing_named_tuples import LineStyle
|
|
|
|
# from nltk import sent_tokenize
|
|
|
|
|
|
class XMLIngestor:
|
|
def __init__(self, file_name):
|
|
self.file_name = file_name
|
|
tree = ET.parse(file_name)
|
|
self.tree = tree
|
|
self.title = None
|
|
self.blocks = []
|
|
self.parse_blocks(tree)
|
|
self.line_style_classes = {}
|
|
self.class_levels = {}
|
|
self.add_styles()
|
|
br = block_renderer.BlockRenderer(self)
|
|
self.html_str = br.render_html()
|
|
self.json_dict = br.render_json()
|
|
|
|
def parse_blocks(self, tree):
|
|
root = tree.getroot()
|
|
all_blocks = []
|
|
title = None
|
|
|
|
def traverse(parent, level, blocks):
|
|
for child in parent:
|
|
# handle cases when there's only a <country /> tag
|
|
if not child.text:
|
|
continue
|
|
if len(list(child)) > 0:
|
|
# print("\t" * (level), "Header", child.tag)
|
|
header_text = XMLIngestor.make_header(child.tag)
|
|
header_block = {
|
|
"block_idx": len(blocks),
|
|
"page_idx": 0,
|
|
"block_text": header_text,
|
|
"block_type": "header",
|
|
"block_class": "nlm-text-header",
|
|
"header_block_idx": 0,
|
|
"level": level,
|
|
}
|
|
subheader = " ".join([child.attrib[c] for c in child.attrib])
|
|
if subheader:
|
|
header_block["block_text"] += " " + subheader
|
|
blocks.append(header_block)
|
|
traverse(child, level + 1, blocks)
|
|
else:
|
|
# print("\t"*(level + 1), child.text)
|
|
if not title and child.tag.lower().find("title") != -1:
|
|
self.title = child.text
|
|
if child.tag != "textblock":
|
|
# print("\t" * (level), "Header", child.tag)
|
|
header_text = XMLIngestor.make_header(child.tag)
|
|
|
|
# header_text = " ".join(child.tag.split("_")).title()
|
|
header_block = {
|
|
"block_idx": len(blocks),
|
|
"page_idx": 0,
|
|
"block_text": header_text,
|
|
"block_type": "header",
|
|
"block_class": "nlm-text-header",
|
|
"header_block_idx": 0,
|
|
"level": level,
|
|
}
|
|
subheader = " ".join([child.attrib[c] for c in child.attrib])
|
|
if subheader:
|
|
header_block["block_text"] += " " + subheader
|
|
blocks.append(header_block)
|
|
else:
|
|
level -= 1
|
|
lines = child.text.split("\n")
|
|
# print("\t" * (level + 1), "======")
|
|
# for line in lines:
|
|
# print("\t" * (level + 1), line)
|
|
# print("\t" * (level + 1), "======")
|
|
col_blocks = processors.clean_lines(lines, xml=True)
|
|
header_text = blocks[-1]["block_text"]
|
|
has_header = False
|
|
for block in col_blocks:
|
|
# print("\t" * (level + 1), block["block_text"])
|
|
inline_header = has_header and block["block_type"] == "para"
|
|
block["header_text"] = para_header if inline_header else header_text
|
|
indent_offset = 2 if inline_header else 1
|
|
block["level"] = level + indent_offset
|
|
block["block_idx"] = len(blocks)
|
|
block["page_idx"] = 0
|
|
block["block_sents"] = sent_tokenize(block["block_text"])
|
|
block["block_class"] = "nlm-text-body"
|
|
block["level_chain"] = (
|
|
[title, header_text] if title else [header_text]
|
|
)
|
|
if len(col_blocks) == 1:
|
|
block["block_type"] = "para"
|
|
blocks.append(block)
|
|
if block["block_type"] == "header":
|
|
has_header = True
|
|
para_header = block["block_text"]
|
|
|
|
traverse(root, 0, all_blocks)
|
|
self.blocks = all_blocks
|
|
|
|
def add_styles(self):
|
|
title_style = LineStyle(
|
|
"Roboto, Georgia, serif",
|
|
"bold",
|
|
14.0,
|
|
"500",
|
|
"left",
|
|
0, # TODO: Decide what font_space_width needs to be added
|
|
"left"
|
|
)
|
|
self.line_style_classes[title_style] = "nlm-text-title"
|
|
self.class_levels["nlm-text-title"] = 0
|
|
header_style = LineStyle(
|
|
"Roboto, Georgia, serif",
|
|
"normal",
|
|
12.0,
|
|
"600",
|
|
"left",
|
|
0, # TODO: Decide what font_space_width needs to be added
|
|
"left"
|
|
)
|
|
self.line_style_classes[header_style] = "nlm-text-header"
|
|
self.class_levels["nlm-text-header"] = 1
|
|
para_style = LineStyle(
|
|
"Roboto, Georgia, serif",
|
|
"normal",
|
|
10.0,
|
|
"400",
|
|
"left",
|
|
0, # TODO: Decide what font_space_width needs to be added
|
|
"left"
|
|
)
|
|
self.line_style_classes[para_style] = 'nlm-text-body'
|
|
self.class_levels['nlm-text-body'] = 2
|
|
|
|
@staticmethod
|
|
def camel_case_split(str):
|
|
return re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', str)
|
|
|
|
@staticmethod
|
|
def make_header(str):
|
|
header_text = str
|
|
if "_" in header_text:
|
|
header_text = " ".join(header_text.split("_")).title()
|
|
elif header_text.islower():
|
|
header_text = header_text.capitalize()
|
|
else:
|
|
header_text = " ".join(XMLIngestor.camel_case_split(header_text)).title()
|
|
return header_text
|