nlm-ingestor-llmsherpa/nlm_ingestor/ingestor/html_ingestor.py

import logging

from bs4 import BeautifulSoup
from nlm_ingestor.ingestor_utils.ing_named_tuples import LineStyle
from nlm_ingestor.ingestor.visual_ingestor import block_renderer
from nlm_ingestor.ingestor_utils.utils import sent_tokenize
from nlm_ingestor.ingestor import line_parser
import codecs


class HTMLIngestor:
    def __init__(self, file_name, sec=False):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.INFO)

        if str(type(file_name)) == "<class 'bs4.element.Tag'>":
            self.html = file_name
        else:
            f = codecs.open(file_name, 'r')
            self.html = BeautifulSoup(f.read(), features="lxml")
            self.html = self.html.find("body")
        self.sec = sec
        self.blocks = []
        self.parse_blocks()

        self.line_style_classes = {}
        self.class_levels = {}
        self.add_styles()

        br = block_renderer.BlockRenderer(self)
        self.html_str = br.render_html()
        self.json_dict = br.render_json()

    def parse_blocks(self):
        self.logger.info("parsing html file")

        header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
        para_tags = ["p", "span"]
        # list: li
        # bold: b, em, strong
        i = 0
        children = self.html.findChildren(recursive=True)
        level_stack = []
        header_stack = []

        while i < len(children):
            child = children[i]
            if not child.text.strip():
                i += 1
                continue

            tag = child.name
            if self.sec:
                # some containers are actually p
                div_is_para = True
                current_level_child = [c.name for c in child.findChildren(recursive=False)]
                if len(current_level_child) > 0:
                    for name in current_level_child:
                        if name != "font":
                            div_is_para = False
                # use styles to determine headers
                style = self.parse_style(child.get("style"))
                if "font-weight" in style and style["font-weight"] == "bold":
                    line = line_parser.Line(child.text)
                    if line.is_header:
                        tag = "h3"
                        if child.text.isupper():
                            tag = "h2"

            else:
                div_is_para = False

            div_text = ""
            if self.sec:
                for c in child.findAll(text=True, recursive=False):
                    if c.strip():
                        div_text += c

            if tag in header_tags:
                if len(level_stack) == 0:
                    level_stack = [tag]
                    header_stack = [child.text]
                    level = 0
                elif tag in level_stack:
                    level = level_stack.index(tag)
                    level_stack = level_stack[:level+1]
                    header_stack = header_stack[:level]
                    header_stack.append(child.text)
                else:
                    idx = 0
                    while idx < len(level_stack) and level_stack[idx] in header_tags and \
                            header_tags.index(level_stack[idx]) < header_tags.index(tag):
                        idx += 1
                    level_stack = level_stack[:idx]
                    level_stack.append(tag)
                    header_stack = header_stack[:idx]
                    header_stack.append(child.text)
                    level = idx

                header_block = {
                    "block_idx": len(self.blocks),
                    "page_idx": 0,
                    "block_text": child.text,
                    "block_type": "header",
                    "block_class": "nlm-text-header",
                    "header_block_idx": 0,
                    "level": level,
                    "header_text": header_stack[-1] if header_stack else "",
                    "level_chain": header_stack[::-1],
                }
                self.blocks.append(header_block)
                i += len(child.findChildren(recursive=True))

            elif tag in para_tags or div_text or div_is_para:
                is_header = False
                line = line_parser.Line(child.text)
                para_child_tag = None
                if line.is_header:
                    is_header = True
                if child.name == "p":
                    para_child = child.findChildren(recursive=True)
                    if len(para_child) > 0:
                        para_child_tag = child.name + "_" + para_child[0].name
                if is_header and para_child_tag:
                    if len(level_stack) == 0:
                        level_stack = [para_child_tag]
                        header_stack = [child.text]
                        level = 0
                    elif para_child_tag in level_stack:
                        level = level_stack.index(para_child_tag)
                        level_stack = level_stack[:level+1]
                        header_stack = header_stack[:level]
                        header_stack.append(child.text)
                    else:
                        idx = len(level_stack)
                        level_stack = level_stack[:idx]
                        level_stack.append(para_child_tag)
                        header_stack = header_stack[:idx]
                        header_stack.append(child.text)
                        level = idx

                    header_block = {
                        "block_idx": len(self.blocks),
                        "page_idx": 0,
                        "block_text": child.text,
                        "block_type": "header",
                        "block_class": "nlm-text-header",
                        "header_block_idx": 0,
                        "level": level,
                        "header_text": header_stack[-1] if header_stack else "",
                        "level_chain": header_stack[::-1],
                    }
                    self.blocks.append(header_block)
                else:
                    para_block = {
                        "block_idx": len(self.blocks),
                        "page_idx": 0,
                        "block_text": child.text,
                        "block_type": "para",
                        "block_class": "nlm-text-body",
                        "header_block_idx": 0,
                        "block_sents": sent_tokenize(child.text),
                        "level": len(level_stack),
                        "header_text": header_stack[-1] if header_stack else "",
                        "level_chain": header_stack[::-1],
                    }
                    self.blocks.append(para_block)

                i += len(child.findChildren(recursive=True))

            elif tag == "li":
                list_block = {
                    "block_idx": len(self.blocks),
                    "page_idx": 0,
                    "block_text": child.text,
                    "block_type": "list_item",
                    "list_type": "",
                    "block_class": "nlm-list-item",
                    "header_block_idx": 0,
                    "block_sents": sent_tokenize(child.text),
                    "level": len(level_stack),
                    "header_text": header_stack[-1] if header_stack else "",
                    "level_chain": header_stack[::-1],
                }
                self.blocks.append(list_block)
                i += len(child.findChildren(recursive=True))

            elif tag == "table":
                rows = child.find_all('tr')
                table_start_idx = len(self.blocks)
                empty_cols = []
                for row in rows:
                    cols = row.find_all(['th', 'td'])
                    col_text = []
                    col_spans = []
                    empty_col = []
                    header_group_flag = False
                    all_th = True
                    for col_idx, col in enumerate(cols):
                        text = col.text.replace(u'\xa0', '')
                        text = text.strip()
                        col_text.append(text)
                        if not text:
                            empty_col.append(col_idx)
                        if not col.name == "th" and text and not col.find('b'):
                            all_th = False
                        if col.get("colspan"):
                            header_group_flag = True
                        col_spans.append(int(col.get("colspan")) if col.get("colspan") else 1)
                    empty_cols.append(empty_col)

                    if not ''.join(col_text).strip():
                        # Empty Row
                        continue

                    if len(rows) > 1:
                        table_row = {
                            "block_idx": len(self.blocks),
                            "page_idx": 0,
                            "block_text": ' '.join([c for c in col_text]),
                            "block_type": "table_row",
                            "block_class": "nlm-table-row",
                            "header_block_idx": 0,
                            "block_sents": sent_tokenize(' '.join([c for c in col_text])),
                            "level": len(level_stack),
                            "header_text": header_stack[-1] if header_stack else "",
                            "level_chain": header_stack[::-1],
                            "cell_values": col_text,
                            "col_spans": col_spans,
                        }
                        if header_group_flag:
                            table_row["is_header_group"] = True
                        if all_th:
                            table_row["is_header"] = True
                        self.blocks.append(table_row)
                    else:
                        blk_text = ' '.join(col_text)
                        line = line_parser.Line(child.text)
                        is_list_item = False
                        if line.is_list_item:
                            is_list_item = True
                        t_block = {
                            "block_idx": len(self.blocks),
                            "page_idx": 0,
                            "block_text": blk_text,
                            "block_type": "para",
                            "block_class": "nlm-text-body",
                            "header_block_idx": 0,
                            "block_sents": sent_tokenize(blk_text),
                            "level": len(level_stack),
                            "header_text": header_stack[-1] if header_stack else "",
                            "level_chain": header_stack[::-1],
                        }
                        if is_list_item:
                            t_block["block_type"] = "list_item"
                            t_block["block_class"] = "nlm-list-item"
                            t_block["list_type"] = ""
                        self.blocks.append(t_block)

                if len(rows) > 1:
                    self.blocks[table_start_idx]['is_table_start'] = True
                    self.blocks[-1]["is_table_end"] = True
                    # Remove any empty columns if there are intersection
                    empty_col_intersection = set.intersection(*map(set, empty_cols))
                    if empty_col_intersection:
                        # Start from the last as we are popping members out
                        # might change the number of elements in the list
                        for inter in list(empty_col_intersection)[::-1]:
                            for blk in self.blocks[table_start_idx:]:
                                blk["col_spans"].pop(inter)
                                blk["cell_values"].pop(inter)

                i += len(child.findChildren(recursive=True))

            i += 1

    def add_styles(self):
        title_style = LineStyle(
            "Roboto, Georgia, serif",
            "bold",
            14.0,
            "500",
            "left",
            0,  # TODO: Decide what font_space_width needs to be added
            "left"
        )
        self.line_style_classes[title_style] = "nlm-text-title"
        self.class_levels["nlm-text-title"] = 0
        header_style = LineStyle(
            "Roboto, Georgia, serif",
            "normal",
            12.0,
            "600",
            "left",
            0,  # TODO: Decide what font_space_width needs to be added
            "left"
        )
        self.line_style_classes[header_style] = "nlm-text-header"
        self.class_levels["nlm-text-header"] = 1
        para_style = LineStyle(
            "Roboto, Georgia, serif",
            "normal",
            10.0,
            "400",
            "left",
            0,  # TODO: Decide what font_space_width needs to be added
            "left"
        )
        self.line_style_classes[para_style] = 'nlm-text-body'
        self.class_levels['nlm-text-body'] = 2

    def parse_style(self, style_str):
        d = {}
        if not style_str:
            return d
        for style in style_str.split(";"):
            style = style.strip()
            if ":" in style:
                key, value = style.split(":")
                d[key] = value
        return d