Fix bolding, update deps

2024-05-07 13:28:21 -07:00
parent 01c18b8715
commit 7f18bb9a8f
16 changed files with 1049 additions and 1654 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -3,9 +3,8 @@ name: Integration test with benchmark
 on: [push]

 env:
-  TESSDATA_PREFIX: "/usr/share/tesseract-ocr/4.00/tessdata"
  TORCH_DEVICE: "cpu"
-  OCR_ENGINE: "tesseract" # So we don't have to install ghostscript, which takes a long time
+  OCR_ENGINE: "surya"

 jobs:
  build:
@@ -16,12 +15,6 @@ jobs:
        uses: actions/setup-python@v4
        with:
          python-version: 3.11
-      - name: Install system dependencies
-        run: |
-          sudo apt-get update
-          cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y
-      - name: Show tessdata folders
-        run: ls /usr/share/tesseract-ocr/
      - name: Install python dependencies
        run: |
          pip install poetry
--- a/README.md
+++ b/README.md
@@ -1,11 +1,11 @@
 # Marker

-Marker converts PDF, EPUB, and MOBI to markdown.  It's 10x faster than nougat, more accurate on most documents, and has low hallucination risk.
+Marker converts PDF to markdown.  It's 10x faster than nougat, more accurate on most documents, and has low hallucination risk.

- Support for a range of PDF documents (optimized for books and scientific papers)
+- Support for a range of documents (optimized for books and scientific papers)
 - Removes headers/footers/other artifacts
 - Converts most equations to latex
- Formats code blocks and tables
+- Formats tables and code blocks
 - Support for all languages (although most testing is done in English).
 - Works on GPU, CPU, or MPS

@@ -73,9 +73,9 @@ First, clone the repo:

 Only needed if using `ocrmypdf` as the ocr backend.

- Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
+- Run `pip install ocrmypdf`
 - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
+- Install other requirements with `cat scripts/install/tess-apt-requirements.txt | xargs sudo apt-get install -y`
 - Set the tesseract data folder path
  - Find the tesseract data folder `tessdata` with `find / -name tessdata`.  Make sure to use the one corresponding to the latest tesseract version if you have multiple.
  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
@@ -90,7 +90,8 @@ Only needed if using `ocrmypdf` as the ocr backend.

 Only needed if using `ocrmypdf` as the ocr backend.

- Install system requirements from `scripts/install/brew-requirements.txt`
+- Run `pip install ocrmypdf`
+- Install system requirements from `scripts/install/tess-brew-requirements.txt`
 - Set the tesseract data folder path
  - Find the tesseract data folder `tessdata` with `brew list tesseract`
  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
@@ -210,7 +211,7 @@ All models were trained from scratch, so they're okay for commercial usage.  The

 If you want to remove the GPL license requirements for inference or use the weights commercially over the revenue limit, please contact me at marker@vikas.sh for dual licensing.

-Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions.  If you want to avoid this completely, just use `surya` as the OCR option.  (ocrmypdf is faster on CPU, but less accurate)
+Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions.  Ocrmypdf is disabled by default, and will not be installed automatically.

 # Thanks

--- a/convert_single.py
+++ b/convert_single.py
@@ -4,7 +4,6 @@ import os
 from marker.convert import convert_single_pdf
 from marker.logger import configure_logging
 from marker.models import load_all_models
-import json

 from marker.output import save_markdown

--- a/marker/cleaners/fontstyle.py
+++ b/marker/cleaners/fontstyle.py
@@ -1,11 +1,9 @@
 from typing import List
-from statistics import mean
-import numpy as np

 from marker.schema.page import Page


-def find_bold_italic(pages: List[Page], bold_min_weight=550):
+def find_bold_italic(pages: List[Page], bold_min_weight=600):
    font_weights = []
    for page in pages:
        for block in page.blocks:
@@ -24,17 +22,9 @@ def find_bold_italic(pages: List[Page], bold_min_weight=550):
    if len(font_weights) == 0:
        return

-    font_weights = np.array(font_weights)
-    bold_thresh = np.percentile(font_weights, 90)
-    bold_thresh_lower = np.percentile(font_weights, 75)
-
-    # If a lot of the text on the page is bold, don't bold it all
-    if bold_thresh == bold_thresh_lower or bold_thresh < bold_min_weight:
-        return
-
    for page in pages:
        for block in page.blocks:
            for line in block.lines:
                for span in line.spans:
-                    if span.font_weight >= bold_thresh:
+                    if span.font_weight >= bold_min_weight:
                        span.bold = True
--- a/marker/cleaners/headers.py
+++ b/marker/cleaners/headers.py
@@ -1,13 +1,8 @@
 import re
-from collections import Counter, defaultdict
-from itertools import chain
+from collections import Counter
 from rapidfuzz import fuzz

-from sklearn.cluster import DBSCAN
-import numpy as np
-
 from marker.schema.merged import FullyMergedBlock
-from marker.schema.page import Page
 from typing import List, Tuple


--- a/marker/images/extract.py
+++ b/marker/images/extract.py
@@ -1,7 +1,7 @@
 from marker.images.save import get_image_filename
 from marker.pdf.images import render_bbox_image
 from marker.schema.bbox import rescale_bbox
-from marker.schema.block import find_insert_block, Span
+from marker.schema.block import find_insert_block, Span, Line
 from marker.settings import settings


@@ -53,7 +53,16 @@ def extract_page_images(page_obj, page):
            image=True,
            span_id=f"image_{image_idx}"
        )
-        block.lines[line_idx].spans.append(image_span)
+
+        # Sometimes, the block has zero lines
+        if len(block.lines) > line_idx:
+            block.lines[line_idx].spans.append(image_span)
+        else:
+            line = Line(
+                bbox=bbox,
+                spans=[image_span]
+            )
+            block.lines.append(line)
        page.images.append(image)


--- a/marker/ocr/recognition.py
+++ b/marker/ocr/recognition.py
@@ -1,7 +1,6 @@
 from itertools import repeat
 from typing import List, Optional, Dict

-import ocrmypdf
 import pypdfium2 as pdfium
 import io
 from concurrent.futures import ThreadPoolExecutor
@@ -113,6 +112,7 @@ def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:


 def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
+    import ocrmypdf
    out_pdf = io.BytesIO()

    ocrmypdf.ocr(
--- a/marker/pdf/extract_text.py
+++ b/marker/pdf/extract_text.py
@@ -4,8 +4,7 @@ from typing import List, Optional, Dict
 import pypdfium2 as pdfium
 import pypdfium2.internal as pdfium_i

-from marker.pdf.utils import find_filetype, font_flags_decomposer, sort_block_group
-from marker.ocr.heuristics import detect_bad_ocr
+from marker.pdf.utils import font_flags_decomposer
 from marker.settings import settings
 from marker.schema.block import Span, Line, Block
 from marker.schema.page import Page
--- a/marker/postprocessors/editor.py
+++ b/marker/postprocessors/editor.py
@@ -1,8 +1,7 @@
-from collections import defaultdict, Counter
+from collections import defaultdict
 from itertools import chain
 from typing import Optional

-from transformers import AutoTokenizer
 from marker.settings import settings
 import torch
 import torch.nn.functional as F
--- a/marker/postprocessors/t5.py
+++ b/marker/postprocessors/t5.py
@@ -2,7 +2,7 @@ from transformers import T5Config, T5PreTrainedModel
 import torch
 from torch import nn
 from copy import deepcopy
-from typing import Optional, Tuple, Union, List
+from typing import Optional, Tuple, Union
 from itertools import chain

 from transformers.modeling_outputs import TokenClassifierOutput
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -1,4 +1,3 @@
-import os
 from typing import Optional, List, Dict

 from dotenv import find_dotenv
--- a/marker/tables/table.py
+++ b/marker/tables/table.py
@@ -1,5 +1,3 @@
-from collections import defaultdict
-
 from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox
 from marker.schema.block import Line, Span, Block
 from marker.schema.page import Page
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,13 +32,12 @@ ray = "^2.20.0"
 tqdm = "^4.66.1"
 tabulate = "^0.9.0"
 ftfy = "^6.1.1"
-ocrmypdf = "^15.4.0"
 texify = "^0.1.8"
 rapidfuzz = "^3.8.1"
 surya-ocr = "^0.4.0"
 filetype = "^1.2.0"
-pdftext = "^0.3.6"
 regex = "^2024.4.28"
+pdftext = "^0.3.7"

 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"
--- a/scripts/install/tess-apt-requirements.txt
+++ b/scripts/install/tess-apt-requirements.txt
--- a/scripts/install/tess-brew-requirements.txt
+++ b/scripts/install/tess-brew-requirements.txt