Fix bolding, update deps

This commit is contained in:
Vik Paruchuri
2024-05-07 13:28:21 -07:00
parent 01c18b8715
commit 7f18bb9a8f
16 changed files with 1049 additions and 1654 deletions

View File

@@ -3,9 +3,8 @@ name: Integration test with benchmark
on: [push]
env:
TESSDATA_PREFIX: "/usr/share/tesseract-ocr/4.00/tessdata"
TORCH_DEVICE: "cpu"
OCR_ENGINE: "tesseract" # So we don't have to install ghostscript, which takes a long time
OCR_ENGINE: "surya"
jobs:
build:
@@ -16,12 +15,6 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install system dependencies
run: |
sudo apt-get update
cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y
- name: Show tessdata folders
run: ls /usr/share/tesseract-ocr/
- name: Install python dependencies
run: |
pip install poetry

View File

@@ -1,11 +1,11 @@
# Marker
Marker converts PDF, EPUB, and MOBI to markdown. It's 10x faster than nougat, more accurate on most documents, and has low hallucination risk.
Marker converts PDF to markdown. It's 10x faster than nougat, more accurate on most documents, and has low hallucination risk.
- Support for a range of PDF documents (optimized for books and scientific papers)
- Support for a range of documents (optimized for books and scientific papers)
- Removes headers/footers/other artifacts
- Converts most equations to latex
- Formats code blocks and tables
- Formats tables and code blocks
- Support for all languages (although most testing is done in English).
- Works on GPU, CPU, or MPS
@@ -73,9 +73,9 @@ First, clone the repo:
Only needed if using `ocrmypdf` as the ocr backend.
- Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
- Run `pip install ocrmypdf`
- Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
- Install other requirements with `cat scripts/install/tess-apt-requirements.txt | xargs sudo apt-get install -y`
- Set the tesseract data folder path
- Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple.
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
@@ -90,7 +90,8 @@ Only needed if using `ocrmypdf` as the ocr backend.
Only needed if using `ocrmypdf` as the ocr backend.
- Install system requirements from `scripts/install/brew-requirements.txt`
- Run `pip install ocrmypdf`
- Install system requirements from `scripts/install/tess-brew-requirements.txt`
- Set the tesseract data folder path
- Find the tesseract data folder `tessdata` with `brew list tesseract`
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
@@ -210,7 +211,7 @@ All models were trained from scratch, so they're okay for commercial usage. The
If you want to remove the GPL license requirements for inference or use the weights commercially over the revenue limit, please contact me at marker@vikas.sh for dual licensing.
Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions. If you want to avoid this completely, just use `surya` as the OCR option. (ocrmypdf is faster on CPU, but less accurate)
Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions. Ocrmypdf is disabled by default, and will not be installed automatically.
# Thanks

View File

@@ -4,7 +4,6 @@ import os
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
import json
from marker.output import save_markdown

View File

@@ -1,11 +1,9 @@
from typing import List
from statistics import mean
import numpy as np
from marker.schema.page import Page
def find_bold_italic(pages: List[Page], bold_min_weight=550):
def find_bold_italic(pages: List[Page], bold_min_weight=600):
font_weights = []
for page in pages:
for block in page.blocks:
@@ -24,17 +22,9 @@ def find_bold_italic(pages: List[Page], bold_min_weight=550):
if len(font_weights) == 0:
return
font_weights = np.array(font_weights)
bold_thresh = np.percentile(font_weights, 90)
bold_thresh_lower = np.percentile(font_weights, 75)
# If a lot of the text on the page is bold, don't bold it all
if bold_thresh == bold_thresh_lower or bold_thresh < bold_min_weight:
return
for page in pages:
for block in page.blocks:
for line in block.lines:
for span in line.spans:
if span.font_weight >= bold_thresh:
if span.font_weight >= bold_min_weight:
span.bold = True

View File

@@ -1,13 +1,8 @@
import re
from collections import Counter, defaultdict
from itertools import chain
from collections import Counter
from rapidfuzz import fuzz
from sklearn.cluster import DBSCAN
import numpy as np
from marker.schema.merged import FullyMergedBlock
from marker.schema.page import Page
from typing import List, Tuple

View File

@@ -1,7 +1,7 @@
from marker.images.save import get_image_filename
from marker.pdf.images import render_bbox_image
from marker.schema.bbox import rescale_bbox
from marker.schema.block import find_insert_block, Span
from marker.schema.block import find_insert_block, Span, Line
from marker.settings import settings
@@ -53,7 +53,16 @@ def extract_page_images(page_obj, page):
image=True,
span_id=f"image_{image_idx}"
)
block.lines[line_idx].spans.append(image_span)
# Sometimes, the block has zero lines
if len(block.lines) > line_idx:
block.lines[line_idx].spans.append(image_span)
else:
line = Line(
bbox=bbox,
spans=[image_span]
)
block.lines.append(line)
page.images.append(image)

View File

@@ -1,7 +1,6 @@
from itertools import repeat
from typing import List, Optional, Dict
import ocrmypdf
import pypdfium2 as pdfium
import io
from concurrent.futures import ThreadPoolExecutor
@@ -113,6 +112,7 @@ def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
import ocrmypdf
out_pdf = io.BytesIO()
ocrmypdf.ocr(

View File

@@ -4,8 +4,7 @@ from typing import List, Optional, Dict
import pypdfium2 as pdfium
import pypdfium2.internal as pdfium_i
from marker.pdf.utils import find_filetype, font_flags_decomposer, sort_block_group
from marker.ocr.heuristics import detect_bad_ocr
from marker.pdf.utils import font_flags_decomposer
from marker.settings import settings
from marker.schema.block import Span, Line, Block
from marker.schema.page import Page

View File

@@ -1,8 +1,7 @@
from collections import defaultdict, Counter
from collections import defaultdict
from itertools import chain
from typing import Optional
from transformers import AutoTokenizer
from marker.settings import settings
import torch
import torch.nn.functional as F

View File

@@ -2,7 +2,7 @@ from transformers import T5Config, T5PreTrainedModel
import torch
from torch import nn
from copy import deepcopy
from typing import Optional, Tuple, Union, List
from typing import Optional, Tuple, Union
from itertools import chain
from transformers.modeling_outputs import TokenClassifierOutput

View File

@@ -1,4 +1,3 @@
import os
from typing import Optional, List, Dict
from dotenv import find_dotenv

View File

@@ -1,5 +1,3 @@
from collections import defaultdict
from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox
from marker.schema.block import Line, Span, Block
from marker.schema.page import Page

2628
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -32,13 +32,12 @@ ray = "^2.20.0"
tqdm = "^4.66.1"
tabulate = "^0.9.0"
ftfy = "^6.1.1"
ocrmypdf = "^15.4.0"
texify = "^0.1.8"
rapidfuzz = "^3.8.1"
surya-ocr = "^0.4.0"
filetype = "^1.2.0"
pdftext = "^0.3.6"
regex = "^2024.4.28"
pdftext = "^0.3.7"
[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"