Fix bolding, update deps
This commit is contained in:
9
.github/workflows/tests.yml
vendored
9
.github/workflows/tests.yml
vendored
@@ -3,9 +3,8 @@ name: Integration test with benchmark
|
||||
on: [push]
|
||||
|
||||
env:
|
||||
TESSDATA_PREFIX: "/usr/share/tesseract-ocr/4.00/tessdata"
|
||||
TORCH_DEVICE: "cpu"
|
||||
OCR_ENGINE: "tesseract" # So we don't have to install ghostscript, which takes a long time
|
||||
OCR_ENGINE: "surya"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
@@ -16,12 +15,6 @@ jobs:
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.11
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y
|
||||
- name: Show tessdata folders
|
||||
run: ls /usr/share/tesseract-ocr/
|
||||
- name: Install python dependencies
|
||||
run: |
|
||||
pip install poetry
|
||||
|
||||
15
README.md
15
README.md
@@ -1,11 +1,11 @@
|
||||
# Marker
|
||||
|
||||
Marker converts PDF, EPUB, and MOBI to markdown. It's 10x faster than nougat, more accurate on most documents, and has low hallucination risk.
|
||||
Marker converts PDF to markdown. It's 10x faster than nougat, more accurate on most documents, and has low hallucination risk.
|
||||
|
||||
- Support for a range of PDF documents (optimized for books and scientific papers)
|
||||
- Support for a range of documents (optimized for books and scientific papers)
|
||||
- Removes headers/footers/other artifacts
|
||||
- Converts most equations to latex
|
||||
- Formats code blocks and tables
|
||||
- Formats tables and code blocks
|
||||
- Support for all languages (although most testing is done in English).
|
||||
- Works on GPU, CPU, or MPS
|
||||
|
||||
@@ -73,9 +73,9 @@ First, clone the repo:
|
||||
|
||||
Only needed if using `ocrmypdf` as the ocr backend.
|
||||
|
||||
- Install tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/) or running `scripts/install/tesseract_5_install.sh`.
|
||||
- Run `pip install ocrmypdf`
|
||||
- Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
|
||||
- Install other requirements with `cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y`
|
||||
- Install other requirements with `cat scripts/install/tess-apt-requirements.txt | xargs sudo apt-get install -y`
|
||||
- Set the tesseract data folder path
|
||||
- Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple.
|
||||
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
|
||||
@@ -90,7 +90,8 @@ Only needed if using `ocrmypdf` as the ocr backend.
|
||||
|
||||
Only needed if using `ocrmypdf` as the ocr backend.
|
||||
|
||||
- Install system requirements from `scripts/install/brew-requirements.txt`
|
||||
- Run `pip install ocrmypdf`
|
||||
- Install system requirements from `scripts/install/tess-brew-requirements.txt`
|
||||
- Set the tesseract data folder path
|
||||
- Find the tesseract data folder `tessdata` with `brew list tesseract`
|
||||
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
|
||||
@@ -210,7 +211,7 @@ All models were trained from scratch, so they're okay for commercial usage. The
|
||||
|
||||
If you want to remove the GPL license requirements for inference or use the weights commercially over the revenue limit, please contact me at marker@vikas.sh for dual licensing.
|
||||
|
||||
Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions. If you want to avoid this completely, just use `surya` as the OCR option. (ocrmypdf is faster on CPU, but less accurate)
|
||||
Note that the `ocrmypdf` OCR option will use ocrmypdf, which includes Ghostscript, an AGPL dependency, but calls it via CLI, so it does not trigger the license provisions. Ocrmypdf is disabled by default, and will not be installed automatically.
|
||||
|
||||
# Thanks
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ import os
|
||||
from marker.convert import convert_single_pdf
|
||||
from marker.logger import configure_logging
|
||||
from marker.models import load_all_models
|
||||
import json
|
||||
|
||||
from marker.output import save_markdown
|
||||
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
from typing import List
|
||||
from statistics import mean
|
||||
import numpy as np
|
||||
|
||||
from marker.schema.page import Page
|
||||
|
||||
|
||||
def find_bold_italic(pages: List[Page], bold_min_weight=550):
|
||||
def find_bold_italic(pages: List[Page], bold_min_weight=600):
|
||||
font_weights = []
|
||||
for page in pages:
|
||||
for block in page.blocks:
|
||||
@@ -24,17 +22,9 @@ def find_bold_italic(pages: List[Page], bold_min_weight=550):
|
||||
if len(font_weights) == 0:
|
||||
return
|
||||
|
||||
font_weights = np.array(font_weights)
|
||||
bold_thresh = np.percentile(font_weights, 90)
|
||||
bold_thresh_lower = np.percentile(font_weights, 75)
|
||||
|
||||
# If a lot of the text on the page is bold, don't bold it all
|
||||
if bold_thresh == bold_thresh_lower or bold_thresh < bold_min_weight:
|
||||
return
|
||||
|
||||
for page in pages:
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
for span in line.spans:
|
||||
if span.font_weight >= bold_thresh:
|
||||
if span.font_weight >= bold_min_weight:
|
||||
span.bold = True
|
||||
@@ -1,13 +1,8 @@
|
||||
import re
|
||||
from collections import Counter, defaultdict
|
||||
from itertools import chain
|
||||
from collections import Counter
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
from sklearn.cluster import DBSCAN
|
||||
import numpy as np
|
||||
|
||||
from marker.schema.merged import FullyMergedBlock
|
||||
from marker.schema.page import Page
|
||||
from typing import List, Tuple
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from marker.images.save import get_image_filename
|
||||
from marker.pdf.images import render_bbox_image
|
||||
from marker.schema.bbox import rescale_bbox
|
||||
from marker.schema.block import find_insert_block, Span
|
||||
from marker.schema.block import find_insert_block, Span, Line
|
||||
from marker.settings import settings
|
||||
|
||||
|
||||
@@ -53,7 +53,16 @@ def extract_page_images(page_obj, page):
|
||||
image=True,
|
||||
span_id=f"image_{image_idx}"
|
||||
)
|
||||
block.lines[line_idx].spans.append(image_span)
|
||||
|
||||
# Sometimes, the block has zero lines
|
||||
if len(block.lines) > line_idx:
|
||||
block.lines[line_idx].spans.append(image_span)
|
||||
else:
|
||||
line = Line(
|
||||
bbox=bbox,
|
||||
spans=[image_span]
|
||||
)
|
||||
block.lines.append(line)
|
||||
page.images.append(image)
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from itertools import repeat
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
import ocrmypdf
|
||||
import pypdfium2 as pdfium
|
||||
import io
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
@@ -113,6 +112,7 @@ def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
|
||||
|
||||
|
||||
def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
|
||||
import ocrmypdf
|
||||
out_pdf = io.BytesIO()
|
||||
|
||||
ocrmypdf.ocr(
|
||||
|
||||
@@ -4,8 +4,7 @@ from typing import List, Optional, Dict
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.internal as pdfium_i
|
||||
|
||||
from marker.pdf.utils import find_filetype, font_flags_decomposer, sort_block_group
|
||||
from marker.ocr.heuristics import detect_bad_ocr
|
||||
from marker.pdf.utils import font_flags_decomposer
|
||||
from marker.settings import settings
|
||||
from marker.schema.block import Span, Line, Block
|
||||
from marker.schema.page import Page
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
from collections import defaultdict, Counter
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from marker.settings import settings
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
@@ -2,7 +2,7 @@ from transformers import T5Config, T5PreTrainedModel
|
||||
import torch
|
||||
from torch import nn
|
||||
from copy import deepcopy
|
||||
from typing import Optional, Tuple, Union, List
|
||||
from typing import Optional, Tuple, Union
|
||||
from itertools import chain
|
||||
|
||||
from transformers.modeling_outputs import TokenClassifierOutput
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import os
|
||||
from typing import Optional, List, Dict
|
||||
|
||||
from dotenv import find_dotenv
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from collections import defaultdict
|
||||
|
||||
from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox
|
||||
from marker.schema.block import Line, Span, Block
|
||||
from marker.schema.page import Page
|
||||
|
||||
2628
poetry.lock
generated
2628
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -32,13 +32,12 @@ ray = "^2.20.0"
|
||||
tqdm = "^4.66.1"
|
||||
tabulate = "^0.9.0"
|
||||
ftfy = "^6.1.1"
|
||||
ocrmypdf = "^15.4.0"
|
||||
texify = "^0.1.8"
|
||||
rapidfuzz = "^3.8.1"
|
||||
surya-ocr = "^0.4.0"
|
||||
filetype = "^1.2.0"
|
||||
pdftext = "^0.3.6"
|
||||
regex = "^2024.4.28"
|
||||
pdftext = "^0.3.7"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
jupyter = "^1.0.0"
|
||||
|
||||
Reference in New Issue
Block a user