Add pdf-extraction-agenda package

2025-03-17 21:12:24 +03:00 · 2025-03-10 21:01:25 +03:00
parent b99df0d362
commit b57959f538
10 changed files with 2433 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,6 @@
 .venv
 .python-version
 data/
 dist/
 .idea/
 *.egg-info
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,31 @@
 [project]
 name = "pdf-extraction-agenda"
 version = "0.1.0"
 description = "Overview of pipelines related to PDF document processing"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "colorlog>=6.9.0",
    "datasets>=3.3.2",
    "huggingface-hub[hf-transfer]>=0.29.2",
    "pandas>=2.2.3",
    "pydantic>=2.10.6",
    "rapidfuzz>=3.12.2",
    "tabulate>=0.9.0",
    "tqdm>=4.67.1",
 ]
 [tool.ruff]
 line-length = 120
 lint.ignore = ["PLR"]
 lint.extend-select = ["I", "UP", "PL"]
 target-version = "py312"
 [dependency-groups]
 docling = [
    "docling>=2.25.2",
 ]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
--- a/src/pdf_extraction_agenda/init.py
+++ b/src/pdf_extraction_agenda/init.py
--- a/src/pdf_extraction_agenda/datasets_.py
+++ b/src/pdf_extraction_agenda/datasets_.py
@@ -0,0 +1,85 @@
 import glob
 import os
 import tarfile
 from pathlib import Path
 from typing import Protocol
 from datasets import Dataset, load_dataset
 from huggingface_hub import snapshot_download
 from logging_ import logger
 from pydantic import BaseModel, ValidationError
 class OlmoOCRResponse(BaseModel):
    """OCRed Page Information"""
    primary_language: str
    is_rotation_valid: bool
    rotation_correction: int
    is_table: bool
    is_diagram: bool
    natural_text: str  # Extracted text from PDF
 def parse_response(example: dict, warn: bool = True) -> tuple[bool, OlmoOCRResponse | None]:
    try:
        return False, OlmoOCRResponse.model_validate_json(example["response"])
    except ValidationError as e:
        if warn:
            logger.warning(f"Malformed response for {example.get('id')}\n{e}")
        return True, None
 def extract_tarballs(source_dir: str | os.PathLike, destination_dir: str | os.PathLike) -> None:
    """Extracts all tarball files from the source directory into the destination directory."""
    os.makedirs(destination_dir, exist_ok=True)
    tarballs = glob.glob(os.path.join(source_dir, "*.tar*"))  # Matches .tar, .tar.gz, .tar.bz2, etc.
    for tarball in tarballs:
        try:
            with tarfile.open(tarball, "r:*") as tar:
                tar.extractall(path=destination_dir, filter="fully_trusted")
        except Exception as e:
            logger.info(f"Failed to extract {tarball}: {e}")
 class IdToPathProto(Protocol):
    def __call__(self, id: str, warn: bool = False) -> Path | None:
        """Converts an ID to a file path."""
        pass
 def prepare_olmocr_dataset() -> tuple[Dataset, IdToPathProto]:
    dataset = load_dataset("allenai/olmOCR-mix-0225", "00_documents", split="eval_s2pdf")
    path_to_snaphot = snapshot_download(
        repo_id="dantetemplar/pdf-extraction-agenda", repo_type="dataset", allow_patterns=["*.tar.gz"]
    )
    source_tarball_dir = os.path.join(path_to_snaphot, "data", "olmOCR-mix-0225")
    destination_dir = Path("data/olmOCR-mix-0225-extracted")
    extract_tarballs(source_tarball_dir, destination_dir)
    def id_to_path(id: str, warn: bool = False) -> Path | None:
        path = destination_dir / f"{id}.pdf"
        if path.exists():
            return path
        else:
            if warn:
                logger.warning(f"File {path} not found")
            return None
    return dataset, id_to_path
 def main():
    dataset, id_to_path = prepare_olmocr_dataset()
    for s in dataset:
        path = id_to_path(s["id"], warn=True)
        malformed, response = parse_response(s, warn=True)
        if malformed:
            continue
 if __name__ == "__main__":
    main()
--- a/src/pdf_extraction_agenda/logging.yaml
+++ b/src/pdf_extraction_agenda/logging.yaml
@@ -0,0 +1,24 @@
 version: 1
 disable_existing_loggers: False
 formatters:
  src:
    "()": colorlog.ColoredFormatter
    format: '[%(asctime)s] [%(log_color)s%(levelname)s%(reset)s] [%(cyan)sFile "%(relativePath)s", line %(lineno)d%(reset)s] %(message)s'
  default:
    "()": colorlog.ColoredFormatter
    format: '[%(asctime)s] [%(log_color)s%(levelname)s%(reset)s] [%(name)s] %(message)s'
 handlers:
  src:
    formatter: src
    class: logging.StreamHandler
    stream: ext://sys.stdout
  default:
    formatter: default
    class: logging.StreamHandler
    stream: ext://sys.stdout
 loggers:
  src:
    level: INFO
    handlers:
      - src
    propagate: no
--- a/src/pdf_extraction_agenda/logging_.py
+++ b/src/pdf_extraction_agenda/logging_.py
@@ -0,0 +1,22 @@
 __all__ = ["logger"]
 import logging.config
 import os
 import yaml
 class RelativePathFilter(logging.Filter):
    def filter(self, record: logging.LogRecord) -> bool:
        record.relativePath = os.path.relpath(record.pathname)
        return True
 logging_yaml = os.path.join(os.path.dirname(__file__), "logging.yaml")
 with open(logging_yaml) as f:
    config = yaml.safe_load(f)
    logging.config.dictConfig(config)
 logger = logging.getLogger("src")
 logger.addFilter(RelativePathFilter())
--- a/src/pdf_extraction_agenda/main.py
+++ b/src/pdf_extraction_agenda/main.py
@@ -0,0 +1,52 @@
 from os import PathLike
 from typing import Literal, NewType, Protocol, assert_never
 import pandas as pd
 from datasets_ import parse_response, prepare_olmocr_dataset
 from metrics import calc_nid
 from tqdm import tqdm
 class PipelineProto(Protocol):
    def __call__(self, path: str | PathLike) -> str:
        """Runs the pipeline on the given path and returns the md result."""
        pass
 EvaluationResult = NewType("EvaluationResult", pd.DataFrame)
 def evaluate_pipeline(run_pipeline: PipelineProto) -> EvaluationResult:
    dataset, id_to_path = prepare_olmocr_dataset()
    metrics_raw = []
    for s in tqdm(dataset):
        path = id_to_path(s["id"], warn=True)
        malformed, response = parse_response(s, warn=True)
        if malformed:
            continue
        md_result = run_pipeline(path)
        nid = calc_nid(response.natural_text, md_result)
        metrics_raw.append({"nid": nid})
    metrics_df = pd.DataFrame(metrics_raw)
    return EvaluationResult(metrics_df)
 def main(pipeline: Literal["docling"]):
    if pipeline == "docling":
        from pipeline_docling import run_docling_pipeline
        run_pipeline = run_docling_pipeline
    else:
        assert_never(pipeline)
    metrics_df = evaluate_pipeline(run_pipeline)
    print(metrics_df)
 if __name__ == "__main__":
    main("docling")
--- a/src/pdf_extraction_agenda/metrics.py
+++ b/src/pdf_extraction_agenda/metrics.py
@@ -0,0 +1,31 @@
 from rapidfuzz import fuzz
 def _normalize_text(text: str) -> str:
    """Normalize text for comparison."""
    if text is None:
        return ""
    return text.strip().lower()
 def calc_nid(gt_text: str, pred_text: str) -> float:
    """Calculate the Normalized Indel score between the gt and pred text.
    Args:
        gt_text (str): The string of gt text to compare.
        pred_text (str): The string of pred text to compare.
    Returns:
        float: The nid score between gt and pred text. [0., 1.]
    """
    gt_text = _normalize_text(gt_text)
    pred_text = _normalize_text(pred_text)
    # if gt and pred is empty, return 1
    if len(gt_text) == 0 and len(pred_text) == 0:
        score = 1
    # if pred is empty while gt is not, return 0
    elif len(gt_text) > 0 and len(pred_text) == 0:
        score = 0
    else:
        score = fuzz.ratio(gt_text, pred_text)
    return score
--- a/src/pdf_extraction_agenda/pipeline_docling.py
+++ b/src/pdf_extraction_agenda/pipeline_docling.py
@@ -0,0 +1,10 @@
 from os import PathLike
 from docling.document_converter import DocumentConverter
 document_converter = DocumentConverter()
 def run_docling_pipeline(path: str | PathLike) -> str:
    result = document_converter.convert(path)
    return result.document.export_to_markdown()
--- a/uv.lock
+++ b/uv.lock