158 lines
5.7 KiB
Python
158 lines
5.7 KiB
Python
import argparse
|
|
import tempfile
|
|
import time
|
|
from collections import defaultdict
|
|
|
|
from tqdm import tqdm
|
|
import pypdfium2 as pdfium
|
|
|
|
from marker.convert import convert_single_pdf
|
|
from marker.logger import configure_logging
|
|
from marker.models import load_all_models
|
|
from marker.benchmark.scoring import score_text
|
|
from marker.pdf.extract_text import naive_get_text
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import shutil
|
|
from tabulate import tabulate
|
|
import torch
|
|
|
|
configure_logging()
|
|
|
|
|
|
def start_memory_profiling():
|
|
torch.cuda.memory._record_memory_history(
|
|
max_entries=100000
|
|
)
|
|
|
|
|
|
def stop_memory_profiling(memory_file):
|
|
try:
|
|
torch.cuda.memory._dump_snapshot(memory_file)
|
|
except Exception as e:
|
|
logger.error(f"Failed to capture memory snapshot {e}")
|
|
|
|
# Stop recording memory snapshot history.
|
|
torch.cuda.memory._record_memory_history(enabled=None)
|
|
|
|
|
|
def nougat_prediction(pdf_filename, batch_size=1):
|
|
out_dir = tempfile.mkdtemp()
|
|
subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
|
|
md_file = os.listdir(out_dir)[0]
|
|
with open(os.path.join(out_dir, md_file), "r") as f:
|
|
data = f.read()
|
|
shutil.rmtree(out_dir)
|
|
return data
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
|
|
parser.add_argument("in_folder", help="Input PDF files")
|
|
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
|
|
parser.add_argument("out_file", help="Output filename")
|
|
parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False)
|
|
# Nougat batch size 1 uses about as much VRAM as default marker settings
|
|
parser.add_argument("--marker_batch_multiplier", type=int, default=1, help="Batch size multiplier to use for marker when making predictions.")
|
|
parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.")
|
|
parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
|
|
parser.add_argument("--profile_memory", action="store_true", help="Profile memory usage", default=False)
|
|
|
|
args = parser.parse_args()
|
|
|
|
methods = ["marker"]
|
|
if args.nougat:
|
|
methods.append("nougat")
|
|
|
|
if args.profile_memory:
|
|
start_memory_profiling()
|
|
|
|
model_lst = load_all_models()
|
|
|
|
if args.profile_memory:
|
|
stop_memory_profiling("model_load.pickle")
|
|
|
|
scores = defaultdict(dict)
|
|
benchmark_files = os.listdir(args.in_folder)
|
|
benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
|
|
times = defaultdict(dict)
|
|
pages = defaultdict(int)
|
|
|
|
for idx, fname in tqdm(enumerate(benchmark_files)):
|
|
md_filename = fname.rsplit(".", 1)[0] + ".md"
|
|
|
|
reference_filename = os.path.join(args.reference_folder, md_filename)
|
|
with open(reference_filename, "r", encoding="utf-8") as f:
|
|
reference = f.read()
|
|
|
|
pdf_filename = os.path.join(args.in_folder, fname)
|
|
doc = pdfium.PdfDocument(pdf_filename)
|
|
pages[fname] = len(doc)
|
|
|
|
for method in methods:
|
|
start = time.time()
|
|
if method == "marker":
|
|
if args.profile_memory:
|
|
start_memory_profiling()
|
|
full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, batch_multiplier=args.marker_batch_multiplier)
|
|
if args.profile_memory:
|
|
stop_memory_profiling(f"marker_memory_{idx}.pickle")
|
|
elif method == "nougat":
|
|
full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
|
|
elif method == "naive":
|
|
full_text = naive_get_text(doc)
|
|
else:
|
|
raise ValueError(f"Unknown method {method}")
|
|
|
|
times[method][fname] = time.time() - start
|
|
|
|
score = score_text(full_text, reference)
|
|
scores[method][fname] = score
|
|
|
|
if args.md_out_path:
|
|
md_out_filename = f"{method}_{md_filename}"
|
|
with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f:
|
|
f.write(full_text)
|
|
|
|
total_pages = sum(pages.values())
|
|
with open(args.out_file, "w+") as f:
|
|
write_data = defaultdict(dict)
|
|
for method in methods:
|
|
total_time = sum(times[method].values())
|
|
file_stats = {
|
|
fname:
|
|
{
|
|
"time": times[method][fname],
|
|
"score": scores[method][fname],
|
|
"pages": pages[fname]
|
|
}
|
|
|
|
for fname in benchmark_files
|
|
}
|
|
write_data[method] = {
|
|
"files": file_stats,
|
|
"avg_score": sum(scores[method].values()) / len(scores[method]),
|
|
"time_per_page": total_time / total_pages,
|
|
"time_per_doc": total_time / len(scores[method])
|
|
}
|
|
|
|
json.dump(write_data, f, indent=4)
|
|
|
|
summary_table = []
|
|
score_table = []
|
|
score_headers = benchmark_files
|
|
for method in methods:
|
|
summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
|
|
score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
|
|
|
|
print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
|
|
print("")
|
|
print("Scores by file")
|
|
print(tabulate(score_table, headers=["Method", *score_headers]))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|