#!/usr/bin/env python3 """ Submit handwritten PDFs to the Mathpix v3/pdf API and download structured LaTeX output as a tex.zip bundle. Usage: python scripts/mathpix_extract.py --pdf ../lec1-4.pdf --pdf ../lecs5-13.pdf --out raw Reads MATHPIX_APP_ID and MATHPIX_APP_KEY from environment or course-notes/.env. The v3/pdf endpoint is the right Mathpix API for this job: it handles multi-page handwritten PDFs in one shot, returns Markdown + LaTeX with preserved structure (sections, equations, tables), and is asynchronous so we just submit, poll, and pull the result. """ from __future__ import annotations import argparse import io import json import os import pathlib import sys import time import zipfile try: import requests except ImportError: sys.exit( "ERROR: this script needs `requests`. Install it with:\n" " python3 -m pip install --user requests" ) ROOT = pathlib.Path(__file__).resolve().parent.parent API_BASE = "https://api.mathpix.com/v3/pdf" def load_env(path: pathlib.Path) -> None: """Load KEY=VALUE lines from a .env file into os.environ (no overwrite).""" if not path.exists(): return for raw in path.read_text().splitlines(): line = raw.strip() if not line or line.startswith("#") or "=" not in line: continue key, value = line.split("=", 1) os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'")) def headers() -> dict[str, str]: return { "app_id": os.environ["MATHPIX_APP_ID"], "app_key": os.environ["MATHPIX_APP_KEY"], } def submit(pdf_path: pathlib.Path) -> str: """POST a PDF, return the assigned pdf_id.""" options = { # tex.zip = full LaTeX bundle; md = standard Markdown. # Mathpix Markdown (.mmd) is always available via the /{id}.mmd endpoint # and does not need to be listed here. "conversion_formats": {"tex.zip": True, "md": True}, "math_inline_delimiters": ["$", "$"], "math_display_delimiters": ["$$", "$$"], "rm_spaces": True, "enable_tables_fallback": True, "auto_number_sections": False, # handwriting recognition is on by default for v3/pdf } with pdf_path.open("rb") as fh: response = requests.post( API_BASE, headers=headers(), files={"file": (pdf_path.name, fh, "application/pdf")}, data={"options_json": json.dumps(options)}, timeout=120, ) response.raise_for_status() payload = response.json() pdf_id = payload.get("pdf_id") if not pdf_id: sys.exit(f" no pdf_id in response: {payload}") print(f" submitted: pdf_id={pdf_id}") return pdf_id def poll(pdf_id: str, timeout: int = 1200, interval: int = 5, require_formats: tuple[str, ...] = ("tex.zip",)) -> None: """Block until both the main conversion AND each requested format finish.""" deadline = time.time() + timeout last_print = "" while time.time() < deadline: r = requests.get(f"{API_BASE}/{pdf_id}", headers=headers(), timeout=30) r.raise_for_status() info = r.json() status = info.get("status") pct = info.get("percent_done", 0) cstatus = info.get("conversion_status", {}) or {} fmt_states = {f: (cstatus.get(f) or {}).get("status", "?") for f in require_formats} msg = f" status={status} ({pct}%) | formats=" + ", ".join( f"{f}:{s}" for f, s in fmt_states.items() ) if msg != last_print: print(msg); last_print = msg if status in {"error", "failed"}: sys.exit(f" conversion failed: {info}") if status == "completed" and all(s == "completed" for s in fmt_states.values()): return time.sleep(interval) sys.exit(" TIMEOUT waiting for Mathpix conversion") def fetch(pdf_id: str, fmt: str, dest: pathlib.Path) -> None: """Download a single output format (tex, mmd, md, html, lines.json...).""" r = requests.get(f"{API_BASE}/{pdf_id}.{fmt}", headers=headers(), timeout=120) r.raise_for_status() dest.parent.mkdir(parents=True, exist_ok=True) dest.write_bytes(r.content) print(f" wrote {dest} ({len(r.content):,} bytes)") def fetch_tex_bundle(pdf_id: str, dest_dir: pathlib.Path) -> None: """Pull the tex.zip and unpack into dest_dir/. The download endpoint extension must match the conversion_formats key *exactly*, so for `tex.zip` we hit `/v3/pdf/{id}.tex.zip` (not `.tex`). """ r = requests.get(f"{API_BASE}/{pdf_id}.tex.zip", headers=headers(), timeout=120) r.raise_for_status() dest_dir.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(io.BytesIO(r.content)) as z: z.extractall(dest_dir) print(f" unpacked tex.zip into {dest_dir}/ ({len(z.namelist())} files)") def process(pdf_path: pathlib.Path, out_root: pathlib.Path) -> None: name = pdf_path.stem target = out_root / name print(f"\n>>> {pdf_path}") pdf_id = submit(pdf_path) poll(pdf_id) fetch_tex_bundle(pdf_id, target) fetch(pdf_id, "mmd", target / f"{name}.mmd") # save the pdf_id for later re-fetches without re-uploading (target / "pdf_id.txt").write_text(pdf_id + "\n") def main() -> None: load_env(ROOT / ".env") if not (os.environ.get("MATHPIX_APP_ID") and os.environ.get("MATHPIX_APP_KEY")): sys.exit( "ERROR: set MATHPIX_APP_ID and MATHPIX_APP_KEY in course-notes/.env" ) ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--pdf", action="append", required=True, help="PDF to submit (repeatable)") ap.add_argument("--out", default=str(ROOT / "raw"), help="Output directory") args = ap.parse_args() out_root = pathlib.Path(args.out) for pdf in args.pdf: process(pathlib.Path(pdf).resolve(), out_root) print("\nAll done.") if __name__ == "__main__": main()