#!/usr/bin/env python3 """ Split Mathpix raw output into per-lecture chunks. Looks at every .mmd / .tex file under raw/ and splits each on lines that match a 'Lecture N' header (case-insensitive, tolerant of OCR noise). Writes the chunks to raw/per-lecture/lecture01.{mmd,tex} ... Usage: python scripts/split_lectures.py --raw raw --out raw/per-lecture If a file fails to detect any lecture markers, it is copied wholesale into raw/per-lecture/UNSPLIT- so you can split it manually. """ from __future__ import annotations import argparse import pathlib import re import shutil # Tolerant of "Lecture 1", "Lecture I", "lec 1", "L1", optional colon/dash HEADER_RE = re.compile( r"""^\s* (?:\#+\s*)? # optional markdown header (?:\\section\*?\{)? # optional latex section (?:lecture|lec\.?|l)\s* # word "lecture" / abbrev (?P[ivx]+|\d+) # number (arabic or roman) \s*[:\.\)\-\}]? # optional punctuation """, re.IGNORECASE | re.VERBOSE, ) ROMAN = { "i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6, "vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12, "xiii": 13, } def to_int(num: str) -> int: num = num.lower() return ROMAN.get(num, int(num) if num.isdigit() else -1) def split_file(path: pathlib.Path, out_dir: pathlib.Path, ext: str) -> int: text = path.read_text(encoding="utf-8", errors="replace") lines = text.splitlines(keepends=True) # find header line indices and their lecture numbers hits: list[tuple[int, int]] = [] # (line_index, lecture_n) for i, line in enumerate(lines): m = HEADER_RE.match(line) if not m: continue n = to_int(m.group("num")) if 1 <= n <= 13: hits.append((i, n)) if not hits: target = out_dir / f"UNSPLIT-{path.name}" out_dir.mkdir(parents=True, exist_ok=True) shutil.copy(path, target) print(f" no lecture markers in {path.name} -> {target.name}") return 0 # de-dup runs of the same lecture number (OCR can pick up the heading twice) deduped: list[tuple[int, int]] = [] for h in hits: if not deduped or deduped[-1][1] != h[1]: deduped.append(h) # add sentinel end bounded = deduped + [(len(lines), -1)] out_dir.mkdir(parents=True, exist_ok=True) written = 0 for (start, n), (end, _) in zip(bounded, bounded[1:]): chunk = "".join(lines[start:end]) out = out_dir / f"lecture{n:02d}.{ext}" # if multiple source files contribute to the same lecture, append mode = "a" if out.exists() else "w" with out.open(mode, encoding="utf-8") as fh: if mode == "a": fh.write("\n\n%-- continued from another source --\n\n") fh.write(chunk) written += 1 print(f" {path.name} -> {out.name} ({end - start} lines)") return written def main() -> None: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--raw", default="raw", help="Mathpix raw output root") ap.add_argument("--out", default="raw/per-lecture", help="Per-lecture output dir") args = ap.parse_args() raw = pathlib.Path(args.raw) out = pathlib.Path(args.out) if not raw.exists(): raise SystemExit(f"raw directory {raw} does not exist; run `make mathpix` first") if out.exists(): shutil.rmtree(out) total = 0 for ext in ("mmd", "tex"): for path in sorted(raw.rglob(f"*.{ext}")): if "per-lecture" in path.parts: continue total += split_file(path, out, ext) print(f"\nWrote {total} per-lecture chunks under {out}/") if __name__ == "__main__": main()