#!/usr/bin/env python3
"""
Split Mathpix raw output into per-lecture chunks.

Looks at every .mmd / .tex file under raw/ and splits each on lines that
match a 'Lecture N' header (case-insensitive, tolerant of OCR noise).
Writes the chunks to raw/per-lecture/lecture01.{mmd,tex} ...

Usage:
    python scripts/split_lectures.py --raw raw --out raw/per-lecture

If a file fails to detect any lecture markers, it is copied wholesale
into raw/per-lecture/UNSPLIT-<name> so you can split it manually.
"""

from __future__ import annotations

import argparse
import pathlib
import re
import shutil

# Tolerant of "Lecture 1", "Lecture I", "lec 1", "L1", optional colon/dash
HEADER_RE = re.compile(
    r"""^\s*
        (?:\#+\s*)?                                     # optional markdown header
        (?:\\section\*?\{)?                             # optional latex section
        (?:lecture|lec\.?|l)\s*                         # word "lecture" / abbrev
        (?P<num>[ivx]+|\d+)                             # number (arabic or roman)
        \s*[:\.\)\-\}]?                                 # optional punctuation
    """,
    re.IGNORECASE | re.VERBOSE,
)

ROMAN = {
    "i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6, "vii": 7,
    "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12, "xiii": 13,
}


def to_int(num: str) -> int:
    num = num.lower()
    return ROMAN.get(num, int(num) if num.isdigit() else -1)


def split_file(path: pathlib.Path, out_dir: pathlib.Path, ext: str) -> int:
    text = path.read_text(encoding="utf-8", errors="replace")
    lines = text.splitlines(keepends=True)

    # find header line indices and their lecture numbers
    hits: list[tuple[int, int]] = []   # (line_index, lecture_n)
    for i, line in enumerate(lines):
        m = HEADER_RE.match(line)
        if not m:
            continue
        n = to_int(m.group("num"))
        if 1 <= n <= 13:
            hits.append((i, n))

    if not hits:
        target = out_dir / f"UNSPLIT-{path.name}"
        out_dir.mkdir(parents=True, exist_ok=True)
        shutil.copy(path, target)
        print(f"  no lecture markers in {path.name} -> {target.name}")
        return 0

    # de-dup runs of the same lecture number (OCR can pick up the heading twice)
    deduped: list[tuple[int, int]] = []
    for h in hits:
        if not deduped or deduped[-1][1] != h[1]:
            deduped.append(h)

    # add sentinel end
    bounded = deduped + [(len(lines), -1)]
    out_dir.mkdir(parents=True, exist_ok=True)
    written = 0
    for (start, n), (end, _) in zip(bounded, bounded[1:]):
        chunk = "".join(lines[start:end])
        out = out_dir / f"lecture{n:02d}.{ext}"
        # if multiple source files contribute to the same lecture, append
        mode = "a" if out.exists() else "w"
        with out.open(mode, encoding="utf-8") as fh:
            if mode == "a":
                fh.write("\n\n%-- continued from another source --\n\n")
            fh.write(chunk)
        written += 1
        print(f"  {path.name} -> {out.name} ({end - start} lines)")
    return written


def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--raw", default="raw", help="Mathpix raw output root")
    ap.add_argument("--out", default="raw/per-lecture", help="Per-lecture output dir")
    args = ap.parse_args()

    raw = pathlib.Path(args.raw)
    out = pathlib.Path(args.out)

    if not raw.exists():
        raise SystemExit(f"raw directory {raw} does not exist; run `make mathpix` first")

    if out.exists():
        shutil.rmtree(out)

    total = 0
    for ext in ("mmd", "tex"):
        for path in sorted(raw.rglob(f"*.{ext}")):
            if "per-lecture" in path.parts:
                continue
            total += split_file(path, out, ext)
    print(f"\nWrote {total} per-lecture chunks under {out}/")


if __name__ == "__main__":
    main()