#!/usr/bin/env python3 """ Split Donald Cohn's *Measure Theory (2nd ed.)* into 21 printable parts: 1) Title page + Preface + Contents + Introduction 2-11) Chapters 1..10 12-19) Appendices A..H 20) References 21) Index of notation + Index (combined) Usage: python split_cohn_measure_theory.py "/path/to/Measure Theory (2nd ed.) - Cohn, Donald L._5990.pdf" ./cohn_splits Deps: pip install pypdf # (or PyPDF2 as fallback) """ import os import re import sys from typing import Dict, List, Tuple def load_reader(pdf_path): try: from pypdf import PdfReader # preferred except Exception: from PyPDF2 import PdfReader # fallback return PdfReader(pdf_path) def extract_texts(reader) -> List[str]: texts = [] for i in range(len(reader.pages)): try: txt = reader.pages[i].extract_text() or "" except Exception: txt = "" texts.append(txt) return texts def find_heading_after(texts: List[str], heading: str, start_from: int) -> int: """Find the first page >= start_from with a line exactly equal to `heading`. If not found, fall back to substring search. """ num_pages = len(texts) pattern = re.compile(rf"^{re.escape(heading)}\s*$", re.MULTILINE) for i in range(start_from, num_pages): if pattern.search(texts[i]): return i for i in range(start_from, num_pages): if heading in texts[i]: return i return -1 def detect_anchor_pages(texts: List[str]) -> Dict[str, int]: # Order exactly as table of contents order = [ "Introduction", "1 Measures", "2 Functions and Integrals", "3 Convergence", "4 Signed and Complex Measures", "5 Product Measures", "6 Differentiation", "7 Measures on Locally Compact Spaces", "8 Polish Spaces and Analytic Sets", "9 Haar Measure", "10 Probability", "A Notation and Set Theory", "B Algebra and Basic Facts About R and C", "C Calculus and Topology in R", "D Topological Spaces and Metric Spaces", "E The Bochner Integral", "F Liftings", "G The Banach–Tarski Paradox", "H The Henstock–Kurzweil and McShane Integrals", "References", "Index of notation", "Index", ] anchors: Dict[str, int] = {} start_from = 0 for h in order: p = find_heading_after(texts, h, start_from) anchors[h] = p if p >= 0 else None if p >= 0: start_from = p + 1 # The OCR sometimes mangles “R and C” on Appendix B pages; if not found, # scan the window between A and C for a line starting with "B". if anchors.get("B Algebra and Basic Facts About R and C") is None: a = anchors.get("A Notation and Set Theory") c = anchors.get("C Calculus and Topology in R") if a is not None and c is not None: for i in range(a, c): # Check early lines for a heading that begins with "Appendix B" or just "B" lines = [ln.strip() for ln in (texts[i] or "").splitlines()[:20]] if any(re.match(r"^(Appendix\s+)?B\b", ln) for ln in lines): anchors["B Algebra and Basic Facts About R and C"] = i break return anchors def build_sections(anchors: Dict[str, int], total_pages: int) -> List[Tuple[str, int, int]]: """Return list of (output_name, start_page, end_page) inclusive, zero-based pages.""" # Define logical groups chapters = [ "1 Measures", "2 Functions and Integrals", "3 Convergence", "4 Signed and Complex Measures", "5 Product Measures", "6 Differentiation", "7 Measures on Locally Compact Spaces", "8 Polish Spaces and Analytic Sets", "9 Haar Measure", "10 Probability", ] appendices = [ "A Notation and Set Theory", "B Algebra and Basic Facts About R and C", "C Calculus and Topology in R", "D Topological Spaces and Metric Spaces", "E The Bochner Integral", "F Liftings", "G The Banach–Tarski Paradox", "H The Henstock–Kurzweil and McShane Integrals", ] master = chapters + appendices + ["References", "Index of notation", "Index"] # 1) Frontmatter + Introduction ch1_start = anchors["1 Measures"] if ch1_start is None: raise RuntimeError("Could not locate '1 Measures' start page.") sections: List[Tuple[str, int, int]] = [("01_frontmatter_introduction", 0, ch1_start - 1)] # Helper to compute end page from next anchor def end_before(next_keys, after_key): for nk in next_keys: p = anchors.get(nk) if p is not None: return p - 1 return total_pages - 1 # Chapters 1..10 for key in chapters: start = anchors.get(key) if start is None: raise RuntimeError(f"Could not locate start for {key}") idx = master.index(key) end = end_before(master[idx+1:], key) num = int(key.split()[0]) name = f"{num:02d}_chapter_{num}" sections.append((name, start, end)) # Appendices A..H for key in appendices: start = anchors.get(key) if start is None: raise RuntimeError(f"Could not locate start for {key}") idx = master.index(key) end = end_before(master[idx+1:], key) letter = key.split()[0] # 'A'..'H' # Prefix indices so overall list ends up at 21 files; frontmatter is 01, chapters 01..10, # appendices 12..19 (after 10 chapters), then 20 refs, 21 indexes. # Here we compute the ordinal: 12 for A, ..., 19 for H ordinal = 10 + (ord(letter) - ord('A')) + 2 name = f"{ordinal:02d}_appendix_{letter.lower()}" sections.append((name, start, end)) # References ref_start = anchors.get("References") if ref_start is None: raise RuntimeError("Could not locate 'References'") idx_notation = anchors.get("Index of notation") ref_end = (idx_notation - 1) if idx_notation is not None else (total_pages - 1) sections.append(("20_references", ref_start, ref_end)) # Index of notation + Index combined idx_not = anchors.get("Index of notation") if idx_not is None: raise RuntimeError("Could not locate 'Index of notation'") sections.append(("21_indexes", idx_not, total_pages - 1)) assert len(sections) == 21, f"Expected 21 outputs, got {len(sections)}" return sections def write_sections(reader, sections: List[Tuple[str, int, int]], out_dir: str): os.makedirs(out_dir, exist_ok=True) try: from pypdf import PdfWriter except Exception: from PyPDF2 import PdfWriter written = [] for name, start, end in sections: writer = PdfWriter() for p in range(start, end + 1): writer.add_page(reader.pages[p]) out_path = os.path.join(out_dir, f"{name}.pdf") with open(out_path, "wb") as f: writer.write(f) written.append(out_path) return written def main(): if len(sys.argv) < 3: print("Usage: python split_cohn_measure_theory.py INPUT.pdf OUTPUT_DIR") sys.exit(1) pdf_path = sys.argv[1] out_dir = sys.argv[2] reader = load_reader(pdf_path) texts = extract_texts(reader) def adjust_anchor_offsets(anchors: dict) -> dict: """Shift all anchors back by one page (title leaf) to fix off-by-one caused by low-text title pages. Clamp at 0. Skip shifting the very first page of the book if present (e.g., if any anchor is at page 0).""" adj = anchors.copy() # Determine a safe minimum page to avoid going negative. MIN_PAGE = 0 # Shift all known keys that resolved to a page >= 0 for k, v in list(adj.items()): if isinstance(v, int) and v > MIN_PAGE: adj[k] = v - 1 return adj import re def heading_present_on_page(text: str, heading: str) -> bool: """True if a line equals the heading.""" if not text: return False return re.search(rf"(?m)^{re.escape(heading)}\s*$", text) is not None def nudge_anchor_forward_if_needed(texts, anchors, key): """ If the exact heading isn't on anchors[key] but is on the next page, move the anchor forward by 1. This fixes early anchors. """ p = anchors.get(key) if not isinstance(p, int) or p < 0 or p >= len(texts): return if heading_present_on_page(texts[p], key): return if p + 1 < len(texts) and heading_present_on_page(texts[p + 1], key): anchors[key] = p + 1 anchors = detect_anchor_pages(texts) anchors = adjust_anchor_offsets(anchors) # your “shift back by 1” for chapters/appendices # Fix early anchors on trailing sections: for key in ["References", "Index of notation", "Index"]: nudge_anchor_forward_if_needed(texts, anchors, key) sections = build_sections(anchors, len(reader.pages)) # sanity print print("Detected anchors:") for k, v in anchors.items(): print(f" {k:50s} -> {v}") sections = build_sections(anchors, len(reader.pages)) written = write_sections(reader, sections, out_dir) print("\nWrote files:") for w in written: print(" ", w) if __name__ == "__main__": main()