% ================================================================= % Lecture 10 % Primary source: handwritten notes (Mathpix mmd) % Fallback: kashlak.pdf (only for OCR/notation/curriculum clarity) % ================================================================= \section[Lecture 10 -- Hierarchy of Convergence; Borel-Cantelli]{Lecture 10 \textemdash{} Hierarchy of Convergence; Borel--Cantelli; Prohorov} \label{sec:lec10} Lecture~9 set up the four modes of convergence for random variables and the equivalent formulations of weak convergence (Portmanteau). The job now is twofold: assemble these modes into a single hierarchy of implications, and develop the Borel--Cantelli lemmas\,---\,the standard tool for promoting summable bounds on \(\mu(A_i)\) into almost-sure statements about whether \(A_i\) occurs only finitely often. We close with a brief look at Prohorov's theorem, which gives a compactness criterion for sequences of probability measures and underlies the classical proof of the central limit theorem. \subsection{Stronger metrics on the space of probability measures} Weak convergence \(\P_i\Rightarrow\P\) (\cref{def:weak-conv}) is the weakest of a family of distance-like notions on probability measures, all of the form \(\sup_{f\in\Fcal}|\int f\,d\P_i-\int f\,d\P|\) for some test class \(\Fcal\). Enlarging the test class \(\Fcal\) yields a finer notion of closeness: \begin{itemize} \item \(\Fcal=C_b(S)\) (continuous bounded): weak convergence. \item \(\Fcal=\{f:S\to[-1,1]\text{ continuous}\}\): the \emph{Radon metric}. \item \(\Fcal=\{f:S\to[-1,1]\text{ measurable}\}\): the \emph{total variation} distance. \item \(\Fcal=\{f:S\to\R\text{ Lipschitz with constant }1\}\): the \emph{1-Wasserstein distance}, central to optimal transport and machine learning, which quantifies \emph{how quickly} the convergence occurs rather than only that it does. \end{itemize} \begin{remark} All four are equivalent on a finite \(S\), and all imply weak convergence on any metric space; the converse fails in general. We work almost exclusively with weak convergence below. \end{remark} \subsection{Hierarchy of modes of convergence} Recall from Lecture~9 the four modes for random variables \(X_i,X\colon(\Omega,\Fcal,\mu)\to(S,\rho)\): convergence in distribution \(X_i\xrightarrow{d}X\), in probability \(X_i\xrightarrow{\P}X\) (\cref{def:conv-prob}), almost surely \(X_i\xrightarrow{a.s.}X\) (\cref{def:conv-as}), and in \(L^p\) \(X_i\xrightarrow{L^p}X\) (\cref{def:conv-lp}). The implications between them form a strict hierarchy. \begin{theorem}{Hierarchy of convergence}{hierarchy} Let \(X_i,X\) be random variables on \((\Omega,\Fcal,\mu)\) with values in a metric space \((S,\rho)\). Then \[ \begin{array}{ccc} X_i\xrightarrow{a.s.}X & \Longrightarrow & X_i\xrightarrow{\P}X \;\Longrightarrow\; X_i\xrightarrow{d}X,\\[4pt] X_i\xrightarrow{L^p}X & \Longrightarrow & X_i\xrightarrow{\P}X \quad\text{for every } p\in[1,\infty]. \end{array} \] None of the reverse implications holds, and \emph{a.s.} and \(L^p\) convergence are not comparable: neither implies the other without an additional uniform integrability or domination hypothesis. \end{theorem} \begin{figure}[h] \centering \begin{tikzpicture}[ >=Stealth, node distance=14mm and 22mm, every node/.style={font=\sffamily\small}, mode/.style={draw=deepnavy, thick, rounded corners=2pt, fill=defbodybg, inner sep=5pt, minimum width=22mm}, imp/.style={->, thick, exampleblue} ] \node[mode] (as) {a.s.}; \node[mode, right=of as] (prob) {in prob.}; \node[mode, right=of prob] (dist) {in dist.}; \node[mode, above=of prob] (lp) {\(L^p\)}; \draw[imp] (as) -- (prob); \draw[imp] (prob) -- (dist); \draw[imp] (lp) -- (prob); \node[font=\sffamily\itshape\small, color=remarkgray] at ($(as)!0.5!(lp) + (-0.6,0.3)$) {\(\not\Leftrightarrow\)}; \end{tikzpicture} \caption{The four modes of convergence and the implications between them. Almost-sure and \(L^p\) convergence are not comparable.} \label{fig:conv-hierarchy} \end{figure} \subsection{Limsup, liminf, and the Borel--Cantelli setup} Fix a probability space \((\Omega,\Fcal,\mu)\) and a sequence \(\{A_i\}_{i=1}^{\infty}\subseteq\Fcal\). The set-theoretic limsup and liminf give a precise meaning to ``\(A_i\) happens infinitely often'' and ``\(A_i\) happens eventually''. \begin{definition}{$\limsup$ and $\liminf$ of events}{limsup-liminf} For \(\{A_i\}_{i=1}^{\infty}\subseteq\Fcal\), \[ \limsup_{i} A_i \;=\; \bigcap_{i=1}^{\infty}\bigcup_{j\ge i} A_j, \qquad \liminf_{i} A_i \;=\; \bigcup_{i=1}^{\infty}\bigcap_{j\ge i} A_j. \] We say \(A_i\) \emph{infinitely often} (\(A_i\) i.o.) for \(\limsup_i A_i\): \(\omega\in\limsup_i A_i\) iff for every \(N\in\N\) there exists \(n\ge N\) with \(\omega\in A_n\). We say \(A_i\) \emph{eventually} (\(A_i\) ev.) for \(\liminf_i A_i\): \(\omega\in\liminf_i A_i\) iff there exists \(N\in\N\) such that \(\omega\in A_n\) for all \(n\ge N\). \end{definition} \begin{remark} The two are dual in the sense \((\limsup_i A_i)^c=\liminf_i A_i^c\) and conversely, by De Morgan. \end{remark} \subsection{The Borel--Cantelli lemmas} The two lemmas are a one-sided pair: summability of \(\mu(A_i)\) forces \(A_i\) to occur only finitely often almost surely; under the extra hypothesis of independence, divergence of the same series forces \(A_i\) to occur infinitely often almost surely. \begin{lemma}{First Borel--Cantelli}{borel-cantelli-1} Let \(\{A_i\}_{i=1}^{\infty}\subseteq\Fcal\). If \(\sum_{i=1}^{\infty}\mu(A_i)<\infty\), then \[ \mu\!\left(\limsup_i A_i\right) \;=\; 0. \] Equivalently, with probability one only finitely many \(A_i\) occur. The proof is a one-line consequence of monotonicity and countable subadditivity: for every \(i\), \[ \mu\!\left(\limsup_j A_j\right) \;\le\; \mu\!\left(\bigcup_{j\ge i}A_j\right) \;\le\; \sum_{j\ge i}\mu(A_j) \;\xrightarrow{i\to\infty}\; 0, \] where the right-hand tail vanishes because the full series converges. \end{lemma} \begin{lemma}{Second Borel--Cantelli}{borel-cantelli-2} Let \(\{A_i\}_{i=1}^{\infty}\subseteq\Fcal\) be \emph{independent}. If \(\sum_{i=1}^{\infty}\mu(A_i)=\infty\), then \[ \mu\!\left(\limsup_i A_i\right) \;=\; 1. \] The argument is a complement-and-exponentiate trick. Independence of \(\{A_i\}\) implies independence of \(\{A_i^c\}\). For any \(i\in\N\) and \(k\ge i\), \[ \mu\!\left(\bigcap_{j=i}^{k} A_j^c\right) \;=\; \prod_{j=i}^{k}\bigl[1-\mu(A_j)\bigr] \;\le\; \exp\!\left[-\sum_{j=i}^{k}\mu(A_j)\right], \] using the elementary bound \(1-t\le e^{-t}\) valid for all \(t\in\R\). Letting \(k\to\infty\) makes the right-hand side vanish, so \(\mu(\bigcap_{j\ge i} A_j^c)=0\) for every \(i\); De Morgan then gives \(\mu(\limsup_i A_i)=1\). \end{lemma} \begin{remark} Independence cannot be dropped from the second lemma: if \(A_1=A_2=\cdots=A\) with \(\mu(A)=\tfrac12\), then \(\sum\mu(A_i)=\infty\) but \(\limsup_i A_i=A\) has probability \(\tfrac12\), not \(1\). \end{remark} \begin{example}[Coin tosses produce every finite pattern] Toss a fair coin independently and let \(A_i\) be the event that positions \(i,i+1,\dots,i+k-1\) spell out a fixed pattern of length \(k\). Then \(\mu(A_i)=2^{-k}\), and the events \(A_1,A_{k+1},A_{2k+1},\dots\) are independent with \(\sum_n\mu(A_{nk+1})=\infty\). The second Borel--Cantelli lemma gives \(\mu(A_{nk+1}\text{ i.o.})=1\): every finite pattern appears infinitely often almost surely. \end{example} \subsection{Prohorov's theorem} The final piece of the convergence apparatus is a compactness criterion for sequences of probability measures: it is to weak convergence what Bolzano--Weierstrass is to bounded sequences in \(\R^d\). The right notion of ``boundedness'' is \emph{tightness}, capturing that no mass escapes to infinity. \begin{definition}{Uniform tightness}{tight} A collection \(\{\mu_i\}_{i\in I}\) of probability measures on a metric space \((S,\rho)\) is \emph{uniformly tight} if for every \(\varepsilon>0\) there exists a compact set \(K_\varepsilon\subseteq S\) with \[ \mu_i(K_\varepsilon) \;>\; 1-\varepsilon \qquad\text{for every } i\in I. \] \end{definition} \begin{theorem}{Prohorov}{prohorov} Let \(\{\mu_i\}_{i=1}^{\infty}\) be a sequence of probability measures on a metric space \(S\). If \(\{\mu_i\}\) is uniformly tight, then it is \emph{relatively sequentially compact} for weak convergence: every subsequence \(\mu_{i_k}\) admits a further subsequence \(\mu_{i_{k_r}}\Rightarrow\mu\) for some probability measure \(\mu\) (depending on the subsequence). \end{theorem} \begin{remark} A useful subsubsequence corollary: if every subsequence of \(\{\mu_i\}\) admits a further subsequence converging weakly to the \emph{same} limit \(\mu\), then \(\mu_i\Rightarrow\mu\). This is the classical route to the central limit theorem\,---\,one shows tightness, extracts a weak limit along a subsequence, and identifies the limit as the standard normal via characteristic functions. \end{remark}