% =================================================================
%  Lecture 10
%  Primary source: handwritten notes (Mathpix mmd)
%  Fallback: kashlak.pdf (only for OCR/notation/curriculum clarity)
% =================================================================
\section[Lecture 10 -- Hierarchy of Convergence; Borel-Cantelli]{Lecture 10 \textemdash{} Hierarchy of Convergence; Borel--Cantelli; Prohorov}
\label{sec:lec10}

Lecture~9 set up the four modes of convergence for random variables and
the equivalent formulations of weak convergence (Portmanteau). The job
now is twofold: assemble these modes into a single hierarchy of
implications, and develop the Borel--Cantelli lemmas\,---\,the standard
tool for promoting summable bounds on \(\mu(A_i)\) into almost-sure
statements about whether \(A_i\) occurs only finitely often. We close
with a brief look at Prohorov's theorem, which gives a compactness
criterion for sequences of probability measures and underlies the
classical proof of the central limit theorem.

\subsection{Stronger metrics on the space of probability measures}

Weak convergence \(\P_i\Rightarrow\P\) (\cref{def:weak-conv}) is the
weakest of a family of distance-like notions on probability measures,
all of the form \(\sup_{f\in\Fcal}|\int f\,d\P_i-\int f\,d\P|\) for some
test class \(\Fcal\). Enlarging the test class \(\Fcal\) yields a finer
notion of closeness:

\begin{itemize}
  \item \(\Fcal=C_b(S)\) (continuous bounded): weak convergence.
  \item \(\Fcal=\{f:S\to[-1,1]\text{ continuous}\}\): the
        \emph{Radon metric}.
  \item \(\Fcal=\{f:S\to[-1,1]\text{ measurable}\}\): the
        \emph{total variation} distance.
  \item \(\Fcal=\{f:S\to\R\text{ Lipschitz with constant }1\}\): the
        \emph{1-Wasserstein distance}, central to optimal transport
        and machine learning, which quantifies \emph{how quickly} the
        convergence occurs rather than only that it does.
\end{itemize}

\begin{remark}
All four are equivalent on a finite \(S\), and all imply weak
convergence on any metric space; the converse fails in general. We work
almost exclusively with weak convergence below.
\end{remark}

\subsection{Hierarchy of modes of convergence}

Recall from Lecture~9 the four modes for random variables
\(X_i,X\colon(\Omega,\Fcal,\mu)\to(S,\rho)\): convergence in
distribution \(X_i\xrightarrow{d}X\), in probability
\(X_i\xrightarrow{\P}X\) (\cref{def:conv-prob}), almost surely
\(X_i\xrightarrow{a.s.}X\) (\cref{def:conv-as}), and in \(L^p\)
\(X_i\xrightarrow{L^p}X\) (\cref{def:conv-lp}). The implications
between them form a strict hierarchy.

\begin{theorem}{Hierarchy of convergence}{hierarchy}
Let \(X_i,X\) be random variables on \((\Omega,\Fcal,\mu)\) with values
in a metric space \((S,\rho)\). Then
\[
\begin{array}{ccc}
X_i\xrightarrow{a.s.}X
   & \Longrightarrow & X_i\xrightarrow{\P}X
        \;\Longrightarrow\; X_i\xrightarrow{d}X,\\[4pt]
X_i\xrightarrow{L^p}X
   & \Longrightarrow & X_i\xrightarrow{\P}X
        \quad\text{for every } p\in[1,\infty].
\end{array}
\]
None of the reverse implications holds, and \emph{a.s.} and \(L^p\)
convergence are not comparable: neither implies the other without an
additional uniform integrability or domination hypothesis.
\end{theorem}

\begin{figure}[h]
\centering
\begin{tikzpicture}[
   >=Stealth,
   node distance=14mm and 22mm,
   every node/.style={font=\sffamily\small},
   mode/.style={draw=deepnavy, thick, rounded corners=2pt,
                fill=defbodybg, inner sep=5pt, minimum width=22mm},
   imp/.style={->, thick, exampleblue}
]
  \node[mode] (as)   {a.s.};
  \node[mode, right=of as] (prob) {in prob.};
  \node[mode, right=of prob] (dist) {in dist.};
  \node[mode, above=of prob] (lp)   {\(L^p\)};
  \draw[imp] (as)   -- (prob);
  \draw[imp] (prob) -- (dist);
  \draw[imp] (lp)   -- (prob);
  \node[font=\sffamily\itshape\small, color=remarkgray]
       at ($(as)!0.5!(lp) + (-0.6,0.3)$) {\(\not\Leftrightarrow\)};
\end{tikzpicture}
\caption{The four modes of convergence and the implications between
them. Almost-sure and \(L^p\) convergence are not comparable.}
\label{fig:conv-hierarchy}
\end{figure}

\subsection{Limsup, liminf, and the Borel--Cantelli setup}

Fix a probability space \((\Omega,\Fcal,\mu)\) and a sequence
\(\{A_i\}_{i=1}^{\infty}\subseteq\Fcal\). The set-theoretic limsup and
liminf give a precise meaning to ``\(A_i\) happens infinitely often''
and ``\(A_i\) happens eventually''.

\begin{definition}{$\limsup$ and $\liminf$ of events}{limsup-liminf}
For \(\{A_i\}_{i=1}^{\infty}\subseteq\Fcal\),
\[
\limsup_{i} A_i \;=\; \bigcap_{i=1}^{\infty}\bigcup_{j\ge i} A_j,
\qquad
\liminf_{i} A_i  \;=\; \bigcup_{i=1}^{\infty}\bigcap_{j\ge i} A_j.
\]
We say \(A_i\) \emph{infinitely often} (\(A_i\) i.o.) for
\(\limsup_i A_i\): \(\omega\in\limsup_i A_i\) iff for every
\(N\in\N\) there exists \(n\ge N\) with \(\omega\in A_n\). We say
\(A_i\) \emph{eventually} (\(A_i\) ev.) for \(\liminf_i A_i\):
\(\omega\in\liminf_i A_i\) iff there exists \(N\in\N\) such that
\(\omega\in A_n\) for all \(n\ge N\).
\end{definition}

\begin{remark}
The two are dual in the sense
\((\limsup_i A_i)^c=\liminf_i A_i^c\) and conversely, by De Morgan.
\end{remark}

\subsection{The Borel--Cantelli lemmas}

The two lemmas are a one-sided pair: summability of \(\mu(A_i)\) forces
\(A_i\) to occur only finitely often almost surely; under the extra
hypothesis of independence, divergence of the same series forces \(A_i\)
to occur infinitely often almost surely.

\begin{lemma}{First Borel--Cantelli}{borel-cantelli-1}
Let \(\{A_i\}_{i=1}^{\infty}\subseteq\Fcal\). If
\(\sum_{i=1}^{\infty}\mu(A_i)<\infty\), then
\[
\mu\!\left(\limsup_i A_i\right) \;=\; 0.
\]
Equivalently, with probability one only finitely many \(A_i\) occur.
The proof is a one-line consequence of monotonicity and countable
subadditivity: for every \(i\),
\[
\mu\!\left(\limsup_j A_j\right)
   \;\le\; \mu\!\left(\bigcup_{j\ge i}A_j\right)
   \;\le\; \sum_{j\ge i}\mu(A_j) \;\xrightarrow{i\to\infty}\; 0,
\]
where the right-hand tail vanishes because the full series converges.
\end{lemma}

\begin{lemma}{Second Borel--Cantelli}{borel-cantelli-2}
Let \(\{A_i\}_{i=1}^{\infty}\subseteq\Fcal\) be \emph{independent}. If
\(\sum_{i=1}^{\infty}\mu(A_i)=\infty\), then
\[
\mu\!\left(\limsup_i A_i\right) \;=\; 1.
\]
The argument is a complement-and-exponentiate trick. Independence of
\(\{A_i\}\) implies independence of \(\{A_i^c\}\). For any
\(i\in\N\) and \(k\ge i\),
\[
\mu\!\left(\bigcap_{j=i}^{k} A_j^c\right)
   \;=\; \prod_{j=i}^{k}\bigl[1-\mu(A_j)\bigr]
   \;\le\; \exp\!\left[-\sum_{j=i}^{k}\mu(A_j)\right],
\]
using the elementary bound \(1-t\le e^{-t}\) valid for all \(t\in\R\).
Letting \(k\to\infty\) makes the right-hand side vanish, so
\(\mu(\bigcap_{j\ge i} A_j^c)=0\) for every \(i\); De Morgan then gives
\(\mu(\limsup_i A_i)=1\).
\end{lemma}

\begin{remark}
Independence cannot be dropped from the second lemma: if
\(A_1=A_2=\cdots=A\) with \(\mu(A)=\tfrac12\), then
\(\sum\mu(A_i)=\infty\) but \(\limsup_i A_i=A\) has probability
\(\tfrac12\), not \(1\).
\end{remark}

\begin{example}[Coin tosses produce every finite pattern]
Toss a fair coin independently and let \(A_i\) be the event that
positions \(i,i+1,\dots,i+k-1\) spell out a fixed pattern of length
\(k\). Then \(\mu(A_i)=2^{-k}\), and the events
\(A_1,A_{k+1},A_{2k+1},\dots\) are independent with
\(\sum_n\mu(A_{nk+1})=\infty\). The second Borel--Cantelli lemma gives
\(\mu(A_{nk+1}\text{ i.o.})=1\): every finite pattern appears
infinitely often almost surely.
\end{example}

\subsection{Prohorov's theorem}

The final piece of the convergence apparatus is a compactness criterion
for sequences of probability measures: it is to weak convergence what
Bolzano--Weierstrass is to bounded sequences in \(\R^d\). The
right notion of ``boundedness'' is \emph{tightness}, capturing that no
mass escapes to infinity.

\begin{definition}{Uniform tightness}{tight}
A collection \(\{\mu_i\}_{i\in I}\) of probability measures on a metric
space \((S,\rho)\) is \emph{uniformly tight} if for every
\(\varepsilon>0\) there exists a compact set \(K_\varepsilon\subseteq S\)
with
\[
\mu_i(K_\varepsilon) \;>\; 1-\varepsilon
\qquad\text{for every } i\in I.
\]
\end{definition}

\begin{theorem}{Prohorov}{prohorov}
Let \(\{\mu_i\}_{i=1}^{\infty}\) be a sequence of probability measures
on a metric space \(S\). If \(\{\mu_i\}\) is uniformly tight, then it
is \emph{relatively sequentially compact} for weak convergence: every
subsequence \(\mu_{i_k}\) admits a further subsequence
\(\mu_{i_{k_r}}\Rightarrow\mu\) for some probability measure \(\mu\)
(depending on the subsequence).
\end{theorem}

\begin{remark}
A useful subsubsequence corollary: if every subsequence of
\(\{\mu_i\}\) admits a further subsequence converging weakly to the
\emph{same} limit \(\mu\), then \(\mu_i\Rightarrow\mu\). This is the
classical route to the central limit theorem\,---\,one shows tightness,
extracts a weak limit along a subsequence, and identifies the limit as
the standard normal via characteristic functions.
\end{remark}