% =================================================================
%  Lecture 9
%  Primary source: handwritten notes (Mathpix mmd)
%  Fallback: kashlak.pdf (only for OCR/notation/curriculum clarity)
% =================================================================
\section[Lecture 9 -- Convergence in Probability and Measure]{Lecture 9 \textemdash{} Convergence in Probability and Measure}
\label{sec:lec09}

We now study what it means for a sequence of probability measures, or
of random variables, to converge. There are several inequivalent
notions; this lecture introduces the four standard modes for random
variables (almost sure, in probability, in \(L^p\), in distribution),
together with their parent notion at the level of measures
(\emph{weak} convergence). The Portmanteau theorem packages the
equivalent characterisations of weak convergence, and a small Hasse
diagram records the implications between the modes.

\subsection{Weak convergence of probability measures}

Let \((\Omega,\Fcal)\) be a measurable space and let
\(\{\P_i\}_{i=1}^{\infty}\) be a sequence of probability measures on
\((\Omega,\Fcal)\). What should ``\(\P_i\to\P\)'' mean? The naive
choice ``\(\P_i(A)\to\P(A)\) for every \(A\in\Fcal\)'' (\emph{setwise}
convergence) is too strong to be useful in practice; the standard
notion fixes a topology on \(\Omega\) and tests against continuous
functions.

\begin{definition}{Weak convergence of measures}{weak-conv}
Let \(S\) be a metric space with Borel \(\sigma\)-field
\(\Scal=\Bcal(S)\), and let \(\P,\P_1,\P_2,\dots\) be probability
measures on \((S,\Scal)\). We say \(\P_i\) \emph{converges weakly}
to \(\P\), written \(\P_i\Rightarrow\P\), if
\[
\int_S f\,d\P_i \;\longrightarrow\; \int_S f\,d\P
\qquad\text{for every } f\in \Ccal_B(S),
\]
where \(\Ccal_B(S)\) denotes the bounded continuous real-valued
functions on \(S\).
\end{definition}

\begin{remark}
Weak convergence is the topology of \emph{closeness} between measures
generated by the metric on \(S\). Concretely, an
\(\varepsilon\)-neighbourhood of \(\P\) is determined by a finite
collection \(f_1,\ldots,f_n\in\Ccal_B(S)\): the neighbourhood is the
set of all probability measures \(Q\) with
\(\bigl|\int f_i\,d\P-\int f_i\,dQ\bigr|<\varepsilon\) for each
\(i\). The weakest test against \(f\in \Ccal_B(S)\) gives the
weakest of several useful metrics on the space of probability
measures.
\end{remark}

The next theorem lists the equivalent characterisations of weak
convergence; it is the standard packaging that one cites whenever
weak convergence appears in the wild.

\begin{theorem}{Portmanteau theorem}{portmanteau}
Let \(\P\) and \(\{\P_i\}_{i=1}^{\infty}\) be probability measures on
a metric space \((S,\Scal)\). The following are equivalent.
\begin{enumerate}
\item \(\P_i\Rightarrow\P\), i.e.\ \(\int f\,d\P_i\to\int f\,d\P\) for
every \(f\in\Ccal_B(S)\).
\item \(\int f\,d\P_i\to\int f\,d\P\) for every bounded
\emph{uniformly} continuous \(f\).
\item \(\limsup_i \P_i(C) \le \P(C)\) for every closed \(C\subseteq S\).
\item \(\liminf_i \P_i(U) \ge \P(U)\) for every open \(U\subseteq S\).
\item \(\lim_i \P_i(A) = \P(A)\) for every \(A\in\Scal\) with
\(\P(\partial A)=0\), where \(\partial A=\bar A\cap \overline{A^c}\)
is the topological boundary.
\end{enumerate}
\end{theorem}

\begin{remark}
The implications \((1)\!\Rightarrow\!(2)\) and (2)\(\Rightarrow\)(3)
are the workhorse direction. For (2)\(\Rightarrow\)(3) the trick is
to take \(C_\delta=\{x\in S:d(x,C)<\delta\}\) and choose a uniformly
continuous \(f\) with \(f=1\) on \(C\) and \(f=0\) off \(C_\delta\)
(Urysohn's lemma); since \(C_\delta\downarrow C\) as
\(\delta\to0^+\), one has
\(\P_i(C)\le\int f\,d\P_i\to\int f\,d\P\le \P(C_\delta)<\P(C)+\varepsilon\),
and \(\varepsilon\downarrow0\) finishes the argument.
\end{remark}

\begin{remark}
Changing the test class for \(f\) tightens the convergence:
\begin{itemize}
\item \(f\in\Ccal_B(S)\) is weak convergence;
\item \(\sup_f \bigl|\int f\,d\P_i - \int f\,d\P\bigr|\to 0\) over all
continuous \(f\colon S\to[-1,1]\) is the \emph{Radon} metric;
\item the same supremum over all \emph{measurable}
\(f\colon S\to[-1,1]\) gives the \emph{total variation} distance;
\item the supremum restricted to \(1\)-Lipschitz \(f\colon S\to[-1,1]\)
is the \(1\)-Wasserstein distance, central to optimal transport.
\end{itemize}
\end{remark}

\subsection{Random variables and their distributions}

We now lift the picture from measures to random variables. Fix a
probability space \((\Omega,\Fcal,\mu)\) and a metric space
\((S,\Scal)\) (with \(\Scal=\Bcal(S)\)). A random variable is a
measurable map \(X\colon\Omega\to S\); its \emph{distribution} is the
pushforward
\[
\P(A) \;=\; \mu\bigl(X^{-1}(A)\bigr), \qquad A\in\Scal.
\]
With this in place, expectations have the change-of-variables form
\[
\E[X] \;=\; \int_\Omega X(\omega)\,d\mu(\omega)
        \;=\; \int_S x\,d\P(x).
\]
For a sequence \(\{X_i\}_{i=1}^{\infty}\) we write \(\P_i\) for the
distribution of \(X_i\), and when no confusion arises we abuse
notation and write \(\P_i(A)\) for \(\P(X_i\in A)\).

\subsection{Modes of convergence}

We collect the four standard modes; throughout, \(X,X_1,X_2,\dots\)
are random variables on a common probability space taking values in a
metric space \((S,d)\).

\begin{definition}{Convergence in distribution}{conv-dist}
\(X_i\) converges to \(X\) \emph{in distribution}, written
\(X_i \xrightarrow{d} X\), if the laws \(\P_i\) of \(X_i\) converge
weakly to the law \(\P\) of \(X\):
\(\P_i\Rightarrow \P\) (\cref{def:weak-conv}).
\end{definition}

\begin{definition}{Convergence in probability}{conv-prob}
\(X_i\) converges to \(X\) \emph{in probability}, written
\(X_i \xrightarrow{\P} X\), if for every \(\varepsilon>0\),
\[
\mu\bigl(\{\omega\in\Omega: d(X_i(\omega),X(\omega))>\varepsilon\}\bigr)
   \;\longrightarrow\; 0
\qquad\text{as } i\to\infty.
\]
In shorthand, \(\P\bigl(d(X_i,X)>\varepsilon\bigr)\to 0\) for every
\(\varepsilon>0\).
\end{definition}

\begin{definition}{Almost sure convergence}{conv-as}
\(X_i\) converges to \(X\) \emph{almost surely} (or
\emph{\(\mu\)-almost everywhere}), written
\(X_i \xrightarrow{\text{a.s.}} X\), if
\[
\mu\bigl(\{\omega\in\Omega: X_i(\omega)\to X(\omega)\}\bigr) \;=\; 1,
\]
or equivalently
\(\mu(\{\omega: X_i(\omega)\nrightarrow X(\omega)\})=0\). This is
pointwise convergence except on a \(\mu\)-null set; the metric \(d\)
does not enter the statement.
\end{definition}

\begin{definition}{Convergence in $L^p$}{conv-lp}
For \(p\in[1,\infty)\), \(X_i\) converges to \(X\) \emph{in \(L^p\)},
written \(X_i \xrightarrow{L^p} X\), if
\[
\E\bigl[d(X_i,X)^p\bigr]
   \;=\; \int_\Omega d\bigl(X_i(\omega),X(\omega)\bigr)^p\,d\mu(\omega)
   \;\longrightarrow\; 0.
\]
When \(S=\R\) this reduces to
\(\int |X_i-X|^p\,d\mu\to 0\).
\end{definition}

\subsection{Hierarchy of convergence}

The four modes are not equivalent; they are linked by a small lattice
of implications.

\begin{proposition}{Hierarchy of modes}{conv-hierarchy}
For random variables on a probability space \((\Omega,\Fcal,\mu)\):
\begin{enumerate}
\item \(X_i \xrightarrow{\text{a.s.}} X
        \;\Longrightarrow\; X_i\xrightarrow{\P}X\).
\item \(X_i \xrightarrow{\P} X
        \;\Longrightarrow\; X_i\xrightarrow{d}X\).
\item For any \(p\in[1,\infty]\),
\(X_i\xrightarrow{L^p}X
        \;\Longrightarrow\; X_i\xrightarrow{\P}X\).
\item For \(1\le q<p\le\infty\),
\(X_i\xrightarrow{L^p}X
        \;\Longrightarrow\; X_i\xrightarrow{L^q}X\)
(on a probability space; this uses Jensen).
\end{enumerate}
None of the converses hold without extra hypotheses; in particular,
a.s.\ convergence and \(L^p\) convergence are incomparable.
\end{proposition}

\begin{figure}[h]
\centering
\begin{tikzpicture}[>=Stealth, node distance=14mm and 22mm,
                    every node/.style={font=\small}]
  \node (lp)   {\(X_i\xrightarrow{L^p}X\)};
  \node (lq) [right=of lp] {\(X_i\xrightarrow{L^q}X\) \footnotesize\((q<p)\)};
  \node (as) [above=of lp] {\(X_i\xrightarrow{\text{a.s.}}X\)};
  \node (prob) [below right=of lp] {\(X_i\xrightarrow{\P}X\)};
  \node (dist) [below=of prob] {\(X_i\xrightarrow{d}X\)};
  \draw[->, thick, deepnavy] (as) -- (prob);
  \draw[->, thick, deepnavy] (lp) -- (prob);
  \draw[->, thick, deepnavy] (lp) -- (lq);
  \draw[->, thick, deepnavy] (lq.south) to[bend left=10] (prob.east);
  \draw[->, thick, deepnavy] (prob) -- (dist);
  \node[remarkgray, right=2pt of as, font=\scriptsize\itshape]
       {strongest pointwise};
  \node[remarkgray, right=2pt of dist, font=\scriptsize\itshape]
       {weakest};
\end{tikzpicture}
\caption{Hasse diagram of the four modes of convergence on a
probability space. Arrows point from stronger to weaker; the
horizontal arrow is the \(L^p\)-monotonicity from Jensen's
inequality. There is no arrow between \(\xrightarrow{\text{a.s.}}\)
and \(\xrightarrow{L^p}\) without extra integrability.}
\label{fig:conv-hasse}
\end{figure}

\begin{remark}
The implication \(\xrightarrow{L^p}\Rightarrow\xrightarrow{\P}\) is a
one-line consequence of \cref{thm:markov}: for any \(\varepsilon>0\),
\[
\mu\bigl(d(X_i,X)>\varepsilon\bigr)
   \;\le\; \frac{\E[d(X_i,X)^p]}{\varepsilon^p}
   \;\longrightarrow\; 0.
\]
The \(L^p\)-monotonicity uses Jensen applied to the convex map
\(t\mapsto t^{p/q}\) on a probability space:
\(\E|Y|^q \le (\E|Y|^p)^{q/p}\). The direction
\(\xrightarrow{\P}\Rightarrow\xrightarrow{d}\) is a corollary of the
Portmanteau theorem (\cref{thm:portmanteau}); the direction
\(\xrightarrow{\text{a.s.}}\Rightarrow\xrightarrow{\P}\) is dominated
convergence applied to the indicator
\(\indic_{\{d(X_i,X)>\varepsilon\}}\).
\end{remark}

\begin{example}[Why a.s.\ and \texorpdfstring{\(L^p\)}{Lp} are incomparable]
On \(([0,1],\Bcal,\lambda)\), set
\(X_n = n\,\indic_{(0,1/n)}\). Then
\(X_n\to 0\) pointwise (so \(X_n\xrightarrow{\text{a.s.}}0\)) but
\(\E[X_n] = 1\) for every \(n\), so \(X_n\) does not converge to
\(0\) in \(L^1\).
Conversely, the ``typewriter'' sequence of indicators of dyadic
sub-intervals of \([0,1]\) satisfies \(X_n\xrightarrow{L^p}0\) for
every \(p\) yet fails to converge at any single \(\omega\).
\end{example}