% =================================================================
%  Lecture 9
%  Source: handwritten notes (Mathpix-converted) + Kashlak STAT 571
% =================================================================
\section[Lecture 9 -- Convergence in Probability and Measure]{Lecture 9 \textemdash{} Convergence in Probability and Measure}
\label{sec:lec09}

Having developed \(L^p\) spaces and the principal inequalities
(Markov, Chebyshev, Jensen, H\"older, Minkowski) in the preceding
lectures, we now turn to the question: \emph{in what sense can a
sequence of probability measures, or of random variables, be said to
converge?} Several inequivalent answers exist; this lecture catalogues
the standard ones, beginning with measures and then specialising to
random variables. Throughout, all measures are probability measures
unless stated otherwise.

\subsection{Convergence of measure}

Given a measurable space \((\Omega,\Fcal)\) and a sequence
\(\{P_i\}_{i=1}^{\infty}\) of probability measures, what should
\(P_i\to P\) mean? Asking that \(P_i(A)\to P(A)\) for every
\(A\in\Fcal\) (sometimes called \emph{setwise convergence}) is often too
strong. The most useful notion couples the measures to the topology of
the underlying space.

\begin{definition}{Weak convergence of measure}{weak-conv-measure}
Let \(S\) be a metric space and \(\Scal\) the Borel \(\sigma\)-field on
\(S\). For a probability measure \(P\) and a sequence
\(\{P_i\}_{i=1}^{\infty}\) of probability measures on \((S,\Scal)\), we
say that \(P_i\) \emph{converges weakly} to \(P\), written
\(P_i\Rightarrow P\), if
\[
\int f\,dP_i \;\longrightarrow\; \int f\,dP
\]
for every bounded continuous real-valued function \(f:S\to\R\). We
write \(f\in\Ccal_B(S)\) for such functions.
\end{definition}

\begin{remark}
Weak convergence is tied to the topology of \(S\) generated by the
metric \(d\): two measures are ``close'' if their integrals agree on
every test function in \(\Ccal_B(S)\). For a finite collection
\(f_1,\dots,f_n\in\Ccal_B(S)\) one obtains an
\(\varepsilon\)-neighbourhood of \(P\) consisting of all \(Q\) with
\(\bigl|\int f_i\,dP-\int f_i\,dQ\bigr|<\varepsilon\) for
\(i=1,\dots,n\); these neighbourhoods generate the topology of weak
convergence.
\end{remark}

The Portmanteau theorem collects the equivalent reformulations.

\begin{theorem}{Portmanteau Theorem}{portmanteau}
For probability measures \(P\) and \(\{P_i\}_{i=1}^{\infty}\) on
\((S,\Scal)\), the following are equivalent:
\begin{enumerate}
  \item \(P_i\Rightarrow P\);
  \item \(\displaystyle\int f\,dP_i\to\int f\,dP\) for every bounded
        \emph{uniformly} continuous \(f:S\to\R\);
  \item \(\limsup_i P_i(C)\le P(C)\) for every closed \(C\subseteq S\);
  \item \(\liminf_i P_i(U)\ge P(U)\) for every open \(U\subseteq S\);
  \item \(\lim_i P_i(A) = P(A)\) for every \(A\in\Scal\) with
        \(P(\partial A)=0\), where
        \(\partial A=\overline{A}\cap\overline{A^c}\) is the topological
        boundary of \(A\).
\end{enumerate}
\end{theorem}

\begin{remark}[Other ways measures converge]
Weak convergence is the statement \(\int f\,dP_i\to\int f\,dP\) for
every \(f\in\Ccal_B(S)\); changing the test class changes the notion.
\begin{itemize}
  \item \emph{Radon metric}:
        \(\displaystyle\sup_{f}\Bigl\{\int f\,dP-\int f\,dP_i\Bigr\}\to 0\)
        with the sup over continuous \(f:S\to[-1,1]\).
  \item \emph{Total variation}: same supremum, but over all measurable
        \(f:S\to[-1,1]\).
  \item \emph{1-Wasserstein}: same supremum, restricted to
        \(1\)-Lipschitz \(f:S\to[-1,1]\); central in optimal transport
        and machine learning.
\end{itemize}
\end{remark}

\subsection{Convergence of random variables}

Let \((\Omega,\Fcal,\mu)\) be a probability space and \((S,\Scal)\) a
metric space with its Borel \(\sigma\)-field. A random variable
\(X:\Omega\to S\) (i.e.\ a measurable function) induces a probability
measure on \((S,\Scal)\) via the push-forward
\[
P(A) \;=\; \mu\!\bigl(X^{-1}(A)\bigr), \qquad A\in\Scal,
\]
called the \emph{distribution} of \(X\). Expectations admit the
familiar change-of-variables identity
\(\E[X]=\int_\Omega X(\omega)\,d\mu(\omega)=\int_S x\,dP(x)\). For a
sequence \(\{X_i\}_{i=1}^{\infty}\) of random variables we now record
the four standard modes of convergence.

\begin{definition}{Convergence in distribution}{conv-dist}
We say \(X_i\) \emph{converges in distribution} to \(X\), written
\(X_i\xrightarrow{d}X\), if the induced laws converge weakly:
\(P_i\Rightarrow P\) (with \(P_i(A)=\mu(X_i\in A)\) and similarly for
\(P\)). Since only weak convergence of the laws matters, the metric
space \((S,\Scal)\) must be fixed but the underlying probability space
\((\Omega,\Fcal,\mu)\) is allowed to vary with \(i\).
\end{definition}

\begin{remark}
The Portmanteau theorem can therefore be rephrased verbatim for random
variables, replacing \(P_i,P\) by the laws of \(X_i,X\).
\end{remark}

\begin{definition}{Convergence in probability}{conv-prob}
We say \(X_i\) \emph{converges in probability} to \(X\), written
\(X_i\xrightarrow{\P}X\), if for every \(\varepsilon>0\),
\[
\mu\!\bigl(\{\omega\in\Omega : d(X_i(\omega),X(\omega))>\varepsilon\}\bigr)
   \;\longrightarrow\; 0
\quad\text{as } i\to\infty,
\]
or in shorthand \(\P\bigl(d(X_i,X)>\varepsilon\bigr)\to 0\). The
metric \(d\) on \(S\) is essential to the definition.
\end{definition}

\begin{definition}{Convergence almost surely}{conv-as}
We say \(X_i\) \emph{converges to \(X\) almost surely}, written
\(X_i\xrightarrow{a.s.}X\), if
\[
\mu\!\bigl(\{\omega\in\Omega : X_i(\omega)\to X(\omega)\}\bigr) \;=\; 1,
\]
equivalently \(\mu(\{\omega:X_i(\omega)\not\to X(\omega)\})=0\). This
is pointwise convergence off a \(\mu\)-null set; the metric \(d\) does
not appear explicitly.
\end{definition}

\begin{definition}{Convergence in $L^p$}{conv-lp}
For \(p\in[1,\infty)\), we say \(X_i\) \emph{converges to \(X\) in
\(L^p\)}, written \(X_i\xrightarrow{L^p}X\), if
\[
\E\!\bigl[d(X_i,X)^p\bigr]
   \;=\; \int_\Omega d\bigl(X_i(\omega),X(\omega)\bigr)^p\,d\mu(\omega)
   \;\longrightarrow\; 0.
\]
When \(S=\R\) this reduces to
\(\displaystyle\int |X_i-X|^p\,d\mu \to 0\).
\end{definition}

\subsection{Hierarchy of convergence types}

The four notions are not equivalent; the implications below summarise
how they are ordered.

\begin{proposition}{Hierarchy}{conv-hierarchy}
For random variables on a probability space \((\Omega,\Fcal,\mu)\) with
values in a metric space \((S,\Scal)\):
\begin{enumerate}
  \item \(X_i\xrightarrow{a.s.}X\) implies
        \(X_i\xrightarrow{\P}X\);
  \item \(X_i\xrightarrow{\P}X\) implies
        \(X_i\xrightarrow{d}X\);
  \item for any \(p\in[1,\infty]\),
        \(X_i\xrightarrow{L^p}X\) implies
        \(X_i\xrightarrow{\P}X\);
  \item for \(1\le q<p\le\infty\),
        \(X_i\xrightarrow{L^p}X\) implies
        \(X_i\xrightarrow{L^q}X\).
\end{enumerate}
In general, \(X_i\xrightarrow{a.s.}X\) and \(X_i\xrightarrow{L^p}X\)
neither implies the other.
\end{proposition}

\begin{remark}
One can ask what additional hypotheses bridge the missing arrows:
uniform integrability promotes convergence in probability to
\(L^1\)-convergence, while a sufficiently strong moment bound (or
Borel--Cantelli, see Lecture~10) lifts convergence in probability to
almost sure convergence along a subsequence.
\end{remark}