% ================================================================= % Lecture 9 % Primary source: handwritten notes (Mathpix mmd) % Fallback: kashlak.pdf (only for OCR/notation/curriculum clarity) % ================================================================= \section[Lecture 9 -- Convergence in Probability and Measure]{Lecture 9 \textemdash{} Convergence in Probability and Measure} \label{sec:lec09} We now study what it means for a sequence of probability measures, or of random variables, to converge. There are several inequivalent notions; this lecture introduces the four standard modes for random variables (almost sure, in probability, in \(L^p\), in distribution), together with their parent notion at the level of measures (\emph{weak} convergence). The Portmanteau theorem packages the equivalent characterisations of weak convergence, and a small Hasse diagram records the implications between the modes. \subsection{Weak convergence of probability measures} Let \((\Omega,\Fcal)\) be a measurable space and let \(\{\P_i\}_{i=1}^{\infty}\) be a sequence of probability measures on \((\Omega,\Fcal)\). What should ``\(\P_i\to\P\)'' mean? The naive choice ``\(\P_i(A)\to\P(A)\) for every \(A\in\Fcal\)'' (\emph{setwise} convergence) is too strong to be useful in practice; the standard notion fixes a topology on \(\Omega\) and tests against continuous functions. \begin{definition}{Weak convergence of measures}{weak-conv} Let \(S\) be a metric space with Borel \(\sigma\)-field \(\Scal=\Bcal(S)\), and let \(\P,\P_1,\P_2,\dots\) be probability measures on \((S,\Scal)\). We say \(\P_i\) \emph{converges weakly} to \(\P\), written \(\P_i\Rightarrow\P\), if \[ \int_S f\,d\P_i \;\longrightarrow\; \int_S f\,d\P \qquad\text{for every } f\in \Ccal_B(S), \] where \(\Ccal_B(S)\) denotes the bounded continuous real-valued functions on \(S\). \end{definition} \begin{remark} Weak convergence is the topology of \emph{closeness} between measures generated by the metric on \(S\). Concretely, an \(\varepsilon\)-neighbourhood of \(\P\) is determined by a finite collection \(f_1,\ldots,f_n\in\Ccal_B(S)\): the neighbourhood is the set of all probability measures \(Q\) with \(\bigl|\int f_i\,d\P-\int f_i\,dQ\bigr|<\varepsilon\) for each \(i\). The weakest test against \(f\in \Ccal_B(S)\) gives the weakest of several useful metrics on the space of probability measures. \end{remark} The next theorem lists the equivalent characterisations of weak convergence; it is the standard packaging that one cites whenever weak convergence appears in the wild. \begin{theorem}{Portmanteau theorem}{portmanteau} Let \(\P\) and \(\{\P_i\}_{i=1}^{\infty}\) be probability measures on a metric space \((S,\Scal)\). The following are equivalent. \begin{enumerate} \item \(\P_i\Rightarrow\P\), i.e.\ \(\int f\,d\P_i\to\int f\,d\P\) for every \(f\in\Ccal_B(S)\). \item \(\int f\,d\P_i\to\int f\,d\P\) for every bounded \emph{uniformly} continuous \(f\). \item \(\limsup_i \P_i(C) \le \P(C)\) for every closed \(C\subseteq S\). \item \(\liminf_i \P_i(U) \ge \P(U)\) for every open \(U\subseteq S\). \item \(\lim_i \P_i(A) = \P(A)\) for every \(A\in\Scal\) with \(\P(\partial A)=0\), where \(\partial A=\bar A\cap \overline{A^c}\) is the topological boundary. \end{enumerate} \end{theorem} \begin{remark} The implications \((1)\!\Rightarrow\!(2)\) and (2)\(\Rightarrow\)(3) are the workhorse direction. For (2)\(\Rightarrow\)(3) the trick is to take \(C_\delta=\{x\in S:d(x,C)<\delta\}\) and choose a uniformly continuous \(f\) with \(f=1\) on \(C\) and \(f=0\) off \(C_\delta\) (Urysohn's lemma); since \(C_\delta\downarrow C\) as \(\delta\to0^+\), one has \(\P_i(C)\le\int f\,d\P_i\to\int f\,d\P\le \P(C_\delta)<\P(C)+\varepsilon\), and \(\varepsilon\downarrow0\) finishes the argument. \end{remark} \begin{remark} Changing the test class for \(f\) tightens the convergence: \begin{itemize} \item \(f\in\Ccal_B(S)\) is weak convergence; \item \(\sup_f \bigl|\int f\,d\P_i - \int f\,d\P\bigr|\to 0\) over all continuous \(f\colon S\to[-1,1]\) is the \emph{Radon} metric; \item the same supremum over all \emph{measurable} \(f\colon S\to[-1,1]\) gives the \emph{total variation} distance; \item the supremum restricted to \(1\)-Lipschitz \(f\colon S\to[-1,1]\) is the \(1\)-Wasserstein distance, central to optimal transport. \end{itemize} \end{remark} \subsection{Random variables and their distributions} We now lift the picture from measures to random variables. Fix a probability space \((\Omega,\Fcal,\mu)\) and a metric space \((S,\Scal)\) (with \(\Scal=\Bcal(S)\)). A random variable is a measurable map \(X\colon\Omega\to S\); its \emph{distribution} is the pushforward \[ \P(A) \;=\; \mu\bigl(X^{-1}(A)\bigr), \qquad A\in\Scal. \] With this in place, expectations have the change-of-variables form \[ \E[X] \;=\; \int_\Omega X(\omega)\,d\mu(\omega) \;=\; \int_S x\,d\P(x). \] For a sequence \(\{X_i\}_{i=1}^{\infty}\) we write \(\P_i\) for the distribution of \(X_i\), and when no confusion arises we abuse notation and write \(\P_i(A)\) for \(\P(X_i\in A)\). \subsection{Modes of convergence} We collect the four standard modes; throughout, \(X,X_1,X_2,\dots\) are random variables on a common probability space taking values in a metric space \((S,d)\). \begin{definition}{Convergence in distribution}{conv-dist} \(X_i\) converges to \(X\) \emph{in distribution}, written \(X_i \xrightarrow{d} X\), if the laws \(\P_i\) of \(X_i\) converge weakly to the law \(\P\) of \(X\): \(\P_i\Rightarrow \P\) (\cref{def:weak-conv}). \end{definition} \begin{definition}{Convergence in probability}{conv-prob} \(X_i\) converges to \(X\) \emph{in probability}, written \(X_i \xrightarrow{\P} X\), if for every \(\varepsilon>0\), \[ \mu\bigl(\{\omega\in\Omega: d(X_i(\omega),X(\omega))>\varepsilon\}\bigr) \;\longrightarrow\; 0 \qquad\text{as } i\to\infty. \] In shorthand, \(\P\bigl(d(X_i,X)>\varepsilon\bigr)\to 0\) for every \(\varepsilon>0\). \end{definition} \begin{definition}{Almost sure convergence}{conv-as} \(X_i\) converges to \(X\) \emph{almost surely} (or \emph{\(\mu\)-almost everywhere}), written \(X_i \xrightarrow{\text{a.s.}} X\), if \[ \mu\bigl(\{\omega\in\Omega: X_i(\omega)\to X(\omega)\}\bigr) \;=\; 1, \] or equivalently \(\mu(\{\omega: X_i(\omega)\nrightarrow X(\omega)\})=0\). This is pointwise convergence except on a \(\mu\)-null set; the metric \(d\) does not enter the statement. \end{definition} \begin{definition}{Convergence in $L^p$}{conv-lp} For \(p\in[1,\infty)\), \(X_i\) converges to \(X\) \emph{in \(L^p\)}, written \(X_i \xrightarrow{L^p} X\), if \[ \E\bigl[d(X_i,X)^p\bigr] \;=\; \int_\Omega d\bigl(X_i(\omega),X(\omega)\bigr)^p\,d\mu(\omega) \;\longrightarrow\; 0. \] When \(S=\R\) this reduces to \(\int |X_i-X|^p\,d\mu\to 0\). \end{definition} \subsection{Hierarchy of convergence} The four modes are not equivalent; they are linked by a small lattice of implications. \begin{proposition}{Hierarchy of modes}{conv-hierarchy} For random variables on a probability space \((\Omega,\Fcal,\mu)\): \begin{enumerate} \item \(X_i \xrightarrow{\text{a.s.}} X \;\Longrightarrow\; X_i\xrightarrow{\P}X\). \item \(X_i \xrightarrow{\P} X \;\Longrightarrow\; X_i\xrightarrow{d}X\). \item For any \(p\in[1,\infty]\), \(X_i\xrightarrow{L^p}X \;\Longrightarrow\; X_i\xrightarrow{\P}X\). \item For \(1\le q
=Stealth, node distance=14mm and 22mm, every node/.style={font=\small}] \node (lp) {\(X_i\xrightarrow{L^p}X\)}; \node (lq) [right=of lp] {\(X_i\xrightarrow{L^q}X\) \footnotesize\((q
, thick, deepnavy] (as) -- (prob); \draw[->, thick, deepnavy] (lp) -- (prob); \draw[->, thick, deepnavy] (lp) -- (lq); \draw[->, thick, deepnavy] (lq.south) to[bend left=10] (prob.east); \draw[->, thick, deepnavy] (prob) -- (dist); \node[remarkgray, right=2pt of as, font=\scriptsize\itshape] {strongest pointwise}; \node[remarkgray, right=2pt of dist, font=\scriptsize\itshape] {weakest}; \end{tikzpicture} \caption{Hasse diagram of the four modes of convergence on a probability space. Arrows point from stronger to weaker; the horizontal arrow is the \(L^p\)-monotonicity from Jensen's inequality. There is no arrow between \(\xrightarrow{\text{a.s.}}\) and \(\xrightarrow{L^p}\) without extra integrability.} \label{fig:conv-hasse} \end{figure} \begin{remark} The implication \(\xrightarrow{L^p}\Rightarrow\xrightarrow{\P}\) is a one-line consequence of \cref{thm:markov}: for any \(\varepsilon>0\), \[ \mu\bigl(d(X_i,X)>\varepsilon\bigr) \;\le\; \frac{\E[d(X_i,X)^p]}{\varepsilon^p} \;\longrightarrow\; 0. \] The \(L^p\)-monotonicity uses Jensen applied to the convex map \(t\mapsto t^{p/q}\) on a probability space: \(\E|Y|^q \le (\E|Y|^p)^{q/p}\). The direction \(\xrightarrow{\P}\Rightarrow\xrightarrow{d}\) is a corollary of the Portmanteau theorem (\cref{thm:portmanteau}); the direction \(\xrightarrow{\text{a.s.}}\Rightarrow\xrightarrow{\P}\) is dominated convergence applied to the indicator \(\indic_{\{d(X_i,X)>\varepsilon\}}\). \end{remark} \begin{example}[Why a.s.\ and \texorpdfstring{\(L^p\)}{Lp} are incomparable] On \(([0,1],\Bcal,\lambda)\), set \(X_n = n\,\indic_{(0,1/n)}\). Then \(X_n\to 0\) pointwise (so \(X_n\xrightarrow{\text{a.s.}}0\)) but \(\E[X_n] = 1\) for every \(n\), so \(X_n\) does not converge to \(0\) in \(L^1\). Conversely, the ``typewriter'' sequence of indicators of dyadic sub-intervals of \([0,1]\) satisfies \(X_n\xrightarrow{L^p}0\) for every \(p\) yet fails to converge at any single \(\omega\). \end{example}