% =================================================================
%  Lecture 11
%  Source: handwritten notes (Mathpix-converted) + Kashlak STAT 571
% =================================================================
\section[Lecture 11 -- Law of Large Numbers]{Lecture 11 \textemdash{} Law of Large Numbers}
\label{sec:lec11}

The two laws of large numbers are the workhorses of measure-theoretic
probability: they say that the empirical average \(S_n/n\) of a sample
collapses onto its expectation, in two distinct senses (in probability
versus almost surely). Throughout we fix a probability space
\((\Omega,\Fcal,\P)\) and a sequence of random variables
\(\{X_i\}_{i=1}^{\infty}\) from \((\Omega,\Fcal)\) to \((\R,\Bcal)\).
For \(A\in\Bcal\) we write
\[
\P(X\in A) \;:=\; \P\bigl(\{\omega\in\Omega : X(\omega)\in A\}\bigr),
\qquad
\E X \;=\; \int X(\omega)\,d\P,
\]
and define the partial sum
\(S_n=\sum_{i=1}^{n}X_i\), itself a measurable random variable.

\subsection{Independence of random variables}

Before stating the laws of large numbers we need to record what
independence means for random variables, not just for events.

\begin{definition}{Independence of random variables}{indep-rv}
Let \(X\) and \(Y\) be random variables on the same probability space
\((\Omega,\Fcal,\P)\) with possibly different codomains
\((\mathbb{X},\mathcal{X})\) and \((\mathbb{Y},\mathcal{Y})\). Then
\(X\) and \(Y\) are \emph{independent} if
\[
\P\bigl(\{X\in A\}\cap\{Y\in B\}\bigr) \;=\; \P(X\in A)\,\P(Y\in B)
\qquad \forall\,A\in\mathcal{X},\ B\in\mathcal{Y}.
\]
This extends to a finite collection \(\{X_i\}_{i=1}^{n}\) by requiring
\[
\P\!\left(\bigcap_{i=1}^{n}\{X_i\in A_i\}\right)
   \;=\; \prod_{i=1}^{n}\P(X_i\in A_i)
\qquad \forall\,A_i\in\mathcal{X}_i.
\]
An infinite collection \(\{X_i\}_{i=1}^{\infty}\) is independent if every
finite subcollection is independent.
\end{definition}

\begin{remark}
Since \(\{X\in A\}=X^{-1}(A)\), the random variables \(X\) and \(Y\) are
independent if and only if the \(\sigma\)-fields \(\sigma(X)\) and
\(\sigma(Y)\) are independent in the sense of
\cref{def:indep-sigma-fields} from Lecture~4.
\end{remark}

\begin{definition}{Identically distributed}{iid}
The \emph{law} (or \emph{distribution}) of a random variable
\(X:\Omega\to\R\) is the pushforward measure \(\P\circ X^{-1}\) on
\((\R,\Bcal)\). Two random variables \(X,Y\) are \emph{identically
distributed} if \(\P\circ X^{-1}=\P\circ Y^{-1}\); a sequence
\(\{X_i\}_{i=1}^{\infty}\) is \emph{i.i.d.}\ if it is independent and
its members are pairwise identically distributed.
\end{definition}

\subsection{The weak law}

The first law trades sharp conclusions for cheap hypotheses: only
uncorrelatedness and a uniform second moment are needed.

\begin{theorem}{Weak Law of Large Numbers}{wlln}
Let \((\Omega,\Fcal,\P)\) be a probability space and let
\(\{X_i\}_{i=1}^{\infty}\) be random variables from \(\Omega\) to \(\R\)
satisfying
\begin{enumerate}
  \item \(\E X_i = c\in\R\) for all \(i\);
  \item \(\E X_i^{2} = 1\) for all \(i\);
  \item \(\E\bigl[(X_i-c)(X_j-c)\bigr] = 0\) for all \(i\neq j\)
        (i.e.\ the \(X_i\) are pairwise uncorrelated).
\end{enumerate}
Then
\[
\frac{S_n}{n} \;\xrightarrow{\ \P\ }\; c
\qquad \text{as } n\to\infty.
\]
\end{theorem}

\begin{remark}
The proof uses only Chebyshev's inequality applied to \(S_n/n\); in
particular only \emph{uncorrelatedness} (not full independence) is
required. The next theorem strengthens convergence in probability to
almost-sure convergence at the price of (i) genuine independence and
(ii) i.i.d.\ structure, but in compensation drops the second-moment
hypothesis entirely.
\end{remark}

\subsection{The strong law}

For the strong law we keep the variance only as notation:
\[
\Var(X) \;=\; \int (X-\E X)^{2}\,d\P(\omega).
\]

\begin{theorem}{Strong Law of Large Numbers}{slln}
Let \(\{X_i\}_{i=1}^{\infty}\) be i.i.d.\ random variables from
\(\Omega\) to \(\R\).
\begin{enumerate}
  \item If \(\E|X_i|<\infty\), then
        \[
            \frac{S_n}{n} \;\xrightarrow{\text{a.s.}}\; c
            \qquad \text{for } c=\E X_1.
        \]
  \item If \(\E|X_i|=\infty\), then \(S_n/n\) does \emph{not} converge
        to any finite limit.
\end{enumerate}
\end{theorem}

\begin{remark}
The contrapositive of part~(2) is informative: existence of an
almost-sure finite limit for \(S_n/n\) forces \(\E|X_1|<\infty\). So the
hypothesis in part~(1) is sharp.
\end{remark}

\begin{remark}
The proof of (2) is short: if \(n^{-1}S_n\to c\in\R\) almost surely
then \(n^{-1}X_n=n^{-1}(S_n-S_{n-1})\to 0\). But
\(\E|X_1|=\infty\) implies \(\sum_{n=0}^{\infty}\P(|X_n|>n)=\infty\),
so by the second Borel--Cantelli lemma \(|X_n|>n\) infinitely often,
contradicting \(n^{-1}X_n\to 0\).
\end{remark}

\begin{remark}
The proof of (1) is the heart of the lecture. The strategy is to
truncate \(Y_i=X_i\indic_{X_i\le i}\) so that the variances become
finite; consider \(T_n=\sum_{i=1}^{n}Y_i\) along the geometric
subsequence \(k_n=\lfloor\delta^n\rfloor\) for \(\delta>1\); apply
Chebyshev plus the first Borel--Cantelli lemma to obtain
\(k_n^{-1}|T_{k_n}-\E T_{k_n}|\to 0\) a.s.; observe that
\(\E Y_n\uparrow\E X_1\); transfer the conclusion from \(T_{k_n}\) to
\(S_{k_n}\) via \(\sum_i\P(X_i\neq Y_i)<\infty\); and finally
interpolate between subsequence indices using
\[
\delta^{-2}\,\E X_1
   \;\le\; \liminf_{i\to\infty}\frac{S_i}{i}
   \;\le\; \limsup_{i\to\infty}\frac{S_i}{i}
   \;\le\; \delta^{2}\,\E X_1,
\]
then send \(\delta\downarrow 1\).
\end{remark}

\begin{example}[Empirical mean of i.i.d.\ Bernoullis]
Let \(\{X_i\}_{i=1}^{\infty}\) be i.i.d.\ \(\mathrm{Bernoulli}(p)\),
\(p\in(0,1)\). Then \(\E|X_1|=p<\infty\), so by \cref{thm:slln}
\(S_n/n\to p\) almost surely. This is the rigorous statement underlying
the frequentist interpretation: the long-run frequency of successes
equals the success probability.
\end{example}

\begin{example}[Cauchy distribution: failure of the SLLN]
If \(\{X_i\}_{i=1}^{\infty}\) are i.i.d.\ standard Cauchy, then
\(\E|X_1|=\infty\). By part~(2) of \cref{thm:slln}, \(S_n/n\) does not
converge to any finite limit; in fact \(S_n/n\) is itself standard
Cauchy for every~\(n\), so the empirical mean never settles down.
\end{example}