% =================================================================
%  Lecture 11
%  Primary source: handwritten notes (Mathpix mmd)
%  Fallback: kashlak.pdf (only for OCR/notation/curriculum clarity)
% =================================================================
\section[Lecture 11 -- Law of Large Numbers]{Lecture 11 \textemdash{} Law of Large Numbers}
\label{sec:lec11}

The Borel--Cantelli machinery of Lecture~10 finally pays off. We fix a
sequence \(\{X_i\}_{i=1}^{\infty}\) of random variables on a common
probability space \((\Omega,\Fcal,\P)\), valued in \((\R,\Bcal)\), and
ask in what sense the sample averages
\(n^{-1}S_n=n^{-1}\sum_{i=1}^{n}X_i\) approach the common mean. Two
answers --- one in probability under uncorrelation plus a second moment,
one almost sure under independence and only a first moment --- are the
content of this lecture.

\subsection{Setup: independence and identical distribution}

Throughout, \(X\colon\Omega\to\R\) is a random variable with law
\(\P(X\in A)=\P(\{\omega\in\Omega:X(\omega)\in A\})\) for
\(A\in\Bcal\), expectation \(\E X=\int X(\omega)\,d\P\), and partial
sums \(S_n=\sum_{i=1}^{n}X_i\).

\begin{definition}{Independence of random variables}{indep-rvs}
Random variables \(X\) and \(Y\) on \((\Omega,\Fcal,\P)\), valued in
measurable spaces \((\Xset,\Xcal)\) and \((\Yset,\Ycal)\)
respectively, are \emph{independent} if
\[
\P\bigl(\{X\in A\}\cap\{Y\in B\}\bigr)
   \;=\; \P(X\in A)\,\P(Y\in B)
\qquad\text{for all }A\in\Xcal,\;B\in\Ycal.
\]
The definition extends to a finite collection \(\{X_i\}_{i=1}^{n}\) by
requiring
\(\P\!\left(\bigcap_{i=1}^{n}\{X_i\in A_i\}\right)=\prod_{i=1}^{n}\P(X_i\in A_i)\).
An infinite collection \(\{X_i\}_{i=1}^{\infty}\) is independent if
every finite subcollection is.
\end{definition}

\begin{remark}
Since \(\{X\in A\}=X^{-1}(A)\), independence of the random variables
\(X\) and \(Y\) is equivalent to independence of the generated
\(\sigma\)-fields \(\sigma(X)\) and \(\sigma(Y)\) in the sense of
\cref{def:indep-sigma-fields}.
\end{remark}

\begin{definition}{Identically distributed; i.i.d.}{iid}
Random variables \(X\) and \(Y\) are \emph{identically distributed} if
the pushforward laws \(\P\circ X^{-1}\) and \(\P\circ Y^{-1}\) coincide
on \(\Bcal\). A sequence \(\{X_i\}_{i=1}^{\infty}\) is i.i.d.\
(\emph{independent and identically distributed}) if it is independent
and the \(X_i\) share a common law.
\end{definition}

\subsection{Weak law of large numbers}

The weak law trades a strong moment hypothesis for a very mild
dependence hypothesis: not full independence, only \hblue{pairwise
uncorrelation} (a strictly weaker condition).

\begin{theorem}{Weak law of large numbers}{wlln}
Let \((\Omega,\Fcal,\P)\) be a probability space and
\(\{X_i\}_{i=1}^{\infty}\) random variables with
\[
\E X_i \;=\; c\in\R,\qquad \E X_i^{2} \;=\; 1
\quad\text{for all } i,
\]
and \(\E\!\left[(X_i-c)(X_j-c)\right]=0\) for all \(i\neq j\). Then
\[
\frac{S_n}{n} \;\xrightarrow{\;\P\;}\; c,
\]
i.e.\ for every \(\varepsilon>0\),
\(\P\!\left(\bigl|n^{-1}S_n-c\bigr|\geq\varepsilon\right)\to 0\) as
\(n\to\infty\).
\end{theorem}

\begin{remark}
The proof reduces to \(c=0\) by replacing \(X_i\) with \(X_i-c\), then
applies Chebyshev: for any \(t>0\),
\[
\P\!\left(\frac{|S_n|}{n}\geq t\right)
   \;\le\; \frac{\E S_n^{2}}{t^{2}n^{2}}
   \;=\; \frac{1}{t^{2}n^{2}}\sum_{i,j=1}^{n}\E[X_i X_j]
   \;=\; \frac{1}{n t^{2}} \;\to\; 0,
\]
where the cross terms vanish by uncorrelation and the diagonal sums to
\(n\) by the unit second moment.
\end{remark}

\begin{remark}
Uncorrelation is genuinely weaker than independence: independence of
\((X,Y)\) implies independence of \((f(X),g(Y))\) for any measurable
\(f,g\), hence \(\Cov(f(X),g(Y))=0\) for every choice; uncorrelation
asks this only for \(f=g=\mathrm{id}\).
\end{remark}

\subsection{Strong law of large numbers}

The strong law promotes ``in probability'' to ``almost surely'',
removes the second moment hypothesis, but pays for it with full
independence and identical distribution. Recall the variance
\[
\Var(X) \;=\; \int (X-\E X)^{2}\,d\P(\omega).
\]

\begin{theorem}{Strong law of large numbers}{slln}
Let \(\{X_i\}_{i=1}^{\infty}\) be i.i.d.\ random variables from
\((\Omega,\Fcal,\P)\) to \((\R,\Bcal)\). Then:
\begin{enumerate}
  \item If \(\E|X_1|<\infty\), then
  \(\displaystyle \frac{S_n}{n}\xrightarrow{\text{a.s.}} c\) where
  \(c=\E X_1\).
  \item If \(\E|X_1|=\infty\), then \(S_n/n\) does not converge to any
  finite limit (almost surely).
\end{enumerate}
\end{theorem}

\begin{remark}
Compared with \cref{thm:wlln}, no second-moment assumption is made on
the \(X_i\); only \(L^{1}\) is needed. The trade-off is full
independence (not just uncorrelation) and identical distribution. The
``a.s.'' qualifier means convergence holds outside a \(\P\)-null set
\(N\subset\Omega\).
\end{remark}

\begin{remark}
The divergence half (part~2) is the easier direction. The heuristic:
if \(\E|X_1|=\infty\) then \(\sum_{n}\P(|X_n|>n)=\infty\), so by the
second Borel--Cantelli lemma \(|X_n|>n\) infinitely often. But on
\(\{n^{-1}S_n\to c\}\) one has
\(n^{-1}X_n=n^{-1}(S_n-S_{n-1})\to 0\), contradicting
\(|X_n|/n>1\) i.o.
\end{remark}

\begin{remark}
The forward direction (part~1) is far more delicate. The standard
route: reduce to \(X_i\geq 0\) by writing \(X_i=X_i^{+}-X_i^{-}\)
(independence of \(X,Y\) passes to \(X^{+},Y^{+}\)), truncate
\(Y_i=X_i\indic_{\{X_i\leq i\}}\) so that variances are finite, control
the truncated partial sums \(T_n=\sum_{i=1}^{n}Y_i\) along a
geometric subsequence \(k_n=\lfloor\delta^{n}\rfloor\) using
Chebyshev plus the first Borel--Cantelli lemma, then sandwich the full
sums \(S_i\) for \(k_n\leq i\leq k_{n+1}\) and let \(\delta\downarrow 1\).
\end{remark}

\begin{example}[i.i.d.\ Bernoulli sample mean]
Let \(X_i\overset{\text{i.i.d.}}{\sim}\mathrm{Bernoulli}(p)\), so
\(\E X_i=p\) and \(\Var(X_i)=p(1-p)\). Both moment hypotheses of the
weak and strong laws are satisfied, so
\(n^{-1}S_n\to p\) both in probability (by \cref{thm:wlln}) and
almost surely (by \cref{thm:slln}). In particular, the empirical
frequency of successes in \(n\) Bernoulli trials converges almost
surely to the true success probability~\(p\) --- the formal statement
behind the everyday claim ``the average converges to the mean''.
\end{example}