Skip to content

Commit f8538e3

Browse files
committed
CO432/notes: june 12
1 parent 3445dfc commit f8538e3

File tree

4 files changed

+132
-8
lines changed

4 files changed

+132
-8
lines changed

CO432/notes.pdf

14.9 KB
Binary file not shown.

CO432/notes.tex

Lines changed: 129 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
\listoflecture
1616
\end{multicols}
1717

18-
\chapter{Introduction}
18+
\chapter{Entropy}
1919

2020
\begin{notation}
2121
I will be using my usual \LaTeX{} typesetting conventions:
@@ -31,7 +31,7 @@ \chapter{Introduction}
3131
\end{itemize}
3232
\end{notation}
3333

34-
\section{Entropy}
34+
\section{Definition}
3535
\lecture{May 6}
3636

3737
\textrule{$\downarrow$ Lecture 1 adapted from Arthur $\downarrow$}
@@ -1006,7 +1006,8 @@ \section{Definition and chain rules}
10061006

10071007
\begin{notation}
10081008
Although relative entropy is defined only on \emph{distributions},
1009-
write $\D\X\Y$ to be $\D{f_{\X}}{f_{\Y}}$.
1009+
write $\D\X\Y$ to be $\D{f_{\X}}{f_{\Y}}$
1010+
where $\X \sim f_{\X}$ and $\Y \sim f_{\Y}$.
10101011
\end{notation}
10111012

10121013
\begin{theorem}[chain rule for relative entropy]\label{thm:chainD}
@@ -1081,7 +1082,7 @@ \section{Definition and chain rules}
10811082
\[ I((\X_1,\dotsc,\X_n) : \Y) = \sum_{i=1}^n I(\X_1 : (\Y \mid (\X_1,\dotsc,\X_{i-1}))) \]
10821083
\end{theorem}
10831084

1084-
\section{Markov chains, data processing, and sufficient statistics}
1085+
\section{Markov chains and data processing}
10851086
\begin{defn}
10861087
The random variables $\X$, $\Y$, and $\rv Z$ form a \term{Markov chain}
10871088
if the conditional distribution of $\rv Z$ depends only on $\Y$
@@ -1090,7 +1091,7 @@ \section{Markov chains, data processing, and sufficient statistics}
10901091
Then, we write $\X \to \Y \to \rv Z$.
10911092
\end{defn}
10921093

1093-
\begin{example}[Legend of the Drunken Master]
1094+
\begin{example}[\textit{\href{https://en.wikipedia.org/wiki/Drunken_Master_II}{Legend of the Drunken Master}}]
10941095
In $\Omega = \R^2$, Jackie Chan is drunk and takes steps in random directions.
10951096
He starts at $\rv J_0 = (0,0)$.
10961097
Then, $\rv J_1 = \rv J_0 + d_1$ where $d_1$ is an independent random unit vector in $\R^2$,
@@ -1103,7 +1104,7 @@ \section{Markov chains, data processing, and sufficient statistics}
11031104
In fact, they are uniformly distributed random points on the circle of radius 1
11041105
centred at $j_2$.
11051106

1106-
\begin{prop}
1107+
\begin{prop}[Markov chain characterization]\label{prop:markov}
11071108
Let $\X$, $\Y$, and $\rv Z$ be random variables. \TFAE:
11081109
\begin{enumerate}
11091110
\item $\X \to \Y \to \rv Z$
@@ -1134,6 +1135,7 @@ \section{Markov chains, data processing, and sufficient statistics}
11341135
\[ I(\X : \Y) = I(\X : \rv Z) + I(\X : \Y \mid \rv Z) \]
11351136
One may show that the mutual information is always non-negative,
11361137
so we have $I(\X : \Y) \geq I(\X : \rv Z)$ as desired.
1138+
We defer the proof of the equality case for \cref{sec:ss}.
11371139
\end{prf}
11381140

11391141
\section{Communication complexity}
@@ -1208,7 +1210,7 @@ \section{Communication complexity}
12081210
where the last step is by Jensen's inequality.
12091211

12101212
Now, let $\Pi$ be any protocol.
1211-
\marginnote{\normalsize{\textit{Lecture 11\\June 10\\cont.}}}
1213+
\marginnote{\normalsize{\textit{Lecture 11\\June 10\\(con'd)}}}
12121214
We will apply the \nameref{thm:dpi}.
12131215

12141216
Notice that $\X \to (M(\X,\rv R),\rv R) \to \Y$ if and only if $\Pi$
@@ -1228,7 +1230,126 @@ \section{Communication complexity}
12281230
completing the proof.
12291231
\end{prf}
12301232

1231-
\section{Parameter estimation}
1233+
\section{Sufficient statistics}\label{sec:ss}
1234+
\skipto[lecture]{12}
1235+
\lecture{June 12}
1236+
We will develop the idea of sufficient statistics and data processing
1237+
towards the asymptotic equipartition property.
1238+
This is a warmup for the joint asymptotic equipartition property
1239+
which we will use to prove one direction of Shannon's channel-coding theorem.
1240+
1241+
\begin{problem}
1242+
Suppose $\X = (\X_1,\dotsc,\X_n)$ are \iid sampled according to $\Bern(\theta)$
1243+
for some fixed parameter $\theta \in [0,1]$.
1244+
1245+
If we have a sample $x = (x_1,\dotsc,x_n)$, how can we recover $\theta$?
1246+
\end{problem}
1247+
1248+
The classical solution (recall from STAT 230) is the maximum likelihood estimator
1249+
$\hat\theta = \frac1n\sum_{i=1}^n x_i$ such that
1250+
$\Pr[\abs{\hat\theta-\theta} > \varepsilon] \leq 2^{-\Omega(\varepsilon^2n)}$.
1251+
In essence, we are reducing the number of bits to send $\theta$ from $n$
1252+
to [whatever it is you need to send a float of desired accuracy lol].
1253+
1254+
\begin{defn}
1255+
A function $T(\X)$ is a \term{sufficient statistic}
1256+
relative to a family $\{f_\theta(x)\}$ if $\theta \to T(\X) \to \X$.
1257+
\end{defn}
1258+
1259+
We are considering the case where $f_\theta$ is $\Bern(\theta)$.
1260+
Clearly, $\theta \to \X \to T(\X)$ is a Markov chain
1261+
because $\X$ is distributed based on $\theta$ and $T$ is a function of
1262+
$\X$ which is not influenced $\theta$.
1263+
1264+
\begin{example}
1265+
$T(\X) = \frac1n\sum_{i=1}^n \X_i$ is a sufficient statistic
1266+
relative to the family $\{\Bern(\theta)\}$.
1267+
\end{example}
1268+
\begin{prf}
1269+
We must show $\theta \to T(\X) \to \X$ is a Markov chain.
1270+
1271+
Fix $x = (x_1,\dotsc,x_n)$. Notice that
1272+
\[ \Pr\qty[\X_1=0,\dotsc,\X_n=0 \mid \frac1n\sum \X_i = \frac12] = 0 \]
1273+
and
1274+
\[ \Pr\qty[\X_1=1,\dotsc,\X_n=1 \mid \frac1n\sum \X_i = \frac12] = 0 \]
1275+
since we obviously cannot have half the $\X_i$'s be 1 if they are all 0s or all 1s.
1276+
1277+
But if we set exactly half of the $\X_i$'s to be 1, the distribution is uniform
1278+
\[ \Pr\qty[\X_1=1,\dotsc,\X_{\frac{n}{2}}=1,\X_{\frac{n}{2}+1}=0,\dotsc,\X_n=0 \mid \frac1n\sum\X_i = \frac12] = \Pr[\X=x \mid \frac1n\sum\X_i = \frac12] = \frac{1}{\binom{n}{n/2}} \]
1279+
for all $x\in\bits{n}$ such that $\frac{n}{2}$ entries are 1.
1280+
1281+
More generally, suppose $x$ has exactly $k$ ones where $k = n\bar\theta$. Then,
1282+
\[
1283+
\Pr[\X=x \mid \frac1n\sum\X_i = \bar\theta] = \begin{cases}
1284+
1/\binom{n}{n\bar\theta} & \frac1n\sum x_i = \bar\theta \\
1285+
0 & \text{otherwise}
1286+
\end{cases}
1287+
\]
1288+
so we have that $\X \mid \frac1n\sum\X_i = \bar\theta$ is independent of $\theta$.
1289+
1290+
We can also see this by saying that $\X \sim \Bern(\theta)^n$
1291+
can be equivalently sampled as:
1292+
\begin{enumerate}
1293+
\item first sampling $\rv K = k$ with probability $\Pr[\frac1n\sum\X_i=k]$,
1294+
\item then sampling a uniform random point that has exactly $\rv K$ ones.
1295+
\end{enumerate}
1296+
which clearly shows that $\rv X$ can be sampled as $f(\frac1n\sum\X_i,\rv R)$
1297+
for some new randomness $\rv R$ (the uniform randomness) independent of $\theta$.
1298+
\end{prf}
1299+
1300+
\begin{example}[``mostly unrelated \textit{\href{https://en.wikipedia.org/wiki/Drunken_Master_III}{Drunken Master III}}'']
1301+
A public domain generic drunkard legally distinct from Jackie Chan begins at $(0,0)$
1302+
and takes steps in random directions $d_i$
1303+
of length $\ell \sim \abs{\normal(0,\theta^2)}$.
1304+
\end{example}
1305+
1306+
Let $\X_n$ be the position at time $n$.
1307+
We can show that
1308+
\[ \norm{\X_n}_2 = c(1\pm o(1))\theta\sqrt{n} \]
1309+
with probability very close to 1. To be more precise,
1310+
\[ \Pr[\text{length from origin} > (1+o(1))(\text{expected length from origin})] \]
1311+
is exponentially small in $n$.
1312+
That is, after $n$ steps, the randomness cancels out,
1313+
and we have a pretty good idea of where we end up.
1314+
1315+
The whole point of this exercise is to notice that if we have a sufficient statistic,
1316+
the probability measure is extremely concentrated around some constant,
1317+
and we can almost just treat the statistic as a constant itself.
1318+
1319+
\begin{example}
1320+
Consider \iid Gaussians $\X_1,\dotsc,\X_n \sim \normal(0,1)$.
1321+
Then, what is the probability $\Pr[\X_1,\dotsc,\X_n > t\sqrt{n}]$
1322+
we overshoot the estimator by $t$ times?
1323+
\end{example}
1324+
\begin{sol}
1325+
Apply simple properties of Gaussians from STAT 230:
1326+
\begin{align*}
1327+
\Pr[\X_1,\dotsc,\X_n > t\sqrt{n}] = \Pr[\sqrt n\normal(0,1) > t\sqrt{n}] = \Pr[\normal(0,1) > t] = \Phi(t) \approx e^{-t^2/2}
1328+
\end{align*}
1329+
\end{sol}
1330+
1331+
\begin{lemma}[rotation invariance of the Gaussian]
1332+
Let $\X$ be a Gaussian and $O$ be an orthonormal matrix.
1333+
Then, $O\X$ is distributed identically to $\X$.
1334+
\end{lemma}
1335+
\begin{prf}[super sketchy]
1336+
Consider \iid $\X_1,\dotsc,\X_n \sim \normal(0,1)$.
1337+
Then, since $p(x_i) = \frac1{\sqrt{2\pi}}\exp(-\frac{x_i^2}{2})$, we have
1338+
\[ p(x_1,\dotsc,x_n) = \frac{1}{\sqrt{2\pi}^n}\exp(-\frac{\norm{x}_2^2}{2}) \]
1339+
Notice that this only depends on the length of $x$,
1340+
so we are uniformly distributing on the $n$-ball of length $\norm{x}_2$.
1341+
\end{prf}
1342+
1343+
Now consider what's going on with a summation.
1344+
Notice that $\sum \X_i = \ev{\X, \bb 1}$.
1345+
There exists some rotation $O$ such that $O\bb 1 =\sqrt{n}e_1$ (the first basis vector).
1346+
Inner products preserve rotations, so $\sum \X_i = \ev{O\X,O\bb 1} = \sqrt{n}\ev{O\X,e_1} = \sqrt{n}O\X_1$.
1347+
But by rotation invariance, this has the same distribution as $\sqrt{n}\X_1$,
1348+
which is just a Gaussian.
1349+
1350+
\chapter{Coding theory}
1351+
1352+
\chapter{Parallel repetition}
12321353

12331354
\pagebreak
12341355
\phantomsection\addcontentsline{toc}{chapter}{Back Matter}

latex/agony-co432.tex

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
\newcommand{\Y}{\rv{Y}}
88
\newcommand{\XX}{\sv{X}}
99
\newcommand{\YY}{\sv{Y}}
10+
\newcommand{\normal}{\mathcal{N}}
1011
\newcommand{\D}[2]{D(#1 \parallel #2)}
12+
1113
\DeclareMathOperator{\Bern}{Bernoulli}
1214
\newcommand{\iid}{\textsc{iid}\xspace}
1315

latex/agony.cls

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@
201201
\newcommand{\dilim}[1]{\dlim{#1}{\infty}} % infinite limits
202202
\newcommand{\ilim}[1]{\lim_{#1\to\infty}}
203203
\newcommand{\at}[2]{\left.#1\right|_{#2}}
204+
\newcommand{\mmid}{\mathrel{\middle|}}
204205
\newcommand{\br}{\\} % non-aligning line break
205206

206207
% Fonts

0 commit comments

Comments
 (0)