1515 \listoflecture
1616\end {multicols }
1717
18- \chapter {Introduction }
18+ \chapter {Entropy }
1919
2020\begin {notation }
2121 I will be using my usual \LaTeX {} typesetting conventions:
@@ -31,7 +31,7 @@ \chapter{Introduction}
3131 \end {itemize }
3232\end {notation }
3333
34- \section {Entropy }
34+ \section {Definition }
3535\lecture {May 6}
3636
3737\textrule {$ \downarrow $ Lecture 1 adapted from Arthur $ \downarrow $ }
@@ -1006,7 +1006,8 @@ \section{Definition and chain rules}
10061006
10071007\begin {notation }
10081008 Although relative entropy is defined only on \emph {distributions },
1009- write $ \D\X\Y $ to be $ \D {f_{\X }}{f_{\Y }}$ .
1009+ write $ \D\X\Y $ to be $ \D {f_{\X }}{f_{\Y }}$
1010+ where $ \X \sim f_{\X }$ and $ \Y \sim f_{\Y }$ .
10101011\end {notation }
10111012
10121013\begin {theorem }[chain rule for relative entropy]\label {thm:chainD }
@@ -1081,7 +1082,7 @@ \section{Definition and chain rules}
10811082 \[ I((\X _1,\dotsc ,\X _n) : \Y ) = \sum _{i=1}^n I(\X _1 : (\Y \mid (\X _1,\dotsc ,\X _{i-1}))) \]
10821083\end {theorem }
10831084
1084- \section {Markov chains, data processing, and sufficient statistics }
1085+ \section {Markov chains and data processing }
10851086\begin {defn }
10861087 The random variables $ \X $ , $ \Y $ , and $ \rv Z$ form a \term {Markov chain}
10871088 if the conditional distribution of $ \rv Z$ depends only on $ \Y $
@@ -1090,7 +1091,7 @@ \section{Markov chains, data processing, and sufficient statistics}
10901091 Then, we write $ \X \to \Y \to \rv Z$ .
10911092\end {defn }
10921093
1093- \begin {example }[Legend of the Drunken Master]
1094+ \begin {example }[\textit { \href {https://en.wikipedia.org/wiki/Drunken_Master_II}{ Legend of the Drunken Master} } ]
10941095 In $ \Omega = \R ^2 $ , Jackie Chan is drunk and takes steps in random directions.
10951096 He starts at $ \rv J_0 = (0 ,0 )$ .
10961097 Then, $ \rv J_1 = \rv J_0 + d_1 $ where $ d_1 $ is an independent random unit vector in $ \R ^2 $ ,
@@ -1103,7 +1104,7 @@ \section{Markov chains, data processing, and sufficient statistics}
11031104In fact, they are uniformly distributed random points on the circle of radius 1
11041105centred at $ j_2 $ .
11051106
1106- \begin {prop }
1107+ \begin {prop }[Markov chain characterization] \label { prop:markov }
11071108 Let $ \X $ , $ \Y $ , and $ \rv Z$ be random variables. \TFAE :
11081109 \begin {enumerate }
11091110 \item $ \X \to \Y \to \rv Z$
@@ -1134,6 +1135,7 @@ \section{Markov chains, data processing, and sufficient statistics}
11341135 \[ I(\X : \Y ) = I(\X : \rv Z) + I(\X : \Y \mid \rv Z) \]
11351136 One may show that the mutual information is always non-negative,
11361137 so we have $ I(\X : \Y ) \geq I(\X : \rv Z)$ as desired.
1138+ We defer the proof of the equality case for \cref {sec:ss }.
11371139\end {prf }
11381140
11391141\section {Communication complexity }
@@ -1208,7 +1210,7 @@ \section{Communication complexity}
12081210 where the last step is by Jensen's inequality.
12091211
12101212 Now, let $ \Pi $ be any protocol.
1211- \marginnote {\normalsize {\textit {Lecture 11\\ June 10\\ cont. }}}
1213+ \marginnote {\normalsize {\textit {Lecture 11\\ June 10\\ (con'd) }}}
12121214 We will apply the \nameref {thm:dpi }.
12131215
12141216 Notice that $ \X \to (M(\X ,\rv R),\rv R) \to \Y $ if and only if $ \Pi $
@@ -1228,7 +1230,126 @@ \section{Communication complexity}
12281230 completing the proof.
12291231\end {prf }
12301232
1231- \section {Parameter estimation }
1233+ \section {Sufficient statistics }\label {sec:ss }
1234+ \skipto [lecture]{12}
1235+ \lecture {June 12}
1236+ We will develop the idea of sufficient statistics and data processing
1237+ towards the asymptotic equipartition property.
1238+ This is a warmup for the joint asymptotic equipartition property
1239+ which we will use to prove one direction of Shannon's channel-coding theorem.
1240+
1241+ \begin {problem }
1242+ Suppose $ \X = (\X _1 ,\dotsc ,\X _n)$ are \iid sampled according to $ \Bern (\theta )$
1243+ for some fixed parameter $ \theta \in [0 ,1 ]$ .
1244+
1245+ If we have a sample $ x = (x_1 ,\dotsc ,x_n)$ , how can we recover $ \theta $ ?
1246+ \end {problem }
1247+
1248+ The classical solution (recall from STAT 230) is the maximum likelihood estimator
1249+ $ \hat \theta = \frac 1 n\sum _{i=1}^n x_i$ such that
1250+ $ \Pr [\abs {\hat\theta -\theta } > \varepsilon ] \leq 2 ^{-\Omega (\varepsilon ^2n)}$ .
1251+ In essence, we are reducing the number of bits to send $ \theta $ from $ n$
1252+ to [whatever it is you need to send a float of desired accuracy lol].
1253+
1254+ \begin {defn }
1255+ A function $ T(\X )$ is a \term {sufficient statistic}
1256+ relative to a family $ \{ f_\theta (x)\} $ if $ \theta \to T(\X ) \to \X $ .
1257+ \end {defn }
1258+
1259+ We are considering the case where $ f_\theta $ is $ \Bern (\theta )$ .
1260+ Clearly, $ \theta \to \X \to T(\X )$ is a Markov chain
1261+ because $ \X $ is distributed based on $ \theta $ and $ T$ is a function of
1262+ $ \X $ which is not influenced $ \theta $ .
1263+
1264+ \begin {example }
1265+ $ T(\X ) = \frac 1 n\sum _{i=1}^n \X _i$ is a sufficient statistic
1266+ relative to the family $ \{ \Bern (\theta )\} $ .
1267+ \end {example }
1268+ \begin {prf }
1269+ We must show $ \theta \to T(\X ) \to \X $ is a Markov chain.
1270+
1271+ Fix $ x = (x_1 ,\dotsc ,x_n)$ . Notice that
1272+ \[ \Pr\qty [\X _1=0,\dotsc ,\X _n=0 \mid \frac 1n\sum \X _i = \frac 12] = 0 \]
1273+ and
1274+ \[ \Pr\qty [\X _1=1,\dotsc ,\X _n=1 \mid \frac 1n\sum \X _i = \frac 12] = 0 \]
1275+ since we obviously cannot have half the $ \X _i$ 's be 1 if they are all 0s or all 1s.
1276+
1277+ But if we set exactly half of the $ \X _i$ 's to be 1, the distribution is uniform
1278+ \[ \Pr\qty [\X _1=1,\dotsc ,\X _{\frac {n}{2}}=1,\X _{\frac {n}{2}+1}=0,\dotsc ,\X _n=0 \mid \frac 1n\sum\X _i = \frac 12] = \Pr [\X =x \mid \frac 1n\sum\X _i = \frac 12] = \frac {1}{\binom {n}{n/2}} \]
1279+ for all $ x\in \bits {n}$ such that $ \frac {n}{2}$ entries are 1.
1280+
1281+ More generally, suppose $ x$ has exactly $ k$ ones where $ k = n\bar \theta $ . Then,
1282+ \[
1283+ \Pr [\X =x \mid \frac 1n\sum\X _i = \bar\theta ] = \begin {cases }
1284+ 1/\binom {n}{n\bar\theta } & \frac 1n\sum x_i = \bar\theta \\
1285+ 0 & \text {otherwise}
1286+ \end {cases }
1287+ \]
1288+ so we have that $ \X \mid \frac 1 n\sum \X _i = \bar \theta $ is independent of $ \theta $ .
1289+
1290+ We can also see this by saying that $ \X \sim \Bern (\theta )^n$
1291+ can be equivalently sampled as:
1292+ \begin {enumerate }
1293+ \item first sampling $ \rv K = k$ with probability $ \Pr [\frac 1 n\sum \X _i=k]$ ,
1294+ \item then sampling a uniform random point that has exactly $ \rv K$ ones.
1295+ \end {enumerate }
1296+ which clearly shows that $ \rv X$ can be sampled as $ f(\frac 1 n\sum \X _i,\rv R)$
1297+ for some new randomness $ \rv R$ (the uniform randomness) independent of $ \theta $ .
1298+ \end {prf }
1299+
1300+ \begin {example }[`` mostly unrelated \textit {\href {https://en.wikipedia.org/wiki/Drunken_Master_III}{Drunken Master III} }'' ]
1301+ A public domain generic drunkard legally distinct from Jackie Chan begins at $ (0 ,0 )$
1302+ and takes steps in random directions $ d_i$
1303+ of length $ \ell \sim \abs {\normal (0,\theta ^2)}$ .
1304+ \end {example }
1305+
1306+ Let $ \X _n$ be the position at time $ n$ .
1307+ We can show that
1308+ \[ \norm {\X _n}_2 = c(1\pm o(1))\theta\sqrt {n} \]
1309+ with probability very close to 1. To be more precise,
1310+ \[ \Pr [\text {length from origin} > (1+o(1))(\text {expected length from origin})] \]
1311+ is exponentially small in $ n$ .
1312+ That is, after $ n$ steps, the randomness cancels out,
1313+ and we have a pretty good idea of where we end up.
1314+
1315+ The whole point of this exercise is to notice that if we have a sufficient statistic,
1316+ the probability measure is extremely concentrated around some constant,
1317+ and we can almost just treat the statistic as a constant itself.
1318+
1319+ \begin {example }
1320+ Consider \iid Gaussians $ \X _1 ,\dotsc ,\X _n \sim \normal (0 ,1 )$ .
1321+ Then, what is the probability $ \Pr [\X _1 ,\dotsc ,\X _n > t\sqrt {n}]$
1322+ we overshoot the estimator by $ t$ times?
1323+ \end {example }
1324+ \begin {sol }
1325+ Apply simple properties of Gaussians from STAT 230:
1326+ \begin {align* }
1327+ \Pr [\X _1,\dotsc ,\X _n > t\sqrt {n}] = \Pr [\sqrt n\normal (0,1) > t\sqrt {n}] = \Pr [\normal (0,1) > t] = \Phi (t) \approx e^{-t^2/2}
1328+ \end {align* }
1329+ \end {sol }
1330+
1331+ \begin {lemma }[rotation invariance of the Gaussian]
1332+ Let $ \X $ be a Gaussian and $ O$ be an orthonormal matrix.
1333+ Then, $ O\X $ is distributed identically to $ \X $ .
1334+ \end {lemma }
1335+ \begin {prf }[super sketchy]
1336+ Consider \iid $ \X _1 ,\dotsc ,\X _n \sim \normal (0 ,1 )$ .
1337+ Then, since $ p(x_i) = \frac 1 {\sqrt {2\pi }}\exp (-\frac {x_i^2}{2})$ , we have
1338+ \[ p(x_1,\dotsc ,x_n) = \frac {1}{\sqrt {2\pi }^n}\exp (-\frac {\norm {x}_2^2}{2}) \]
1339+ Notice that this only depends on the length of $ x$ ,
1340+ so we are uniformly distributing on the $ n$ -ball of length $ \norm {x}_2 $ .
1341+ \end {prf }
1342+
1343+ Now consider what's going on with a summation.
1344+ Notice that $ \sum \X _i = \ev {\X , \bb 1}$ .
1345+ There exists some rotation $ O$ such that $ O\bb 1 =\sqrt {n}e_1 $ (the first basis vector).
1346+ Inner products preserve rotations, so $ \sum \X _i = \ev {O\X ,O\bb 1} = \sqrt {n}\ev {O\X ,e_1} = \sqrt {n}O\X _1 $ .
1347+ But by rotation invariance, this has the same distribution as $ \sqrt {n}\X _1 $ ,
1348+ which is just a Gaussian.
1349+
1350+ \chapter {Coding theory }
1351+
1352+ \chapter {Parallel repetition }
12321353
12331354\pagebreak
12341355\phantomsection\addcontentsline {toc}{chapter}{Back Matter}
0 commit comments