Skip to content

Commit 3445dfc

Browse files
committed
CO432/notes: jun 12
1 parent 3f7148c commit 3445dfc

File tree

2 files changed

+104
-22
lines changed

2 files changed

+104
-22
lines changed

CO432/notes.pdf

9.05 KB
Binary file not shown.

CO432/notes.tex

Lines changed: 104 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ \section{Entropy as optimal lossless data compression}
297297
Taking a limit of some sort, we can say that we need $H(\Y) + o(1)$ bits.
298298
\end{sol}
299299

300+
\chapter{Relative entropy}
300301
\begin{defn*}[relative entropy]
301302
Given two discrete distributions $p = (p_i)$ and $q = (q_i)$,
302303
the \term[entropy!relative]{relative entropy}
@@ -447,7 +448,6 @@ \section{Entropy as optimal lossless data compression}
447448
which tends to 1 as $k \to \infty$, as desired.
448449
\end{prf}
449450

450-
\chapter{Applications of KL divergence}
451451
\lecture{May 20}
452452

453453
\begin{notation}
@@ -942,8 +942,9 @@ \section{Rejection sampling}
942942
as desired.
943943
\end{prf}
944944

945-
\section{Mutual information and communication complexity}
945+
\chapter{Mutual information}
946946

947+
\section{Definition and chain rules}
947948
\lecture{June 3}
948949
\begin{notation}
949950
Given two jointly distributed random variables $(\X,\Y)$ over sample space
@@ -957,7 +958,7 @@ \section{Mutual information and communication complexity}
957958
& = H(\X) - H(\X \mid \Y) \\
958959
& = H(\Y) - H(\Y \mid \X)
959960
\end{align*}
960-
where the \term{conditional entropy} $H(\X \mid \Y)$ is
961+
where the \term[entropy!conditional]{conditional entropy} $H(\X \mid \Y)$ is
961962
\[ \sum_{y \in \YY} p_y \cdot H((\X|_{\Y=y})) \]
962963
\end{defn}
963964
This is entirely analogous to saying that
@@ -1009,13 +1010,23 @@ \section{Mutual information and communication complexity}
10091010
\end{notation}
10101011

10111012
\begin{theorem}[chain rule for relative entropy]\label{thm:chainD}
1012-
Let $p(x,y)$ and $q(x,y)$ be distributions. Then,
1013-
\[ \D{p(x,y)}{q(x,y)} = \D{p(x)}{q(x)} + \D{p(y \mid x)}{q(y \mid x)} \]
1013+
Let $p$ and $q : \XX\times\YY \to [0,1]$ be distributions.
1014+
Let $p(x) \coloneqq \sum_{y\in\YY} p(x,y)$ denote marginals of $p$
1015+
and $p(y|x) \coloneqq \frac{p(x,y)}{p(x)}$ denote conditionals of $p$.
1016+
Then,
1017+
\begin{align*}
1018+
\D{p(x,y)}{q(x,y)}
1019+
& = \D{p(x)}{q(x)} + \D{p(y|x)}{q(y|x)} \\
1020+
& = \D{p(x)}{q(x)} + \sum_{x\in\XX} p(x) \cdot \D{(p(y|x))_{y\in\YY}}{(q(y|x))_{y\in\YY}}
1021+
\end{align*}
1022+
where $\D{p(y|x)}{q(y|x)}$
1023+
is the \term[entropy!relative!conditional]{conditional relative entropy}.
1024+
10141025
Equivalently, let $(\X_1,\Y_1)$ and $(\X_2,\Y_2)$ be two joint random variables.
10151026
Then,
10161027
\[
10171028
\D{(\X_1,\Y_1)}{(\X_2,\Y_2)}
1018-
= \D{\X_1}{\X_2} + \D{\Y_1|_{(\X_1,\X_2)}}{\Y_2|_{(\X_1,\X_2)}}
1029+
= \D{\X_1}{\X_2} + \sum_{x\in\XX}\Pr[\X_1=x]\cdot\D{\Y_1|_{\X_1=x}}{\Y_2|_{\X_2=x}}
10191030
\]
10201031
\end{theorem}
10211032
\begin{prf}[for distributions]
@@ -1061,6 +1072,72 @@ \section{Mutual information and communication complexity}
10611072
\end{align*}
10621073
\end{prf}
10631074

1075+
\skipto[lecture]{11}
1076+
\lecture{June 10}
1077+
\begin{theorem}[chain rule for mutual information]
1078+
Let $\X_1$, $\X_2$, and $\Y$ be random variables. Then,
1079+
\[ I((\X_1,\X_2) : \Y) = I(\X_1 : Y) + I(\X_2 : (Y \mid \X_1)) \]
1080+
and in general
1081+
\[ I((\X_1,\dotsc,\X_n) : \Y) = \sum_{i=1}^n I(\X_1 : (\Y \mid (\X_1,\dotsc,\X_{i-1}))) \]
1082+
\end{theorem}
1083+
1084+
\section{Markov chains, data processing, and sufficient statistics}
1085+
\begin{defn}
1086+
The random variables $\X$, $\Y$, and $\rv Z$ form a \term{Markov chain}
1087+
if the conditional distribution of $\rv Z$ depends only on $\Y$
1088+
and is conditionally independent of $\X$. Equivalently,
1089+
\[ \Pr[\X=x,\Y=y,\rv Z=z] = \Pr[\X=x]\cdot\Pr[\Y=y\mid\X=x]\cdot\Pr[\rv Z=z\mid\Y=y] \]
1090+
Then, we write $\X \to \Y \to \rv Z$.
1091+
\end{defn}
1092+
1093+
\begin{example}[Legend of the Drunken Master]
1094+
In $\Omega = \R^2$, Jackie Chan is drunk and takes steps in random directions.
1095+
He starts at $\rv J_0 = (0,0)$.
1096+
Then, $\rv J_1 = \rv J_0 + d_1$ where $d_1$ is an independent random unit vector in $\R^2$,
1097+
and $\rv J_2 = \rv J_1 + d_2$ and so on.
1098+
\end{example}
1099+
1100+
First, $\rv J_3$ and $\rv J_1$ are not independent.
1101+
But if we fix $\rv J_2 = j_2 \in \R^2$, then $\rv J_1 \mid \rv J_2=j_2$
1102+
and $\rv J_3 \mid \rv J_2=j_2$ are independent.
1103+
In fact, they are uniformly distributed random points on the circle of radius 1
1104+
centred at $j_2$.
1105+
1106+
\begin{prop}
1107+
Let $\X$, $\Y$, and $\rv Z$ be random variables. \TFAE:
1108+
\begin{enumerate}
1109+
\item $\X \to \Y \to \rv Z$
1110+
\item $\X$ and $\rv Z$ are conditionally independent given $\Y$.
1111+
That is,
1112+
\[
1113+
\Pr[\X=x,\rv Z=z \mid \Y=y] = \Pr[\X=x \mid \Y=y] \cdot \Pr[\rv Z=z \mid \Y=y]
1114+
\]
1115+
\item $\rv Z$ is distributed according to $f(\Y,\rv R)$ for some $\rv R$ independent of $\X$ and $\Y$.
1116+
\end{enumerate}
1117+
\end{prop}
1118+
\begin{xca}
1119+
Prove the definitions are equivalent.
1120+
\end{xca}
1121+
1122+
\begin{theorem}[data-processing inequality]\label{thm:dpi}
1123+
If $\X \to \Y \to \rv Z$, then $I(\X : \rv Z) \leq I(\X : \Y)$.
1124+
1125+
Equality happens if and only if $\X \to \rv Z \to \Y$.
1126+
\end{theorem}
1127+
\begin{prf}
1128+
By the chain rule,
1129+
\[
1130+
I(\X : (\Y, \rv Z)) = I(\X : \Y) + \cancelto{0}{I(\X : \rv Z \mid \Y)}
1131+
= I(\X : \rv Z) + I(\X : \Y \mid \rv Z)
1132+
\]
1133+
so that
1134+
\[ I(\X : \Y) = I(\X : \rv Z) + I(\X : \Y \mid \rv Z) \]
1135+
One may show that the mutual information is always non-negative,
1136+
so we have $I(\X : \Y) \geq I(\X : \rv Z)$ as desired.
1137+
\end{prf}
1138+
1139+
\section{Communication complexity}
1140+
\skipto[lecture]{10}
10641141
\lecture{June 5}
10651142
\begin{problem}
10661143
Suppose there is a joint distribution $(\X,\Y)$ that Alice and Bob wish to
@@ -1123,31 +1200,36 @@ \section{Mutual information and communication complexity}
11231200
The performance is:
11241201
\begin{align*}
11251202
\E_{\mathclap{\X,\rv R}} \abs{M(\X,\rv R)}
1126-
& = \sum_{x\in\XX} p_x \E_{i^*,\rv Y_1,\rv Y_2,\dotsc}[\ell(i^*)] \\
1127-
& \leq \sum_{x\in\XX} p_x \qty(\D{\Y|_{\X=x}}{\Y} + \order{\log \D{\Y_{\X=x}}{\Y}}) \\
1128-
& = I(\X:\Y) + \sum_{x\in\XX}p_x\order{\log \D{\Y_{\X=x}}{\Y}} \\
1203+
& = \sum_{x\in\XX} p_x \E_{i^*,\rv Y_1,\rv Y_2,\dotsc}[\ell(i^*)] \\
1204+
& \leq \sum_{x\in\XX} p_x \qty(\D{\Y|_{\X=x}}{\Y} + \order{\log \D{\Y|_{\X=x}}{\Y}}) \\
1205+
& = I(\X:\Y) + \sum_{x\in\XX}p_x\order{\log \D{\Y|_{\X=x}}{\Y}} \\
11291206
& \leq I(\X:\Y) + \order{\log I(\X:\Y)}
11301207
\end{align*}
11311208
where the last step is by Jensen's inequality.
11321209

11331210
Now, let $\Pi$ be any protocol.
1134-
By \nameref{thm:kraft}, $\E_{\X,\rv R} \abs{M(\X,\rv R)} \geq H(M(\X,\rv R))$.
1135-
1136-
We will also show the data-processing inequality:
1211+
\marginnote{\normalsize{\textit{Lecture 11\\June 10\\cont.}}}
1212+
We will apply the \nameref{thm:dpi}.
1213+
1214+
Notice that $\X \to (M(\X,\rv R),\rv R) \to \Y$ if and only if $\Pi$
1215+
is a valid protocol.
1216+
If we sample $x\sim\X$ and Alice sends $M(x,\rv R)$,
1217+
then Bob outputs something distributed according to $\Y \mid \X=x$,
1218+
i.e., just $\Y$ since $x$ was arbitrary. Then,
11371219
\begin{align*}
1138-
I(\X : (M(\X,\rv R), \rv R))
1139-
& = I(X : M(\X,\rv R)) + \sum_{r\in\Omega} p_r I(\X|_{\rv R=r} : M(\X,\rv R)|_{\rv R=r}) \tag{chain rule} \\
1140-
& = 0 + \sum_{r\in\Omega} p_r I(\X|_{\rv R=r} : M(\X,\rv R)|_{\rv R=r}) \tag{since $\X$ and $\rv R$ are independent} \\
1141-
& \leq \sum_{r\in\Omega} p_r H(M(\X,\rv R)|_{\rv R=r}) \tag{venn diagram logic} \\
1142-
& = H(M(\X,\rv R) \mid \rv R) \\
1143-
& \leq H(M(\X,\rv R)) \tag{conditioning only reduces entropy}
1220+
I(\X : \Y)
1221+
& \leq I(\X : (M(\X,\rv R),\rv R)) \tag{data processing inequality} \\
1222+
& = I(\X : \rv R) + \sum_{r\in\Omega_{\rv R}} p_r I(\X|_{\rv R=r} : M(\X,\rv R)|_{\rv R=r}) \tag{chain rule} \\
1223+
& = 0 + I(\X : M(\X,\rv R) \mid \rv R) \tag{independence} \\
1224+
& \leq H(M(\X,\rv R) \mid \rv R) \tag{$I(\rv A : \rv B) \leq \min\{H(\rv A),H(\rv B)\}$} \\
1225+
& \leq H(M(\X,\rv R)) \tag{$H(\rv A \mid \rv B) \leq H(\rv A)$} \\
1226+
& \leq \E\abs{M(\X,\rv R)} \tag{Kraft inequality}
11441227
\end{align*}
1145-
In general,
1146-
\[ I(\rv A : f(\rv B)) \leq I(\rv A : \rv B) \]
1147-
and we are doing the case where $\rv A = \X$, $\rv B = (M(\X,\rv R))$,
1148-
and $f = y$.
1228+
completing the proof.
11491229
\end{prf}
11501230

1231+
\section{Parameter estimation}
1232+
11511233
\pagebreak
11521234
\phantomsection\addcontentsline{toc}{chapter}{Back Matter}
11531235
\renewcommand{\listtheoremname}{List of Named Results}

0 commit comments

Comments
 (0)