@@ -297,6 +297,7 @@ \section{Entropy as optimal lossless data compression}
297297 Taking a limit of some sort, we can say that we need $ H(\Y ) + o(1 )$ bits.
298298\end {sol }
299299
300+ \chapter {Relative entropy }
300301\begin {defn* }[relative entropy]
301302 Given two discrete distributions $ p = (p_i)$ and $ q = (q_i)$ ,
302303 the \term [entropy!relative]{relative entropy}
@@ -447,7 +448,6 @@ \section{Entropy as optimal lossless data compression}
447448 which tends to 1 as $ k \to \infty $ , as desired.
448449\end {prf }
449450
450- \chapter {Applications of KL divergence }
451451\lecture {May 20}
452452
453453\begin {notation }
@@ -942,8 +942,9 @@ \section{Rejection sampling}
942942 as desired.
943943\end {prf }
944944
945- \section {Mutual information and communication complexity }
945+ \chapter {Mutual information }
946946
947+ \section {Definition and chain rules }
947948\lecture {June 3}
948949\begin {notation }
949950 Given two jointly distributed random variables $ (\X ,\Y )$ over sample space
@@ -957,7 +958,7 @@ \section{Mutual information and communication complexity}
957958 & = H(\X ) - H(\X \mid \Y ) \\
958959 & = H(\Y ) - H(\Y \mid \X )
959960 \end {align* }
960- where the \term {conditional entropy} $ H(\X \mid \Y )$ is
961+ where the \term [entropy!conditional] {conditional entropy} $ H(\X \mid \Y )$ is
961962 \[ \sum _{y \in \YY } p_y \cdot H((\X |_{\Y =y})) \]
962963\end {defn }
963964This is entirely analogous to saying that
@@ -1009,13 +1010,23 @@ \section{Mutual information and communication complexity}
10091010\end {notation }
10101011
10111012\begin {theorem }[chain rule for relative entropy]\label {thm:chainD }
1012- Let $ p(x,y)$ and $ q(x,y)$ be distributions. Then,
1013- \[ \D {p(x,y)}{q(x,y)} = \D {p(x)}{q(x)} + \D {p(y \mid x)}{q(y \mid x)} \]
1013+ Let $ p$ and $ q : \XX \times \YY \to [0 ,1 ]$ be distributions.
1014+ Let $ p(x) \coloneqq \sum _{y\in\YY } p(x,y)$ denote marginals of $ p$
1015+ and $ p(y|x) \coloneqq \frac {p(x,y)}{p(x)}$ denote conditionals of $ p$ .
1016+ Then,
1017+ \begin {align* }
1018+ \D {p(x,y)}{q(x,y)}
1019+ & = \D {p(x)}{q(x)} + \D {p(y|x)}{q(y|x)} \\
1020+ & = \D {p(x)}{q(x)} + \sum _{x\in\XX } p(x) \cdot \D {(p(y|x))_{y\in\YY }}{(q(y|x))_{y\in\YY }}
1021+ \end {align* }
1022+ where $ \D {p(y|x)}{q(y|x)}$
1023+ is the \term [entropy!relative!conditional]{conditional relative entropy}.
1024+
10141025 Equivalently, let $ (\X _1 ,\Y _1 )$ and $ (\X _2 ,\Y _2 )$ be two joint random variables.
10151026 Then,
10161027 \[
10171028 \D {(\X _1,\Y _1)}{(\X _2,\Y _2)}
1018- = \D {\X _1}{\X _2} + \D {\Y _1|_{( \X _1, \X _2) }}{\Y _2|_{( \X _1, \ X _2) }}
1029+ = \D {\X _1}{\X _2} + \sum _{x \in\XX } \Pr [ \X _1=x] \cdot\ D {\Y _1|_{\X _1=x }}{\Y _2|_{\ X _2=x }}
10191030 \]
10201031\end {theorem }
10211032\begin {prf }[for distributions]
@@ -1061,6 +1072,72 @@ \section{Mutual information and communication complexity}
10611072 \end {align* }
10621073\end {prf }
10631074
1075+ \skipto [lecture]{11}
1076+ \lecture {June 10}
1077+ \begin {theorem }[chain rule for mutual information]
1078+ Let $ \X _1 $ , $ \X _2 $ , and $ \Y $ be random variables. Then,
1079+ \[ I((\X _1,\X _2) : \Y ) = I(\X _1 : Y) + I(\X _2 : (Y \mid \X _1)) \]
1080+ and in general
1081+ \[ I((\X _1,\dotsc ,\X _n) : \Y ) = \sum _{i=1}^n I(\X _1 : (\Y \mid (\X _1,\dotsc ,\X _{i-1}))) \]
1082+ \end {theorem }
1083+
1084+ \section {Markov chains, data processing, and sufficient statistics }
1085+ \begin {defn }
1086+ The random variables $ \X $ , $ \Y $ , and $ \rv Z$ form a \term {Markov chain}
1087+ if the conditional distribution of $ \rv Z$ depends only on $ \Y $
1088+ and is conditionally independent of $ \X $ . Equivalently,
1089+ \[ \Pr [\X =x,\Y =y,\rv Z=z] = \Pr [\X =x]\cdot\Pr [\Y =y\mid\X =x]\cdot\Pr [\rv Z=z\mid\Y =y] \]
1090+ Then, we write $ \X \to \Y \to \rv Z$ .
1091+ \end {defn }
1092+
1093+ \begin {example }[Legend of the Drunken Master]
1094+ In $ \Omega = \R ^2 $ , Jackie Chan is drunk and takes steps in random directions.
1095+ He starts at $ \rv J_0 = (0 ,0 )$ .
1096+ Then, $ \rv J_1 = \rv J_0 + d_1 $ where $ d_1 $ is an independent random unit vector in $ \R ^2 $ ,
1097+ and $ \rv J_2 = \rv J_1 + d_2 $ and so on.
1098+ \end {example }
1099+
1100+ First, $ \rv J_3 $ and $ \rv J_1 $ are not independent.
1101+ But if we fix $ \rv J_2 = j_2 \in \R ^2 $ , then $ \rv J_1 \mid \rv J_2 =j_2 $
1102+ and $ \rv J_3 \mid \rv J_2 =j_2 $ are independent.
1103+ In fact, they are uniformly distributed random points on the circle of radius 1
1104+ centred at $ j_2 $ .
1105+
1106+ \begin {prop }
1107+ Let $ \X $ , $ \Y $ , and $ \rv Z$ be random variables. \TFAE :
1108+ \begin {enumerate }
1109+ \item $ \X \to \Y \to \rv Z$
1110+ \item $ \X $ and $ \rv Z$ are conditionally independent given $ \Y $ .
1111+ That is,
1112+ \[
1113+ \Pr [\X =x,\rv Z=z \mid \Y =y] = \Pr [\X =x \mid \Y =y] \cdot \Pr [\rv Z=z \mid \Y =y]
1114+ \]
1115+ \item $ \rv Z$ is distributed according to $ f(\Y ,\rv R)$ for some $ \rv R$ independent of $ \X $ and $ \Y $ .
1116+ \end {enumerate }
1117+ \end {prop }
1118+ \begin {xca }
1119+ Prove the definitions are equivalent.
1120+ \end {xca }
1121+
1122+ \begin {theorem }[data-processing inequality]\label {thm:dpi }
1123+ If $ \X \to \Y \to \rv Z$ , then $ I(\X : \rv Z) \leq I(\X : \Y )$ .
1124+
1125+ Equality happens if and only if $ \X \to \rv Z \to \Y $ .
1126+ \end {theorem }
1127+ \begin {prf }
1128+ By the chain rule,
1129+ \[
1130+ I(\X : (\Y , \rv Z)) = I(\X : \Y ) + \cancelto {0}{I(\X : \rv Z \mid \Y )}
1131+ = I(\X : \rv Z) + I(\X : \Y \mid \rv Z)
1132+ \]
1133+ so that
1134+ \[ I(\X : \Y ) = I(\X : \rv Z) + I(\X : \Y \mid \rv Z) \]
1135+ One may show that the mutual information is always non-negative,
1136+ so we have $ I(\X : \Y ) \geq I(\X : \rv Z)$ as desired.
1137+ \end {prf }
1138+
1139+ \section {Communication complexity }
1140+ \skipto [lecture]{10}
10641141\lecture {June 5}
10651142\begin {problem }
10661143 Suppose there is a joint distribution $ (\X ,\Y )$ that Alice and Bob wish to
@@ -1123,31 +1200,36 @@ \section{Mutual information and communication complexity}
11231200 The performance is:
11241201 \begin {align* }
11251202 \E _{\mathclap {\X ,\rv R}} \abs {M(\X ,\rv R)}
1126- & = \sum _{x\in\XX } p_x \E _{i^*,\rv Y_1,\rv Y_2,\dotsc }[\ell (i^*)] \\
1127- & \leq \sum _{x\in\XX } p_x \qty (\D {\Y |_{\X =x}}{\Y } + \order {\log \D {\Y _ {\X =x}}{\Y }}) \\
1128- & = I(\X :\Y ) + \sum _{x\in\XX }p_x\order {\log \D {\Y _ {\X =x}}{\Y }} \\
1203+ & = \sum _{x\in\XX } p_x \E _{i^*,\rv Y_1,\rv Y_2,\dotsc }[\ell (i^*)] \\
1204+ & \leq \sum _{x\in\XX } p_x \qty (\D {\Y |_{\X =x}}{\Y } + \order {\log \D {\Y |_ {\X =x}}{\Y }}) \\
1205+ & = I(\X :\Y ) + \sum _{x\in\XX }p_x\order {\log \D {\Y |_ {\X =x}}{\Y }} \\
11291206 & \leq I(\X :\Y ) + \order {\log I(\X :\Y )}
11301207 \end {align* }
11311208 where the last step is by Jensen's inequality.
11321209
11331210 Now, let $ \Pi $ be any protocol.
1134- By \nameref {thm:kraft }, $ \E _{\X ,\rv R} \abs {M(\X ,\rv R)} \geq H(M(\X ,\rv R))$ .
1135-
1136- We will also show the data-processing inequality:
1211+ \marginnote {\normalsize {\textit {Lecture 11\\ June 10\\ cont. }}}
1212+ We will apply the \nameref {thm:dpi }.
1213+
1214+ Notice that $ \X \to (M(\X ,\rv R),\rv R) \to \Y $ if and only if $ \Pi $
1215+ is a valid protocol.
1216+ If we sample $ x\sim \X $ and Alice sends $ M(x,\rv R)$ ,
1217+ then Bob outputs something distributed according to $ \Y \mid \X =x$ ,
1218+ i.e., just $ \Y $ since $ x$ was arbitrary. Then,
11371219 \begin {align* }
1138- I(\X : (M(\X ,\rv R), \rv R))
1139- & = I(X : M(\X ,\rv R)) + \sum _{r\in\Omega } p_r I(\X |_{\rv R=r} : M(\X ,\rv R)|_{\rv R=r}) \tag {chain rule} \\
1140- & = 0 + \sum _{r\in\Omega } p_r I(\X |_{\rv R=r} : M(\X ,\rv R)|_{\rv R=r}) \tag {since $ \X $ and $ \rv R$ are independent} \\
1141- & \leq \sum _{r\in\Omega } p_r H(M(\X ,\rv R)|_{\rv R=r}) \tag {venn diagram logic} \\
1142- & = H(M(\X ,\rv R) \mid \rv R) \\
1143- & \leq H(M(\X ,\rv R)) \tag {conditioning only reduces entropy}
1220+ I(\X : \Y )
1221+ & \leq I(\X : (M(\X ,\rv R),\rv R)) \tag {data processing inequality} \\
1222+ & = I(\X : \rv R) + \sum _{r\in\Omega _{\rv R}} p_r I(\X |_{\rv R=r} : M(\X ,\rv R)|_{\rv R=r}) \tag {chain rule} \\
1223+ & = 0 + I(\X : M(\X ,\rv R) \mid \rv R) \tag {independence} \\
1224+ & \leq H(M(\X ,\rv R) \mid \rv R) \tag {$ I(\rv A : \rv B) \leq \min \{ H(\rv A),H(\rv B)\} $ } \\
1225+ & \leq H(M(\X ,\rv R)) \tag {$ H(\rv A \mid \rv B) \leq H(\rv A)$ } \\
1226+ & \leq \E\abs {M(\X ,\rv R)} \tag {Kraft inequality}
11441227 \end {align* }
1145- In general,
1146- \[ I(\rv A : f(\rv B)) \leq I(\rv A : \rv B) \]
1147- and we are doing the case where $ \rv A = \X $ , $ \rv B = (M(\X ,\rv R))$ ,
1148- and $ f = y$ .
1228+ completing the proof.
11491229\end {prf }
11501230
1231+ \section {Parameter estimation }
1232+
11511233\pagebreak
11521234\phantomsection\addcontentsline {toc}{chapter}{Back Matter}
11531235\renewcommand {\listtheoremname }{List of Named Results}
0 commit comments