@@ -272,31 +272,31 @@ \section{Entropy as optimal lossless data compression}
272272by amortizing over longer batches of the string.
273273
274274\begin {sol }[batching]
275- For $ \rv Y$ defined on $ [n]$ equal to $ i$ with probability $ q_i$ ,
276- define the random variable $ \rv Y^{(k)}$ on $ [n]^k$
275+ For $ \Y $ defined on $ [n]$ equal to $ i$ with probability $ q_i$ ,
276+ define the random variable $ \Y ^{(k)}$ on $ [n]^k$
277277 equal to the string $ i_1 \cdots i_k$ with probability $ q_{i_1}\cdots q_{i_k}$ .
278- That is, $ \rv Y^{(k)}$ models $ k$ independent samples of $ \rv Y$ .
278+ That is, $ \Y ^{(k)}$ models $ k$ independent samples of $ \Y $ .
279279
280- Apply the Shannon--Fano code to $ \rv Y^{(k)}$
280+ Apply the Shannon--Fano code to $ \Y ^{(k)}$
281281 to get an encoding of $ [n]^k$ as bitstrings of expected length $ \ell $
282- satisfying $ H(\rv Y^{(k)}) \leq \ell \leq H(\rv Y^{(k)}) + 1 $ .
282+ satisfying $ H(\Y ^{(k)}) \leq \ell \leq H(\Y ^{(k)}) + 1 $ .
283283 \begin {align* }
284- H(\rv Y^{(k)}) & = \E _{i_1\cdots i_k \sim \rv Y^{(k)}}\qty [\log _2 \frac {1}{q_{i_1}\cdots q_{i_k}}] \tag {by def'n} \\
285- & = \E _{i_1\cdots i_k \sim \rv Y^{(k)}}\qty [\log _2 \frac {1}{q_{i_1}} + \dotsb + \log _2\frac {1}{q_{i_k}}] \tag {log rules} \\
286- & = \sum _{j=1}^k \E _{i_1\cdots i_k \sim \rv Y^{(k)}}\qty [\log _2 \frac {1}{q_{i_j}}] \tag {linearity of expectation} \\
287- & = \sum _{j=1}^k \E _{i \sim \rv Y}\qty [\log _2 \frac {1}{q_{i}}] \tag {$ q_{i_j}$ only depends on one character} \\
288- & = kH(\rv Y) \tag {by def'n, no $ j$ -dependence in sum}
284+ H(\Y ^{(k)}) & = \E _{i_1\cdots i_k \sim \Y ^{(k)}}\qty [\log _2 \frac {1}{q_{i_1}\cdots q_{i_k}}] \tag {by def'n} \\
285+ & = \E _{i_1\cdots i_k \sim \Y ^{(k)}}\qty [\log _2 \frac {1}{q_{i_1}} + \dotsb + \log _2\frac {1}{q_{i_k}}] \tag {log rules} \\
286+ & = \sum _{j=1}^k \E _{i_1\cdots i_k \sim \Y ^{(k)}}\qty [\log _2 \frac {1}{q_{i_j}}] \tag {linearity of expectation} \\
287+ & = \sum _{j=1}^k \E _{i \sim \Y }\qty [\log _2 \frac {1}{q_{i}}] \tag {$ q_{i_j}$ only depends on one character} \\
288+ & = kH(\Y ) \tag {by def'n, no $ j$ -dependence in sum}
289289 \end {align* }
290290 For every $ k$ symbols, we use $ \ell $ bits, i.e., $ \frac {\ell }{k}$ bits per symbol.
291291 From the Shannon--Faro bound, we have
292292 \begin {align* }
293- \frac {H(\rv Y^{(k)})}{k} & \leq \frac {\ell }{k} < \frac {H(\rv Y^{(k)})}{k} + \frac {1}{k} \\
294- H(\rv Y) & \leq \frac {\ell }{k} < H(\rv Y) + \frac {1}{k}
293+ \frac {H(\Y ^{(k)})}{k} & \leq \frac {\ell }{k} < \frac {H(\Y ^{(k)})}{k} + \frac {1}{k} \\
294+ H(\Y ) & \leq \frac {\ell }{k} < H(\Y ) + \frac {1}{k}
295295 \end {align* }
296- Then, we have a code for $ \rv Y$ bounded by
297- $ [H(\rv Y), H(\rv Y) + \frac {1}{k})$ .
296+ Then, we have a code for $ \Y $ bounded by
297+ $ [H(\Y ), H(\Y ) + \frac {1}{k})$ .
298298
299- Taking a limit of some sort, we can say that we need $ H(\rv Y) + o(1 )$ bits.
299+ Taking a limit of some sort, we can say that we need $ H(\Y ) + o(1 )$ bits.
300300\end {sol }
301301
302302\begin {defn* }[relative entropy]
@@ -457,8 +457,8 @@ \chapter{Applications of KL divergence}
457457
458458 That is, $ H(p) = p\log _2 \frac 1 p + (1 -p)\log _2 \frac {1}{1-p}$ .
459459
460- Likewise, write $ \D q p$ to be $ \D {\rv Y}{\X }$
461- where $ \rv Y \sim \Bern (q)$ .
460+ Likewise, write $ \D q p$ to be $ \D {\Y }{\X }$
461+ where $ \Y \sim \Bern (q)$ .
462462\end {notation }
463463
464464Recall Sterling's approximation (which we have used before):
@@ -601,12 +601,12 @@ \section{Rejection sampling}
601601 Suppose $ \X = \begin {cases}
602602 0 & p=\frac 12 \\
603603 1 & p=\frac 12
604- \end {cases}$ and $ \rv Y = \begin {cases}
604+ \end {cases}$ and $ \Y = \begin {cases}
605605 0 & p=\frac 14 \\
606606 1 & p=\frac 34
607607 \end {cases}$ .
608608
609- How can we sample $ \rv Y$ using $ \X $ ?
609+ How can we sample $ \Y $ using $ \X $ ?
610610\end {example }
611611\begin {sol }[naive]
612612 Take \iid $ \X _1 $ and $ \X _2 $ .
0 commit comments