1- \documentclass [notes,tikz ]{agony }
1+ \documentclass [class=co432, notes,tikz ]{agony }
22
33\DeclareMathOperator *{\E }{\mathbb {E}}
44
1919
2020\chapter {Introduction }
2121
22+ \begin {notation }
23+ I will be using my usual \LaTeX {} typesetting conventions:
24+ \begin {itemize }[nosep]
25+ \item $ [n]$ means the set $ \{ 1 ,2 ,\dotsc ,n\} $
26+ \item $ \bits *$ means the set of bitstrings of arbitrary length (i.e., the Kleene star)
27+ \item $ \rv A, \rv B, \dotsc , \rv Z$ are random variables (in sans-serif)
28+ \item $ \rv X = (p_1 ,p_2 ,\dotsc ,p_k)$ means $ \rv X$ is a discrete random variable
29+ such that $ \Pr [\rv X = 1 ] = p_1 $ , $ \Pr [\rv X=2 ] = p_2 $ , etc.
30+ (abbreviate further as $ \rv X = (p_i)$ )
31+ \end {itemize }
32+ \end {notation }
33+
2234\section {Entropy }
2335\lecture {May 6}
24- TODO
36+
37+ \textrule {$ \downarrow $ Lecture 1 adapted from Arthur $ \downarrow $ }
2538
2639\begin {defn }[entropy]
27- For a random variable $ \rv X$ which is equal to $ i$ with probability $ p_i$ ,
28- the \term {entropy} $ H(\rv X) := \sum _i p_i \log \frac {1}{p_i}$ .
40+ For a random variable $ \rv X = (p_i)$ ,
41+ the \term *{entropy} $ H(\rv X)$ is
42+ \[ H(\rv X) = -\sum _i p_i \log p_i = \sum _i p_i \log \frac {1}{p_i}. \]
2943\end {defn }
3044
31- \subsection {Axiomatic view of entropy }
32- \lecture {May 8}
45+ \begin {convention }
46+ By convention, we usually use $ \log _2 $ .
47+ Also, we define entropy such that $ \log _2 (0 ) = 0 $ so that
48+ impossible values do not break the formula.
49+ \end {convention }
50+
51+ \begin {example }
52+ If $ \rv X$ takes on the values $ a$ , $ b$ , $ c$ , $ d$
53+ with probabilities 1, 0, 0, 0, respectively, then $ H(\rv X) = 1 \log 1 = 0 $ .
54+
55+ If $ \rv X$ takes on those values instead with probabilities
56+ $ \frac 12 $ , $ \frac 14 $ , $ \frac 18 $ , $ \frac 18 $ , respectively,
57+ then $ H(\rv X) = \frac 74 $ .
58+ \end {example }
59+
60+ \begin {fact }
61+ $ H(\rv X) = 0 $ if and only if $ \rv X$ is a constant.
62+ \end {fact }
63+ \begin {prf }
64+ Suppose $ \rv X$ is constant. Then, $ H(\rv X) = 1 \log 1 = 0 $ .
65+
66+ Suppose $ H(\rv X) = 0 $ .
67+ Probabilities are in $ [0 ,1 ]$ , so $ p_i \log \frac {1}{p_i} \geq 0 $ .
68+ Since $ H(\rv X) = \sum _i p_i \log \frac {1}{p_i} = 0 $
69+ and each term is non-negative, each term must be zero.
70+ Thus, each $ p_i$ is either 0 or 1.
71+ We cannot have $ \sum p_i > 1 $ , so exactly one $ p_i = 1 $ and the rest are zero.
72+ That is, $ \rv X$ is constant.
73+ \end {prf }
74+
75+ \begin {theorem }[Jensen's inequality]\label {thm:jensen }
76+ Let $ f : \R \to \R $ be concave. That is,
77+ for any $ a$ and $ b$ in the domain of $ f$ and $ \lambda \in [0 ,1 )$ ,
78+ $ f(\lambda a + (1 -\lambda )b) \geq \lambda f(a) + (1 -\lambda )f(b)$ .
79+ For any discrete random variable $ \rv X$ ,
80+ \[ \E [f(\rv X)] \leq f(\E [\rv X]) \]
81+ \end {theorem }
82+ \begin {prf }
83+ Consider a random variable $ \rv X$ with two values $ a$ and $ b$ ,
84+ each with probabilities $ \lambda $ and $ 1 -\lambda $ .
85+ Then, notice that
86+ \[ \E [f(\rv X)] = \lambda f(a) + (1-\lambda ) f(b) \leq f(\lambda a + (1-\lambda )b) = f(\E [\rv X]) \]
87+ by convexity of $ f$ .
88+
89+ TODO: This can be generalized by induction.
90+ \end {prf }
91+
92+ \begin {fact }
93+ Assume $ \rv X$ is supported on $ [n]$ . Then, $ 0 \leq H(\rv X) \leq \log n$ .
94+ \end {fact }
95+ \begin {prf }
96+ Start by claiming without proof that $ \log n$ is concave, so we can apply
97+ \nameref {thm:jensen }.
98+
99+ Let $ \rv X' = \frac {1}{p_i}$ with probability $ p_i$ . Then,
100+ \begin {align* }
101+ H(\rv X) & = \sum _i p_i \log \frac {1}{p_i} \\
102+ & = \E\qty [\log (\rv X')] \\
103+ & \leq \log (\E [\rv X']) \\
104+ & = \log\qty (\sum p_i \frac {1}{p_i}) \\
105+ & = \log n \qedhere
106+ \end {align* }
107+ \end {prf }
108+
109+ It is not a coincidence that $ \log _2 n$ is the minimum number of bits to encode $ [n]$ .
110+
111+ \section {Entropy as expected surprise }
33112
34113We want $ S : [0 ,1 ] \to [0 ,\infty )$ to capture how `` surprised''
35114we are $ S(p)$ that an event with probability $ p$ happens.
@@ -45,6 +124,9 @@ \subsection{Axiomatic view of entropy}
45124 That is, if I see something twice, I should be twice as surprised.
46125\end {enumerate }
47126
127+ \textrule {$ \uparrow $ Lecture 1 adapted from Arthur $ \uparrow $ }
128+ \lecture {May 8}
129+
48130\begin {prop }
49131 If $ S(p)$ satisfies these 4 axioms, then $ S(p)=c\cdot \log _2 (1 /p)$ for some $ c > 0 $ .
50132\end {prop }
@@ -71,15 +153,15 @@ \subsection{Axiomatic view of entropy}
71153\[ \sum _i p_i \log _2\frac {1}{p_i} = \E _{x \sim \rv X}\qty [S(p_x)] \]
72154for a random variable $ \rv X = i$ with probability $ p_i$ .
73155
74- \subsection {Entropy as optimal lossless data compression }
156+ \section {Entropy as optimal lossless data compression }
75157
76158Suppose we are trying to compress a string consisting of $ n$
77159symbols drawn from some distribution.
78160
79- \begin {problem }
161+ \begin {restatable }{ problem}{bitproblem }
80162 What is the expected number of bits you need to store the results of $ n$ independent samples
81163 of a random variable $ \rv X$ ?
82- \end {problem }
164+ \end {restatable }
83165
84166We will show this is $ nH(\rv X)$ .
85167
@@ -88,11 +170,11 @@ \subsection{Entropy as optimal lossless data compression}
88170
89171\begin {defn }
90172 Let $ C : \Sigma \to (\Sigma ')^*$ be a code.
91- We say $ C$ is \term {uniquely decodable} if there does not exist
173+ We say $ C$ is \term [code!uniquely decodable] {uniquely decodable} if there does not exist
92174 a collision $ x, y \in \Sigma ^*$ ,
93175 with identical encoding $ C(x_1 )C(x_2 )\cdots C(x_k) = C(y_1 )C(y_2 )\cdots C(y_{k'})$ .
94176
95- Also, $ C$ is \term {prefix-free} (sometimes called \term *{instantaneous})
177+ Also, $ C$ is \term [code!prefix-free] {prefix-free} (sometimes called \term *{instantaneous})
96178 if for any distinct $ x,y \in \Sigma $ , $ C(x)$ is not a prefix of $ C(y)$ .
97179\end {defn }
98180
@@ -112,7 +194,7 @@ \subsection{Entropy as optimal lossless data compression}
112194Recall from CS 240 that a prefix-free code is equivalent to a trie,
113195and we can decode it by traversing the trie in linear time.
114196
115- \begin {theorem }[Kraft's inequality]
197+ \begin {theorem }[Kraft's inequality]\label { thm:kraft }
116198 A prefix-free binary code $ C : \{ 1 ,\dotsc ,n\} \to \{ 0 ,1 \} ^*$
117199 with codeword lengths $ \ell _i = \abs {C(i)}$ exists if and only if
118200 \[ \sum _{i=1}^n \frac {1}{2^{\ell _i}} \leq 1. \]
@@ -154,4 +236,103 @@ \subsection{Entropy as optimal lossless data compression}
154236 by the inequality.
155237\end {prf }
156238
239+ \lecture {May 13}
240+ Recall the problem we are trying to solve:
241+ \bitproblem *
242+ \begin {sol }[Shannon \& Faro]
243+ Consider the case where $ \rv X$ is symbol $ i$ with probability $ p_i$ .
244+ We want to encode independent samples $ x_i \sim \rv X$
245+ as $ C(x_i)$ for some code $ C : [n] \to \bits *$ .
246+
247+ Suppose for simplification that $ p_i = \frac {1}{2^{\ell _i}}$
248+ for some integers $ \ell _i$ .
249+ Since $ \sum p_i = 1 $ , we must have $ \sum \frac {1}{2^{\ell _i}} = 1 $ .
250+ Then, by \nameref {thm:kraft }, there exists a prefix-free binary code
251+ $ C : [n] \to \bits *$ with codeword lengths $ \abs {C(i)} = \ell _i$ .
252+ Now,
253+ \[
254+ \E _{x_i \sim \rv X}\qty [\sum _i\abs {C(x_i)}] = \sum _i p_i\ell _i = \sum _i p_i\log _2\frac {1}{p_i} = H(\rv X)
255+ \]
256+ Proceed to the general case.
257+ Suppose $ \log _2 \frac {1}{p_i}$ are non-integral.
258+ Instead, use $ \ell '_i = \ceil *{\log _2\frac {1}{p_i}}$ .
259+ We still satisfy Kraft since $ \sum _i \frac {1}{2^{\ell '_i}} \leq \sum _i p_i = 1 $ .
260+ Then,
261+ \[
262+ \E _{x_i \sim \rv X}\qty [\sum _i\abs {C(x_i)}] = \sum _i p_i\ell '_i = \sum _i p_i\ceil *{\log _2\frac {1}{p_i}}
263+ \]
264+ which is bounded by
265+ \[ H(\rv X) = \sum _i p_i\log _2\frac {1}{p_i} \leq \sum _i p_i\ceil *{\log _2\frac {1}{p_i}} < \sum _i p_i\qty (1+\log _2\frac {1}{p_i}) = H(\rv X) + 1 \]
266+ We call the code $ C$ generated by this process the \term [code!Shannon--Faro]{Shannon--Faro code}.
267+ \end {sol }
268+
269+ We can improve on this bound $ [H(\rv X), H(\rv X) + 1 )$
270+ by amortizing over longer batches of the string.
271+
272+ \begin {sol }[batching]
273+ For $ \rv Y$ defined on $ [n]$ equal to $ i$ with probability $ q_i$ ,
274+ define the random variable $ \rv Y^{(k)}$ on $ [n]^k$
275+ equal to the string $ i_1 \cdots i_k$ with probability $ q_{i_1}\cdots q_{i_k}$ .
276+ That is, $ \rv Y^{(k)}$ models $ k$ independent samples of $ \rv Y$ .
277+
278+ Apply the Shannon--Fano code to $ \rv Y^{(k)}$
279+ to get an encoding of $ [n]^k$ as bitstrings of expected length $ \ell $
280+ satisfying $ H(\rv Y^{(k)}) \leq \ell \leq H(\rv Y^{(k)}) + 1 $ .
281+ \begin {align* }
282+ H(\rv Y^{(k)}) & = \E _{i_1\cdots i_k \sim \rv Y^{(k)}}\qty [\log _2 \frac {1}{q_{i_1}\cdots q_{i_k}}] \tag {by def'n} \\
283+ & = \E _{i_1\cdots i_k \sim \rv Y^{(k)}}\qty [\log _2 \frac {1}{q_{i_1}} + \dotsb + \log _2\frac {1}{q_{i_k}}] \tag {log rules} \\
284+ & = \sum _{j=1}^k \E _{i_1\cdots i_k \sim \rv Y^{(k)}}\qty [\log _2 \frac {1}{q_{i_j}}] \tag {linearity of expectation} \\
285+ & = \sum _{j=1}^k \E _{i \sim \rv Y}\qty [\log _2 \frac {1}{q_{i}}] \tag {$ q_{i_j}$ only depends on one character} \\
286+ & = kH(\rv Y) \tag {by def'n, no $ j$ -dependence in sum}
287+ \end {align* }
288+ For every $ k$ symbols, we use $ \ell $ bits, i.e., $ \frac {\ell }{k}$ bits per symbol.
289+ From the Shannon--Faro bound, we have
290+ \begin {align* }
291+ \frac {H(\rv Y^{(k)})}{k} & \leq \frac {\ell }{k} < \frac {H(\rv Y^{(k)})}{k} + \frac {1}{k} \\
292+ H(\rv Y) & \leq \frac {\ell }{k} < H(\rv Y) + \frac {1}{k}
293+ \end {align* }
294+ Then, we have a code for $ \rv Y$ bounded by
295+ $ [H(\rv Y), H(\rv Y) + \frac {1}{k})$ .
296+
297+ Taking a limit of some sort, we can say that we need $ H(\rv Y) + o(1 )$ bits.
298+ \end {sol }
299+
300+ \begin {defn* }[relative entropy]
301+ Given two discrete distributions $ p = (p_i)$ and $ q = (q_i)$ ,
302+ the \term [entropy!relative]{relative entropy}
303+ \[ D(p \parallel q) :=
304+ \sum p_i \log _2\frac {1}{q_i} - \sum _i p_i \log _2 \frac {1}{p_i}
305+ = \sum p_i \log _2 \frac {p_i}{q_i} \]
306+ This is also known as the \term {KL divergence}.
307+ \end {defn* }
308+
309+ \begin {fact }
310+ $ D(p \parallel q) \geq 0 $ with equality exactly when $ p = q$ .
311+ \end {fact }
312+ \begin {prf }
313+ Define $ \rv X' = \frac {p_i}{q_i}$ with probability $ p_i$ .
314+ Then,
315+ \[ D(p \parallel q) = \E [-\log _2 \rv X'] \geq -\log _2 E[\rv X'] \]
316+ by Jensen's inequality (as $ f(x) = -\log _2 x$ is convex), and then
317+ \[ D(p \parallel q) \geq -\log _2 \sum p_i \frac {q_i}{p_i} = -\log _2 1 = 0 \qedhere \]
318+ \end {prf }
319+
320+ \begin {prop }
321+ Any prefix-free code has an expected length at least $ H(\rv X)$ .
322+ \end {prop }
323+ \begin {prf }
324+ We can show this by interpreting the expected length $ H(\rv X)$
325+ as $ D(p \parallel q)$ for some $ q$ .
326+
327+ We will take $ q$ to be the random walk distribution corresponding to the binary tree
328+ associated to the candidate prefix-free code.
329+ \end {prf }
330+
331+ \pagebreak
332+ \phantomsection\addcontentsline {toc}{chapter}{Back Matter}
333+ \renewcommand {\listtheoremname }{List of Named Results}
334+ \phantomsection\addcontentsline {toc}{section}{\listtheoremname }
335+ \listoftheorems [ignoreall,numwidth=4em,onlynamed={theorem,lemma,corollary,prop}]
336+ \printindex
337+
157338\end {document }
0 commit comments