Skip to content

Commit 9ae8a31

Browse files
committed
CO432/notes: may 13
1 parent af6dbf7 commit 9ae8a31

File tree

4 files changed

+195
-13
lines changed

4 files changed

+195
-13
lines changed

CO432/notes.pdf

25.4 KB
Binary file not shown.

CO432/notes.tex

Lines changed: 193 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
\documentclass[notes,tikz]{agony}
1+
\documentclass[class=co432,notes,tikz]{agony}
22

33
\DeclareMathOperator*{\E}{\mathbb{E}}
44

@@ -19,17 +19,96 @@
1919

2020
\chapter{Introduction}
2121

22+
\begin{notation}
23+
I will be using my usual \LaTeX{} typesetting conventions:
24+
\begin{itemize}[nosep]
25+
\item $[n]$ means the set $\{1,2,\dotsc,n\}$
26+
\item $\bits*$ means the set of bitstrings of arbitrary length (i.e., the Kleene star)
27+
\item $\rv A, \rv B, \dotsc, \rv Z$ are random variables (in sans-serif)
28+
\item $\rv X = (p_1,p_2,\dotsc,p_k)$ means $\rv X$ is a discrete random variable
29+
such that $\Pr[\rv X = 1] = p_1$, $\Pr[\rv X=2] = p_2$, etc.
30+
(abbreviate further as $\rv X = (p_i)$)
31+
\end{itemize}
32+
\end{notation}
33+
2234
\section{Entropy}
2335
\lecture{May 6}
24-
TODO
36+
37+
\textrule{$\downarrow$ Lecture 1 adapted from Arthur $\downarrow$}
2538

2639
\begin{defn}[entropy]
27-
For a random variable $\rv X$ which is equal to $i$ with probability $p_i$,
28-
the \term{entropy} $H(\rv X) := \sum_i p_i \log \frac{1}{p_i}$.
40+
For a random variable $\rv X = (p_i)$,
41+
the \term*{entropy} $H(\rv X)$ is
42+
\[ H(\rv X) = -\sum_i p_i \log p_i = \sum_i p_i \log \frac{1}{p_i}. \]
2943
\end{defn}
3044

31-
\subsection{Axiomatic view of entropy}
32-
\lecture{May 8}
45+
\begin{convention}
46+
By convention, we usually use $\log_2$.
47+
Also, we define entropy such that $\log_2(0) = 0$ so that
48+
impossible values do not break the formula.
49+
\end{convention}
50+
51+
\begin{example}
52+
If $\rv X$ takes on the values $a$, $b$, $c$, $d$
53+
with probabilities 1, 0, 0, 0, respectively, then $H(\rv X) = 1 \log 1 = 0$.
54+
55+
If $\rv X$ takes on those values instead with probabilities
56+
$\frac12$, $\frac14$, $\frac18$, $\frac18$, respectively,
57+
then $H(\rv X) = \frac74$.
58+
\end{example}
59+
60+
\begin{fact}
61+
$H(\rv X) = 0$ if and only if $\rv X$ is a constant.
62+
\end{fact}
63+
\begin{prf}
64+
Suppose $\rv X$ is constant. Then, $H(\rv X) = 1 \log 1 = 0$.
65+
66+
Suppose $H(\rv X) = 0$.
67+
Probabilities are in $[0,1]$, so $p_i \log \frac{1}{p_i} \geq 0$.
68+
Since $H(\rv X) = \sum_i p_i \log \frac{1}{p_i} = 0$
69+
and each term is non-negative, each term must be zero.
70+
Thus, each $p_i$ is either 0 or 1.
71+
We cannot have $\sum p_i > 1$, so exactly one $p_i = 1$ and the rest are zero.
72+
That is, $\rv X$ is constant.
73+
\end{prf}
74+
75+
\begin{theorem}[Jensen's inequality]\label{thm:jensen}
76+
Let $f : \R \to \R$ be concave. That is,
77+
for any $a$ and $b$ in the domain of $f$ and $\lambda \in [0,1)$,
78+
$f(\lambda a + (1-\lambda)b) \geq \lambda f(a) + (1-\lambda)f(b)$.
79+
For any discrete random variable $\rv X$,
80+
\[ \E[f(\rv X)] \leq f(\E[\rv X]) \]
81+
\end{theorem}
82+
\begin{prf}
83+
Consider a random variable $\rv X$ with two values $a$ and $b$,
84+
each with probabilities $\lambda$ and $1-\lambda$.
85+
Then, notice that
86+
\[ \E[f(\rv X)] = \lambda f(a) + (1-\lambda) f(b) \leq f(\lambda a + (1-\lambda)b) = f(\E[\rv X]) \]
87+
by convexity of $f$.
88+
89+
TODO: This can be generalized by induction.
90+
\end{prf}
91+
92+
\begin{fact}
93+
Assume $\rv X$ is supported on $[n]$. Then, $0 \leq H(\rv X) \leq \log n$.
94+
\end{fact}
95+
\begin{prf}
96+
Start by claiming without proof that $\log n$ is concave, so we can apply
97+
\nameref{thm:jensen}.
98+
99+
Let $\rv X' = \frac{1}{p_i}$ with probability $p_i$. Then,
100+
\begin{align*}
101+
H(\rv X) & = \sum_i p_i \log \frac{1}{p_i} \\
102+
& = \E\qty[\log(\rv X')] \\
103+
& \leq \log(\E[\rv X']) \\
104+
& = \log\qty(\sum p_i \frac{1}{p_i}) \\
105+
& = \log n \qedhere
106+
\end{align*}
107+
\end{prf}
108+
109+
It is not a coincidence that $\log_2 n$ is the minimum number of bits to encode $[n]$.
110+
111+
\section{Entropy as expected surprise}
33112

34113
We want $S : [0,1] \to [0,\infty)$ to capture how ``surprised''
35114
we are $S(p)$ that an event with probability $p$ happens.
@@ -45,6 +124,9 @@ \subsection{Axiomatic view of entropy}
45124
That is, if I see something twice, I should be twice as surprised.
46125
\end{enumerate}
47126

127+
\textrule{$\uparrow$ Lecture 1 adapted from Arthur $\uparrow$}
128+
\lecture{May 8}
129+
48130
\begin{prop}
49131
If $S(p)$ satisfies these 4 axioms, then $S(p)=c\cdot \log_2(1/p)$ for some $c > 0$.
50132
\end{prop}
@@ -71,15 +153,15 @@ \subsection{Axiomatic view of entropy}
71153
\[ \sum_i p_i \log_2\frac{1}{p_i} = \E_{x \sim \rv X}\qty[S(p_x)] \]
72154
for a random variable $\rv X = i$ with probability $p_i$.
73155

74-
\subsection{Entropy as optimal lossless data compression}
156+
\section{Entropy as optimal lossless data compression}
75157

76158
Suppose we are trying to compress a string consisting of $n$
77159
symbols drawn from some distribution.
78160

79-
\begin{problem}
161+
\begin{restatable}{problem}{bitproblem}
80162
What is the expected number of bits you need to store the results of $n$ independent samples
81163
of a random variable $\rv X$?
82-
\end{problem}
164+
\end{restatable}
83165

84166
We will show this is $nH(\rv X)$.
85167

@@ -88,11 +170,11 @@ \subsection{Entropy as optimal lossless data compression}
88170

89171
\begin{defn}
90172
Let $C : \Sigma \to (\Sigma')^*$ be a code.
91-
We say $C$ is \term{uniquely decodable} if there does not exist
173+
We say $C$ is \term[code!uniquely decodable]{uniquely decodable} if there does not exist
92174
a collision $x, y \in \Sigma^*$,
93175
with identical encoding $C(x_1)C(x_2)\cdots C(x_k) = C(y_1)C(y_2)\cdots C(y_{k'})$.
94176

95-
Also, $C$ is \term{prefix-free} (sometimes called \term*{instantaneous})
177+
Also, $C$ is \term[code!prefix-free]{prefix-free} (sometimes called \term*{instantaneous})
96178
if for any distinct $x,y \in \Sigma$, $C(x)$ is not a prefix of $C(y)$.
97179
\end{defn}
98180

@@ -112,7 +194,7 @@ \subsection{Entropy as optimal lossless data compression}
112194
Recall from CS 240 that a prefix-free code is equivalent to a trie,
113195
and we can decode it by traversing the trie in linear time.
114196

115-
\begin{theorem}[Kraft's inequality]
197+
\begin{theorem}[Kraft's inequality]\label{thm:kraft}
116198
A prefix-free binary code $C : \{1,\dotsc,n\} \to \{0,1\}^*$
117199
with codeword lengths $\ell_i = \abs{C(i)}$ exists if and only if
118200
\[ \sum_{i=1}^n \frac{1}{2^{\ell_i}} \leq 1. \]
@@ -154,4 +236,103 @@ \subsection{Entropy as optimal lossless data compression}
154236
by the inequality.
155237
\end{prf}
156238

239+
\lecture{May 13}
240+
Recall the problem we are trying to solve:
241+
\bitproblem*
242+
\begin{sol}[Shannon \& Faro]
243+
Consider the case where $\rv X$ is symbol $i$ with probability $p_i$.
244+
We want to encode independent samples $x_i \sim \rv X$
245+
as $C(x_i)$ for some code $C : [n] \to \bits*$.
246+
247+
Suppose for simplification that $p_i = \frac{1}{2^{\ell_i}}$
248+
for some integers $\ell_i$.
249+
Since $\sum p_i = 1$, we must have $\sum \frac{1}{2^{\ell_i}} = 1$.
250+
Then, by \nameref{thm:kraft}, there exists a prefix-free binary code
251+
$C : [n] \to \bits*$ with codeword lengths $\abs{C(i)} = \ell_i$.
252+
Now,
253+
\[
254+
\E_{x_i \sim \rv X}\qty[\sum_i\abs{C(x_i)}] = \sum_i p_i\ell_i = \sum_i p_i\log_2\frac{1}{p_i} = H(\rv X)
255+
\]
256+
Proceed to the general case.
257+
Suppose $\log_2\frac{1}{p_i}$ are non-integral.
258+
Instead, use $\ell'_i = \ceil*{\log_2\frac{1}{p_i}}$.
259+
We still satisfy Kraft since $\sum_i \frac{1}{2^{\ell'_i}} \leq \sum_i p_i = 1$.
260+
Then,
261+
\[
262+
\E_{x_i \sim \rv X}\qty[\sum_i\abs{C(x_i)}] = \sum_i p_i\ell'_i = \sum_i p_i\ceil*{\log_2\frac{1}{p_i}}
263+
\]
264+
which is bounded by
265+
\[ H(\rv X) = \sum_i p_i\log_2\frac{1}{p_i} \leq \sum_i p_i\ceil*{\log_2\frac{1}{p_i}} < \sum_i p_i\qty(1+\log_2\frac{1}{p_i}) = H(\rv X) + 1 \]
266+
We call the code $C$ generated by this process the \term[code!Shannon--Faro]{Shannon--Faro code}.
267+
\end{sol}
268+
269+
We can improve on this bound $[H(\rv X), H(\rv X) + 1)$
270+
by amortizing over longer batches of the string.
271+
272+
\begin{sol}[batching]
273+
For $\rv Y$ defined on $[n]$ equal to $i$ with probability $q_i$,
274+
define the random variable $\rv Y^{(k)}$ on $[n]^k$
275+
equal to the string $i_1\cdots i_k$ with probability $q_{i_1}\cdots q_{i_k}$.
276+
That is, $\rv Y^{(k)}$ models $k$ independent samples of $\rv Y$.
277+
278+
Apply the Shannon--Fano code to $\rv Y^{(k)}$
279+
to get an encoding of $[n]^k$ as bitstrings of expected length $\ell$
280+
satisfying $H(\rv Y^{(k)}) \leq \ell \leq H(\rv Y^{(k)}) + 1$.
281+
\begin{align*}
282+
H(\rv Y^{(k)}) & = \E_{i_1\cdots i_k \sim \rv Y^{(k)}}\qty[\log_2 \frac{1}{q_{i_1}\cdots q_{i_k}}] \tag{by def'n} \\
283+
& = \E_{i_1\cdots i_k \sim \rv Y^{(k)}}\qty[\log_2 \frac{1}{q_{i_1}} + \dotsb + \log_2\frac{1}{q_{i_k}}] \tag{log rules} \\
284+
& = \sum_{j=1}^k \E_{i_1\cdots i_k \sim \rv Y^{(k)}}\qty[\log_2 \frac{1}{q_{i_j}}] \tag{linearity of expectation} \\
285+
& = \sum_{j=1}^k \E_{i \sim \rv Y}\qty[\log_2 \frac{1}{q_{i}}] \tag{$q_{i_j}$ only depends on one character} \\
286+
& = kH(\rv Y) \tag{by def'n, no $j$-dependence in sum}
287+
\end{align*}
288+
For every $k$ symbols, we use $\ell$ bits, i.e., $\frac{\ell}{k}$ bits per symbol.
289+
From the Shannon--Faro bound, we have
290+
\begin{align*}
291+
\frac{H(\rv Y^{(k)})}{k} & \leq \frac{\ell}{k} < \frac{H(\rv Y^{(k)})}{k} + \frac{1}{k} \\
292+
H(\rv Y) & \leq \frac{\ell}{k} < H(\rv Y) + \frac{1}{k}
293+
\end{align*}
294+
Then, we have a code for $\rv Y$ bounded by
295+
$[H(\rv Y), H(\rv Y) + \frac{1}{k})$.
296+
297+
Taking a limit of some sort, we can say that we need $H(\rv Y) + o(1)$ bits.
298+
\end{sol}
299+
300+
\begin{defn*}[relative entropy]
301+
Given two discrete distributions $p = (p_i)$ and $q = (q_i)$,
302+
the \term[entropy!relative]{relative entropy}
303+
\[ D(p \parallel q) :=
304+
\sum p_i \log_2\frac{1}{q_i} - \sum_i p_i \log_2 \frac{1}{p_i}
305+
= \sum p_i \log_2 \frac{p_i}{q_i} \]
306+
This is also known as the \term{KL divergence}.
307+
\end{defn*}
308+
309+
\begin{fact}
310+
$D(p \parallel q) \geq 0$ with equality exactly when $p = q$.
311+
\end{fact}
312+
\begin{prf}
313+
Define $\rv X' = \frac{p_i}{q_i}$ with probability $p_i$.
314+
Then,
315+
\[ D(p \parallel q) = \E[-\log_2 \rv X'] \geq -\log_2 E[\rv X'] \]
316+
by Jensen's inequality (as $f(x) = -\log_2 x$ is convex), and then
317+
\[ D(p \parallel q) \geq -\log_2 \sum p_i \frac{q_i}{p_i} = -\log_2 1 = 0 \qedhere \]
318+
\end{prf}
319+
320+
\begin{prop}
321+
Any prefix-free code has an expected length at least $H(\rv X)$.
322+
\end{prop}
323+
\begin{prf}
324+
We can show this by interpreting the expected length $H(\rv X)$
325+
as $D(p \parallel q)$ for some $q$.
326+
327+
We will take $q$ to be the random walk distribution corresponding to the binary tree
328+
associated to the candidate prefix-free code.
329+
\end{prf}
330+
331+
\pagebreak
332+
\phantomsection\addcontentsline{toc}{chapter}{Back Matter}
333+
\renewcommand{\listtheoremname}{List of Named Results}
334+
\phantomsection\addcontentsline{toc}{section}{\listtheoremname}
335+
\listoftheorems[ignoreall,numwidth=4em,onlynamed={theorem,lemma,corollary,prop}]
336+
\printindex
337+
157338
\end{document}

latex/agony-co432.tex

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
\newcommand{\bits}[1]{\ensuremath{\{0,1\}^{#1}}}

latex/agony.cls

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@
162162

163163
% Question/Problem theorem styles
164164
\usepackage{mdframed}
165-
\usepackage{amsthm,thmtools}
165+
\usepackage{amsthm,thmtools,thm-restate}
166166

167167
\newcounter{question}[subsection]
168168
\renewcommand{\thequestion}{Q\ifnum\value{question}<10 0\fi\arabic{question}}

0 commit comments

Comments
 (0)