|
| 1 | +\documentclass[notes,tikz]{agony} |
| 2 | + |
| 3 | +\DeclareMathOperator*{\E}{\mathbb{E}} |
| 4 | + |
| 5 | +\title{CO 432 Spring 2025: Lecture Notes} |
| 6 | + |
| 7 | +\begin{document} |
| 8 | +\renewcommand{\contentsname}{CO 432 Spring 2025:\\{\huge Lecture Notes}} |
| 9 | +\thispagestyle{firstpage} |
| 10 | +\tableofcontents |
| 11 | + |
| 12 | +Lecture notes taken, unless otherwise specified, |
| 13 | +by myself during the Spring 2025 offering of CO 432, |
| 14 | +taught by Vijay Bhattiprolu. |
| 15 | + |
| 16 | +\begin{multicols}{2} |
| 17 | + \listoflecture |
| 18 | +\end{multicols} |
| 19 | + |
| 20 | +\chapter{Introduction} |
| 21 | + |
| 22 | +\section{Entropy} |
| 23 | +\lecture{May 6} |
| 24 | +TODO |
| 25 | + |
| 26 | +\begin{defn}[entropy] |
| 27 | + For a random variable $\rv X$ which is equal to $i$ with probability $p_i$, |
| 28 | + the \term{entropy} $H(\rv X) := \sum_i p_i \log \frac{1}{p_i}$. |
| 29 | +\end{defn} |
| 30 | + |
| 31 | +\subsection{Axiomatic view of entropy} |
| 32 | +\lecture{May 8} |
| 33 | + |
| 34 | +We want $S : [0,1] \to [0,\infty)$ to capture how ``surprised'' |
| 35 | +we are $S(p)$ that an event with probability $p$ happens. |
| 36 | +We want to show that under some natural assumptions, |
| 37 | +this is the only function we could have defined as entropy. |
| 38 | +In particular: |
| 39 | + |
| 40 | +\begin{enumerate} |
| 41 | + \item $S(1) = 0$, a certainty should not be surprising |
| 42 | + \item $S(q) > S(p)$ if $p > q$, less probable should be more surprising |
| 43 | + \item $S(p)$ is continuous in $p$ |
| 44 | + \item $S(pq) = S(p) + S(q)$, surprise should add for independent events. |
| 45 | + That is, if I see something twice, I should be twice as surprised. |
| 46 | +\end{enumerate} |
| 47 | + |
| 48 | +\begin{prop} |
| 49 | + If $S(p)$ satisfies these 4 axioms, then $S(p)=c\cdot \log_2(1/p)$ for some $c > 0$. |
| 50 | +\end{prop} |
| 51 | +\begin{prf} |
| 52 | + Suppose a function $S : [0,1] \to [0,\infty)$ exists satisfying the axioms. |
| 53 | + Let $c := S(\frac12) > 0$. |
| 54 | + |
| 55 | + By axiom 4 (addition), $S(\frac{1}{2^k}) = kS(\frac12)$. |
| 56 | + Likewise, $S(\frac{1}{2^{1/k}}\cdots\frac{1}{2^{1/k}}) |
| 57 | + = S(\frac{1}{2^{1/k}}) + \dotsb + S(\frac{1}{2^{1/k}}) = kS(\frac{1}{2^{1/k}})$. |
| 58 | + |
| 59 | + Then, $S(\frac{1}{2^{m/n}}) = \frac{m}{n}S(\frac12) = \frac{m}{n}\cdot c$ |
| 60 | + for any rational $m/n$. |
| 61 | + |
| 62 | + By axiom 3 (continuity), $S(\frac{1}{2^z}) = c \cdot z$ for all $z \in [0,\infty)$ |
| 63 | + because the rationals are dense in the reals. |
| 64 | + In particular, for any $p \in [0,1]$, |
| 65 | + we can write $p = \frac{1}{2^z}$ for $z = \log_2(1/p)$ |
| 66 | + and we get \[ S\qty(p) = S\qty(\frac{1}{2^z}) = c \cdot z = c \cdot \log_2(1/p) \] |
| 67 | + as desired. |
| 68 | +\end{prf} |
| 69 | + |
| 70 | +We can now view entropy as expected surprise. In particular, |
| 71 | +\[ \sum_i p_i \log_2\frac{1}{p_i} = \E_{x \sim \rv X}\qty[S(p_x)] \] |
| 72 | +for a random variable $\rv X = i$ with probability $p_i$. |
| 73 | + |
| 74 | +\subsection{Entropy as optimal lossless data compression} |
| 75 | + |
| 76 | +Suppose we are trying to compress a string consisting of $n$ |
| 77 | +symbols drawn from some distribution. |
| 78 | + |
| 79 | +\begin{problem} |
| 80 | + What is the expected number of bits you need to store the results of $n$ independent samples |
| 81 | + of a random variable $\rv X$? |
| 82 | +\end{problem} |
| 83 | + |
| 84 | +We will show this is $nH(\rv X)$. |
| 85 | + |
| 86 | +Notice that we assume that the symbols we are drawn \uline{independently}, |
| 87 | +which is violated by almost all data we actually care about. |
| 88 | + |
| 89 | +\begin{defn} |
| 90 | + Let $C : \Sigma \to (\Sigma')^*$ be a code. |
| 91 | + We say $C$ is \term{uniquely decodable} if there does not exist |
| 92 | + a collision $x, y \in \Sigma^*$, |
| 93 | + with identical encoding $C(x_1)C(x_2)\cdots C(x_k) = C(y_1)C(y_2)\cdots C(y_{k'})$. |
| 94 | + |
| 95 | + Also, $C$ is \term{prefix-free} (sometimes called \term*{instantaneous}) |
| 96 | + if for any distinct $x,y \in \Sigma$, $C(x)$ is not a prefix of $C(y)$. |
| 97 | +\end{defn} |
| 98 | + |
| 99 | +\begin{prop} |
| 100 | + Prefix-freeness is sufficient for unique decodability. |
| 101 | +\end{prop} |
| 102 | + |
| 103 | +\begin{example} |
| 104 | + Let $C : \{A,B,C,D\} \to \{0,1\}^*$ where |
| 105 | + $C(A) = 11$, $C(B) = 101$, $C(C) = 100$, and $C(D) = 00$. |
| 106 | + Then, $C$ is prefix-free and uniquely decodable. |
| 107 | + |
| 108 | + We can easily parse $1011100001100$ unambiguously as $101.11.00.00.11.00$ |
| 109 | + ($BADDAD$). |
| 110 | +\end{example} |
| 111 | + |
| 112 | +Recall from CS 240 that a prefix-free code is equivalent to a trie, |
| 113 | +and we can decode it by traversing the trie in linear time. |
| 114 | + |
| 115 | +\begin{theorem}[Kraft's inequality] |
| 116 | + A prefix-free binary code $C : \{1,\dotsc,n\} \to \{0,1\}^*$ |
| 117 | + with codeword lengths $\ell_i = \abs{C(i)}$ exists if and only if |
| 118 | + \[ \sum_{i=1}^n \frac{1}{2^{\ell_i}} \leq 1. \] |
| 119 | +\end{theorem} |
| 120 | +\begin{prf} |
| 121 | + Suppose $C : \{1,\dotsc,n\} \to \{0,1\}^*$ is prefix-free |
| 122 | + with codeword lengths $\ell_i$. |
| 123 | + Let $T$ be its associated binary tree |
| 124 | + and let $W$ be a random walk on $T$ where 0 and 1 have equal weight |
| 125 | + (stopping at either a leaf or undefined branch). |
| 126 | + |
| 127 | + Define $E_i$ as the event where $W$ reaches $i$ and |
| 128 | + $E_\varnothing$ where $W$ falls off. Then, |
| 129 | + \begin{align*} |
| 130 | + 1 & = \Pr(E_\varnothing) + \sum_i \Pr(E_i) \\ |
| 131 | + & = \Pr(E_\varnothing) + \sum_i \frac{1}{2^{\ell_i}} \tag{by independence} \\ |
| 132 | + & \geq \sum_i \frac{1}{2^{\ell_i}} \tag{probabilities are non-negative} |
| 133 | + \end{align*} |
| 134 | + |
| 135 | + Conversely, suppose the inequality holds for some $\ell_i$. |
| 136 | + \WLOG, suppose $\ell_1 < \ell_2 < \dotsb < \ell_n$. |
| 137 | + |
| 138 | + Start with a complete binary tree $T$ of depth $\ell_n$. |
| 139 | + For each $i = 1,\dotsc,n$, find any unassigned node in $T$ of depth $\ell_i$, |
| 140 | + delete its children, and assign it a symbol. |
| 141 | + |
| 142 | + Now, it remains to show that this process will not fail. |
| 143 | + That is, for any loop step $i$, there is still some unassigned node at depth $\ell_i$. |
| 144 | + |
| 145 | + Let $P \gets 2^{\ell_n}$ be the number of leaves |
| 146 | + of the complete binary tree of depth $\ell_n$. |
| 147 | + After the $i$\xth step, we decrease $P$ by $2^{\ell_n - \ell_i}$. |
| 148 | + That is, after $n$ steps, |
| 149 | + \begin{align*} |
| 150 | + P & = 2^{\ell_n} - \sum_{i=1}^n \frac{2^{\ell_n}}{2^{\ell_i}} \\ |
| 151 | + & = 2^{\ell_n} - 2^{\ell_n} \sum_{i=1}^n \frac{1}{2^{\ell_i}} \\ |
| 152 | + & \geq 0 |
| 153 | + \end{align*} |
| 154 | + by the inequality. |
| 155 | +\end{prf} |
| 156 | + |
| 157 | +\end{document} |
0 commit comments