|
1 | 1 | import Mathlib.Algebra.Order.Ring.Rat |
2 | 2 | import Mathlib.Analysis.SpecialFunctions.Log.NegMulLog |
| 3 | +import Mathlib.Analysis.SpecialFunctions.Sqrt |
3 | 4 | import Mathlib.Algebra.BigOperators.Field |
4 | 5 | import Mathlib.Data.Fintype.BigOperators |
| 6 | +import Mathlib.InformationTheory.KullbackLeibler.KLFun |
5 | 7 |
|
6 | 8 | /-! |
7 | 9 | # Information-Theoretic Primitives |
@@ -39,6 +41,8 @@ return `(Finset α, α → ℝ)` directly). |
39 | 41 | - `jsdOf s p q`: Jensen-Shannon divergence |
40 | 42 | - `deltaP`: ΔP directional association measure (ℚ-valued, no log) |
41 | 43 | - `deltaPCounts`: ΔP from a 2×2 contingency table (ℚ-valued, no log) |
| 44 | +- `klFinite p q`: discrete KL divergence `Σᵢ pᵢ · log(pᵢ/qᵢ)` (with `0·log(0/q)=0` guard) |
| 45 | +- `bhattacharyyaCoeff`, `hellingerDistSq`, `hellingerDist`: Hellinger family |
42 | 46 | -/ |
43 | 47 |
|
44 | 48 | namespace Core.InformationTheory |
@@ -176,4 +180,183 @@ theorem deltaP_eq_zero_of_independent (pX pY : ℚ) |
176 | 180 | rw [mul_sub, mul_one, mul_comm pY pX]] |
177 | 181 | rw [mul_div_cancel_right₀ pY hne1, sub_self] |
178 | 182 |
|
| 183 | +/-! ## Kullback–Leibler divergence (finite, ℝ-valued) |
| 184 | +
|
| 185 | +Discrete-finite specialization of mathlib's `InformationTheory.klDiv`, |
| 186 | +routed through mathlib's `klFun (x) = x · log x + 1 − x`. -/ |
| 187 | + |
| 188 | +section KLDivergence |
| 189 | + |
| 190 | +variable {ι : Type*} [Fintype ι] |
| 191 | + |
| 192 | +/-- Finite KL divergence: `KL(p ‖ q) = Σᵢ pᵢ · log(pᵢ / qᵢ)`. |
| 193 | + Convention: `0 · log(0/q) = 0` (via the `if`-guard). -/ |
| 194 | +noncomputable def klFinite (p q : ι → ℝ) : ℝ := |
| 195 | + ∑ i, if p i = 0 then 0 else p i * Real.log (p i / q i) |
| 196 | + |
| 197 | +/-- When `q > 0`, each KL term can be written via `klFun`: |
| 198 | + `p · log(p/q) = q · klFun(p/q) + (p − q)`. -/ |
| 199 | +private theorem kl_term_eq_klFun {p_i q_i : ℝ} (hq : 0 < q_i) (_hp : 0 ≤ p_i) : |
| 200 | + (if p_i = 0 then (0 : ℝ) else p_i * log (p_i / q_i)) = |
| 201 | + q_i * _root_.InformationTheory.klFun (p_i / q_i) + (p_i - q_i) := by |
| 202 | + by_cases hp0 : p_i = 0 |
| 203 | + · simp only [hp0, ↓reduceIte, zero_div, _root_.InformationTheory.klFun_zero, mul_one, zero_sub, |
| 204 | + add_neg_cancel] |
| 205 | + · simp only [hp0, ↓reduceIte] |
| 206 | + unfold _root_.InformationTheory.klFun |
| 207 | + have hq_ne : q_i ≠ 0 := ne_of_gt hq |
| 208 | + field_simp |
| 209 | + ring |
| 210 | + |
| 211 | +/-- Finite KL divergence equals `Σᵢ qᵢ · klFun(pᵢ/qᵢ)` when `Σpᵢ = Σqᵢ`. -/ |
| 212 | +theorem kl_eq_sum_klFun (p q : ι → ℝ) (hq : ∀ i, 0 < q i) (hp : ∀ i, 0 ≤ p i) |
| 213 | + (hsum : ∑ i, p i = ∑ i, q i) : |
| 214 | + klFinite p q = ∑ i, q i * _root_.InformationTheory.klFun (p i / q i) := by |
| 215 | + unfold klFinite |
| 216 | + have hterm : ∀ i, (if p i = 0 then (0 : ℝ) else p i * log (p i / q i)) = |
| 217 | + q i * _root_.InformationTheory.klFun (p i / q i) + (p i - q i) := |
| 218 | + λ i => kl_term_eq_klFun (hq i) (hp i) |
| 219 | + simp_rw [hterm] |
| 220 | + rw [Finset.sum_add_distrib] |
| 221 | + have hcancel : ∑ i, (p i - q i) = 0 := by |
| 222 | + rw [Finset.sum_sub_distrib, hsum, sub_self] |
| 223 | + linarith |
| 224 | + |
| 225 | +/-- **Gibbs' inequality (finite form)**: `KL(p ‖ q) ≥ 0`. |
| 226 | +
|
| 227 | + For distributions `p, q` with `qᵢ > 0`, `pᵢ ≥ 0`, and `Σpᵢ = Σqᵢ`: |
| 228 | + `Σᵢ pᵢ · log(pᵢ/qᵢ) ≥ 0`. Proof via mathlib's `klFun_nonneg`. -/ |
| 229 | +theorem kl_nonneg (p q : ι → ℝ) (hq : ∀ i, 0 < q i) (hp : ∀ i, 0 ≤ p i) |
| 230 | + (hsum : ∑ i, p i = ∑ i, q i) : |
| 231 | + 0 ≤ klFinite p q := by |
| 232 | + rw [kl_eq_sum_klFun p q hq hp hsum] |
| 233 | + apply Finset.sum_nonneg |
| 234 | + intro i _ |
| 235 | + apply mul_nonneg (le_of_lt (hq i)) |
| 236 | + exact _root_.InformationTheory.klFun_nonneg (div_nonneg (hp i) (le_of_lt (hq i))) |
| 237 | + |
| 238 | +/-- Alternative KL non-negativity with distribution hypotheses. -/ |
| 239 | +theorem kl_nonneg' [Nonempty ι] {p q : ι → ℝ} |
| 240 | + (hp_nonneg : ∀ i, 0 ≤ p i) (hq_pos : ∀ i, 0 < q i) |
| 241 | + (hp_sum : ∑ i, p i = 1) (hq_sum : ∑ i, q i = 1) : |
| 242 | + 0 ≤ klFinite p q := |
| 243 | + kl_nonneg p q hq_pos hp_nonneg (by rw [hp_sum, hq_sum]) |
| 244 | + |
| 245 | +/-- Cross-entropy decomposition: |
| 246 | + `KL(p ‖ q) = (Σ pᵢ log pᵢ) − (Σ pᵢ log qᵢ)` |
| 247 | +
|
| 248 | + The hypothesis is **absolute continuity** `p ≪ q`: wherever `p` puts |
| 249 | + mass, `q` does too. Strictly weaker than `∀ i, 0 < q i`. -/ |
| 250 | +theorem klFinite_eq_negEntropy_sub_crossEntropy (p q : ι → ℝ) |
| 251 | + (hAC : ∀ i, p i ≠ 0 → q i ≠ 0) : |
| 252 | + klFinite p q = (∑ i, p i * log (p i)) - (∑ i, p i * log (q i)) := by |
| 253 | + unfold klFinite |
| 254 | + rw [← Finset.sum_sub_distrib] |
| 255 | + refine Finset.sum_congr rfl fun i _ => ?_ |
| 256 | + by_cases hP : p i = 0 |
| 257 | + · simp [hP] |
| 258 | + · rw [if_neg hP, log_div hP (hAC i hP), mul_sub] |
| 259 | + |
| 260 | +/-- KL with a Dirac point mass reduces to negative log-probability (= surprisal): |
| 261 | + `KL(δₛ ‖ Q) = −log Q(s)`. |
| 262 | +
|
| 263 | + Foundation of standard RSA speaker utility `U(u; s) = log L₀(s | u)` |
| 264 | + (@cite{frank-goodman-2012}, @cite{goodman-stuhlmuller-2013}). -/ |
| 265 | +theorem klFinite_pi_single_eq_neg_log [DecidableEq ι] |
| 266 | + (s : ι) (q : ι → ℝ) (hQ : q s ≠ 0) : |
| 267 | + klFinite (Pi.single s 1) q = -log (q s) := by |
| 268 | + unfold klFinite |
| 269 | + rw [Finset.sum_eq_single s] |
| 270 | + · have h1 : Pi.single (M := fun _ => ℝ) s 1 s = 1 := Pi.single_eq_same s 1 |
| 271 | + rw [if_neg (by rw [h1]; exact one_ne_zero), h1, one_mul, one_div, log_inv] |
| 272 | + · intro j _ hj |
| 273 | + have h0 : Pi.single (M := fun _ => ℝ) s 1 j = 0 := Pi.single_eq_of_ne hj 1 |
| 274 | + rw [h0, if_pos rfl] |
| 275 | + · intro h; exact (h (Finset.mem_univ s)).elim |
| 276 | + |
| 277 | +/-- Expected log-likelihood under uncertain beliefs equals negative KL plus |
| 278 | + speaker entropy: `E_p[log q] = −KL(p ‖ q) + Σ p log p`. |
| 279 | +
|
| 280 | + Since `Σ p log p` is independent of `q`, softmax over utterances cancels |
| 281 | + it, yielding the equivalence between Frank-Goodman surprisal `log L₀(s|u)` |
| 282 | + and Goodman-Stuhlmüller belief-weighted utility. -/ |
| 283 | +theorem expected_log_eq_neg_klFinite_plus_negEntropy (p q : ι → ℝ) |
| 284 | + (hAC : ∀ i, p i ≠ 0 → q i ≠ 0) : |
| 285 | + (∑ i, p i * log (q i)) = -klFinite p q + (∑ i, p i * log (p i)) := by |
| 286 | + rw [klFinite_eq_negEntropy_sub_crossEntropy p q hAC]; ring |
| 287 | + |
| 288 | +end KLDivergence |
| 289 | + |
| 290 | +/-! ## Hellinger family (Bhattacharyya, squared-Hellinger, Hellinger distance) |
| 291 | +@cite{herbstritt-franke-2019} |
| 292 | +
|
| 293 | +Finite-distribution Hellinger family used as an alternative speaker utility |
| 294 | +in RSA pragmatics: @cite{herbstritt-franke-2019} argue (footnote 8) that |
| 295 | +Hellinger distance is necessary for probability expressions because KL |
| 296 | +assigns infinite disutility to messages whose literal interpretation |
| 297 | +assigns zero probability to states the speaker considers possible. |
| 298 | +The §-Hellinger-vs-KL inequality `2 · H²(P, Q) ≤ KL(P ‖ Q)` |
| 299 | +(Bretagnolle–Huber, sorried) makes the Hellinger speaker's permissiveness |
| 300 | +over the KL speaker a proved corollary rather than a docstring claim. -/ |
| 301 | + |
| 302 | +section Hellinger |
| 303 | + |
| 304 | +variable {ι : Type*} [Fintype ι] |
| 305 | + |
| 306 | +/-- Bhattacharyya coefficient: `BC(P, Q) = Σᵢ √(Pᵢ · Qᵢ)`. |
| 307 | +
|
| 308 | + For probability distributions `BC = 1 ↔ P = Q` and `BC = 0 ↔` disjoint |
| 309 | + support. -/ |
| 310 | +noncomputable def bhattacharyyaCoeff (P Q : ι → ℝ) : ℝ := |
| 311 | + ∑ i : ι, √(P i * Q i) |
| 312 | + |
| 313 | +/-- Squared Hellinger distance: `H²(P, Q) = 1 − BC(P, Q)`. |
| 314 | +
|
| 315 | + Ranges from 0 (identical distributions) to 1 (disjoint support). |
| 316 | + Equivalent to the standard form `(1/2) Σᵢ (√Pᵢ − √Qᵢ)²` for |
| 317 | + distributions summing to 1. -/ |
| 318 | +noncomputable def hellingerDistSq (P Q : ι → ℝ) : ℝ := |
| 319 | + 1 - bhattacharyyaCoeff P Q |
| 320 | + |
| 321 | +/-- Hellinger distance: `HD(P, Q) = √H²(P, Q)`. |
| 322 | +
|
| 323 | + Satisfies `0 ≤ HD ≤ 1` for probability distributions. Unlike KL, |
| 324 | + Hellinger distance is always finite and is a proper metric (symmetric, |
| 325 | + triangle inequality). |
| 326 | +
|
| 327 | + The @cite{herbstritt-franke-2019} speaker utility (their eq. 16) is |
| 328 | + `EU(m, o, a) = −HD(P_belief, P_LL)`. -/ |
| 329 | +noncomputable def hellingerDist (P Q : ι → ℝ) : ℝ := |
| 330 | + √(hellingerDistSq P Q) |
| 331 | + |
| 332 | +/-- Squared Hellinger distance is non-negative when `BC ≤ 1`. |
| 333 | +
|
| 334 | + For normalised distributions `Σ Pᵢ = Σ Qᵢ = 1`, Cauchy-Schwarz gives |
| 335 | + `BC(P, Q) ≤ 1`, hence `H² ≥ 0`. -/ |
| 336 | +theorem hellingerDistSq_nonneg_of_bc_le_one (P Q : ι → ℝ) |
| 337 | + (h : bhattacharyyaCoeff P Q ≤ 1) : |
| 338 | + 0 ≤ hellingerDistSq P Q := by |
| 339 | + unfold hellingerDistSq; linarith |
| 340 | + |
| 341 | +/-- **Bretagnolle–Huber inequality**: `2 · H²(P, Q) ≤ KL(P ‖ Q)`. |
| 342 | +
|
| 343 | + The standard sharp comparison between Hellinger and KL on probability |
| 344 | + distributions. Combined with `H² ≥ 0`, yields `H²(P, Q) ≤ KL(P ‖ Q)`, |
| 345 | + making the Hellinger speaker's choice set a **superset** of the KL |
| 346 | + speaker's: any utterance the KL speaker can consider, the Hellinger |
| 347 | + speaker can too — but not conversely. |
| 348 | +
|
| 349 | + **Proof sketch (TODO):** Pointwise `klFun(x) ≥ 2(√x − 1)²` (factor-of-2 |
| 350 | + convexity bound). Multiply by `qᵢ` and sum: `Σ qᵢ klFun(pᵢ/qᵢ) ≥ |
| 351 | + 2 Σ (√pᵢ − √qᵢ)²`. The LHS equals `KL(p ‖ q)` via `kl_eq_sum_klFun`; |
| 352 | + the RHS equals `2 · 2 · H²(p, q) = 4 H²` for normalised `p, q`, but only |
| 353 | + `2 H²` is needed. Standard reference: Bretagnolle–Huber (1979). -/ |
| 354 | +theorem two_hellingerDistSq_le_klFinite [Nonempty ι] (P Q : ι → ℝ) |
| 355 | + (_hP_nonneg : ∀ i, 0 ≤ P i) (_hQ_pos : ∀ i, 0 < Q i) |
| 356 | + (_hP_sum : ∑ i, P i = 1) (_hQ_sum : ∑ i, Q i = 1) : |
| 357 | + 2 * hellingerDistSq P Q ≤ klFinite P Q := by |
| 358 | + sorry |
| 359 | + |
| 360 | +end Hellinger |
| 361 | + |
179 | 362 | end Core.InformationTheory |
0 commit comments