Skip to content

Commit 89129c3

Browse files
Add batched quantile queries for KLL sketch
1 parent 076714b commit 89129c3

File tree

3 files changed

+74
-15
lines changed

3 files changed

+74
-15
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ Fast, mergeable **KLL** sketch for streaming quantiles — deterministic, zero d
1818
- **Weighted ingestion** via `add(x, weight)` for aggregated data
1919
- **Mergeable** sketches for distributed/parallel ingestion
2020
- **Serializable** (`to_bytes` / `from_bytes`)
21-
- **Convenience helpers** such as `quantiles(m)` for evenly spaced cuts
21+
- **Convenience helpers** such as `quantiles(m)` and `quantiles_at(qs)` for
22+
evenly spaced or ad-hoc cuts
2223
- **Zero dependencies**, Python 3.9+
2324

2425
---
@@ -60,6 +61,7 @@ assert abs(a2.quantile(0.5) - a.quantile(0.5)) < 1e-12
6061
| `size()` | Total number of ingested items `n`. |
6162
| `quantile(q)` | Approximate `q`-quantile for `q∈[0,1]`. |
6263
| `quantiles(m)` | Evenly spaced cut points. |
64+
| `quantiles_at(qs)` | Batched quantiles for arbitrary `qs`. |
6365
| `median()` | Convenience for `quantile(0.5)`. |
6466
| `rank(x)` | Approximate rank of `x` in `[0, n]`. |
6567
| `cdf(xs)` | CDF values for a sequence `xs`. |
@@ -81,6 +83,7 @@ This implementation follows **Karnin–Lang–Liberty (2016)**: a space-optimal
8183
* Typical error ≈ **O(1/k)** in rank space (increase `capacity` to tighten ε).
8284
* Updates amortized **O(1)** with occasional compactions.
8385
* Queries merge level buffers (**k-way**) and scan weights to the target rank.
86+
Use `quantiles_at` to answer multiple quantiles with a single scan.
8487

8588
> Tip: For heavy query loads, cache materialized arrays between queries.
8689

kll_sketch/kll_sketch.py

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import struct
1313
import random
1414
import heapq
15+
from bisect import bisect_left
1516
from typing import Iterable, List, Tuple, Optional
1617

1718

@@ -32,8 +33,8 @@ class KLL:
3233
This guarantees total weight conservation: Σ(weights) == n.
3334
3435
Public API:
35-
add(x, weight=1), extend(xs), quantile(q), quantiles(m), median(), rank(x),
36-
cdf(xs), merge(other), to_bytes(), from_bytes()
36+
add(x, weight=1), extend(xs), quantile(q), quantiles(m), quantiles_at(qs),
37+
median(), rank(x), cdf(xs), merge(other), to_bytes(), from_bytes()
3738
"""
3839

3940
# ---------------------------- Tunable constants ----------------------------
@@ -100,17 +101,22 @@ def median(self) -> float:
100101
def quantile(self, q: float) -> float:
101102
if not (0.0 <= q <= 1.0):
102103
raise ValueError("q must be in [0,1]")
103-
if self._n == 0:
104-
raise ValueError("empty sketch")
105-
vals, wts = self._materialize_aligned()
106-
# invariant: sum(wts) == n
107-
target = q * (self._n - 1) # rank target in [0, n-1]
108-
cum = 0.0
109-
for v, w in zip(vals, wts):
110-
cum += w
111-
if cum >= target - 1e-12:
112-
return v
113-
return vals[-1]
104+
return self._batched_quantiles([q])[0]
105+
106+
def quantiles_at(self, probabilities: Iterable[float]) -> List[float]:
107+
"""Return the approximate quantiles for each entry in ``probabilities``.
108+
109+
This method evaluates all requested quantiles using a single materialized
110+
pass through the sketch, which is significantly faster than issuing
111+
repeated :meth:`quantile` calls for large query batches.
112+
"""
113+
114+
qs = [float(q) for q in probabilities]
115+
if any(not (0.0 <= q <= 1.0) for q in qs):
116+
raise ValueError("all probabilities must be in [0,1]")
117+
if not qs:
118+
return []
119+
return self._batched_quantiles(qs)
114120

115121
def rank(self, x: float) -> float:
116122
"""Approximate rank in [0, n]."""
@@ -350,7 +356,8 @@ def quantiles(self, m: int) -> List[float]:
350356
if m == 1:
351357
return [self.quantile(0.5)]
352358
step = 1.0 / m
353-
return [self.quantile(step * i) for i in range(1, m)]
359+
qs = [step * i for i in range(1, m)]
360+
return self.quantiles_at(qs)
354361

355362
# ---------------------- weighted ingestion internals ----------------------
356363
def _ingest_weighted_value(self, value: float, weight: int) -> None:
@@ -368,6 +375,31 @@ def _ingest_weighted_value(self, value: float, weight: int) -> None:
368375
if self._capacity_exceeded():
369376
self._compress_until_ok()
370377

378+
def _batched_quantiles(self, qs: List[float]) -> List[float]:
379+
if self._n == 0:
380+
raise ValueError("empty sketch")
381+
vals, wts = self._materialize_aligned()
382+
if not vals:
383+
raise ValueError("empty sketch")
384+
385+
prefix: List[float] = []
386+
total = 0.0
387+
for w in wts:
388+
total += w
389+
prefix.append(total)
390+
391+
ordered = sorted(enumerate(qs), key=lambda item: item[1])
392+
out = [0.0] * len(qs)
393+
search_lo = 0
394+
for idx, q in ordered:
395+
target = q * (self._n - 1)
396+
pos = bisect_left(prefix, target - 1e-12, lo=search_lo)
397+
if pos >= len(vals):
398+
pos = len(vals) - 1
399+
out[idx] = vals[pos]
400+
search_lo = pos
401+
return out
402+
371403

372404
# ----------------------------- quick self-test --------------------------------
373405
if __name__ == "__main__":

kll_sketch/tests/test_kll.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,3 +206,27 @@ def test_quantiles_helper_even_spacing() -> None:
206206

207207
median_only = sketch.quantiles(1)
208208
assert median_only == pytest.approx([sketch.median()])
209+
210+
211+
def test_quantiles_at_matches_repeated_calls() -> None:
212+
rng = random.Random(123)
213+
xs = [rng.gauss(0.0, 2.0) for _ in range(5000)]
214+
215+
sketch = KLL(capacity=200)
216+
sketch.extend(xs)
217+
218+
qs = [0.05, 0.2, 0.33, 0.5, 0.75, 0.95]
219+
batched = sketch.quantiles_at(qs)
220+
repeated = [sketch.quantile(q) for q in qs]
221+
assert batched == pytest.approx(repeated, abs=1e-12)
222+
223+
224+
def test_quantiles_at_accepts_unsorted_probabilities() -> None:
225+
sketch = KLL(capacity=128)
226+
sketch.extend(range(1000))
227+
228+
qs = [0.9, 0.1, 0.5]
229+
values = sketch.quantiles_at(qs)
230+
assert values[0] == pytest.approx(sketch.quantile(0.9))
231+
assert values[1] == pytest.approx(sketch.quantile(0.1))
232+
assert values[2] == pytest.approx(sketch.quantile(0.5))

0 commit comments

Comments
 (0)