Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions kll_sketch/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,25 @@ classifiers = [
[project.urls]
Homepage = "https://github.com/yourname/kll_sketch"
Repository = "https://github.com/yourname/kll_sketch"

[project.optional-dependencies]
test = [
"pytest>=7.4",
"hypothesis>=6.88",
"pytest-cov>=4.1",
]

[tool.pytest.ini_options]
addopts = "--strict-config --strict-markers --cov=kll_sketch --cov-report=term-missing"
testpaths = ["kll_sketch/tests"]
filterwarnings = [
"error",
]

[tool.coverage.run]
branch = true
source = ["kll_sketch"]

[tool.coverage.report]
precision = 2
show_missing = true
176 changes: 163 additions & 13 deletions kll_sketch/tests/test_kll.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,169 @@
"""Deterministic regression tests for :mod:`kll_sketch`."""
from __future__ import annotations

import bisect
import math
import random
from typing import Iterable

import pytest

from kll_sketch import KLL

def test_basic_quantiles():

def _truth_quantile(xs: Iterable[float], q: float) -> float:
ordered = sorted(xs)
if not ordered:
raise ValueError("empty iterable")
idx = int(q * (len(ordered) - 1))
return ordered[idx]


def test_basic_quantiles_regression() -> None:
"""The sketch stays within a tight absolute error on a pseudo-random stream."""
rng = random.Random(1)
xs = [rng.random() for _ in range(50_000)]
srt = sorted(xs)
sk = KLL(capacity=200); sk.extend(xs)
for q in [0.01,0.1,0.25,0.5,0.75,0.9,0.99]:
est = sk.quantile(q); tru = srt[int(q*(len(xs)-1))]
assert abs(est - tru) <= 0.02
xs = [rng.random() for _ in range(20_000)]
truth = sorted(xs)

sketch = KLL(capacity=256)
sketch.extend(xs)

# Deterministic checkpoints chosen to exercise both tails and the median.
for q in [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]:
estimate = sketch.quantile(q)
reference = truth[int(q * (len(xs) - 1))]
assert abs(estimate - reference) <= 0.015


@pytest.mark.parametrize("capacity", [64, 128, 256])
def test_quantile_rank_consistency(capacity: int) -> None:
"""Quantile answers are compatible with the public ``rank`` helper."""
rng = random.Random(capacity)
xs = [rng.uniform(-5.0, 5.0) for _ in range(2_000)]

sketch = KLL(capacity=capacity)
sketch.extend(xs)

for q in [0.0, 0.25, 0.5, 0.75, 1.0]:
estimate = sketch.quantile(q)
approx_rank = sketch.rank(estimate)
target_rank = q * sketch.size()
tolerance = max(5.0, 0.05 * sketch.size())
assert abs(approx_rank - target_rank) <= tolerance


def test_rank_and_cdf_are_monotone() -> None:
rng = random.Random(7)
xs = sorted(rng.uniform(-1.0, 1.0) for _ in range(500))

sketch = KLL(capacity=200)
sketch.extend(xs)

last_rank = -1.0
for x in xs:
rank = sketch.rank(x)
assert rank >= last_rank
last_rank = rank

cdf_values = sketch.cdf(xs)
assert all(0.0 <= value <= 1.0 for value in cdf_values)
assert cdf_values == sorted(cdf_values)

def test_weight_conservation():

def test_weight_conservation() -> None:
rng = random.Random(0)
sk = KLL(capacity=64, rng_seed=777)
xs = [rng.random() for _ in range(200_000)]
sk.extend(xs)
vals, wts = sk._materialize_aligned()
assert abs(sum(wts) - sk.size()) < 1e-9
sketch = KLL(capacity=64, rng_seed=777)
xs = [rng.random() for _ in range(50_000)]
sketch.extend(xs)
vals, wts = sketch._materialize_aligned()
assert len(vals) == len(wts)
assert abs(sum(wts) - sketch.size()) < 1e-9


def test_merge_matches_single_stream() -> None:
rng = random.Random(321)
left = [rng.random() for _ in range(5_000)]
right = [rng.random() for _ in range(5_000)]

merged = KLL(capacity=200)
merged.extend(left)
merged.extend(right)

a = KLL(capacity=200)
b = KLL(capacity=200)
a.extend(left)
b.extend(right)
a.merge(b)

checkpoints = [0.01, 0.1, 0.5, 0.9, 0.99]
for q in checkpoints:
assert math.isclose(a.quantile(q), merged.quantile(q), rel_tol=0.05, abs_tol=0.02)


@pytest.mark.parametrize(
"sample",
[
[],
[0.0],
[0.0, 0.0, 0.0],
[-1.5, 0.0, 1.5],
[float(i) for i in range(10)],
],
)
def test_serialization_roundtrip(sample: list[float]) -> None:
sketch = KLL(capacity=64)
sketch.extend(sample)

restored = KLL.from_bytes(sketch.to_bytes())
assert restored.size() == sketch.size()
assert restored._levels == sketch._levels
if sketch.size():
assert restored.quantile(0.5) == pytest.approx(sketch.quantile(0.5))
else:
with pytest.raises(ValueError):
restored.quantile(0.5)


def test_invalid_inputs_raise() -> None:
sketch = KLL(capacity=64)
with pytest.raises(ValueError):
sketch.add(float("nan"))
with pytest.raises(ValueError):
sketch.add(float("inf"))
with pytest.raises(ValueError):
sketch.quantile(-0.01)
with pytest.raises(ValueError):
sketch.quantile(1.5)
with pytest.raises(ValueError):
sketch.quantile(0.5) # empty sketch


@pytest.mark.parametrize("q", [0.0, 0.25, 0.5, 0.75, 1.0])
@pytest.mark.parametrize("values", [[1.0], [1.0, 2.0, 3.0], [5.0] * 10])
def test_quantile_matches_truth_for_small_inputs(values: list[float], q: float) -> None:
sketch = KLL(capacity=64)
sketch.extend(values)
truth = _truth_quantile(values, q)
estimate = sketch.quantile(q)
assert estimate == pytest.approx(truth, abs=1.0)
if values:
assert min(values) <= estimate <= max(values)
rank_estimate = sketch.rank(truth)
assert 0.0 <= rank_estimate <= len(values)


def test_rank_brackets_quantile() -> None:
rng = random.Random(99)
xs = [rng.uniform(-10, 10) for _ in range(3_000)]

sketch = KLL(capacity=128)
sketch.extend(xs)

for q in [0.0, 0.1, 0.5, 0.9, 1.0]:
estimate = sketch.quantile(q)
ordered = sorted(xs)
lower = bisect.bisect_left(ordered, estimate)
upper = bisect.bisect_right(ordered, estimate)
target_rank = q * (len(xs) - 1)
assert lower <= target_rank + 200
assert upper >= target_rank - 200
103 changes: 103 additions & 0 deletions kll_sketch/tests/test_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Property-based tests exercising probabilistic guarantees of :mod:`kll_sketch`."""
from __future__ import annotations

import bisect
from typing import Sequence

import math

import pytest

hypothesis = pytest.importorskip("hypothesis")
st = hypothesis.strategies
given = hypothesis.given
settings = hypothesis.settings

from kll_sketch import KLL


def _sorted_list(seq: Sequence[float]) -> list[float]:
ordered = list(seq)
ordered.sort()
return ordered


@given(
st.lists(
st.floats(min_value=-1e6, max_value=1e6, allow_nan=False, allow_infinity=False),
min_size=1,
max_size=2_000,
),
st.floats(min_value=0.0, max_value=1.0),
)
@settings(max_examples=75, deadline=None)
def test_quantile_rank_error_is_bounded(xs: list[float], q: float) -> None:
sketch = KLL(capacity=256)
sketch.extend(xs)

estimate = sketch.quantile(q)
ordered = _sorted_list(xs)
target_rank = q * (len(xs) - 1)

# Compute the realised rank interval for the estimate in the truth data.
left = bisect.bisect_left(ordered, estimate)
right = bisect.bisect_right(ordered, estimate)

# Allow a tolerance proportional to 1/k (here ~0.004) plus a small constant
# for discrete datasets. The assert keeps the property coarse but useful.
slack = max(3.0, 0.04 * len(xs))
assert left <= target_rank + slack
assert right >= target_rank - slack


@given(
st.lists(
st.floats(min_value=-1e3, max_value=1e3, allow_nan=False, allow_infinity=False),
min_size=0,
max_size=1_000,
),
st.lists(
st.floats(min_value=-1e3, max_value=1e3, allow_nan=False, allow_infinity=False),
min_size=0,
max_size=1_000,
),
)
@settings(max_examples=60, deadline=None)
def test_merge_matches_extending(xs: list[float], ys: list[float]) -> None:
combined = xs + ys

serial = KLL(capacity=128)
serial.extend(combined)

a = KLL(capacity=128)
b = KLL(capacity=128)
a.extend(xs)
b.extend(ys)
a.merge(b)

for q in [0.0, 0.1, 0.5, 0.9, 1.0]:
assert math.isclose(a.quantile(q), serial.quantile(q), rel_tol=0.05, abs_tol=0.05)


@given(
st.lists(
st.floats(min_value=-1e4, max_value=1e4, allow_nan=False, allow_infinity=False),
min_size=0,
max_size=1_500,
)
)
@settings(max_examples=60, deadline=None)
def test_serialization_roundtrip_matches_levels(xs: list[float]) -> None:
sketch = KLL(capacity=200)
sketch.extend(xs)
payload = sketch.to_bytes()
restored = KLL.from_bytes(payload)

assert restored.size() == sketch.size()
assert restored._levels == sketch._levels

if xs:
for q in [0.0, 0.25, 0.5, 0.75, 1.0]:
restored_q = restored.quantile(q)
sketch_q = sketch.quantile(q)
assert math.isclose(restored_q, sketch_q, rel_tol=1e-9, abs_tol=1e-9)