Skip to content

Commit 70c8436

Browse files
test: expand suite with property checks and coverage config
1 parent dbb31b5 commit 70c8436

File tree

3 files changed

+281
-13
lines changed

3 files changed

+281
-13
lines changed

kll_sketch/pyproject.toml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,25 @@ classifiers = [
2424
[project.urls]
2525
Homepage = "https://github.com/yourname/kll_sketch"
2626
Repository = "https://github.com/yourname/kll_sketch"
27+
28+
[project.optional-dependencies]
29+
test = [
30+
"pytest>=7.4",
31+
"hypothesis>=6.88",
32+
"pytest-cov>=4.1",
33+
]
34+
35+
[tool.pytest.ini_options]
36+
addopts = "--strict-config --strict-markers --cov=kll_sketch --cov-report=term-missing"
37+
testpaths = ["kll_sketch/tests"]
38+
filterwarnings = [
39+
"error",
40+
]
41+
42+
[tool.coverage.run]
43+
branch = true
44+
source = ["kll_sketch"]
45+
46+
[tool.coverage.report]
47+
precision = 2
48+
show_missing = true

kll_sketch/tests/test_kll.py

Lines changed: 160 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,166 @@
1+
"""Deterministic regression tests for :mod:`kll_sketch`."""
2+
from __future__ import annotations
3+
4+
import bisect
5+
import math
16
import random
7+
from typing import Iterable
8+
9+
import pytest
10+
211
from kll_sketch import KLL
312

4-
def test_basic_quantiles():
13+
14+
def _truth_quantile(xs: Iterable[float], q: float) -> float:
15+
ordered = sorted(xs)
16+
if not ordered:
17+
raise ValueError("empty iterable")
18+
idx = int(q * (len(ordered) - 1))
19+
return ordered[idx]
20+
21+
22+
def test_basic_quantiles_regression() -> None:
23+
"""The sketch stays within a tight absolute error on a pseudo-random stream."""
524
rng = random.Random(1)
6-
xs = [rng.random() for _ in range(50_000)]
7-
srt = sorted(xs)
8-
sk = KLL(capacity=200); sk.extend(xs)
9-
for q in [0.01,0.1,0.25,0.5,0.75,0.9,0.99]:
10-
est = sk.quantile(q); tru = srt[int(q*(len(xs)-1))]
11-
assert abs(est - tru) <= 0.02
25+
xs = [rng.random() for _ in range(20_000)]
26+
truth = sorted(xs)
27+
28+
sketch = KLL(capacity=256)
29+
sketch.extend(xs)
30+
31+
# Deterministic checkpoints chosen to exercise both tails and the median.
32+
for q in [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]:
33+
estimate = sketch.quantile(q)
34+
reference = truth[int(q * (len(xs) - 1))]
35+
assert abs(estimate - reference) <= 0.015
36+
37+
38+
@pytest.mark.parametrize("capacity", [64, 128, 256])
39+
def test_quantile_rank_consistency(capacity: int) -> None:
40+
"""Quantile answers are compatible with the public ``rank`` helper."""
41+
rng = random.Random(capacity)
42+
xs = [rng.uniform(-5.0, 5.0) for _ in range(2_000)]
43+
44+
sketch = KLL(capacity=capacity)
45+
sketch.extend(xs)
46+
47+
for q in [0.0, 0.25, 0.5, 0.75, 1.0]:
48+
estimate = sketch.quantile(q)
49+
approx_rank = sketch.rank(estimate)
50+
target_rank = q * sketch.size()
51+
# rank() returns a value in [0, n]; allow a small tolerance in samples.
52+
assert math.isclose(approx_rank, target_rank, rel_tol=0.05, abs_tol=2.0)
53+
54+
55+
def test_rank_and_cdf_are_monotone() -> None:
56+
rng = random.Random(7)
57+
xs = sorted(rng.uniform(-1.0, 1.0) for _ in range(500))
58+
59+
sketch = KLL(capacity=200)
60+
sketch.extend(xs)
61+
62+
last_rank = -1.0
63+
for x in xs:
64+
rank = sketch.rank(x)
65+
assert rank >= last_rank
66+
last_rank = rank
67+
68+
cdf_values = sketch.cdf(xs)
69+
assert all(0.0 <= value <= 1.0 for value in cdf_values)
70+
assert cdf_values == sorted(cdf_values)
1271

13-
def test_weight_conservation():
72+
73+
def test_weight_conservation() -> None:
1474
rng = random.Random(0)
15-
sk = KLL(capacity=64, rng_seed=777)
16-
xs = [rng.random() for _ in range(200_000)]
17-
sk.extend(xs)
18-
vals, wts = sk._materialize_aligned()
19-
assert abs(sum(wts) - sk.size()) < 1e-9
75+
sketch = KLL(capacity=64, rng_seed=777)
76+
xs = [rng.random() for _ in range(50_000)]
77+
sketch.extend(xs)
78+
vals, wts = sketch._materialize_aligned()
79+
assert len(vals) == len(wts)
80+
assert abs(sum(wts) - sketch.size()) < 1e-9
81+
82+
83+
def test_merge_matches_single_stream() -> None:
84+
rng = random.Random(321)
85+
left = [rng.random() for _ in range(5_000)]
86+
right = [rng.random() for _ in range(5_000)]
87+
88+
merged = KLL(capacity=200)
89+
merged.extend(left)
90+
merged.extend(right)
91+
92+
a = KLL(capacity=200)
93+
b = KLL(capacity=200)
94+
a.extend(left)
95+
b.extend(right)
96+
a.merge(b)
97+
98+
checkpoints = [0.01, 0.1, 0.5, 0.9, 0.99]
99+
for q in checkpoints:
100+
assert math.isclose(a.quantile(q), merged.quantile(q), rel_tol=0.02, abs_tol=0.01)
101+
102+
103+
@pytest.mark.parametrize(
104+
"sample",
105+
[
106+
[],
107+
[0.0],
108+
[0.0, 0.0, 0.0],
109+
[-1.5, 0.0, 1.5],
110+
[float(i) for i in range(10)],
111+
],
112+
)
113+
def test_serialization_roundtrip(sample: list[float]) -> None:
114+
sketch = KLL(capacity=64)
115+
sketch.extend(sample)
116+
117+
restored = KLL.from_bytes(sketch.to_bytes())
118+
assert restored.size() == sketch.size()
119+
assert restored._levels == sketch._levels
120+
if sketch.size():
121+
assert restored.quantile(0.5) == pytest.approx(sketch.quantile(0.5))
122+
else:
123+
with pytest.raises(ValueError):
124+
restored.quantile(0.5)
125+
126+
127+
def test_invalid_inputs_raise() -> None:
128+
sketch = KLL(capacity=64)
129+
with pytest.raises(ValueError):
130+
sketch.add(float("nan"))
131+
with pytest.raises(ValueError):
132+
sketch.add(float("inf"))
133+
with pytest.raises(ValueError):
134+
sketch.quantile(-0.01)
135+
with pytest.raises(ValueError):
136+
sketch.quantile(1.5)
137+
with pytest.raises(ValueError):
138+
sketch.quantile(0.5) # empty sketch
139+
140+
141+
@pytest.mark.parametrize("q", [0.0, 0.25, 0.5, 0.75, 1.0])
142+
@pytest.mark.parametrize("values", [[1.0], [1.0, 2.0, 3.0], [5.0] * 10])
143+
def test_quantile_matches_truth_for_small_inputs(values: list[float], q: float) -> None:
144+
sketch = KLL(capacity=64)
145+
sketch.extend(values)
146+
truth = _truth_quantile(values, q)
147+
assert sketch.quantile(q) == pytest.approx(truth)
148+
rank_estimate = sketch.rank(truth)
149+
assert 0.0 <= rank_estimate <= len(values)
150+
151+
152+
def test_rank_brackets_quantile() -> None:
153+
rng = random.Random(99)
154+
xs = [rng.uniform(-10, 10) for _ in range(3_000)]
155+
156+
sketch = KLL(capacity=128)
157+
sketch.extend(xs)
158+
159+
for q in [0.0, 0.1, 0.5, 0.9, 1.0]:
160+
estimate = sketch.quantile(q)
161+
ordered = sorted(xs)
162+
lower = bisect.bisect_left(ordered, estimate)
163+
upper = bisect.bisect_right(ordered, estimate)
164+
target_rank = q * (len(xs) - 1)
165+
assert lower <= target_rank + 200
166+
assert upper >= target_rank - 200
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
"""Property-based tests exercising probabilistic guarantees of :mod:`kll_sketch`."""
2+
from __future__ import annotations
3+
4+
import bisect
5+
from typing import Sequence
6+
7+
import math
8+
9+
import hypothesis.strategies as st
10+
from hypothesis import given, settings
11+
12+
from kll_sketch import KLL
13+
14+
15+
def _sorted_list(seq: Sequence[float]) -> list[float]:
16+
ordered = list(seq)
17+
ordered.sort()
18+
return ordered
19+
20+
21+
@given(
22+
st.lists(
23+
st.floats(min_value=-1e6, max_value=1e6, allow_nan=False, allow_infinity=False),
24+
min_size=1,
25+
max_size=2_000,
26+
),
27+
st.floats(min_value=0.0, max_value=1.0),
28+
)
29+
@settings(max_examples=75, deadline=None)
30+
def test_quantile_rank_error_is_bounded(xs: list[float], q: float) -> None:
31+
sketch = KLL(capacity=256)
32+
sketch.extend(xs)
33+
34+
estimate = sketch.quantile(q)
35+
ordered = _sorted_list(xs)
36+
target_rank = q * (len(xs) - 1)
37+
38+
# Compute the realised rank interval for the estimate in the truth data.
39+
left = bisect.bisect_left(ordered, estimate)
40+
right = bisect.bisect_right(ordered, estimate)
41+
42+
# Allow a tolerance proportional to 1/k (here ~0.004) plus a small constant
43+
# for discrete datasets. The assert keeps the property coarse but useful.
44+
slack = max(3.0, 0.04 * len(xs))
45+
assert left <= target_rank + slack
46+
assert right >= target_rank - slack
47+
48+
49+
@given(
50+
st.lists(
51+
st.floats(min_value=-1e3, max_value=1e3, allow_nan=False, allow_infinity=False),
52+
min_size=0,
53+
max_size=1_000,
54+
),
55+
st.lists(
56+
st.floats(min_value=-1e3, max_value=1e3, allow_nan=False, allow_infinity=False),
57+
min_size=0,
58+
max_size=1_000,
59+
),
60+
)
61+
@settings(max_examples=60, deadline=None)
62+
def test_merge_matches_extending(xs: list[float], ys: list[float]) -> None:
63+
combined = xs + ys
64+
65+
serial = KLL(capacity=128)
66+
serial.extend(combined)
67+
68+
a = KLL(capacity=128)
69+
b = KLL(capacity=128)
70+
a.extend(xs)
71+
b.extend(ys)
72+
a.merge(b)
73+
74+
for q in [0.0, 0.1, 0.5, 0.9, 1.0]:
75+
assert math.isclose(a.quantile(q), serial.quantile(q), rel_tol=0.05, abs_tol=0.05)
76+
77+
78+
@given(
79+
st.lists(
80+
st.floats(min_value=-1e4, max_value=1e4, allow_nan=False, allow_infinity=False),
81+
min_size=0,
82+
max_size=1_500,
83+
)
84+
)
85+
@settings(max_examples=60, deadline=None)
86+
def test_serialization_roundtrip_matches_levels(xs: list[float]) -> None:
87+
sketch = KLL(capacity=200)
88+
sketch.extend(xs)
89+
payload = sketch.to_bytes()
90+
restored = KLL.from_bytes(payload)
91+
92+
assert restored.size() == sketch.size()
93+
assert restored._levels == sketch._levels
94+
95+
if xs:
96+
for q in [0.0, 0.25, 0.5, 0.75, 1.0]:
97+
restored_q = restored.quantile(q)
98+
sketch_q = sketch.quantile(q)
99+
assert math.isclose(restored_q, sketch_q, rel_tol=1e-9, abs_tol=1e-9)

0 commit comments

Comments
 (0)