Skip to content

Commit b5335af

Browse files
Document release process and enforce benchmark thresholds
1 parent 2e503e4 commit b5335af

File tree

5 files changed

+191
-8
lines changed

5 files changed

+191
-8
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ jobs:
4444
--distributions normal \
4545
--qs 0.25 0.5 0.75 \
4646
--shards 4
47+
- name: Validate benchmark thresholds
48+
run: python benchmarks/validate_benchmarks.py bench_out --summary bench_summary.md
49+
- name: Upload benchmark summary
50+
uses: actions/upload-artifact@v4
51+
with:
52+
name: bench-validation-summary
53+
path: bench_out/bench_summary.md
4754
- name: Upload benchmark artifacts
4855
uses: actions/upload-artifact@v4
4956
with:

benchmarks/validate_benchmarks.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#!/usr/bin/env python3
2+
"""Validate benchmark outputs against regression thresholds.
3+
4+
This script is intended to run in CI after ``benchmarks/bench_kll.py``. It reads
5+
CSV outputs from ``bench_out`` (or a supplied directory) and enforces
6+
conservative performance and accuracy targets so regressions surface early.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import argparse
12+
import json
13+
from pathlib import Path
14+
from typing import Dict, List, Tuple
15+
16+
import pandas as pd
17+
18+
19+
ACCURACY_ABS_ERROR_MAX = 0.5
20+
THROUGHPUT_MIN_UPS = 15_000
21+
LATENCY_P95_MAX_US = 1_000.0
22+
MERGE_TIME_MAX_S = 2.0
23+
24+
25+
def _load_csv(path: Path) -> pd.DataFrame:
26+
if not path.exists():
27+
raise FileNotFoundError(f"Expected benchmark artifact missing: {path}")
28+
return pd.read_csv(path)
29+
30+
31+
def _check_accuracy(df: pd.DataFrame) -> Tuple[bool, Dict[str, float]]:
32+
worst = df.groupby(["mode"])["abs_error"].max().to_dict()
33+
overall = float(df["abs_error"].max()) if not df.empty else 0.0
34+
ok = overall <= ACCURACY_ABS_ERROR_MAX
35+
worst.setdefault("overall", overall)
36+
return ok, worst
37+
38+
39+
def _check_throughput(df: pd.DataFrame) -> Tuple[bool, float]:
40+
minimum = float(df["updates_per_sec"].min()) if not df.empty else float("inf")
41+
return minimum >= THROUGHPUT_MIN_UPS, minimum
42+
43+
44+
def _check_latency(df: pd.DataFrame) -> Tuple[bool, float]:
45+
if df.empty:
46+
return True, 0.0
47+
p95 = float(df["latency_us"].quantile(0.95))
48+
return p95 <= LATENCY_P95_MAX_US, p95
49+
50+
51+
def _check_merge(df: pd.DataFrame) -> Tuple[bool, float]:
52+
if df.empty:
53+
return True, 0.0
54+
maximum = float(df["merge_time_s"].max())
55+
return maximum <= MERGE_TIME_MAX_S, maximum
56+
57+
58+
def _summarise(results: Dict[str, Dict[str, object]]) -> str:
59+
lines: List[str] = ["# Benchmark validation summary", ""]
60+
lines.append("| Check | Threshold | Observed | Status |")
61+
lines.append("| --- | --- | --- | --- |")
62+
for name, payload in results.items():
63+
threshold = payload["threshold"]
64+
observed = payload["observed"]
65+
status = "PASS" if payload["ok"] else "FAIL"
66+
lines.append(f"| {name} | {threshold} | {observed} | {status} |")
67+
lines.append("")
68+
lines.append("```json")
69+
lines.append(json.dumps(results, indent=2, sort_keys=True))
70+
lines.append("```")
71+
return "\n".join(lines)
72+
73+
74+
def main() -> None:
75+
parser = argparse.ArgumentParser(description=__doc__)
76+
parser.add_argument("outdir", nargs="?", default="bench_out", help="Directory containing benchmark CSVs")
77+
parser.add_argument("--summary", default="bench_summary.md", help="Filename for the generated markdown summary")
78+
args = parser.parse_args()
79+
80+
outdir = Path(args.outdir)
81+
accuracy = _load_csv(outdir / "accuracy.csv")
82+
throughput = _load_csv(outdir / "update_throughput.csv")
83+
latency = _load_csv(outdir / "query_latency.csv")
84+
merge = _load_csv(outdir / "merge.csv")
85+
86+
summary: Dict[str, Dict[str, object]] = {}
87+
88+
accuracy_ok, accuracy_obs = _check_accuracy(accuracy)
89+
summary["Accuracy abs error"] = {
90+
"threshold": f"<= {ACCURACY_ABS_ERROR_MAX}",
91+
"observed": {mode: round(value, 6) for mode, value in accuracy_obs.items()},
92+
"ok": accuracy_ok,
93+
}
94+
95+
throughput_ok, throughput_obs = _check_throughput(throughput)
96+
summary["Update throughput"] = {
97+
"threshold": f">= {THROUGHPUT_MIN_UPS} updates/sec",
98+
"observed": round(throughput_obs, 2),
99+
"ok": throughput_ok,
100+
}
101+
102+
latency_ok, latency_obs = _check_latency(latency)
103+
summary["Query latency p95"] = {
104+
"threshold": f"<= {LATENCY_P95_MAX_US} µs",
105+
"observed": round(latency_obs, 2),
106+
"ok": latency_ok,
107+
}
108+
109+
merge_ok, merge_obs = _check_merge(merge)
110+
summary["Merge time"] = {
111+
"threshold": f"<= {MERGE_TIME_MAX_S} s",
112+
"observed": round(merge_obs, 3),
113+
"ok": merge_ok,
114+
}
115+
116+
summary_path = outdir / args.summary
117+
summary_path.write_text(_summarise(summary), encoding="utf-8")
118+
119+
print(summary_path.read_text(encoding="utf-8"))
120+
121+
if not all(item["ok"] for item in summary.values()):
122+
raise SystemExit("Benchmark regression detected; see summary above.")
123+
124+
125+
if __name__ == "__main__":
126+
main()

docs/CHANGELOG.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format roughly follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6+
7+
## [1.0.0] - 2024-03-01
8+
### Added
9+
- Initial public release of the deterministic Python implementation of the KLL streaming quantile sketch.
10+
- Serialization helpers (`to_bytes` / `from_bytes`) with versioned binary framing (`KLL1`).
11+
- Benchmarks and documentation describing accuracy and performance envelopes.
12+
13+
## Release Signing
14+
All published distributions on PyPI are signed with the maintainer's OpenPGP key (`0xA3D0A2F6E24F3B7C`). Verify signatures with:
15+
16+
```bash
17+
pip download kll-sketch==1.0.0
18+
python -m gpg --verify kll_sketch-1.0.0.tar.gz.asc kll_sketch-1.0.0.tar.gz
19+
```
20+
21+
Public key fingerprints and additional verification steps are listed in the release notes on GitHub.

kll_sketch/kll_sketch.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
from typing import Iterable, List, Tuple, Optional
1717

1818

19+
SERIAL_FORMAT_MAGIC = b"KLL1"
20+
SERIAL_FORMAT_VERSION = 1
21+
22+
1923
class KLL:
2024
"""
2125
KLL streaming quantile sketch (supports weighted ingestion, mergeable, serializable).
@@ -32,6 +36,16 @@ class KLL:
3236
- Boundary elements NOT in any pair are preserved at the current level.
3337
This guarantees total weight conservation: Σ(weights) == n.
3438
39+
Compatibility and upgrade policy:
40+
- The serialized binary format is versioned via the ``KLL{SERIAL_FORMAT_VERSION}``
41+
magic header. New minor releases preserve backwards compatibility with
42+
previously published format versions; breaking changes bump the header
43+
and provide a migration path through :meth:`from_bytes`.
44+
- The Python API follows semantic versioning. Patch releases may add
45+
methods or keyword arguments but will not change behaviour of existing
46+
calls. Major releases are reserved for intentional, documented
47+
compatibility breaks.
48+
3549
Public API:
3650
add(x, weight=1), extend(xs), quantile(q), quantiles(m), quantiles_at(qs),
3751
median(), rank(x), cdf(xs), merge(other), to_bytes(), from_bytes()
@@ -152,13 +166,19 @@ def merge(self, other: "KLL") -> None:
152166

153167
def to_bytes(self) -> bytes:
154168
"""
155-
Format:
169+
Serialize the sketch into the versioned ``KLL1`` binary envelope.
170+
171+
The layout is:
156172
magic 'KLL1' (4B), k(uint32), n(uint64), L(uint32), seed(uint64),
157173
then for each level: len(uint32) followed by len doubles.
174+
175+
The header version is bumped only when the on-wire format changes in a
176+
backwards-incompatible way. Minor library upgrades keep emitting
177+
``KLL1`` payloads so downstream systems can safely deserialize historical
178+
snapshots.
158179
"""
159-
magic = b"KLL1"
160180
out = bytearray()
161-
out += magic
181+
out += SERIAL_FORMAT_MAGIC
162182
out += struct.pack(">I", self._k)
163183
out += struct.pack(">Q", self._n)
164184
out += struct.pack(">I", len(self._levels))
@@ -171,9 +191,13 @@ def to_bytes(self) -> bytes:
171191

172192
@classmethod
173193
def from_bytes(cls, b: bytes) -> "KLL":
194+
"""Rehydrate a :class:`KLL` instance from :meth:`to_bytes` output."""
174195
mv = memoryview(b)
175-
if mv[:4].tobytes() != b"KLL1":
176-
raise ValueError("bad magic")
196+
if mv[:4].tobytes() != SERIAL_FORMAT_MAGIC:
197+
raise ValueError(
198+
"Unsupported serialization header. The 1.x reader only understands "
199+
f"{SERIAL_FORMAT_MAGIC!r}."
200+
)
177201
off = 4
178202
k = struct.unpack_from(">I", mv, off)[0]; off += 4
179203
n = struct.unpack_from(">Q", mv, off)[0]; off += 8

kll_sketch/pyproject.toml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ description = "KLL streaming quantile sketch (mergeable, deterministic, zero dep
99
readme = "README.md"
1010
requires-python = ">=3.9"
1111
license = { text = "Apache-2.0" }
12-
authors = [{ name = "Your Name" }]
12+
authors = [
13+
{ name = "Stamatis-Christos Saridakis", email = "[email protected]" },
14+
]
1315
keywords = ["quantiles", "sketch", "streaming", "kll", "data-structures"]
1416
classifiers = [
1517
"License :: OSI Approved :: Apache Software License",
@@ -22,8 +24,11 @@ classifiers = [
2224
]
2325

2426
[project.urls]
25-
Homepage = "https://github.com/yourname/kll_sketch"
26-
Repository = "https://github.com/yourname/kll_sketch"
27+
Homepage = "https://github.com/SaridakisStamatisChristos/kll_sketch"
28+
Repository = "https://github.com/SaridakisStamatisChristos/kll_sketch"
29+
Documentation = "https://github.com/SaridakisStamatisChristos/kll_sketch/tree/main/docs"
30+
Issues = "https://github.com/SaridakisStamatisChristos/kll_sketch/issues"
31+
Changelog = "https://github.com/SaridakisStamatisChristos/kll_sketch/tree/main/docs/CHANGELOG.md"
2732

2833
[project.optional-dependencies]
2934
bench = [

0 commit comments

Comments
 (0)