feat(perf): infra perf-benchmark substrate — journeys, integrity contracts, percentile ratchet (0.90.0)

drewstone · drewstone · commit a310acd1e337 · 2026-06-10T07:33:22.000-06:00
New domain-agnostic /perf subpath for infra performance benchmarking,
complementing the judge-panel BenchmarkRunner (quality) with latency /
reliability scoring over flat metric records:

- JourneySpec + expandMatrix + scenarioKey: journeys × free-form axes
  cartesian matrix with sorted-dim stable keys and a combo filter.
- checkRecordIntegrity + assertRecordIntegrity: a pass=true record must
  carry its journey's requiredFields / minimums / phaseFields; failed
  records are exempt.
- summarizeRecords + gatePerf: nearest-rank p50/p90 PerfStat baselines
  and a tolerance ratchet with improvements, missing/new scenario
  detection, and a minSamples floor; null metrics never become fake
  zeros.

Exported from the root barrel and the new ./perf subpath (tsup entry +
package.json exports). Version 0.90.0 across npm + PyPI; CHANGELOG entry
added. 25 vitest cases, each mutation-verified (7/7 mutants killed).
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,15 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
 
 ---
 
-## [0.86.0] — 2026-06-09 — fleet-rebuilt eval primitives
+## [0.90.0] — 2026-06-10 — infra perf-benchmark substrate (`/perf`)
+
+Domain-agnostic infra-performance benchmarking: a journeys × axes scenario matrix, record-integrity contracts over flat metric records, and a percentile ratchet. Complements the judge-panel `BenchmarkRunner` (root) — that one scores QUALITY via judges; `/perf` scores LATENCY / RELIABILITY. All additive — no existing export changed.
+
+### Added
+
+- **`JourneySpec` + `expandMatrix` + `scenarioKey` (`/perf` + root).** A journey is one measurable user path (`provision.cold`, `chat.ttft`) carrying its own data contract: `requiredFields` (must be non-null on a passing record), `minimums` (numeric floors, e.g. `event_count ≥ 1` for streaming), `phaseFields` (per-phase breakdown, reported separately), and `requiresLLM` (nightly vs per-PR scheduling). `expandMatrix` does the cartesian expansion over free-form `ScenarioAxes` (driver × region × …) with a `filter` for invalid combos; scenario keys are `journeyId|dim=value|…` with dims sorted, so the key is stable across axes-object insertion order.
+- **`checkRecordIntegrity` + `assertRecordIntegrity` (`/perf` + root).** A record claiming `pass === true` must actually carry its journey's required measurements — a "passing" run with a null `total_ms` is an integrity violation (`null-required-field` / `below-minimum`), not a pass. Failed records are exempt (an errored run legitimately has nulls); `resolveJourney` returning null skips the record. The assert variant throws listing every violation.
+- **`summarizeRecords` + `gatePerf` (`/perf` + root).** Percentile ratchet: fold flat records into per-scenario `PerfStat` (`p50` / `p90` / `n`, nearest-rank on sorted values), then gate a current `PerfBaseline` against a committed one. Null / non-numeric metric values are excluded from `n` and a zero-sample field is omitted — no fake zeros. Regressions trip when p50 OR p90 exceed `tolerancePct` (default 10) over baseline; strict improvements are reported with negative `overBy`; scenarios under `minSamples` (default 3) in current are surfaced in `missingScenarios` and never gated; baseline/current key drift lands in `missingScenarios` / `newScenarios`.
 
 One clean, canonical version of five generic patterns the fleet kept hand-rolling across 2–4 product agents each. All additive — no existing export changed.
 
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.89.0"
+version = "0.90.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py
@@ -58,7 +58,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.89.0"
+    __version__ = "0.90.0"
 
 __all__ = [
     "Client",
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.89.0",
+  "version": "0.90.0",
   "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
@@ -109,6 +109,11 @@
       "import": "./dist/matrix/index.js",
       "default": "./dist/matrix/index.js"
     },
+    "./perf": {
+      "types": "./dist/perf/index.d.ts",
+      "import": "./dist/perf/index.js",
+      "default": "./dist/perf/index.js"
+    },
     "./multishot": {
       "types": "./dist/multishot/index.d.ts",
       "import": "./dist/multishot/index.js",
diff --git a/src/index.ts b/src/index.ts
@@ -1333,6 +1333,30 @@ export type {
   AttestedReport,
 } from './attestation'
 export { ATTESTATION_ALGORITHM, attest, verifyAttestation } from './attestation'
+// ── Perf — infra-performance benchmarking substrate ──────────────────
+// Journeys × axes scenario matrix, record-integrity contracts, and the
+// percentile ratchet (summarize → baseline → gate). Scores LATENCY /
+// RELIABILITY over flat metric records; the judge-panel BenchmarkRunner
+// (./benchmark) scores QUALITY. Also on the `/perf` subpath.
+export type {
+  IntegrityResult,
+  IntegrityViolation,
+  JourneySpec,
+  PerfBaseline,
+  PerfGateResult,
+  PerfRegression,
+  PerfScenario,
+  PerfStat,
+  ScenarioAxes,
+} from './perf'
+export {
+  assertRecordIntegrity,
+  checkRecordIntegrity,
+  expandMatrix,
+  gatePerf,
+  scenarioKey,
+  summarizeRecords,
+} from './perf'
 // ── Anytime-valid sequential testing (e-process core) ────────────────
 // The betting test-martingale behind the sequential gates. Gate-level
 // machinery (sequentialPairedGate, sequentialDecide) lives on the /campaign
diff --git a/src/perf/index.ts b/src/perf/index.ts
@@ -0,0 +1,17 @@
+/**
+ * @tangle-network/agent-eval/perf
+ *
+ * Domain-agnostic infra-performance benchmarking substrate: a journeys ×
+ * axes scenario matrix, record-integrity contracts over flat metric
+ * records, and a percentile ratchet (summarize → baseline → gate).
+ *
+ * Complements the judge-panel `BenchmarkRunner` (root): that one scores
+ * QUALITY; this one scores LATENCY / RELIABILITY over flat metric records.
+ */
+
+export type { IntegrityResult, IntegrityViolation } from './integrity'
+export { assertRecordIntegrity, checkRecordIntegrity } from './integrity'
+export type { JourneySpec, PerfScenario, ScenarioAxes } from './journey'
+export { expandMatrix, scenarioKey } from './journey'
+export type { PerfBaseline, PerfGateResult, PerfRegression, PerfStat } from './ratchet'
+export { gatePerf, summarizeRecords } from './ratchet'
diff --git a/src/perf/integrity.ts b/src/perf/integrity.ts
@@ -0,0 +1,106 @@
+/**
+ * Record-integrity contracts for perf metric records.
+ *
+ * A record that claims `pass === true` must actually carry the journey's
+ * required measurements — a "passing" provision run with a null
+ * `total_ms` is a lying record, not a pass. Failed records are exempt:
+ * a run that errored mid-flight legitimately has nulls.
+ */
+
+import type { JourneySpec } from './journey'
+
+export interface IntegrityViolation {
+  recordIndex: number
+  journeyId: string
+  field: string
+  reason: 'null-required-field' | 'below-minimum'
+  detail: string
+}
+
+export interface IntegrityResult {
+  succeeded: boolean
+  violations: IntegrityViolation[]
+}
+
+function isMissing(value: unknown): boolean {
+  return value === null || value === undefined
+}
+
+/**
+ * Validates flat metric records (Record<string, unknown> with a boolean
+ * `pass` field) against their journey contract. Only records with
+ * pass === true are checked — a failed record may legitimately have nulls.
+ * resolveJourney maps a record to its JourneySpec (or null to skip).
+ */
+export function checkRecordIntegrity(
+  records: ReadonlyArray<Record<string, unknown>>,
+  resolveJourney: (record: Record<string, unknown>) => JourneySpec | null,
+): IntegrityResult {
+  const violations: IntegrityViolation[] = []
+  for (const [recordIndex, record] of records.entries()) {
+    if (record.pass !== true) continue
+    const journey = resolveJourney(record)
+    if (journey === null) continue
+    for (const field of journey.requiredFields) {
+      if (isMissing(record[field])) {
+        violations.push({
+          recordIndex,
+          journeyId: journey.id,
+          field,
+          reason: 'null-required-field',
+          detail: `required field '${field}' is ${record[field] === null ? 'null' : 'undefined'} on a passing '${journey.id}' record`,
+        })
+      }
+    }
+    for (const field of journey.phaseFields ?? []) {
+      if (isMissing(record[field])) {
+        violations.push({
+          recordIndex,
+          journeyId: journey.id,
+          field,
+          reason: 'null-required-field',
+          detail: `phase field '${field}' is ${record[field] === null ? 'null' : 'undefined'} on a passing '${journey.id}' record`,
+        })
+      }
+    }
+    for (const { field, min } of journey.minimums ?? []) {
+      const value = record[field]
+      if (isMissing(value)) continue // null-ness is the required/phase fields' contract
+      if (typeof value !== 'number' || Number.isNaN(value)) {
+        violations.push({
+          recordIndex,
+          journeyId: journey.id,
+          field,
+          reason: 'below-minimum',
+          detail: `field '${field}' has non-numeric value ${JSON.stringify(value)} on a passing '${journey.id}' record (minimum ${min})`,
+        })
+        continue
+      }
+      if (value < min) {
+        violations.push({
+          recordIndex,
+          journeyId: journey.id,
+          field,
+          reason: 'below-minimum',
+          detail: `field '${field}' is ${value}, below minimum ${min} on a passing '${journey.id}' record`,
+        })
+      }
+    }
+  }
+  return { succeeded: violations.length === 0, violations }
+}
+
+/** Throws an Error listing every violation when the result fails. */
+export function assertRecordIntegrity(
+  records: ReadonlyArray<Record<string, unknown>>,
+  resolveJourney: (record: Record<string, unknown>) => JourneySpec | null,
+): void {
+  const result = checkRecordIntegrity(records, resolveJourney)
+  if (result.succeeded) return
+  const lines = result.violations.map(
+    (v) => `  [record ${v.recordIndex}] ${v.journeyId}.${v.field} (${v.reason}): ${v.detail}`,
+  )
+  throw new Error(
+    `Record integrity check failed with ${result.violations.length} violation(s):\n${lines.join('\n')}`,
+  )
+}
diff --git a/src/perf/journey.ts b/src/perf/journey.ts
@@ -0,0 +1,76 @@
+/**
+ * Journey × axes matrix for infra performance benchmarks.
+ *
+ * A journey is one measurable user path ("provision.cold", "chat.ttft");
+ * axes are free-form scenario dimensions (driver, region, image…). The
+ * matrix expansion is pure bookkeeping — running the scenarios and
+ * recording metrics is the caller's job. This module complements the
+ * judge-panel `BenchmarkRunner` (src/benchmark.ts): that one scores
+ * QUALITY via judges, this one structures LATENCY / RELIABILITY runs
+ * over flat metric records.
+ */
+
+/** One measurable user journey (e.g. "provision.cold", "chat.ttft"). */
+export interface JourneySpec {
+  id: string
+  description: string
+  /** Needs a real LLM call — schedule nightly, not per-PR. */
+  requiresLLM: boolean
+  /**
+   * Fields that MUST be non-null on a passing record of this journey.
+   * A "passing" record missing one is an integrity violation, not a pass.
+   */
+  requiredFields: ReadonlyArray<string>
+  /** Numeric floors, e.g. {field: 'event_count', min: 1} for streaming. */
+  minimums?: ReadonlyArray<{ field: string; min: number }>
+  /** Per-phase breakdown fields expected non-null (subset of requiredFields semantics, reported separately). */
+  phaseFields?: ReadonlyArray<string>
+}
+
+export interface ScenarioAxes {
+  /** e.g. driver: ['docker','firecracker'] — every key is a free-form dimension. */
+  [dimension: string]: ReadonlyArray<string>
+}
+
+export interface PerfScenario {
+  /** `${journeyId}|${dim1}=${v1}|${dim2}=${v2}` (dims sorted). */
+  key: string
+  journey: JourneySpec
+  axes: Record<string, string>
+}
+
+/** Stable scenario key: journey id then `dim=value` pairs in sorted-dim order. */
+export function scenarioKey(journeyId: string, axes: Record<string, string>): string {
+  const parts = Object.keys(axes)
+    .sort()
+    .map((dim) => `${dim}=${axes[dim]}`)
+  return [journeyId, ...parts].join('|')
+}
+
+/** Cartesian expansion; `filter` lets callers drop invalid combos (e.g. firecracker×resume). */
+export function expandMatrix(
+  journeys: ReadonlyArray<JourneySpec>,
+  axes: ScenarioAxes,
+  filter?: (journeyId: string, combo: Record<string, string>) => boolean,
+): PerfScenario[] {
+  const dims = Object.keys(axes).sort()
+  let combos: Record<string, string>[] = [{}]
+  for (const dim of dims) {
+    const values = axes[dim] as ReadonlyArray<string>
+    const next: Record<string, string>[] = []
+    for (const combo of combos) {
+      for (const value of values) {
+        next.push({ ...combo, [dim]: value })
+      }
+    }
+    combos = next
+  }
+  const scenarios: PerfScenario[] = []
+  for (const journey of journeys) {
+    for (const combo of combos) {
+      if (filter && !filter(journey.id, combo)) continue
+      scenarios.push({ key: scenarioKey(journey.id, combo), journey, axes: combo })
+    }
+  }
+  return scenarios
+}
diff --git a/src/perf/ratchet.ts b/src/perf/ratchet.ts
diff --git a/tests/perf.test.ts b/tests/perf.test.ts
diff --git a/tsup.config.ts b/tsup.config.ts