tangle-network
diff --git a/‎clients/python/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎clients/python/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clients/python/src/agent_eval_rpc/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎clients/python/src/agent_eval_rpc/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎package.json‎
Lines changed: 1 addition & 1 deletion b/‎package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/campaign/types.ts‎
Lines changed: 20 additions & 0 deletions b/‎src/campaign/types.ts‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/completion-verifier.test.ts‎
Lines changed: 48 additions & 0 deletions b/‎src/completion-verifier.test.ts‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎src/completion-verifier.ts‎
Lines changed: 35 additions & 8 deletions b/‎src/completion-verifier.ts‎
Lines changed: 35 additions & 8 deletions
diff --git a/‎src/contract/analyze-runs.ts‎
Lines changed: 4 additions & 4 deletions b/‎src/contract/analyze-runs.ts‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/contract/index.ts‎
Lines changed: 8 additions & 0 deletions b/‎src/contract/index.ts‎
Lines changed: 8 additions & 0 deletions
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.86.0"
+version = "0.87.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"
 
@@ -58,7 +58,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.86.0"
+    __version__ = "0.87.0"
 
 __all__ = [
     "Client",
 
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.86.0",
+  "version": "0.87.0",
   "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
 
@@ -106,10 +106,30 @@ export interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {
   appliesTo?: (scenario: TScenario) => boolean
 }
 
+/** The canonical judge verdict shape — one declaration, shared by campaign
+ *  judges and the multishot judge runner (which re-exports this type).
+ *
+ *  Scale is PRODUCER-DEFINED: campaign convention is [0,1]; the legacy
+ *  multishot runner emits 0-10. Cross-scale comparison must go through
+ *  `detectScale` (src/campaign/gates/statistical-heldout.ts, used by
+ *  promotion-policy) — never renormalize a producer's values in place, as
+ *  downstream thresholds (`composite >= 5` in multishot/matrix.ts, live-soak
+ *  `>= 7` gates) key on the producer's native scale. */
 export interface JudgeScore {
   dimensions: Record<string, number>
   composite: number
   notes: string
+  /** Set when the judge itself failed (call error, unparseable output).
+   *  `composite`/`dimensions` carry no signal — aggregators MUST exclude
+   *  failed scores from means instead of folding them into zeros. */
+  failed?: true
+  /** Ensemble extras (populated by `ensembleJudge`): max per-dimension
+   *  spread across surviving judges — the inter-rater signal. */
+  maxDisagreement?: number
+  /** Ensemble extras: judge identities whose verdict failed. */
+  failedJudges?: string[]
+  /** Ensemble extras: each surviving judge's per-dimension scores. */
+  perJudge?: Record<string, Record<string, number>>
 }
 
 // ── Optimization (population + generations + mutator) ─────────────────
 
@@ -13,10 +13,12 @@ import type { Artifact } from './artifact-validator'
 import {
   type CompletionRequirement,
   type CorrectnessChecker,
+  completionVerdict,
   createLlmCorrectnessChecker,
   createTokenRecallChecker,
   type ProducedState,
   parseCorrectnessResponse,
+  type RequirementCheck,
   type TaskGold,
   verifyCompletion,
 } from './completion-verifier'
@@ -281,3 +283,49 @@ describe('createTokenRecallChecker — deterministic content checker', () => {
     expect(v.fullyComplete).toBe(true)
   })
 })
+
+describe('completionVerdict — spine derivation', () => {
+  const check = (reqId: string, satisfied: boolean): RequirementCheck => ({
+    reqId,
+    title: reqId,
+    structurallyPresent: satisfied,
+    correct: satisfied ? true : null,
+    satisfied,
+    evidence: [],
+  })
+
+  it('derives completionRate/fullyComplete and the spine fields together', () => {
+    const v = completionVerdict({
+      taskId: 't1',
+      requirements: [check('a', true), check('b', false)],
+    })
+    expect(v.completionRate).toBeCloseTo(0.5, 5)
+    expect(v.fullyComplete).toBe(false)
+    expect(v.valid).toBe(false)
+    expect(v.score).toBeCloseTo(0.5, 5)
+  })
+
+  it('valid mirrors fullyComplete when everything is satisfied', () => {
+    const v = completionVerdict({
+      taskId: 't1',
+      requirements: [check('a', true), check('b', true)],
+    })
+    expect(v.fullyComplete).toBe(true)
+    expect(v.valid).toBe(true)
+    expect(v.score).toBe(1)
+  })
+
+  it('throws on zero requirement checks', () => {
+    expect(() => completionVerdict({ taskId: 't1', requirements: [] })).toThrow(
+      /no requirement checks/,
+    )
+  })
+
+  it('verifyCompletion verdicts carry the spine fields by construction', async () => {
+    const v = await verifyCompletion(gold([DISPUTE_REQ]), emptyState(), alwaysCorrect)
+    expect(v.valid).toBe(v.fullyComplete)
+    expect(v.score).toBe(v.completionRate)
+    expect(v.valid).toBe(false)
+    expect(v.score).toBe(0)
+  })
+})
@@ -22,6 +22,7 @@
 
 import type { TCloud } from '@tangle-network/tcloud'
 import type { Artifact } from './artifact-validator'
+import type { DefaultVerdict } from './verdict'
 
 /** What kind of produced state can satisfy a requirement structurally. */
 export type SatisfiedBy = 'artifact' | 'proposal' | 'tool-call' | 'any'
@@ -77,7 +78,10 @@ export interface RequirementCheck {
   evidence: string[]
 }
 
-export interface CompletionVerdict {
+/** Extends the substrate verdict spine: `valid` = `fullyComplete` and
+ *  `score` = `completionRate` — derived in `completionVerdict()`, the one
+ *  place those equalities hold by construction. */
+export interface CompletionVerdict extends DefaultVerdict {
   taskId: string
   requirements: RequirementCheck[]
   /** satisfied / total requirements. */
@@ -86,6 +90,35 @@ export interface CompletionVerdict {
   fullyComplete: boolean
 }
 
+/**
+ * Construct a `CompletionVerdict` from the per-requirement checks, deriving
+ * `completionRate` / `fullyComplete` and the spine fields (`valid` =
+ * `fullyComplete`, `score` = `completionRate`) in one place. Throws on zero
+ * requirements — a verdict over nothing is a misconfiguration, mirroring
+ * `verifyCompletion`'s gold-spec guard.
+ */
+export function completionVerdict(input: {
+  taskId: string
+  requirements: RequirementCheck[]
+}): CompletionVerdict {
+  if (input.requirements.length === 0) {
+    throw new Error(
+      `completionVerdict: task '${input.taskId}' has no requirement checks — nothing to derive a verdict from`,
+    )
+  }
+  const satisfiedCount = input.requirements.filter((r) => r.satisfied).length
+  const completionRate = satisfiedCount / input.requirements.length
+  const fullyComplete = satisfiedCount === input.requirements.length
+  return {
+    taskId: input.taskId,
+    requirements: input.requirements,
+    completionRate,
+    fullyComplete,
+    valid: fullyComplete,
+    score: completionRate,
+  }
+}
+
 /**
  * Decides whether a produced item's content actually fulfils a requirement.
  * Injected so the structural verifier stays pure and unit-testable; the
@@ -294,13 +327,7 @@ export async function verifyCompletion(
     })
   }
 
-  const satisfiedCount = requirements.filter((r) => r.satisfied).length
-  return {
-    taskId: gold.taskId,
-    requirements,
-    completionRate: satisfiedCount / requirements.length,
-    fullyComplete: satisfiedCount === requirements.length,
-  }
+  return completionVerdict({ taskId: gold.taskId, requirements })
 }
 
 export interface LlmCorrectnessCheckerOpts {
 
@@ -735,10 +735,10 @@ async function computeFailureClusters(
   const clusters = new Map<string, { exemplars: string[]; share: number }>()
   for (const run of failed) {
     try {
-      const result = await analyst.run(run.runId, {
-        kind: 'run-record',
-        run,
-      } as Parameters<typeof analyst.run>[1])
+      // AnalystRunInputs routes by field name: run-record analysts read
+      // `runRecord`. Any other shape makes every analyst skip with
+      // "missing input" and the clusters come back silently empty.
+      const result = await analyst.run(run.runId, { runRecord: run })
       for (const finding of result.findings as AnalystFinding[]) {
         const key = finding.area || finding.analyst_id || 'unclassified'
         const c = clusters.get(key) ?? { exemplars: [], share: 0 }
 
@@ -188,6 +188,14 @@ export {
 // `analyzeRuns()` direct callers (observed runs, no loop) get the same
 // `InsightReport` shape.
 
+// The stable analyst entry: build the canonical registry (feeds
+// `AnalyzeRunsOptions.analyst` → `failureClusters`) and read its findings.
+// The full analyst machinery stays under `@tangle-network/agent-eval/analyst`.
+export {
+  buildDefaultAnalystRegistry,
+  type DefaultAnalystRegistryOptions,
+} from '../analyst/default-registry'
+export type { AnalystFinding } from '../analyst/types'
 export type { AnalyzeRunsOptions } from './analyze-runs'
 export { analyzeRuns } from './analyze-runs'
 export type {
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@tangle-network/agent-eval",`
`3`		`- "version": "0.86.0",`
	`3`	`+ "version": "0.87.0",`
`4`	`4`	`"description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",`
`5`	`5`	`"homepage": "https://github.com/tangle-network/agent-eval#readme",`
`6`	`6`	`"repository": {`