Skip to content

Commit 287c63f

Browse files
authored
feat(verdict): canonical verdict spine — JudgeScore dedupe, ensembleJudge, eval tools, fail-loud judges (#236)
One JudgeScore declaration (campaign/types.ts; multishot re-exports, scale producer-defined via detectScale). CompletionVerdict + VerificationReport extend DefaultVerdict with derived valid/score; completionVerdict() helper. VerifiableReward carries required components. parseJudgeResponse throws typed JudgeParseError instead of fabricating zero rows; executor and traced ensemble record failed judges. multishot runJudge marks failed:true and the matrix excludes failed scores from cell means. analyzeRuns failure clustering passes the real AnalystRunInputs shape (was silently empty). New ensembleJudge panel (cross-family gate, retry, collision-suffixed votes) and makeEvalTools/toOpenAiTool agent toolset. Contract exports the stable analyst entry. Version trio 0.87.0.
1 parent 534ce7d commit 287c63f

28 files changed

Lines changed: 1410 additions & 89 deletions

clients/python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "agent-eval-rpc"
7-
version = "0.86.0"
7+
version = "0.87.0"
88
description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
99
readme = "README.md"
1010
requires-python = ">=3.10"

clients/python/src/agent_eval_rpc/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
try:
5959
__version__ = version("agent-eval-rpc")
6060
except PackageNotFoundError:
61-
__version__ = "0.86.0"
61+
__version__ = "0.87.0"
6262

6363
__all__ = [
6464
"Client",

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@tangle-network/agent-eval",
3-
"version": "0.86.0",
3+
"version": "0.87.0",
44
"description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
55
"homepage": "https://github.com/tangle-network/agent-eval#readme",
66
"repository": {

src/campaign/types.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,30 @@ export interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {
106106
appliesTo?: (scenario: TScenario) => boolean
107107
}
108108

109+
/** The canonical judge verdict shape — one declaration, shared by campaign
110+
* judges and the multishot judge runner (which re-exports this type).
111+
*
112+
* Scale is PRODUCER-DEFINED: campaign convention is [0,1]; the legacy
113+
* multishot runner emits 0-10. Cross-scale comparison must go through
114+
* `detectScale` (src/campaign/gates/statistical-heldout.ts, used by
115+
* promotion-policy) — never renormalize a producer's values in place, as
116+
* downstream thresholds (`composite >= 5` in multishot/matrix.ts, live-soak
117+
* `>= 7` gates) key on the producer's native scale. */
109118
export interface JudgeScore {
110119
dimensions: Record<string, number>
111120
composite: number
112121
notes: string
122+
/** Set when the judge itself failed (call error, unparseable output).
123+
* `composite`/`dimensions` carry no signal — aggregators MUST exclude
124+
* failed scores from means instead of folding them into zeros. */
125+
failed?: true
126+
/** Ensemble extras (populated by `ensembleJudge`): max per-dimension
127+
* spread across surviving judges — the inter-rater signal. */
128+
maxDisagreement?: number
129+
/** Ensemble extras: judge identities whose verdict failed. */
130+
failedJudges?: string[]
131+
/** Ensemble extras: each surviving judge's per-dimension scores. */
132+
perJudge?: Record<string, Record<string, number>>
113133
}
114134

115135
// ── Optimization (population + generations + mutator) ─────────────────

src/completion-verifier.test.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@ import type { Artifact } from './artifact-validator'
1313
import {
1414
type CompletionRequirement,
1515
type CorrectnessChecker,
16+
completionVerdict,
1617
createLlmCorrectnessChecker,
1718
createTokenRecallChecker,
1819
type ProducedState,
1920
parseCorrectnessResponse,
21+
type RequirementCheck,
2022
type TaskGold,
2123
verifyCompletion,
2224
} from './completion-verifier'
@@ -281,3 +283,49 @@ describe('createTokenRecallChecker — deterministic content checker', () => {
281283
expect(v.fullyComplete).toBe(true)
282284
})
283285
})
286+
287+
describe('completionVerdict — spine derivation', () => {
288+
const check = (reqId: string, satisfied: boolean): RequirementCheck => ({
289+
reqId,
290+
title: reqId,
291+
structurallyPresent: satisfied,
292+
correct: satisfied ? true : null,
293+
satisfied,
294+
evidence: [],
295+
})
296+
297+
it('derives completionRate/fullyComplete and the spine fields together', () => {
298+
const v = completionVerdict({
299+
taskId: 't1',
300+
requirements: [check('a', true), check('b', false)],
301+
})
302+
expect(v.completionRate).toBeCloseTo(0.5, 5)
303+
expect(v.fullyComplete).toBe(false)
304+
expect(v.valid).toBe(false)
305+
expect(v.score).toBeCloseTo(0.5, 5)
306+
})
307+
308+
it('valid mirrors fullyComplete when everything is satisfied', () => {
309+
const v = completionVerdict({
310+
taskId: 't1',
311+
requirements: [check('a', true), check('b', true)],
312+
})
313+
expect(v.fullyComplete).toBe(true)
314+
expect(v.valid).toBe(true)
315+
expect(v.score).toBe(1)
316+
})
317+
318+
it('throws on zero requirement checks', () => {
319+
expect(() => completionVerdict({ taskId: 't1', requirements: [] })).toThrow(
320+
/no requirement checks/,
321+
)
322+
})
323+
324+
it('verifyCompletion verdicts carry the spine fields by construction', async () => {
325+
const v = await verifyCompletion(gold([DISPUTE_REQ]), emptyState(), alwaysCorrect)
326+
expect(v.valid).toBe(v.fullyComplete)
327+
expect(v.score).toBe(v.completionRate)
328+
expect(v.valid).toBe(false)
329+
expect(v.score).toBe(0)
330+
})
331+
})

src/completion-verifier.ts

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import type { TCloud } from '@tangle-network/tcloud'
2424
import type { Artifact } from './artifact-validator'
25+
import type { DefaultVerdict } from './verdict'
2526

2627
/** What kind of produced state can satisfy a requirement structurally. */
2728
export type SatisfiedBy = 'artifact' | 'proposal' | 'tool-call' | 'any'
@@ -77,7 +78,10 @@ export interface RequirementCheck {
7778
evidence: string[]
7879
}
7980

80-
export interface CompletionVerdict {
81+
/** Extends the substrate verdict spine: `valid` = `fullyComplete` and
82+
* `score` = `completionRate` — derived in `completionVerdict()`, the one
83+
* place those equalities hold by construction. */
84+
export interface CompletionVerdict extends DefaultVerdict {
8185
taskId: string
8286
requirements: RequirementCheck[]
8387
/** satisfied / total requirements. */
@@ -86,6 +90,35 @@ export interface CompletionVerdict {
8690
fullyComplete: boolean
8791
}
8892

93+
/**
94+
* Construct a `CompletionVerdict` from the per-requirement checks, deriving
95+
* `completionRate` / `fullyComplete` and the spine fields (`valid` =
96+
* `fullyComplete`, `score` = `completionRate`) in one place. Throws on zero
97+
* requirements — a verdict over nothing is a misconfiguration, mirroring
98+
* `verifyCompletion`'s gold-spec guard.
99+
*/
100+
export function completionVerdict(input: {
101+
taskId: string
102+
requirements: RequirementCheck[]
103+
}): CompletionVerdict {
104+
if (input.requirements.length === 0) {
105+
throw new Error(
106+
`completionVerdict: task '${input.taskId}' has no requirement checks — nothing to derive a verdict from`,
107+
)
108+
}
109+
const satisfiedCount = input.requirements.filter((r) => r.satisfied).length
110+
const completionRate = satisfiedCount / input.requirements.length
111+
const fullyComplete = satisfiedCount === input.requirements.length
112+
return {
113+
taskId: input.taskId,
114+
requirements: input.requirements,
115+
completionRate,
116+
fullyComplete,
117+
valid: fullyComplete,
118+
score: completionRate,
119+
}
120+
}
121+
89122
/**
90123
* Decides whether a produced item's content actually fulfils a requirement.
91124
* Injected so the structural verifier stays pure and unit-testable; the
@@ -294,13 +327,7 @@ export async function verifyCompletion(
294327
})
295328
}
296329

297-
const satisfiedCount = requirements.filter((r) => r.satisfied).length
298-
return {
299-
taskId: gold.taskId,
300-
requirements,
301-
completionRate: satisfiedCount / requirements.length,
302-
fullyComplete: satisfiedCount === requirements.length,
303-
}
330+
return completionVerdict({ taskId: gold.taskId, requirements })
304331
}
305332

306333
export interface LlmCorrectnessCheckerOpts {

src/contract/analyze-runs.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -735,10 +735,10 @@ async function computeFailureClusters(
735735
const clusters = new Map<string, { exemplars: string[]; share: number }>()
736736
for (const run of failed) {
737737
try {
738-
const result = await analyst.run(run.runId, {
739-
kind: 'run-record',
740-
run,
741-
} as Parameters<typeof analyst.run>[1])
738+
// AnalystRunInputs routes by field name: run-record analysts read
739+
// `runRecord`. Any other shape makes every analyst skip with
740+
// "missing input" and the clusters come back silently empty.
741+
const result = await analyst.run(run.runId, { runRecord: run })
742742
for (const finding of result.findings as AnalystFinding[]) {
743743
const key = finding.area || finding.analyst_id || 'unclassified'
744744
const c = clusters.get(key) ?? { exemplars: [], share: 0 }

src/contract/index.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,14 @@ export {
188188
// `analyzeRuns()` direct callers (observed runs, no loop) get the same
189189
// `InsightReport` shape.
190190

191+
// The stable analyst entry: build the canonical registry (feeds
192+
// `AnalyzeRunsOptions.analyst` → `failureClusters`) and read its findings.
193+
// The full analyst machinery stays under `@tangle-network/agent-eval/analyst`.
194+
export {
195+
buildDefaultAnalystRegistry,
196+
type DefaultAnalystRegistryOptions,
197+
} from '../analyst/default-registry'
198+
export type { AnalystFinding } from '../analyst/types'
191199
export type { AnalyzeRunsOptions } from './analyze-runs'
192200
export { analyzeRuns } from './analyze-runs'
193201
export type {

0 commit comments

Comments
 (0)