Skip to content

Commit ddbef93

Browse files
authored
feat: define policy reward calibration (#228)
Co-authored-by: devkade <devkade@users.noreply.github.com>
1 parent cfae401 commit ddbef93

5 files changed

Lines changed: 119 additions & 45 deletions

File tree

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,9 @@ Runtime storage, adapter configuration, and worker retention are described in [`
251251

252252
Future learning-runtime boundaries are designed in [`docs/learning-runtime-boundaries.md`](docs/learning-runtime-boundaries.md). That document connects existing `WorkflowState` and RunContract projection responsibilities to a future `RunState` execution envelope while separating completion authority, runtime readiness authority, and advisory evaluation/learning signals.
253253

254-
The domain `PolicySelector` in `src/domain/policy-selector.ts` is an advisory pre-dispatch primitive. It generates a fixed initial policy set, simulates objective-weighted candidate outcomes, records a selected policy plus rejected alternatives, and emits prediction ids that reward-ledger entries can later calibrate. It does not launch agents, mutate workflow state, or hard-block execution from simulated score alone.
254+
The domain `PolicySelector` in `src/domain/policy-selector.ts` is an advisory pre-dispatch primitive. It generates a fixed initial policy set across conservative, balanced, aggressive, high-assurance, and learning-exploration strategies; simulates objective-weighted candidate outcomes from task complexity, expected module touch count, dependency depth, adapter mix, isolation mode, verification depth, historical success, and recent reward calibration; records estimator outputs for conflict risk, regression risk, repair likelihood, elapsed/tool cost, review burden, learning value, confidence, and utility; and emits prediction ids that reward-ledger entries can later calibrate. Human overrides are explicit (`selector: "human"` plus reason) and remain bounded by exploration/conflict/regression safety caps. The selector does not launch agents, mutate workflow state, or hard-block execution from simulated score alone.
255+
256+
The objective/reward domain in `src/domain/objective.ts` converts evidence-backed `EvaluationResult` records into append-only `reward-ledger.v1` events. Reward records include outcome status, prediction-vs-actual delta, penalty taxonomy, anti-Goodhart checks tied to [`docs/runcontract-harness-evaluator.md`](docs/runcontract-harness-evaluator.md), advisory `PolicyHint` values, and calibration metadata. Reward data may inform future `PolicySelection` records, but it must not silently mutate objective weights, selected policy, worker count, adapter choices, or completion authority; human-approved objective calibration must be recorded explicitly.
255257

256258
## Thin Harness Standard
257259

src/domain/objective.ts

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,16 @@ export interface GuardrailViolation { guardrailId: string; reason: string }
1212
export interface EvaluationResult { schemaVersion: "evaluation-result.v1"; id: string; objectiveId: string; target: ObjectiveTarget; score: number; verdict: EvaluationVerdict; metricScores: MetricScore[]; guardrailViolations: GuardrailViolation[]; reasons: string[]; requiredRepairs: string[]; evidenceRefs: string[]; at: string }
1313
export interface EvaluateObjectiveInput { id: string; target: ObjectiveTarget; metricValues: Record<string, number>; guardrailChecks: GuardrailCheck[]; evidenceRefs: string[]; at: string }
1414
export type RewardOutcomeStatus = "success" | "repair" | "failure";
15+
export type RewardPenaltyKind = "repair" | "regression" | "conflict" | "stale_evidence" | "anti_goodhart" | "human_rejection" | "cost";
1516
export interface PolicyPrediction { id: string; policyId: string; predictedScore: number; confidence: number }
1617
export interface RewardLedgerContext { runId: string; taskId?: string; workerId?: string }
1718
export interface RewardActualOutcome { status: RewardOutcomeStatus; score: number; elapsedSeconds: number; repairCount: number }
18-
export interface RewardPenalty { id: string; amount: number; reason: string }
19-
export interface PolicyHint { id: string; policyId: string; summary: string; authority: "advisory" }
20-
export interface RewardLedgerEvent { schemaVersion: "reward-ledger.v1"; id: string; at: string; context: RewardLedgerContext; selectedPolicyId: string; prediction: PolicyPrediction; evaluation: EvaluationResult; actualOutcome: RewardActualOutcome; reward: number; predictionDelta: number; penalties: RewardPenalty[]; policyHints: PolicyHint[] }
21-
export type RewardLedgerAppendInput = Omit<RewardLedgerEvent, "schemaVersion" | "reward" | "predictionDelta" | "penalties"> & { penalties?: RewardPenalty[] };
19+
export interface RewardPenalty { id: string; kind: RewardPenaltyKind; amount: number; reason: string; evidenceRefs: string[] }
20+
export interface AntiGoodhartCheck { id: string; passed: boolean; concern: string; evidenceRefs: string[] }
21+
export interface RewardCalibration { predictionId: string; predictionDelta: number; confidence: number; humanApprovedWeightChange: boolean; notes: string[] }
22+
export interface PolicyHint { id: string; policyId: string; summary: string; authority: "advisory"; strength?: "weak" | "normal" | "strong"; source?: "reward_ledger" | "human" | "policy_simulation" }
23+
export interface RewardLedgerEvent { schemaVersion: "reward-ledger.v1"; id: string; at: string; context: RewardLedgerContext; selectedPolicyId: string; prediction: PolicyPrediction; evaluation: EvaluationResult; actualOutcome: RewardActualOutcome; reward: number; predictionDelta: number; penalties: RewardPenalty[]; antiGoodhartChecks: AntiGoodhartCheck[]; calibration: RewardCalibration; policyHints: PolicyHint[] }
24+
export type RewardLedgerAppendInput = Omit<RewardLedgerEvent, "schemaVersion" | "reward" | "predictionDelta" | "penalties" | "calibration"> & { penalties?: RewardPenalty[]; calibration?: Partial<RewardCalibration> };
2225

2326
export function evaluateObjective(objective: ObjectiveFunction, input: EvaluateObjectiveInput): EvaluationResult {
2427
assertValidObjective(objective);
@@ -45,10 +48,13 @@ export function evaluateObjective(objective: ObjectiveFunction, input: EvaluateO
4548
export function appendRewardLedgerEvent(ledger: RewardLedgerEvent[], input: RewardLedgerAppendInput): RewardLedgerEvent[] {
4649
assertValidRewardAppendInput(input);
4750
const penalties = input.penalties ?? [];
48-
const penaltyTotal = penalties.reduce((sum, penalty) => sum + penalty.amount, 0);
51+
const antiGoodhartPenalty = input.antiGoodhartChecks.some((check) => !check.passed) ? 0.15 : 0;
52+
const penaltyTotal = penalties.reduce((sum, penalty) => sum + penalty.amount, antiGoodhartPenalty);
4953
const outcomePenalty = input.actualOutcome.status === "success" ? 0 : input.actualOutcome.status === "repair" ? 0.2 : 0.5;
5054
const reward = roundScore(input.actualOutcome.score - outcomePenalty - input.actualOutcome.repairCount * 0.05 - Math.min(input.actualOutcome.elapsedSeconds / 10_000, 0.2) - penaltyTotal);
51-
const event: RewardLedgerEvent = { schemaVersion: "reward-ledger.v1", id: input.id, at: input.at, context: { ...input.context }, selectedPolicyId: input.selectedPolicyId, prediction: { ...input.prediction }, evaluation: cloneEvaluation(input.evaluation), actualOutcome: { ...input.actualOutcome }, reward, predictionDelta: roundScore(input.actualOutcome.score - input.prediction.predictedScore), penalties: penalties.map((penalty) => ({ ...penalty })), policyHints: input.policyHints.map((hint) => ({ ...hint, authority: "advisory" })) };
55+
const predictionDelta = roundScore(input.actualOutcome.score - input.prediction.predictedScore);
56+
const calibration: RewardCalibration = { predictionId: input.prediction.id, predictionDelta, confidence: input.prediction.confidence, humanApprovedWeightChange: input.calibration?.humanApprovedWeightChange ?? false, notes: [...(input.calibration?.notes ?? [])] };
57+
const event: RewardLedgerEvent = { schemaVersion: "reward-ledger.v1", id: input.id, at: input.at, context: { ...input.context }, selectedPolicyId: input.selectedPolicyId, prediction: { ...input.prediction }, evaluation: cloneEvaluation(input.evaluation), actualOutcome: { ...input.actualOutcome }, reward, predictionDelta, penalties: penalties.map((penalty) => ({ ...penalty, evidenceRefs: [...penalty.evidenceRefs] })), antiGoodhartChecks: input.antiGoodhartChecks.map((check) => ({ ...check, evidenceRefs: [...check.evidenceRefs] })), calibration, policyHints: input.policyHints.map((hint) => ({ ...hint, authority: "advisory" })) };
5258
return [...ledger.map(cloneRewardLedgerEvent), event];
5359
}
5460

@@ -90,7 +96,7 @@ function cloneEvaluation(evaluation: EvaluationResult): EvaluationResult {
9096
}
9197

9298
function cloneRewardLedgerEvent(event: RewardLedgerEvent): RewardLedgerEvent {
93-
return { ...event, context: { ...event.context }, prediction: { ...event.prediction }, evaluation: cloneEvaluation(event.evaluation), actualOutcome: { ...event.actualOutcome }, penalties: event.penalties.map((penalty) => ({ ...penalty })), policyHints: event.policyHints.map((hint) => ({ ...hint })) };
99+
return { ...event, context: { ...event.context }, prediction: { ...event.prediction }, evaluation: cloneEvaluation(event.evaluation), actualOutcome: { ...event.actualOutcome }, penalties: event.penalties.map((penalty) => ({ ...penalty, evidenceRefs: [...penalty.evidenceRefs] })), antiGoodhartChecks: event.antiGoodhartChecks.map((check) => ({ ...check, evidenceRefs: [...check.evidenceRefs] })), calibration: { ...event.calibration, notes: [...event.calibration.notes] }, policyHints: event.policyHints.map((hint) => ({ ...hint })) };
94100
}
95101

96102
function assertValidRewardAppendInput(input: RewardLedgerAppendInput): void {
@@ -100,13 +106,20 @@ function assertValidRewardAppendInput(input: RewardLedgerAppendInput): void {
100106
assertValidScore("actualOutcome score", input.actualOutcome.score);
101107
if (!Number.isFinite(input.actualOutcome.elapsedSeconds) || input.actualOutcome.elapsedSeconds < 0) throw new Error("elapsedSeconds must be non-negative");
102108
if (!Number.isInteger(input.actualOutcome.repairCount) || input.actualOutcome.repairCount < 0) throw new Error("repairCount must be a non-negative integer");
103-
for (const penalty of input.penalties ?? []) if (!Number.isFinite(penalty.amount) || penalty.amount < 0) throw new Error(`penalty ${penalty.id} amount must be non-negative`);
109+
for (const penalty of input.penalties ?? []) {
110+
if (!penalty.id.trim()) throw new Error("penalty id is required");
111+
if (!Number.isFinite(penalty.amount) || penalty.amount < 0) throw new Error(`penalty ${penalty.id} amount must be non-negative`);
112+
if (!penalty.evidenceRefs.length) throw new Error(`penalty ${penalty.id} requires evidence refs`);
113+
}
114+
for (const check of input.antiGoodhartChecks) if (!check.id.trim() || !check.concern.trim() || !check.evidenceRefs.length) throw new Error("anti-Goodhart checks require id, concern, and evidence refs");
104115
}
105116

106117
function validateParsedRewardLedgerEvent(value: unknown): RewardLedgerEvent {
107118
if (!value || typeof value !== "object") throw new Error("reward ledger event must be an object");
108119
const event = value as Partial<RewardLedgerEvent>;
109120
if (event.schemaVersion !== "reward-ledger.v1") throw new Error("unsupported reward ledger schema version");
121+
if (!event.calibration?.predictionId) throw new Error("reward ledger event requires calibration prediction id");
122+
if (!Array.isArray(event.antiGoodhartChecks)) throw new Error("reward ledger event requires anti-Goodhart checks");
110123
return event as RewardLedgerEvent;
111124
}
112125

0 commit comments

Comments
 (0)