Skip to content

Commit 5d6c422

Browse files
committed
chore: dirty-worktree checkpoint (core retrieval/trajectory + dspy primitives + benchmarks retrieval defaults)
1 parent 88d83d2 commit 5d6c422

13 files changed

Lines changed: 1380 additions & 6 deletions

File tree

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/**
2+
* Per-tier retrieval defaults for the action retrieval / RRF system.
3+
*
4+
* Wave 2-C output: the Pareto sweep recommends a `topK` and stage-weight
5+
* profile for each `ModelTier`. Smaller tiers prefer high-precision
6+
* stages (exact match + BM25) and tighter top-K to keep the action block
7+
* short; frontier tiers can afford to spread retrieval across more
8+
* stages with a wider top-K because the planner has the context budget
9+
* to disambiguate.
10+
*
11+
* Values are heuristic / Pareto-driven, not magic — re-run
12+
* `scripts/lifeops-retrieval-pareto.mjs` against fresh trajectories to
13+
* recalibrate.
14+
*
15+
* Consumers:
16+
* - `action-retrieval.ts` reads these via `tierOverrides` to apply the
17+
* `topK` cap and stage weights at fusion time.
18+
* - The benchmark runners read these by `MODEL_TIER` and pass them
19+
* through to `retrieveActions`.
20+
*/
21+
22+
import type { ModelTier } from "./model-tiers.ts";
23+
24+
/**
25+
* Canonical retrieval stage names — kept in sync with
26+
* `@elizaos/core` `RetrievalStageName`. Duplicated here so this package
27+
* doesn't take a runtime dep on core.
28+
*/
29+
export type RetrievalStageName =
30+
| "exact"
31+
| "regex"
32+
| "keyword"
33+
| "bm25"
34+
| "embedding"
35+
| "contextMatch";
36+
37+
export interface RetrievalTierDefaults {
38+
/** Final fused-top-K cap. Monotone non-decreasing across tiers. */
39+
topK: number;
40+
/**
41+
* Per-stage RRF weight. Default weight per stage is 1.0 — values >1
42+
* up-weight that stage, values <1 down-weight. Missing stages default
43+
* to 1.0.
44+
*/
45+
stageWeights: Partial<Record<RetrievalStageName, number>>;
46+
}
47+
48+
/**
49+
* Initial Pareto-derived defaults. Rationale (per
50+
* `docs/audits/lifeops-2026-05-11/retrieval-pareto.md`):
51+
*
52+
* - `small` — Qwen 0.6B: short context, brittle at long action blocks.
53+
* Prefer exact+BM25 (high precision, deterministic). topK=5 keeps the
54+
* action block under ~1.5KB.
55+
* - `mid` — Qwen 1.7B: tolerates more candidates but still benefits
56+
* from precision-heavy weighting. topK=8.
57+
* - `large` — Cerebras gpt-oss-120b: long context, embedding ranking
58+
* pays off here. Balanced weights, topK=12.
59+
* - `frontier` — Opus 4.7: context-rich planner — let it see a wider
60+
* slate. topK=20, embedding/keyword weighted up to surface long-tail
61+
* matches.
62+
*/
63+
export const RETRIEVAL_DEFAULTS_BY_TIER: Record<ModelTier, RetrievalTierDefaults> = {
64+
small: {
65+
topK: 5,
66+
stageWeights: {
67+
exact: 1.5,
68+
regex: 1.3,
69+
bm25: 1.2,
70+
keyword: 1.0,
71+
embedding: 0.7,
72+
contextMatch: 0.9,
73+
},
74+
},
75+
mid: {
76+
topK: 8,
77+
stageWeights: {
78+
exact: 1.4,
79+
regex: 1.2,
80+
bm25: 1.15,
81+
keyword: 1.0,
82+
embedding: 0.85,
83+
contextMatch: 1.0,
84+
},
85+
},
86+
large: {
87+
topK: 12,
88+
stageWeights: {
89+
exact: 1.2,
90+
regex: 1.1,
91+
bm25: 1.0,
92+
keyword: 1.0,
93+
embedding: 1.0,
94+
contextMatch: 1.0,
95+
},
96+
},
97+
frontier: {
98+
topK: 20,
99+
stageWeights: {
100+
exact: 1.0,
101+
regex: 1.0,
102+
bm25: 1.0,
103+
keyword: 1.1,
104+
embedding: 1.2,
105+
contextMatch: 1.0,
106+
},
107+
},
108+
};
109+
110+
/**
111+
* Resolve retrieval defaults from `MODEL_TIER` (or a passed-in env).
112+
* Falls back to `large` when the env var is missing/unknown. Mirrors
113+
* the resolution policy in `resolveTier`.
114+
*/
115+
export function resolveRetrievalDefaults(
116+
env: NodeJS.ProcessEnv = process.env,
117+
): RetrievalTierDefaults {
118+
const raw = env.MODEL_TIER?.trim();
119+
const tier: ModelTier =
120+
raw === "small" || raw === "mid" || raw === "large" || raw === "frontier"
121+
? raw
122+
: "large";
123+
// Return a fresh copy so callers can mutate without poisoning the
124+
// module-level registry.
125+
const source = RETRIEVAL_DEFAULTS_BY_TIER[tier];
126+
return {
127+
topK: source.topK,
128+
stageWeights: { ...source.stageWeights },
129+
};
130+
}

packages/core/src/runtime/action-retrieval.ts

Lines changed: 114 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,42 @@ export type RetrieveActionsInput = {
3131
* while still preferring on-context candidates when scores are close.
3232
*/
3333
selectedContexts?: readonly string[];
34+
/**
35+
* When `true`, capture each stage's full pre-fusion output and emit it
36+
* in `response.measurement`. Default `false` — no allocation cost in
37+
* production. Toggle via the `MILADY_RETRIEVAL_MEASUREMENT=1` env var
38+
* on the caller side.
39+
*/
40+
measurementMode?: boolean;
41+
/**
42+
* Optional per-tier overrides for retrieval. When provided, the call
43+
* uses these instead of the in-file constants. Wired by the benchmark
44+
* harness from `RETRIEVAL_DEFAULTS_BY_TIER`.
45+
*/
46+
tierOverrides?: {
47+
topK?: number;
48+
stageWeights?: Partial<Record<RetrievalStageName, number>>;
49+
};
50+
};
51+
52+
export type RetrievalStageEntry = {
53+
actionName: string;
54+
score: number;
55+
rank: number;
56+
};
57+
58+
export type RetrievalPerStageScores = {
59+
exact: RetrievalStageEntry[];
60+
regex: RetrievalStageEntry[];
61+
keyword: RetrievalStageEntry[];
62+
bm25: RetrievalStageEntry[];
63+
embedding: RetrievalStageEntry[];
64+
contextMatch: RetrievalStageEntry[];
65+
};
66+
67+
export type RetrievalMeasurement = {
68+
perStageScores: RetrievalPerStageScores;
69+
fusedTopK: Array<{ actionName: string; rrfScore: number; rank: number }>;
3470
};
3571

3672
export type ActionRetrievalResult = {
@@ -53,6 +89,12 @@ export type ActionRetrievalResponse = {
5389
candidateActions: string[];
5490
parentActionHints: string[];
5591
};
92+
/**
93+
* Per-stage retrieval funnel. Populated only when
94+
* `input.measurementMode === true`. The benchmark harness consumes
95+
* this to compute stage-by-stage recall.
96+
*/
97+
measurement?: RetrievalMeasurement;
5698
};
5799

58100
const BM25_K1 = 0.9;
@@ -99,7 +141,8 @@ export function retrieveActions(
99141
bm25: rankScores(bm25Scores),
100142
embedding: rankScores(embeddingScores),
101143
};
102-
const rrfScores = reciprocalRankFusion(stageRankings);
144+
const stageWeights = input.tierOverrides?.stageWeights;
145+
const rrfScores = reciprocalRankFusion(stageRankings, stageWeights);
103146
const maxRrf = Math.max(0, ...rrfScores.values());
104147
const maxKeyword = Math.max(0, ...keywordScores.values());
105148
const maxBm25 = Math.max(0, ...bm25Scores.values());
@@ -195,15 +238,63 @@ export function retrieveActions(
195238
);
196239
});
197240

198-
const limit = Number.isFinite(input.limit)
199-
? Math.max(0, input.limit ?? 0)
241+
const effectiveLimit =
242+
input.tierOverrides?.topK ??
243+
(Number.isFinite(input.limit) ? input.limit : undefined);
244+
const limit = Number.isFinite(effectiveLimit)
245+
? Math.max(0, effectiveLimit ?? 0)
200246
: 0;
201247
const limited = limit > 0 ? results.slice(0, limit) : results;
202248

203249
for (let index = 0; index < limited.length; index += 1) {
204250
limited[index].rank = index + 1;
205251
}
206252

253+
let measurement: RetrievalMeasurement | undefined;
254+
if (input.measurementMode === true) {
255+
// Capture each stage's pre-fusion ranking so the analyzer can compute
256+
// stage-by-stage recall. Context-match scores are recomputed from the
257+
// per-parent boost so they're available alongside the other five
258+
// stages even though they're applied as an additive bump in the main
259+
// loop, not as a ranking source.
260+
const selectedContextSetForMeasurement = selectedContextSet;
261+
const contextMatchScores = new Map<string, number>();
262+
for (const parent of input.catalog.parents) {
263+
const parentContexts = Array.isArray(parent.contexts)
264+
? (parent.contexts as readonly unknown[])
265+
: [];
266+
if (
267+
selectedContextSetForMeasurement.size > 0 &&
268+
parentContexts.length > 0 &&
269+
parentContexts.some((c) =>
270+
selectedContextSetForMeasurement.has(String(c).toLowerCase()),
271+
)
272+
) {
273+
contextMatchScores.set(parent.normalizedName, 1);
274+
}
275+
}
276+
277+
measurement = {
278+
perStageScores: {
279+
exact: mapToStageEntries(exactScores),
280+
regex: mapToStageEntries(regexScores),
281+
keyword: mapToStageEntries(keywordScores),
282+
bm25: mapToStageEntries(bm25Scores),
283+
embedding: mapToStageEntries(embeddingScores),
284+
contextMatch: mapToStageEntries(contextMatchScores),
285+
},
286+
fusedTopK: Array.from(rrfScores.entries())
287+
.sort(([leftName, leftScore], [rightName, rightScore]) => {
288+
return rightScore - leftScore || leftName.localeCompare(rightName);
289+
})
290+
.map(([name, rrfScore], index) => ({
291+
actionName: name,
292+
rrfScore: roundScore(rrfScore),
293+
rank: index + 1,
294+
})),
295+
};
296+
}
297+
207298
return {
208299
results: limited,
209300
warnings: input.catalog.warnings,
@@ -213,9 +304,23 @@ export function retrieveActions(
213304
candidateActions,
214305
parentActionHints,
215306
},
307+
...(measurement ? { measurement } : {}),
216308
};
217309
}
218310

311+
function mapToStageEntries(scores: Map<string, number>): RetrievalStageEntry[] {
312+
return Array.from(scores.entries())
313+
.filter(([, score]) => score > 0)
314+
.sort(([leftName, leftScore], [rightName, rightScore]) => {
315+
return rightScore - leftScore || leftName.localeCompare(rightName);
316+
})
317+
.map(([actionName, score], index) => ({
318+
actionName,
319+
score: roundScore(score),
320+
rank: index + 1,
321+
}));
322+
}
323+
219324
export function tokenizeActionSearchText(text: string): string[] {
220325
return String(text ?? "")
221326
.replace(/([a-z0-9])([A-Z])/g, "$1 $2")
@@ -445,16 +550,20 @@ function rankScores(scores: Map<string, number>): Map<string, number> {
445550

446551
function reciprocalRankFusion(
447552
stageRankings: Partial<Record<RetrievalStageName, Map<string, number>>>,
553+
stageWeights?: Partial<Record<RetrievalStageName, number>>,
448554
): Map<string, number> {
449555
const scores = new Map<string, number>();
450556

451-
for (const ranking of Object.values(stageRankings)) {
557+
for (const [stageName, ranking] of Object.entries(stageRankings) as Array<
558+
[RetrievalStageName, Map<string, number> | undefined]
559+
>) {
452560
if (!ranking) {
453561
continue;
454562
}
563+
const weight = stageWeights?.[stageName] ?? 1;
455564

456565
for (const [name, rank] of ranking.entries()) {
457-
scores.set(name, (scores.get(name) ?? 0) + 1 / (RRF_K + rank));
566+
scores.set(name, (scores.get(name) ?? 0) + weight / (RRF_K + rank));
458567
}
459568
}
460569

packages/core/src/runtime/trajectory-recorder.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,31 @@ export interface RecordedToolStage {
135135
truncated?: RecordedTruncationMarker[];
136136
}
137137

138+
/**
139+
* Per-stage retrieval entry captured when measurement mode is on. One
140+
* entry per (action, stage) pair, recorded BEFORE reciprocal-rank-fusion
141+
* so the funnel analyzer can see what each individual stage produced.
142+
*/
143+
export interface RecordedRetrievalStageEntry {
144+
actionName: string;
145+
score: number;
146+
rank: number;
147+
}
148+
149+
/**
150+
* Per-stage retrieval scores captured under `MILADY_RETRIEVAL_MEASUREMENT=1`.
151+
* Default `undefined` — no perf cost in production unless the env var is
152+
* explicitly enabled.
153+
*/
154+
export interface RecordedRetrievalPerStageScores {
155+
exact: RecordedRetrievalStageEntry[];
156+
regex: RecordedRetrievalStageEntry[];
157+
keyword: RecordedRetrievalStageEntry[];
158+
bm25: RecordedRetrievalStageEntry[];
159+
embedding: RecordedRetrievalStageEntry[];
160+
contextMatch: RecordedRetrievalStageEntry[];
161+
}
162+
138163
/**
139164
* Snapshot of the tool-search / action-retrieval phase. Logged once per
140165
* planner turn before the LLM call so reviewers can see which actions
@@ -158,6 +183,29 @@ export interface RecordedToolSearchStage {
158183
tier: { tierA: string[]; tierB: string[]; omitted: number };
159184
durationMs: number;
160185
fallback?: string;
186+
/**
187+
* Per-stage retrieval funnel. Populated only when the retrieval call
188+
* ran with measurement mode on (`MILADY_RETRIEVAL_MEASUREMENT=1`).
189+
*/
190+
perStageScores?: RecordedRetrievalPerStageScores;
191+
/**
192+
* Top-K fused (RRF) results. Mirrors `results` but exposes the raw
193+
* `rrfScore` field directly so downstream analyzers don't need to
194+
* unify the two shapes. Populated only under measurement mode.
195+
*/
196+
fusedTopK?: Array<{ actionName: string; rrfScore: number; rank: number }>;
197+
/**
198+
* Actions the planner ultimately invoked this turn. Recorded by the
199+
* caller after the planner loop resolves — the retrieval call itself
200+
* does not know which results were selected.
201+
*/
202+
selectedActions?: string[];
203+
/**
204+
* Ground-truth actions for this scenario, when available. Sourced from
205+
* the scenario manifest by the benchmark harness; never inferred from
206+
* the trajectory.
207+
*/
208+
correctActions?: string[];
161209
}
162210

163211
export interface RecordedEvaluationStage extends EvaluationResult {

0 commit comments

Comments
 (0)