elizaOS
diff --git a/‎packages/benchmarks/lib/src/retrieval-defaults.ts‎
Lines changed: 130 additions & 0 deletions b/‎packages/benchmarks/lib/src/retrieval-defaults.ts‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎packages/core/src/runtime/action-retrieval.ts‎
Lines changed: 114 additions & 5 deletions b/‎packages/core/src/runtime/action-retrieval.ts‎
Lines changed: 114 additions & 5 deletions
diff --git a/‎packages/core/src/runtime/trajectory-recorder.ts‎
Lines changed: 48 additions & 0 deletions b/‎packages/core/src/runtime/trajectory-recorder.ts‎
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,130 @@
+/**
+ * Per-tier retrieval defaults for the action retrieval / RRF system.
+ *
+ * Wave 2-C output: the Pareto sweep recommends a `topK` and stage-weight
+ * profile for each `ModelTier`. Smaller tiers prefer high-precision
+ * stages (exact match + BM25) and tighter top-K to keep the action block
+ * short; frontier tiers can afford to spread retrieval across more
+ * stages with a wider top-K because the planner has the context budget
+ * to disambiguate.
+ *
+ * Values are heuristic / Pareto-driven, not magic — re-run
+ * `scripts/lifeops-retrieval-pareto.mjs` against fresh trajectories to
+ * recalibrate.
+ *
+ * Consumers:
+ * - `action-retrieval.ts` reads these via `tierOverrides` to apply the
+ *   `topK` cap and stage weights at fusion time.
+ * - The benchmark runners read these by `MODEL_TIER` and pass them
+ *   through to `retrieveActions`.
+ */
+
+import type { ModelTier } from "./model-tiers.ts";
+
+/**
+ * Canonical retrieval stage names — kept in sync with
+ * `@elizaos/core` `RetrievalStageName`. Duplicated here so this package
+ * doesn't take a runtime dep on core.
+ */
+export type RetrievalStageName =
+  | "exact"
+  | "regex"
+  | "keyword"
+  | "bm25"
+  | "embedding"
+  | "contextMatch";
+
+export interface RetrievalTierDefaults {
+  /** Final fused-top-K cap. Monotone non-decreasing across tiers. */
+  topK: number;
+  /**
+   * Per-stage RRF weight. Default weight per stage is 1.0 — values >1
+   * up-weight that stage, values <1 down-weight. Missing stages default
+   * to 1.0.
+   */
+  stageWeights: Partial<Record<RetrievalStageName, number>>;
+}
+
+/**
+ * Initial Pareto-derived defaults. Rationale (per
+ * `docs/audits/lifeops-2026-05-11/retrieval-pareto.md`):
+ *
+ * - `small` — Qwen 0.6B: short context, brittle at long action blocks.
+ *   Prefer exact+BM25 (high precision, deterministic). topK=5 keeps the
+ *   action block under ~1.5KB.
+ * - `mid` — Qwen 1.7B: tolerates more candidates but still benefits
+ *   from precision-heavy weighting. topK=8.
+ * - `large` — Cerebras gpt-oss-120b: long context, embedding ranking
+ *   pays off here. Balanced weights, topK=12.
+ * - `frontier` — Opus 4.7: context-rich planner — let it see a wider
+ *   slate. topK=20, embedding/keyword weighted up to surface long-tail
+ *   matches.
+ */
+export const RETRIEVAL_DEFAULTS_BY_TIER: Record<ModelTier, RetrievalTierDefaults> = {
+  small: {
+    topK: 5,
+    stageWeights: {
+      exact: 1.5,
+      regex: 1.3,
+      bm25: 1.2,
+      keyword: 1.0,
+      embedding: 0.7,
+      contextMatch: 0.9,
+    },
+  },
+  mid: {
+    topK: 8,
+    stageWeights: {
+      exact: 1.4,
+      regex: 1.2,
+      bm25: 1.15,
+      keyword: 1.0,
+      embedding: 0.85,
+      contextMatch: 1.0,
+    },
+  },
+  large: {
+    topK: 12,
+    stageWeights: {
+      exact: 1.2,
+      regex: 1.1,
+      bm25: 1.0,
+      keyword: 1.0,
+      embedding: 1.0,
+      contextMatch: 1.0,
+    },
+  },
+  frontier: {
+    topK: 20,
+    stageWeights: {
+      exact: 1.0,
+      regex: 1.0,
+      bm25: 1.0,
+      keyword: 1.1,
+      embedding: 1.2,
+      contextMatch: 1.0,
+    },
+  },
+};
+
+/**
+ * Resolve retrieval defaults from `MODEL_TIER` (or a passed-in env).
+ * Falls back to `large` when the env var is missing/unknown. Mirrors
+ * the resolution policy in `resolveTier`.
+ */
+export function resolveRetrievalDefaults(
+  env: NodeJS.ProcessEnv = process.env,
+): RetrievalTierDefaults {
+  const raw = env.MODEL_TIER?.trim();
+  const tier: ModelTier =
+    raw === "small" || raw === "mid" || raw === "large" || raw === "frontier"
+      ? raw
+      : "large";
+  // Return a fresh copy so callers can mutate without poisoning the
+  // module-level registry.
+  const source = RETRIEVAL_DEFAULTS_BY_TIER[tier];
+  return {
+    topK: source.topK,
+    stageWeights: { ...source.stageWeights },
+  };
+}
@@ -31,6 +31,42 @@ export type RetrieveActionsInput = {
 	 * while still preferring on-context candidates when scores are close.
 	 */
 	selectedContexts?: readonly string[];
+	/**
+	 * When `true`, capture each stage's full pre-fusion output and emit it
+	 * in `response.measurement`. Default `false` — no allocation cost in
+	 * production. Toggle via the `MILADY_RETRIEVAL_MEASUREMENT=1` env var
+	 * on the caller side.
+	 */
+	measurementMode?: boolean;
+	/**
+	 * Optional per-tier overrides for retrieval. When provided, the call
+	 * uses these instead of the in-file constants. Wired by the benchmark
+	 * harness from `RETRIEVAL_DEFAULTS_BY_TIER`.
+	 */
+	tierOverrides?: {
+		topK?: number;
+		stageWeights?: Partial<Record<RetrievalStageName, number>>;
+	};
+};
+
+export type RetrievalStageEntry = {
+	actionName: string;
+	score: number;
+	rank: number;
+};
+
+export type RetrievalPerStageScores = {
+	exact: RetrievalStageEntry[];
+	regex: RetrievalStageEntry[];
+	keyword: RetrievalStageEntry[];
+	bm25: RetrievalStageEntry[];
+	embedding: RetrievalStageEntry[];
+	contextMatch: RetrievalStageEntry[];
+};
+
+export type RetrievalMeasurement = {
+	perStageScores: RetrievalPerStageScores;
+	fusedTopK: Array<{ actionName: string; rrfScore: number; rank: number }>;
 };
 
 export type ActionRetrievalResult = {
@@ -53,6 +89,12 @@ export type ActionRetrievalResponse = {
 		candidateActions: string[];
 		parentActionHints: string[];
 	};
+	/**
+	 * Per-stage retrieval funnel. Populated only when
+	 * `input.measurementMode === true`. The benchmark harness consumes
+	 * this to compute stage-by-stage recall.
+	 */
+	measurement?: RetrievalMeasurement;
 };
 
 const BM25_K1 = 0.9;
@@ -99,7 +141,8 @@ export function retrieveActions(
 		bm25: rankScores(bm25Scores),
 		embedding: rankScores(embeddingScores),
 	};
-	const rrfScores = reciprocalRankFusion(stageRankings);
+	const stageWeights = input.tierOverrides?.stageWeights;
+	const rrfScores = reciprocalRankFusion(stageRankings, stageWeights);
 	const maxRrf = Math.max(0, ...rrfScores.values());
 	const maxKeyword = Math.max(0, ...keywordScores.values());
 	const maxBm25 = Math.max(0, ...bm25Scores.values());
@@ -195,15 +238,63 @@ export function retrieveActions(
 		);
 	});
 
-	const limit = Number.isFinite(input.limit)
-		? Math.max(0, input.limit ?? 0)
+	const effectiveLimit =
+		input.tierOverrides?.topK ??
+		(Number.isFinite(input.limit) ? input.limit : undefined);
+	const limit = Number.isFinite(effectiveLimit)
+		? Math.max(0, effectiveLimit ?? 0)
 		: 0;
 	const limited = limit > 0 ? results.slice(0, limit) : results;
 
 	for (let index = 0; index < limited.length; index += 1) {
 		limited[index].rank = index + 1;
 	}
 
+	let measurement: RetrievalMeasurement | undefined;
+	if (input.measurementMode === true) {
+		// Capture each stage's pre-fusion ranking so the analyzer can compute
+		// stage-by-stage recall. Context-match scores are recomputed from the
+		// per-parent boost so they're available alongside the other five
+		// stages even though they're applied as an additive bump in the main
+		// loop, not as a ranking source.
+		const selectedContextSetForMeasurement = selectedContextSet;
+		const contextMatchScores = new Map<string, number>();
+		for (const parent of input.catalog.parents) {
+			const parentContexts = Array.isArray(parent.contexts)
+				? (parent.contexts as readonly unknown[])
+				: [];
+			if (
+				selectedContextSetForMeasurement.size > 0 &&
+				parentContexts.length > 0 &&
+				parentContexts.some((c) =>
+					selectedContextSetForMeasurement.has(String(c).toLowerCase()),
+				)
+			) {
+				contextMatchScores.set(parent.normalizedName, 1);
+			}
+		}
+
+		measurement = {
+			perStageScores: {
+				exact: mapToStageEntries(exactScores),
+				regex: mapToStageEntries(regexScores),
+				keyword: mapToStageEntries(keywordScores),
+				bm25: mapToStageEntries(bm25Scores),
+				embedding: mapToStageEntries(embeddingScores),
+				contextMatch: mapToStageEntries(contextMatchScores),
+			},
+			fusedTopK: Array.from(rrfScores.entries())
+				.sort(([leftName, leftScore], [rightName, rightScore]) => {
+					return rightScore - leftScore || leftName.localeCompare(rightName);
+				})
+				.map(([name, rrfScore], index) => ({
+					actionName: name,
+					rrfScore: roundScore(rrfScore),
+					rank: index + 1,
+				})),
+		};
+	}
+
 	return {
 		results: limited,
 		warnings: input.catalog.warnings,
@@ -213,9 +304,23 @@ export function retrieveActions(
 			candidateActions,
 			parentActionHints,
 		},
+		...(measurement ? { measurement } : {}),
 	};
 }
 
+function mapToStageEntries(scores: Map<string, number>): RetrievalStageEntry[] {
+	return Array.from(scores.entries())
+		.filter(([, score]) => score > 0)
+		.sort(([leftName, leftScore], [rightName, rightScore]) => {
+			return rightScore - leftScore || leftName.localeCompare(rightName);
+		})
+		.map(([actionName, score], index) => ({
+			actionName,
+			score: roundScore(score),
+			rank: index + 1,
+		}));
+}
+
 export function tokenizeActionSearchText(text: string): string[] {
 	return String(text ?? "")
 		.replace(/([a-z0-9])([A-Z])/g, "$1 $2")
@@ -445,16 +550,20 @@ function rankScores(scores: Map<string, number>): Map<string, number> {
 
 function reciprocalRankFusion(
 	stageRankings: Partial<Record<RetrievalStageName, Map<string, number>>>,
+	stageWeights?: Partial<Record<RetrievalStageName, number>>,
 ): Map<string, number> {
 	const scores = new Map<string, number>();
 
-	for (const ranking of Object.values(stageRankings)) {
+	for (const [stageName, ranking] of Object.entries(stageRankings) as Array<
+		[RetrievalStageName, Map<string, number> | undefined]
+	>) {
 		if (!ranking) {
 			continue;
 		}
+		const weight = stageWeights?.[stageName] ?? 1;
 
 		for (const [name, rank] of ranking.entries()) {
-			scores.set(name, (scores.get(name) ?? 0) + 1 / (RRF_K + rank));
+			scores.set(name, (scores.get(name) ?? 0) + weight / (RRF_K + rank));
 		}
 	}
 
 
@@ -135,6 +135,31 @@ export interface RecordedToolStage {
 	truncated?: RecordedTruncationMarker[];
 }
 
+/**
+ * Per-stage retrieval entry captured when measurement mode is on. One
+ * entry per (action, stage) pair, recorded BEFORE reciprocal-rank-fusion
+ * so the funnel analyzer can see what each individual stage produced.
+ */
+export interface RecordedRetrievalStageEntry {
+	actionName: string;
+	score: number;
+	rank: number;
+}
+
+/**
+ * Per-stage retrieval scores captured under `MILADY_RETRIEVAL_MEASUREMENT=1`.
+ * Default `undefined` — no perf cost in production unless the env var is
+ * explicitly enabled.
+ */
+export interface RecordedRetrievalPerStageScores {
+	exact: RecordedRetrievalStageEntry[];
+	regex: RecordedRetrievalStageEntry[];
+	keyword: RecordedRetrievalStageEntry[];
+	bm25: RecordedRetrievalStageEntry[];
+	embedding: RecordedRetrievalStageEntry[];
+	contextMatch: RecordedRetrievalStageEntry[];
+}
+
 /**
  * Snapshot of the tool-search / action-retrieval phase. Logged once per
  * planner turn before the LLM call so reviewers can see which actions
@@ -158,6 +183,29 @@ export interface RecordedToolSearchStage {
 	tier: { tierA: string[]; tierB: string[]; omitted: number };
 	durationMs: number;
 	fallback?: string;
+	/**
+	 * Per-stage retrieval funnel. Populated only when the retrieval call
+	 * ran with measurement mode on (`MILADY_RETRIEVAL_MEASUREMENT=1`).
+	 */
+	perStageScores?: RecordedRetrievalPerStageScores;
+	/**
+	 * Top-K fused (RRF) results. Mirrors `results` but exposes the raw
+	 * `rrfScore` field directly so downstream analyzers don't need to
+	 * unify the two shapes. Populated only under measurement mode.
+	 */
+	fusedTopK?: Array<{ actionName: string; rrfScore: number; rank: number }>;
+	/**
+	 * Actions the planner ultimately invoked this turn. Recorded by the
+	 * caller after the planner loop resolves — the retrieval call itself
+	 * does not know which results were selected.
+	 */
+	selectedActions?: string[];
+	/**
+	 * Ground-truth actions for this scenario, when available. Sourced from
+	 * the scenario manifest by the benchmark harness; never inferred from
+	 * the trajectory.
+	 */
+	correctActions?: string[];
 }
 
 export interface RecordedEvaluationStage extends EvaluationResult {