elizaOS · NubsCarson · May 23, 2026 · May 23, 2026
diff --git a/packages/core/src/__tests__/planner-happy-path.test.ts b/packages/core/src/__tests__/planner-happy-path.test.ts
@@ -474,6 +474,10 @@ describe("v5 happy path — message handler → planner → executor → evaluat
 				text: "raw shell output with exact paths and metrics",
 				userFacingText:
 					"Root disk: 65% used, 138G available. Biggest cleanup candidate: /home/example/.bun (19G).",
+				// Marks userFacingText as canonical so the planner-loop will not
+				// fall back to the evaluator's paraphrase (which can hallucinate
+				// paths/numbers in this kind of structured output).
+				verifiedUserFacing: true,
 				data: { actionName: "CHECK_RUNTIME" },
 			}),
 		});

diff --git a/packages/core/src/runtime/__tests__/planner-loop-user-facing-text.test.ts b/packages/core/src/runtime/__tests__/planner-loop-user-facing-text.test.ts
@@ -1,5 +1,9 @@
 import { describe, expect, it, vi } from "vitest";
-import { runPlannerLoop } from "../planner-loop";
+import {
+	runPlannerLoop,
+	singleVerifiedUserFacingToolResultText,
+} from "../planner-loop";
+import type { PlannerTrajectory } from "../planner-types";
 import type { TrajectoryRecorder } from "../trajectory-recorder";
 
 /**
@@ -186,3 +190,78 @@ describe("planner-loop — user-facing tool text isolation", () => {
 		expect(result.finalMessage).toBe(evaluatorMessage);
 	});
 });
+
+describe("singleVerifiedUserFacingToolResultText — failed-step filter", () => {
+	// Greptile flagged that the previous implementation counted ALL steps
+	// with `toolCall + result` toward its uniqueness check — failed steps
+	// included — so a 2-tool plan whose first tool errored and whose
+	// second tool set `verifiedUserFacing: true` would silently fall
+	// through to the evaluator's reply. These tests pin the corrected
+	// filter (`step.result?.success === true`).
+	const trajectoryWith = (
+		steps: PlannerTrajectory["steps"],
+	): PlannerTrajectory => ({
+		context: { id: "ctx" },
+		steps,
+		archivedSteps: [],
+		plannedQueue: [],
+		evaluatorOutputs: [],
+	});
+
+	const failedStep = {
+		iteration: 0,
+		toolCall: { id: "call-1", name: "FLAKY", arguments: {} },
+		result: {
+			success: false as const,
+			text: "transient network error",
+			error: "ECONNRESET",
+		},
+	};
+	const verifiedStep = {
+		iteration: 1,
+		toolCall: { id: "call-2", name: "CHECK_CACHE", arguments: {} },
+		result: {
+			success: true as const,
+			text: "raw diag",
+			userFacingText: "Wrote 14 files to /home/example/.bun/install/cache.",
+			verifiedUserFacing: true,
+		},
+	};
+
+	it("returns the verified tool's text when a prior step failed", () => {
+		const trajectory = trajectoryWith([failedStep, verifiedStep]);
+		expect(singleVerifiedUserFacingToolResultText(trajectory)).toBe(
+			"Wrote 14 files to /home/example/.bun/install/cache.",
+		);
+	});
+
+	it("returns undefined when two successful tools both have results", () => {
+		// Genuine ambiguity — caller falls through to evaluator/fallback.
+		const secondVerified = {
+			...verifiedStep,
+			iteration: 2,
+			toolCall: { id: "call-3", name: "OTHER", arguments: {} },
+		};
+		const trajectory = trajectoryWith([
+			{ ...verifiedStep, iteration: 1 },
+			secondVerified,
+		]);
+		expect(singleVerifiedUserFacingToolResultText(trajectory)).toBeUndefined();
+	});
+
+	it("returns undefined when the single successful tool did not opt in", () => {
+		const trajectory = trajectoryWith([
+			failedStep,
+			{
+				...verifiedStep,
+				result: { ...verifiedStep.result, verifiedUserFacing: false },
+			},
+		]);
+		expect(singleVerifiedUserFacingToolResultText(trajectory)).toBeUndefined();
+	});
+
+	it("returns undefined when there are no successful tools", () => {
+		const trajectory = trajectoryWith([failedStep]);
+		expect(singleVerifiedUserFacingToolResultText(trajectory)).toBeUndefined();
+	});
+});
diff --git a/packages/core/src/runtime/execute-planned-tool-call.ts b/packages/core/src/runtime/execute-planned-tool-call.ts
@@ -359,6 +359,7 @@ function actionResultToStreamingResult(
 		success: result.success,
 		text: result.text,
 		userFacingText: result.userFacingText,
+		verifiedUserFacing: result.verifiedUserFacing,
 		error: result.error ? stringifyError(result.error) : undefined,
 		data: result.data,
 		values: result.values,

diff --git a/packages/core/src/runtime/planner-loop.ts b/packages/core/src/runtime/planner-loop.ts
@@ -2178,15 +2178,35 @@ function latestToolResultText(
 	return undefined;
 }
 
-function singleSuccessfulUserFacingToolResultText(
+/**
+ * Returns the canonical user-facing text from a trajectory whose
+ * `verifiedUserFacing` opt-in is unambiguous: exactly one *successful*
+ * tool step set `verifiedUserFacing: true` with a non-empty
+ * `userFacingText`.
+ *
+ * Failed steps are intentionally ignored when counting toward the
+ * uniqueness check — a plan whose first tool errored and whose second
+ * tool emitted a verified canonical reply must still echo the verified
+ * reply. (Counting failed steps would silently fall through to the
+ * evaluator's `messageToUser`, defeating the whole point of the flag
+ * for any tool that runs after a recoverable error.)
+ *
+ * Tools that emit structured data the evaluator could paraphrase
+ * incorrectly (paths, ids, counts, numeric metrics) set the flag so the
+ * framework echoes their output verbatim instead of trusting the
+ * evaluator's rewording.
+ */
+// Exported for unit-test coverage of the success-filter / failed-step
+// invariant; not part of the public runtime surface.
+export function singleVerifiedUserFacingToolResultText(
 	trajectory: PlannerTrajectory,
 ): string | undefined {
-	const toolResultSteps = trajectory.steps.filter(
-		(step) => step.toolCall && step.result,
+	const successfulToolSteps = trajectory.steps.filter(
+		(step) => step.toolCall && step.result?.success === true,
 	);
-	if (toolResultSteps.length !== 1) return undefined;
-	const result = toolResultSteps[0]?.result;
-	if (result?.success !== true) return undefined;
+	if (successfulToolSteps.length !== 1) return undefined;
+	const result = successfulToolSteps[0]?.result;
+	if (result?.verifiedUserFacing !== true) return undefined;
 	const text = result.userFacingText?.trim();
 	return text || undefined;
 }
@@ -2196,8 +2216,27 @@ function preferredFinalMessageFromToolOrModel(
 	modelMessage?: unknown,
 	fallback?: unknown,
 ): string | undefined {
+	// Precedence:
+	//   1. A single successful tool whose result was explicitly marked
+	//      `verifiedUserFacing: true` — used for structured outputs
+	//      (paths, ids, counts) where evaluator paraphrase risks
+	//      hallucinating a value.
+	//   2. The model/evaluator's explicit `messageToUser` — authoritative
+	//      by default; the evaluator has seen the full trajectory and
+	//      chose what the user should read.
+	//   3. The most recent tool's `userFacingText` — fallback when neither
+	//      the model nor any verified tool provided a clean reply.
+	//   4. An explicit caller-provided fallback (e.g. failed-tool message).
+	//
+	// Regression coverage:
+	//   - `planner-loop-user-facing-text.test.ts` → "does not regress
+	//     evaluator's explicit messageToUser path" — evaluator wins when
+	//     no tool sets `verifiedUserFacing`.
+	//   - `planner-happy-path.test.ts` → "prefers a single tool's verified
+	//     user-facing text over evaluator paraphrase" — tool wins when it
+	//     opts in via `verifiedUserFacing: true`.
 	return (
-		singleSuccessfulUserFacingToolResultText(trajectory) ??
+		singleVerifiedUserFacingToolResultText(trajectory) ??
 		getNonEmptyString(modelMessage) ??
 		latestToolResultText(trajectory) ??
 		getNonEmptyString(fallback)
@@ -2460,6 +2499,7 @@ export function actionResultToPlannerToolResult(
 		success: result.success,
 		text: result.text,
 		userFacingText: result.userFacingText,
+		verifiedUserFacing: result.verifiedUserFacing,
 		data: Object.keys(data).length > 0 ? data : undefined,
 		error: result.error,
 		continueChain: result.continueChain,

diff --git a/packages/core/src/runtime/planner-types.ts b/packages/core/src/runtime/planner-types.ts
@@ -99,8 +99,29 @@ export interface PlannerToolResult {
 	 * undefined; in that case the framework falls through to the
 	 * evaluator's synthesized reply rather than dumping shell-wrapper
 	 * text into the user channel.
+	 *
+	 * By default an explicit evaluator `messageToUser` outranks this —
+	 * the evaluator has seen the full trajectory and chose what the
+	 * user should read. To mark `userFacingText` as canonical
+	 * (do-not-paraphrase) and have it outrank the evaluator's reply
+	 * when there is exactly one successful tool, set
+	 * `verifiedUserFacing: true`.
 	 */
 	userFacingText?: string;
+	/**
+	 * Marks `userFacingText` as the canonical answer for this turn —
+	 * the evaluator's `messageToUser` MUST NOT paraphrase it. When set
+	 * AND there is exactly one successful tool with `userFacingText`,
+	 * the planner-loop prefers the tool's text over the evaluator's
+	 * reply for the terminal-FINISH `finalMessage`.
+	 *
+	 * Use when the tool's output is structured data the evaluator can
+	 * easily hallucinate (paths, ids, counts, numeric metrics) and any
+	 * paraphrase risk is worse than echoing the tool verbatim. Leave
+	 * unset for natural-language answers where the evaluator may
+	 * legitimately rephrase or add framing.
+	 */
+	verifiedUserFacing?: boolean;
 	data?: Record<string, unknown>;
 	error?: unknown;
 	continueChain?: boolean;

diff --git a/packages/core/src/types/components.ts b/packages/core/src/types/components.ts
@@ -659,9 +659,23 @@ export interface ActionResult {
 	 * instead of the diagnostic `text`. Leave unset for log-emitting
 	 * actions (BASH, file readers); set for Q&A actions, REPLY actions,
 	 * and content generators.
+	 *
+	 * By default an explicit evaluator `messageToUser` outranks this.
+	 * Set `verifiedUserFacing: true` to mark this text as canonical
+	 * (do-not-paraphrase) — e.g. when it contains paths, ids, counts,
+	 * or numeric metrics the evaluator might otherwise hallucinate.
 	 */
 	userFacingText?: string;
 
+	/**
+	 * When `true` and `userFacingText` is set, the planner-loop prefers
+	 * the action's `userFacingText` over the evaluator's `messageToUser`
+	 * for the terminal-FINISH reply. Use for structured outputs
+	 * (paths, ids, counts, numeric metrics) where a paraphrase risk is
+	 * worse than echoing the action verbatim.
+	 */
+	verifiedUserFacing?: boolean;
+
 	/** Values to merge into the state */
 	values?: Record<string, ProviderValue>;