Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions packages/core/src/__tests__/planner-happy-path.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,10 @@ describe("v5 happy path — message handler → planner → executor → evaluat
text: "raw shell output with exact paths and metrics",
userFacingText:
"Root disk: 65% used, 138G available. Biggest cleanup candidate: /home/example/.bun (19G).",
// Marks userFacingText as canonical so the planner-loop will not
// fall back to the evaluator's paraphrase (which can hallucinate
// paths/numbers in this kind of structured output).
verifiedUserFacing: true,
data: { actionName: "CHECK_RUNTIME" },
}),
});
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/runtime/execute-planned-tool-call.ts
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ function actionResultToStreamingResult(
success: result.success,
text: result.text,
userFacingText: result.userFacingText,
verifiedUserFacing: result.verifiedUserFacing,
error: result.error ? stringifyError(result.error) : undefined,
data: result.data,
values: result.values,
Expand Down
34 changes: 32 additions & 2 deletions packages/core/src/runtime/planner-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2178,7 +2178,16 @@ function latestToolResultText(
return undefined;
}

function singleSuccessfulUserFacingToolResultText(
/**
* Returns a single successful tool's `userFacingText` ONLY when the tool
* explicitly opted in to canonical-output via `verifiedUserFacing: true`.
*
* Tools that emit structured data the evaluator could easily paraphrase
* incorrectly (paths, ids, counts, numeric metrics) set the flag so the
* framework echoes their output verbatim instead of trusting the
* evaluator's rewording.
*/
function singleVerifiedUserFacingToolResultText(
trajectory: PlannerTrajectory,
): string | undefined {
const toolResultSteps = trajectory.steps.filter(
Expand All @@ -2187,6 +2196,7 @@ function singleSuccessfulUserFacingToolResultText(
if (toolResultSteps.length !== 1) return undefined;
const result = toolResultSteps[0]?.result;
if (result?.success !== true) return undefined;
if (result.verifiedUserFacing !== true) return undefined;
const text = result.userFacingText?.trim();
return text || undefined;
}
Expand All @@ -2196,8 +2206,27 @@ function preferredFinalMessageFromToolOrModel(
modelMessage?: unknown,
fallback?: unknown,
): string | undefined {
// Precedence:
// 1. A single successful tool whose result was explicitly marked
// `verifiedUserFacing: true` — used for structured outputs
// (paths, ids, counts) where evaluator paraphrase risks
// hallucinating a value.
// 2. The model/evaluator's explicit `messageToUser` — authoritative
// by default; the evaluator has seen the full trajectory and
// chose what the user should read.
// 3. The most recent tool's `userFacingText` — fallback when neither
// the model nor any verified tool provided a clean reply.
// 4. An explicit caller-provided fallback (e.g. failed-tool message).
//
// Regression coverage:
// - `planner-loop-user-facing-text.test.ts` → "does not regress
// evaluator's explicit messageToUser path" — evaluator wins when
// no tool sets `verifiedUserFacing`.
// - `planner-happy-path.test.ts` → "prefers a single tool's verified
// user-facing text over evaluator paraphrase" — tool wins when it
// opts in via `verifiedUserFacing: true`.
return (
singleSuccessfulUserFacingToolResultText(trajectory) ??
singleVerifiedUserFacingToolResultText(trajectory) ??
getNonEmptyString(modelMessage) ??
latestToolResultText(trajectory) ??
getNonEmptyString(fallback)
Expand Down Expand Up @@ -2460,6 +2489,7 @@ export function actionResultToPlannerToolResult(
success: result.success,
text: result.text,
userFacingText: result.userFacingText,
verifiedUserFacing: result.verifiedUserFacing,
data: Object.keys(data).length > 0 ? data : undefined,
error: result.error,
continueChain: result.continueChain,
Expand Down
21 changes: 21 additions & 0 deletions packages/core/src/runtime/planner-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,29 @@ export interface PlannerToolResult {
* undefined; in that case the framework falls through to the
* evaluator's synthesized reply rather than dumping shell-wrapper
* text into the user channel.
*
* By default an explicit evaluator `messageToUser` outranks this —
* the evaluator has seen the full trajectory and chose what the
* user should read. To mark `userFacingText` as canonical
* (do-not-paraphrase) and have it outrank the evaluator's reply
* when there is exactly one successful tool, set
* `verifiedUserFacing: true`.
*/
userFacingText?: string;
/**
* Marks `userFacingText` as the canonical answer for this turn —
* the evaluator's `messageToUser` MUST NOT paraphrase it. When set
* AND there is exactly one successful tool with `userFacingText`,
* the planner-loop prefers the tool's text over the evaluator's
* reply for the terminal-FINISH `finalMessage`.
*
* Use when the tool's output is structured data the evaluator can
* easily hallucinate (paths, ids, counts, numeric metrics) and any
* paraphrase risk is worse than echoing the tool verbatim. Leave
* unset for natural-language answers where the evaluator may
* legitimately rephrase or add framing.
*/
verifiedUserFacing?: boolean;
data?: Record<string, unknown>;
error?: unknown;
continueChain?: boolean;
Expand Down
14 changes: 14 additions & 0 deletions packages/core/src/types/components.ts
Original file line number Diff line number Diff line change
Expand Up @@ -659,9 +659,23 @@ export interface ActionResult {
* instead of the diagnostic `text`. Leave unset for log-emitting
* actions (BASH, file readers); set for Q&A actions, REPLY actions,
* and content generators.
*
* By default an explicit evaluator `messageToUser` outranks this.
* Set `verifiedUserFacing: true` to mark this text as canonical
* (do-not-paraphrase) — e.g. when it contains paths, ids, counts,
* or numeric metrics the evaluator might otherwise hallucinate.
*/
userFacingText?: string;

/**
* When `true` and `userFacingText` is set, the planner-loop prefers
* the action's `userFacingText` over the evaluator's `messageToUser`
* for the terminal-FINISH reply. Use for structured outputs
* (paths, ids, counts, numeric metrics) where a paraphrase risk is
* worse than echoing the action verbatim.
*/
verifiedUserFacing?: boolean;

/** Values to merge into the state */
values?: Record<string, ProviderValue>;

Expand Down
Loading