Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions packages/core/src/__tests__/planner-happy-path.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,10 @@ describe("v5 happy path — message handler → planner → executor → evaluat
text: "raw shell output with exact paths and metrics",
userFacingText:
"Root disk: 65% used, 138G available. Biggest cleanup candidate: /home/example/.bun (19G).",
// Marks userFacingText as canonical so the planner-loop will not
// fall back to the evaluator's paraphrase (which can hallucinate
// paths/numbers in this kind of structured output).
verifiedUserFacing: true,
data: { actionName: "CHECK_RUNTIME" },
}),
});
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import { describe, expect, it, vi } from "vitest";
import { runPlannerLoop } from "../planner-loop";
import {
runPlannerLoop,
singleVerifiedUserFacingToolResultText,
} from "../planner-loop";
import type { PlannerTrajectory } from "../planner-types";
import type { TrajectoryRecorder } from "../trajectory-recorder";

/**
Expand Down Expand Up @@ -186,3 +190,78 @@ describe("planner-loop — user-facing tool text isolation", () => {
expect(result.finalMessage).toBe(evaluatorMessage);
});
});

describe("singleVerifiedUserFacingToolResultText — failed-step filter", () => {
// Greptile flagged that the previous implementation counted ALL steps
// with `toolCall + result` toward its uniqueness check — failed steps
// included — so a 2-tool plan whose first tool errored and whose
// second tool set `verifiedUserFacing: true` would silently fall
// through to the evaluator's reply. These tests pin the corrected
// filter (`step.result?.success === true`).
const trajectoryWith = (
steps: PlannerTrajectory["steps"],
): PlannerTrajectory => ({
context: { id: "ctx" },
steps,
archivedSteps: [],
plannedQueue: [],
evaluatorOutputs: [],
});

const failedStep = {
iteration: 0,
toolCall: { id: "call-1", name: "FLAKY", arguments: {} },
result: {
success: false as const,
text: "transient network error",
error: "ECONNRESET",
},
};
const verifiedStep = {
iteration: 1,
toolCall: { id: "call-2", name: "CHECK_CACHE", arguments: {} },
result: {
success: true as const,
text: "raw diag",
userFacingText: "Wrote 14 files to /home/example/.bun/install/cache.",
verifiedUserFacing: true,
},
};

it("returns the verified tool's text when a prior step failed", () => {
const trajectory = trajectoryWith([failedStep, verifiedStep]);
expect(singleVerifiedUserFacingToolResultText(trajectory)).toBe(
"Wrote 14 files to /home/example/.bun/install/cache.",
);
});

it("returns undefined when two successful tools both have results", () => {
// Genuine ambiguity — caller falls through to evaluator/fallback.
const secondVerified = {
...verifiedStep,
iteration: 2,
toolCall: { id: "call-3", name: "OTHER", arguments: {} },
};
const trajectory = trajectoryWith([
{ ...verifiedStep, iteration: 1 },
secondVerified,
]);
expect(singleVerifiedUserFacingToolResultText(trajectory)).toBeUndefined();
});

it("returns undefined when the single successful tool did not opt in", () => {
const trajectory = trajectoryWith([
failedStep,
{
...verifiedStep,
result: { ...verifiedStep.result, verifiedUserFacing: false },
},
]);
expect(singleVerifiedUserFacingToolResultText(trajectory)).toBeUndefined();
});

it("returns undefined when there are no successful tools", () => {
const trajectory = trajectoryWith([failedStep]);
expect(singleVerifiedUserFacingToolResultText(trajectory)).toBeUndefined();
});
});
1 change: 1 addition & 0 deletions packages/core/src/runtime/execute-planned-tool-call.ts
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ function actionResultToStreamingResult(
success: result.success,
text: result.text,
userFacingText: result.userFacingText,
verifiedUserFacing: result.verifiedUserFacing,
error: result.error ? stringifyError(result.error) : undefined,
data: result.data,
values: result.values,
Expand Down
54 changes: 47 additions & 7 deletions packages/core/src/runtime/planner-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2178,15 +2178,35 @@ function latestToolResultText(
return undefined;
}

function singleSuccessfulUserFacingToolResultText(
/**
* Returns the canonical user-facing text from a trajectory whose
* `verifiedUserFacing` opt-in is unambiguous: exactly one *successful*
* tool step set `verifiedUserFacing: true` with a non-empty
* `userFacingText`.
*
* Failed steps are intentionally ignored when counting toward the
* uniqueness check — a plan whose first tool errored and whose second
* tool emitted a verified canonical reply must still echo the verified
* reply. (Counting failed steps would silently fall through to the
* evaluator's `messageToUser`, defeating the whole point of the flag
* for any tool that runs after a recoverable error.)
*
* Tools that emit structured data the evaluator could paraphrase
* incorrectly (paths, ids, counts, numeric metrics) set the flag so the
* framework echoes their output verbatim instead of trusting the
* evaluator's rewording.
*/
// Exported for unit-test coverage of the success-filter / failed-step
// invariant; not part of the public runtime surface.
export function singleVerifiedUserFacingToolResultText(
trajectory: PlannerTrajectory,
): string | undefined {
const toolResultSteps = trajectory.steps.filter(
(step) => step.toolCall && step.result,
const successfulToolSteps = trajectory.steps.filter(
(step) => step.toolCall && step.result?.success === true,
);
if (toolResultSteps.length !== 1) return undefined;
const result = toolResultSteps[0]?.result;
if (result?.success !== true) return undefined;
if (successfulToolSteps.length !== 1) return undefined;
const result = successfulToolSteps[0]?.result;
if (result?.verifiedUserFacing !== true) return undefined;
const text = result.userFacingText?.trim();
return text || undefined;
}
Expand All @@ -2196,8 +2216,27 @@ function preferredFinalMessageFromToolOrModel(
modelMessage?: unknown,
fallback?: unknown,
): string | undefined {
// Precedence:
// 1. A single successful tool whose result was explicitly marked
// `verifiedUserFacing: true` — used for structured outputs
// (paths, ids, counts) where evaluator paraphrase risks
// hallucinating a value.
// 2. The model/evaluator's explicit `messageToUser` — authoritative
// by default; the evaluator has seen the full trajectory and
// chose what the user should read.
// 3. The most recent tool's `userFacingText` — fallback when neither
// the model nor any verified tool provided a clean reply.
// 4. An explicit caller-provided fallback (e.g. failed-tool message).
//
// Regression coverage:
// - `planner-loop-user-facing-text.test.ts` → "does not regress
// evaluator's explicit messageToUser path" — evaluator wins when
// no tool sets `verifiedUserFacing`.
// - `planner-happy-path.test.ts` → "prefers a single tool's verified
// user-facing text over evaluator paraphrase" — tool wins when it
// opts in via `verifiedUserFacing: true`.
return (
singleSuccessfulUserFacingToolResultText(trajectory) ??
singleVerifiedUserFacingToolResultText(trajectory) ??
getNonEmptyString(modelMessage) ??
latestToolResultText(trajectory) ??
getNonEmptyString(fallback)
Expand Down Expand Up @@ -2460,6 +2499,7 @@ export function actionResultToPlannerToolResult(
success: result.success,
text: result.text,
userFacingText: result.userFacingText,
verifiedUserFacing: result.verifiedUserFacing,
data: Object.keys(data).length > 0 ? data : undefined,
error: result.error,
continueChain: result.continueChain,
Expand Down
21 changes: 21 additions & 0 deletions packages/core/src/runtime/planner-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,29 @@ export interface PlannerToolResult {
* undefined; in that case the framework falls through to the
* evaluator's synthesized reply rather than dumping shell-wrapper
* text into the user channel.
*
* By default an explicit evaluator `messageToUser` outranks this —
* the evaluator has seen the full trajectory and chose what the
* user should read. To mark `userFacingText` as canonical
* (do-not-paraphrase) and have it outrank the evaluator's reply
* when there is exactly one successful tool, set
* `verifiedUserFacing: true`.
*/
userFacingText?: string;
/**
* Marks `userFacingText` as the canonical answer for this turn —
* the evaluator's `messageToUser` MUST NOT paraphrase it. When set
* AND there is exactly one successful tool with `userFacingText`,
* the planner-loop prefers the tool's text over the evaluator's
* reply for the terminal-FINISH `finalMessage`.
*
* Use when the tool's output is structured data the evaluator can
* easily hallucinate (paths, ids, counts, numeric metrics) and any
* paraphrase risk is worse than echoing the tool verbatim. Leave
* unset for natural-language answers where the evaluator may
* legitimately rephrase or add framing.
*/
verifiedUserFacing?: boolean;
data?: Record<string, unknown>;
error?: unknown;
continueChain?: boolean;
Expand Down
14 changes: 14 additions & 0 deletions packages/core/src/types/components.ts
Original file line number Diff line number Diff line change
Expand Up @@ -659,9 +659,23 @@ export interface ActionResult {
* instead of the diagnostic `text`. Leave unset for log-emitting
* actions (BASH, file readers); set for Q&A actions, REPLY actions,
* and content generators.
*
* By default an explicit evaluator `messageToUser` outranks this.
* Set `verifiedUserFacing: true` to mark this text as canonical
* (do-not-paraphrase) — e.g. when it contains paths, ids, counts,
* or numeric metrics the evaluator might otherwise hallucinate.
*/
userFacingText?: string;

/**
* When `true` and `userFacingText` is set, the planner-loop prefers
* the action's `userFacingText` over the evaluator's `messageToUser`
* for the terminal-FINISH reply. Use for structured outputs
* (paths, ids, counts, numeric metrics) where a paraphrase risk is
* worse than echoing the action verbatim.
*/
verifiedUserFacing?: boolean;

/** Values to merge into the state */
values?: Record<string, ProviderValue>;

Expand Down
Loading