Skip to content

Commit 8689237

Browse files
feat(evals): Playground compare defaults to Results + Actual loading spinner (#1879)
Co-authored-by: marcelo <marcelojimenezrocabado@gmail.com>
1 parent 9991df2 commit 8689237

6 files changed

Lines changed: 234 additions & 10 deletions

File tree

mcpjam-inspector/client/src/components/evals/__tests__/test-template-editor-open-compare-route.test.tsx

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ vi.mock("../trace-viewer", () => ({
9494
trace?: { messages?: Array<{ content?: unknown }> } | null;
9595
forcedViewMode?: string;
9696
isLoading?: boolean;
97+
expectedToolCalls?: Array<{ toolName: string }>;
9798
}) => {
9899
mockTraceViewer(props);
99100
const firstMessage = props.trace?.messages?.[0]?.content;
@@ -106,6 +107,7 @@ vi.mock("../trace-viewer", () => ({
106107
data-first-message={
107108
typeof firstMessage === "string" ? firstMessage : "non-string"
108109
}
110+
data-expected-tool-count={String(props.expectedToolCalls?.length ?? 0)}
109111
/>
110112
);
111113
},
@@ -252,6 +254,7 @@ describe("TestTemplateEditor run view from route", () => {
252254
trace?: { messages?: Array<{ content?: unknown }> } | null;
253255
forcedViewMode?: string;
254256
isLoading?: boolean;
257+
expectedToolCalls?: Array<{ toolName: string; arguments: Record<string, unknown> }>;
255258
};
256259
}
257260

@@ -673,6 +676,85 @@ describe("TestTemplateEditor run view from route", () => {
673676
expect(screen.queryByText("Running GPT-4…")).not.toBeInTheDocument();
674677
});
675678

679+
it("renders a tools preview (not generic spinner) before the first stream event when the case has expected tool calls", async () => {
680+
const user = userEvent.setup();
681+
const caseWithTools = {
682+
...caseDoc,
683+
isNegativeTest: false,
684+
expectedToolCalls: [{ toolName: "create_view", arguments: {} }],
685+
};
686+
687+
useQueryMock.mockImplementation((name: string, args: unknown) => {
688+
if (name === "testSuites:listTestCases") return [caseWithTools];
689+
if (name === "testSuites:getTestSuite") {
690+
return { _id: "suite-1", environment: { servers: ["srv"] } };
691+
}
692+
if (name === "testSuites:listTestIterations" && args !== "skip") {
693+
return [baseIteration];
694+
}
695+
if (
696+
name === "testSuites:getTestIteration" &&
697+
typeof args === "object" &&
698+
args !== null &&
699+
(args as { iterationId?: string }).iterationId === baseIteration._id
700+
) {
701+
return baseIteration;
702+
}
703+
return undefined;
704+
});
705+
706+
// Stream never resolves — keeps the run in "running, no iteration" state.
707+
streamEvalTestCaseMock.mockImplementation(
708+
async () => new Promise<void>(() => {}),
709+
);
710+
711+
renderWithProviders(
712+
<TestTemplateEditor
713+
suiteId="suite-1"
714+
selectedTestCaseId="case-1"
715+
connectedServerNames={new Set(["srv"])}
716+
workspaceId={null}
717+
availableModels={[
718+
{
719+
provider: "openai",
720+
id: "gpt-4",
721+
model: "gpt-4",
722+
name: "GPT-4",
723+
label: "GPT-4",
724+
} as any,
725+
]}
726+
/>,
727+
{ hostStyle: "claude" },
728+
);
729+
730+
await waitFor(() => {
731+
expect(screen.getByRole("button", { name: /run$/i })).toBeInTheDocument();
732+
});
733+
734+
await user.click(screen.getByRole("button", { name: /run$/i }));
735+
736+
await waitFor(() => {
737+
expect(streamEvalTestCaseMock).toHaveBeenCalledTimes(1);
738+
expect(screen.getByTestId("mock-trace-viewer")).toBeInTheDocument();
739+
});
740+
741+
// Must show tools view (not chat) and pass expected tool calls through.
742+
expect(screen.getByTestId("mock-trace-viewer")).toHaveAttribute(
743+
"data-view-mode",
744+
"tools",
745+
);
746+
expect(screen.getByTestId("mock-trace-viewer")).toHaveAttribute(
747+
"data-is-loading",
748+
"true",
749+
);
750+
expect(screen.getByTestId("mock-trace-viewer")).toHaveAttribute(
751+
"data-expected-tool-count",
752+
"1",
753+
);
754+
// Generic spinner must not appear.
755+
expect(screen.queryByText(/Running GPT-4/)).not.toBeInTheDocument();
756+
});
757+
676758
it("replaces the initial preview with streamed chat messages as soon as live trace data exists", async () => {
677759
const user = userEvent.setup();
678760
let emitEvent:
@@ -811,6 +893,94 @@ describe("TestTemplateEditor run view from route", () => {
811893
});
812894
});
813895

896+
it("defaults to Results tab when expected tool calls are on a non-first prompt turn (multi-turn case)", async () => {
897+
const user = userEvent.setup();
898+
// Multi-turn case: turn 1 has no expected tool calls, turn 2 has one.
899+
const multiTurnCase = {
900+
...caseDoc,
901+
isNegativeTest: false,
902+
expectedToolCalls: [],
903+
promptTurns: [
904+
{
905+
id: "turn-1",
906+
prompt: "First prompt",
907+
expectedToolCalls: [],
908+
},
909+
{
910+
id: "turn-2",
911+
prompt: "Second prompt",
912+
expectedToolCalls: [{ toolName: "some_tool", arguments: {} }],
913+
},
914+
],
915+
};
916+
917+
useQueryMock.mockImplementation((name: string, args: unknown) => {
918+
if (name === "testSuites:listTestCases") {
919+
return [multiTurnCase];
920+
}
921+
if (name === "testSuites:getTestSuite") {
922+
return {
923+
_id: "suite-1",
924+
environment: { servers: ["srv"] },
925+
};
926+
}
927+
if (name === "testSuites:listTestIterations" && args !== "skip") {
928+
return [baseIteration];
929+
}
930+
if (
931+
name === "testSuites:getTestIteration" &&
932+
typeof args === "object" &&
933+
args !== null &&
934+
(args as { iterationId?: string }).iterationId === baseIteration._id
935+
) {
936+
return baseIteration;
937+
}
938+
return undefined;
939+
});
940+
streamEvalTestCaseMock.mockImplementation(
941+
async () => new Promise<void>(() => {}),
942+
);
943+
944+
renderWithProviders(
945+
<TestTemplateEditor
946+
suiteId="suite-1"
947+
selectedTestCaseId="case-1"
948+
connectedServerNames={new Set(["srv"])}
949+
workspaceId={null}
950+
availableModels={[
951+
{
952+
provider: "openai",
953+
id: "gpt-4",
954+
model: "gpt-4",
955+
name: "GPT-4",
956+
label: "GPT-4",
957+
} as any,
958+
]}
959+
/>,
960+
{ hostStyle: "claude" },
961+
);
962+
963+
await waitFor(() => {
964+
expect(screen.getByRole("button", { name: /run$/i })).toBeInTheDocument();
965+
});
966+
967+
await user.click(screen.getByRole("button", { name: /run$/i }));
968+
969+
await waitFor(() => {
970+
expect(streamEvalTestCaseMock).toHaveBeenCalledTimes(1);
971+
});
972+
973+
// The pre-stream preview TraceViewer must be rendered in tools mode with
974+
// the expected tool call flattened from turn 2.
975+
await waitFor(() => {
976+
const props = getLatestTraceViewerProps();
977+
expect(props.forcedViewMode).toBe("tools");
978+
expect(props.expectedToolCalls).toEqual([
979+
{ toolName: "some_tool", arguments: {} },
980+
]);
981+
});
982+
});
983+
814984
it("shows the host-style pill only while the chat tab is active", async () => {
815985
const user = userEvent.setup();
816986
const caseWithExpectedToolCalls = {
@@ -882,6 +1052,10 @@ describe("TestTemplateEditor run view from route", () => {
8821052

8831053
const card = getCompareCard("GPT-4");
8841054

1055+
// Default tab is Results when the case has expected tools — host-style pill is Chat-only.
1056+
expect(card.querySelector("[data-selected-host-style]")).toBeNull();
1057+
1058+
await user.click(within(card).getByRole("button", { name: /^Chat$/i }));
8851059
expect(card.querySelector('[data-selected-host-style="claude"]')).not.toBe(
8861060
null,
8871061
);

mcpjam-inspector/client/src/components/evals/__tests__/trace-viewer.test.tsx

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,21 @@ describe("TraceViewer", () => {
589589
expect(screen.getByText("Actual")).toBeInTheDocument();
590590
});
591591

592+
it("shows a loading spinner next to Actual in tools compare view when isLoading", () => {
593+
render(
594+
<TraceViewer
595+
trace={simpleTextTrace}
596+
estimatedDurationMs={100}
597+
expectedToolCalls={[{ toolName: "a", arguments: {} }]}
598+
actualToolCalls={[]}
599+
forcedViewMode="tools"
600+
isLoading
601+
/>,
602+
);
603+
expect(screen.getByTestId("trace-viewer-tools-compare")).toBeInTheDocument();
604+
expect(screen.getByTestId("trace-viewer-actual-loading")).toBeInTheDocument();
605+
});
606+
592607
it("hides Tools tab when there are no expected or actual tool calls", async () => {
593608
render(<TraceViewer trace={simpleTextTrace} estimatedDurationMs={100} />);
594609
expect(await screen.findByText("Estimated total only")).toBeInTheDocument();

mcpjam-inspector/client/src/components/evals/eval-trace-surface.tsx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ interface EvalTraceSurfaceProps {
3131
traceBlob?: TraceEnvelope | null;
3232
traceBlobLoading?: boolean;
3333
traceBlobError?: string | null;
34+
/** Run in progress; shows beside "Actual" in Results (tools) mode, same signal as metric spinners. */
35+
isLoading?: boolean;
3436
toolsMetadata: Record<string, Record<string, unknown>>;
3537
toolServerMap: ToolServerMap;
3638
connectedServerIds: string[];
@@ -78,6 +80,7 @@ export function EvalTraceSurface({
7880
traceBlob,
7981
traceBlobLoading,
8082
traceBlobError,
83+
isLoading = false,
8184
toolsMetadata,
8285
toolServerMap,
8386
connectedServerIds,
@@ -188,6 +191,7 @@ export function EvalTraceSurface({
188191
<TraceViewer
189192
trace={activeTrace}
190193
model={traceModel}
194+
isLoading={isLoading}
191195
toolsMetadata={toolsMetadata}
192196
toolServerMap={toolServerMap}
193197
connectedServerIds={connectedServerIds}

mcpjam-inspector/client/src/components/evals/test-template-editor.tsx

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ import {
5858
} from "./single-test-case-runner";
5959
import {
6060
deriveLegacyPromptFields,
61+
flattenAssertedExpectedToolCalls,
6162
resolveIterationDisplayExpectedToolCalls,
6263
resolvePromptTurns,
6364
stripPromptTurnsFromAdvancedConfig,
@@ -1246,12 +1247,15 @@ export function TestTemplateEditor({
12461247

12471248
compareHandlesInFlightRef.current += 1;
12481249
setIsRunningCompare(true);
1250+
const previewExpectedToolCalls = flattenAssertedExpectedToolCalls(savePayload);
1251+
const defaultRunColumnTab: RunColumnTab =
1252+
previewExpectedToolCalls.length > 0 ? "tools" : "chat";
12491253
setRunColumnTabByModel((previous) => ({
12501254
...previous,
12511255
...Object.fromEntries(
12521256
runModelValues.map((modelValue) => [
12531257
modelValue,
1254-
"chat" as RunColumnTab,
1258+
defaultRunColumnTab,
12551259
]),
12561260
),
12571261
}));
@@ -1284,6 +1288,7 @@ export function TestTemplateEditor({
12841288
completedAt: null,
12851289
error: null,
12861290
previewTrace: comparePreviewTrace,
1291+
previewExpectedToolCalls,
12871292
};
12881293
}
12891294
for (const { modelValue, modelLabel, error } of preparationFailures) {
@@ -2380,10 +2385,15 @@ function RunColumn({
23802385
record.completedAt ??
23812386
record.modelValue,
23822387
});
2383-
const expectedToolCalls = resolveIterationDisplayExpectedToolCalls(
2384-
record.iteration?.testCaseSnapshot,
2385-
testCase,
2386-
);
2388+
// Prefer the iteration snapshot (authoritative) once available; otherwise
2389+
// fall back to previewExpectedToolCalls captured from the in-memory form at
2390+
// run-start so unsaved edits are reflected in showToolsTab / the pre-stream
2391+
// Results preview before the persisted testCase is updated.
2392+
const expectedToolCalls = record.iteration?.testCaseSnapshot
2393+
? resolveIterationDisplayExpectedToolCalls(record.iteration.testCaseSnapshot, null)
2394+
: record.previewExpectedToolCalls != null
2395+
? record.previewExpectedToolCalls
2396+
: resolveIterationDisplayExpectedToolCalls(null, testCase);
23872397
const actualToolCalls =
23882398
record.iteration?.actualToolCalls ?? record.streamingActualToolCalls ?? [];
23892399
const showToolsTab =
@@ -2610,6 +2620,7 @@ function RunColumn({
26102620
traceBlob={persistedTraceBlob}
26112621
traceBlobLoading={persistedTraceLoading}
26122622
traceBlobError={persistedTraceError}
2623+
isLoading={isRunningRecord}
26132624
toolsMetadata={toolsMetadata}
26142625
toolServerMap={toolServerMap}
26152626
connectedServerIds={connectedServerIds}
@@ -2643,7 +2654,7 @@ function RunColumn({
26432654
<TraceViewer
26442655
trace={streamingTraceEnvelope}
26452656
forcedViewMode={traceMode}
2646-
isLoading={traceMode === "chat" && isRunningRecord}
2657+
isLoading={isRunningRecord}
26472658
expectedToolCalls={expectedToolCalls}
26482659
actualToolCalls={actualToolCalls}
26492660
toolsMetadata={toolsMetadata}
@@ -2655,7 +2666,7 @@ function RunColumn({
26552666
</div>
26562667
)
26572668
) : record.status === "running" && !record.iteration ? (
2658-
traceMode === "chat" && activeLiveChatTrace ? (
2669+
(traceMode === "chat" || traceMode === "tools") && activeLiveChatTrace ? (
26592670
<div className="flex min-h-0 min-w-0 flex-1 flex-col">
26602671
<TraceViewer
26612672
trace={activeLiveChatTrace}
@@ -2664,8 +2675,10 @@ function RunColumn({
26642675
name: record.modelLabel,
26652676
provider: record.provider as any,
26662677
}}
2667-
forcedViewMode="chat"
2678+
forcedViewMode={traceMode === "tools" ? "tools" : "chat"}
26682679
isLoading={true}
2680+
expectedToolCalls={expectedToolCalls}
2681+
actualToolCalls={actualToolCalls}
26692682
toolsMetadata={toolsMetadata}
26702683
toolServerMap={toolServerMap}
26712684
connectedServerIds={connectedServerIds}

mcpjam-inspector/client/src/components/evals/trace-viewer.tsx

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ export type TraceViewerEvalToolCall = {
5353
interface TraceViewerProps {
5454
trace: TraceEnvelope | TraceMessage | TraceMessage[] | null;
5555
model?: ModelDefinition;
56+
/**
57+
* Chat: forwarded to the transcript `Thread`. Tools (Results): shows a spinner
58+
* beside "Actual" while the run is still in progress.
59+
*/
5660
isLoading?: boolean;
5761
toolsMetadata?: Record<string, Record<string, any>>;
5862
toolServerMap?: ToolServerMap;
@@ -670,8 +674,15 @@ export function TraceViewer({
670674
)}
671675
</div>
672676
<div className="flex min-h-0 min-w-0 flex-1 flex-col gap-2 rounded-md border border-border/40 bg-muted/10 p-3">
673-
<div className="shrink-0 text-xs font-medium text-muted-foreground uppercase">
677+
<div className="flex shrink-0 items-center gap-1.5 text-xs font-medium text-muted-foreground uppercase">
674678
Actual
679+
{isLoading ? (
680+
<Loader2
681+
className="h-3 w-3 shrink-0 animate-spin"
682+
aria-hidden
683+
data-testid="trace-viewer-actual-loading"
684+
/>
685+
) : null}
675686
</div>
676687
{actualToolCalls.length === 0 ? (
677688
<div className="text-xs text-muted-foreground italic">

0 commit comments

Comments
 (0)