feat(evals): Playground compare defaults to Results + Actual loading spinner (#1879)

Vu-John · chelojimenez · web-flow · commit 86892372617a · 2026-04-21T00:03:22.000-07:00
Co-authored-by: marcelo &lt;marcelojimenezrocabado@gmail.com&gt;
diff --git a/mcpjam-inspector/client/src/components/evals/__tests__/test-template-editor-open-compare-route.test.tsx b/mcpjam-inspector/client/src/components/evals/__tests__/test-template-editor-open-compare-route.test.tsx
@@ -94,6 +94,7 @@ vi.mock("../trace-viewer", () => ({
     trace?: { messages?: Array<{ content?: unknown }> } | null;
     forcedViewMode?: string;
     isLoading?: boolean;
+    expectedToolCalls?: Array<{ toolName: string }>;
   }) => {
     mockTraceViewer(props);
     const firstMessage = props.trace?.messages?.[0]?.content;
@@ -106,6 +107,7 @@ vi.mock("../trace-viewer", () => ({
         data-first-message={
           typeof firstMessage === "string" ? firstMessage : "non-string"
         }
+        data-expected-tool-count={String(props.expectedToolCalls?.length ?? 0)}
       />
     );
   },
@@ -252,6 +254,7 @@ describe("TestTemplateEditor run view from route", () => {
       trace?: { messages?: Array<{ content?: unknown }> } | null;
       forcedViewMode?: string;
       isLoading?: boolean;
+      expectedToolCalls?: Array<{ toolName: string; arguments: Record<string, unknown> }>;
     };
   }
 
@@ -673,6 +676,85 @@ describe("TestTemplateEditor run view from route", () => {
     expect(screen.queryByText("Running GPT-4…")).not.toBeInTheDocument();
   });
 
+  it("renders a tools preview (not generic spinner) before the first stream event when the case has expected tool calls", async () => {
+    const user = userEvent.setup();
+    const caseWithTools = {
+      ...caseDoc,
+      isNegativeTest: false,
+      expectedToolCalls: [{ toolName: "create_view", arguments: {} }],
+    };
+
+    useQueryMock.mockImplementation((name: string, args: unknown) => {
+      if (name === "testSuites:listTestCases") return [caseWithTools];
+      if (name === "testSuites:getTestSuite") {
+        return { _id: "suite-1", environment: { servers: ["srv"] } };
+      }
+      if (name === "testSuites:listTestIterations" && args !== "skip") {
+        return [baseIteration];
+      }
+      if (
+        name === "testSuites:getTestIteration" &&
+        typeof args === "object" &&
+        args !== null &&
+        (args as { iterationId?: string }).iterationId === baseIteration._id
+      ) {
+        return baseIteration;
+      }
+      return undefined;
+    });
+
+    // Stream never resolves — keeps the run in "running, no iteration" state.
+    streamEvalTestCaseMock.mockImplementation(
+      async () => new Promise<void>(() => {}),
+    );
+
+    renderWithProviders(
+      <TestTemplateEditor
+        suiteId="suite-1"
+        selectedTestCaseId="case-1"
+        connectedServerNames={new Set(["srv"])}
+        workspaceId={null}
+        availableModels={[
+          {
+            provider: "openai",
+            id: "gpt-4",
+            model: "gpt-4",
+            name: "GPT-4",
+            label: "GPT-4",
+          } as any,
+        ]}
+      />,
+      { hostStyle: "claude" },
+    );
+
+    await waitFor(() => {
+      expect(screen.getByRole("button", { name: /run$/i })).toBeInTheDocument();
+    });
+
+    await user.click(screen.getByRole("button", { name: /run$/i }));
+
+    await waitFor(() => {
+      expect(streamEvalTestCaseMock).toHaveBeenCalledTimes(1);
+      expect(screen.getByTestId("mock-trace-viewer")).toBeInTheDocument();
+    });
+
+    // Must show tools view (not chat) and pass expected tool calls through.
+    expect(screen.getByTestId("mock-trace-viewer")).toHaveAttribute(
+      "data-view-mode",
+      "tools",
+    );
+    expect(screen.getByTestId("mock-trace-viewer")).toHaveAttribute(
+      "data-is-loading",
+      "true",
+    );
+    expect(screen.getByTestId("mock-trace-viewer")).toHaveAttribute(
+      "data-expected-tool-count",
+      "1",
+    );
+    // Generic spinner must not appear.
+    expect(screen.queryByText(/Running GPT-4/)).not.toBeInTheDocument();
+  });
+
   it("replaces the initial preview with streamed chat messages as soon as live trace data exists", async () => {
     const user = userEvent.setup();
     let emitEvent:
@@ -811,6 +893,94 @@ describe("TestTemplateEditor run view from route", () => {
     });
   });
 
+  it("defaults to Results tab when expected tool calls are on a non-first prompt turn (multi-turn case)", async () => {
+    const user = userEvent.setup();
+    // Multi-turn case: turn 1 has no expected tool calls, turn 2 has one.
+    const multiTurnCase = {
+      ...caseDoc,
+      isNegativeTest: false,
+      expectedToolCalls: [],
+      promptTurns: [
+        {
+          id: "turn-1",
+          prompt: "First prompt",
+          expectedToolCalls: [],
+        },
+        {
+          id: "turn-2",
+          prompt: "Second prompt",
+          expectedToolCalls: [{ toolName: "some_tool", arguments: {} }],
+        },
+      ],
+    };
+
+    useQueryMock.mockImplementation((name: string, args: unknown) => {
+      if (name === "testSuites:listTestCases") {
+        return [multiTurnCase];
+      }
+      if (name === "testSuites:getTestSuite") {
+        return {
+          _id: "suite-1",
+          environment: { servers: ["srv"] },
+        };
+      }
+      if (name === "testSuites:listTestIterations" && args !== "skip") {
+        return [baseIteration];
+      }
+      if (
+        name === "testSuites:getTestIteration" &&
+        typeof args === "object" &&
+        args !== null &&
+        (args as { iterationId?: string }).iterationId === baseIteration._id
+      ) {
+        return baseIteration;
+      }
+      return undefined;
+    });
+    streamEvalTestCaseMock.mockImplementation(
+      async () => new Promise<void>(() => {}),
+    );
+
+    renderWithProviders(
+      <TestTemplateEditor
+        suiteId="suite-1"
+        selectedTestCaseId="case-1"
+        connectedServerNames={new Set(["srv"])}
+        workspaceId={null}
+        availableModels={[
+          {
+            provider: "openai",
+            id: "gpt-4",
+            model: "gpt-4",
+            name: "GPT-4",
+            label: "GPT-4",
+          } as any,
+        ]}
+      />,
+      { hostStyle: "claude" },
+    );
+
+    await waitFor(() => {
+      expect(screen.getByRole("button", { name: /run$/i })).toBeInTheDocument();
+    });
+
+    await user.click(screen.getByRole("button", { name: /run$/i }));
+
+    await waitFor(() => {
+      expect(streamEvalTestCaseMock).toHaveBeenCalledTimes(1);
+    });
+
+    // The pre-stream preview TraceViewer must be rendered in tools mode with
+    // the expected tool call flattened from turn 2.
+    await waitFor(() => {
+      const props = getLatestTraceViewerProps();
+      expect(props.forcedViewMode).toBe("tools");
+      expect(props.expectedToolCalls).toEqual([
+        { toolName: "some_tool", arguments: {} },
+      ]);
+    });
+  });
+
   it("shows the host-style pill only while the chat tab is active", async () => {
     const user = userEvent.setup();
     const caseWithExpectedToolCalls = {
@@ -882,6 +1052,10 @@ describe("TestTemplateEditor run view from route", () => {
 
     const card = getCompareCard("GPT-4");
 
+    // Default tab is Results when the case has expected tools — host-style pill is Chat-only.
+    expect(card.querySelector("[data-selected-host-style]")).toBeNull();
+
+    await user.click(within(card).getByRole("button", { name: /^Chat$/i }));
     expect(card.querySelector('[data-selected-host-style="claude"]')).not.toBe(
       null,
     );
diff --git a/mcpjam-inspector/client/src/components/evals/__tests__/trace-viewer.test.tsx b/mcpjam-inspector/client/src/components/evals/__tests__/trace-viewer.test.tsx
@@ -589,6 +589,21 @@ describe("TraceViewer", () => {
     expect(screen.getByText("Actual")).toBeInTheDocument();
   });
 
+  it("shows a loading spinner next to Actual in tools compare view when isLoading", () => {
+    render(
+      <TraceViewer
+        trace={simpleTextTrace}
+        estimatedDurationMs={100}
+        expectedToolCalls={[{ toolName: "a", arguments: {} }]}
+        actualToolCalls={[]}
+        forcedViewMode="tools"
+        isLoading
+      />,
+    );
+    expect(screen.getByTestId("trace-viewer-tools-compare")).toBeInTheDocument();
+    expect(screen.getByTestId("trace-viewer-actual-loading")).toBeInTheDocument();
+  });
+
   it("hides Tools tab when there are no expected or actual tool calls", async () => {
     render(<TraceViewer trace={simpleTextTrace} estimatedDurationMs={100} />);
     expect(await screen.findByText("Estimated total only")).toBeInTheDocument();
diff --git a/mcpjam-inspector/client/src/components/evals/eval-trace-surface.tsx b/mcpjam-inspector/client/src/components/evals/eval-trace-surface.tsx
@@ -31,6 +31,8 @@ interface EvalTraceSurfaceProps {
   traceBlob?: TraceEnvelope | null;
   traceBlobLoading?: boolean;
   traceBlobError?: string | null;
+  /** Run in progress; shows beside "Actual" in Results (tools) mode, same signal as metric spinners. */
+  isLoading?: boolean;
   toolsMetadata: Record<string, Record<string, unknown>>;
   toolServerMap: ToolServerMap;
   connectedServerIds: string[];
@@ -78,6 +80,7 @@ export function EvalTraceSurface({
   traceBlob,
   traceBlobLoading,
   traceBlobError,
+  isLoading = false,
   toolsMetadata,
   toolServerMap,
   connectedServerIds,
@@ -188,6 +191,7 @@ export function EvalTraceSurface({
         <TraceViewer
           trace={activeTrace}
           model={traceModel}
+          isLoading={isLoading}
           toolsMetadata={toolsMetadata}
           toolServerMap={toolServerMap}
           connectedServerIds={connectedServerIds}
diff --git a/mcpjam-inspector/client/src/components/evals/test-template-editor.tsx b/mcpjam-inspector/client/src/components/evals/test-template-editor.tsx
@@ -58,6 +58,7 @@ import {
 } from "./single-test-case-runner";
 import {
   deriveLegacyPromptFields,
+  flattenAssertedExpectedToolCalls,
   resolveIterationDisplayExpectedToolCalls,
   resolvePromptTurns,
   stripPromptTurnsFromAdvancedConfig,
@@ -1246,12 +1247,15 @@ export function TestTemplateEditor({
 
     compareHandlesInFlightRef.current += 1;
     setIsRunningCompare(true);
+    const previewExpectedToolCalls = flattenAssertedExpectedToolCalls(savePayload);
+    const defaultRunColumnTab: RunColumnTab =
+      previewExpectedToolCalls.length > 0 ? "tools" : "chat";
     setRunColumnTabByModel((previous) => ({
       ...previous,
       ...Object.fromEntries(
         runModelValues.map((modelValue) => [
           modelValue,
-          "chat" as RunColumnTab,
+          defaultRunColumnTab,
         ]),
       ),
     }));
@@ -1284,6 +1288,7 @@ export function TestTemplateEditor({
           completedAt: null,
           error: null,
           previewTrace: comparePreviewTrace,
+          previewExpectedToolCalls,
         };
       }
       for (const { modelValue, modelLabel, error } of preparationFailures) {
@@ -2380,10 +2385,15 @@ function RunColumn({
         record.completedAt ??
         record.modelValue,
     });
-  const expectedToolCalls = resolveIterationDisplayExpectedToolCalls(
-    record.iteration?.testCaseSnapshot,
-    testCase,
-  );
+  // Prefer the iteration snapshot (authoritative) once available; otherwise
+  // fall back to previewExpectedToolCalls captured from the in-memory form at
+  // run-start so unsaved edits are reflected in showToolsTab / the pre-stream
+  // Results preview before the persisted testCase is updated.
+  const expectedToolCalls = record.iteration?.testCaseSnapshot
+    ? resolveIterationDisplayExpectedToolCalls(record.iteration.testCaseSnapshot, null)
+    : record.previewExpectedToolCalls != null
+      ? record.previewExpectedToolCalls
+      : resolveIterationDisplayExpectedToolCalls(null, testCase);
   const actualToolCalls =
     record.iteration?.actualToolCalls ?? record.streamingActualToolCalls ?? [];
   const showToolsTab =
@@ -2610,6 +2620,7 @@ function RunColumn({
           traceBlob={persistedTraceBlob}
           traceBlobLoading={persistedTraceLoading}
           traceBlobError={persistedTraceError}
+          isLoading={isRunningRecord}
           toolsMetadata={toolsMetadata}
           toolServerMap={toolServerMap}
           connectedServerIds={connectedServerIds}
@@ -2643,7 +2654,7 @@ function RunColumn({
         <TraceViewer
           trace={streamingTraceEnvelope}
           forcedViewMode={traceMode}
-          isLoading={traceMode === "chat" && isRunningRecord}
+          isLoading={isRunningRecord}
           expectedToolCalls={expectedToolCalls}
           actualToolCalls={actualToolCalls}
           toolsMetadata={toolsMetadata}
@@ -2655,7 +2666,7 @@ function RunColumn({
       </div>
     )
   ) : record.status === "running" && !record.iteration ? (
-    traceMode === "chat" && activeLiveChatTrace ? (
+    (traceMode === "chat" || traceMode === "tools") && activeLiveChatTrace ? (
       <div className="flex min-h-0 min-w-0 flex-1 flex-col">
         <TraceViewer
           trace={activeLiveChatTrace}
@@ -2664,8 +2675,10 @@ function RunColumn({
             name: record.modelLabel,
             provider: record.provider as any,
           }}
-          forcedViewMode="chat"
+          forcedViewMode={traceMode === "tools" ? "tools" : "chat"}
           isLoading={true}
+          expectedToolCalls={expectedToolCalls}
+          actualToolCalls={actualToolCalls}
           toolsMetadata={toolsMetadata}
           toolServerMap={toolServerMap}
           connectedServerIds={connectedServerIds}
diff --git a/mcpjam-inspector/client/src/components/evals/trace-viewer.tsx b/mcpjam-inspector/client/src/components/evals/trace-viewer.tsx
@@ -53,6 +53,10 @@ export type TraceViewerEvalToolCall = {
 interface TraceViewerProps {
   trace: TraceEnvelope | TraceMessage | TraceMessage[] | null;
   model?: ModelDefinition;
+  /**
+   * Chat: forwarded to the transcript `Thread`. Tools (Results): shows a spinner
+   * beside "Actual" while the run is still in progress.
+   */
   isLoading?: boolean;
   toolsMetadata?: Record<string, Record<string, any>>;
   toolServerMap?: ToolServerMap;
@@ -670,8 +674,15 @@ export function TraceViewer({
               )}
             </div>
             <div className="flex min-h-0 min-w-0 flex-1 flex-col gap-2 rounded-md border border-border/40 bg-muted/10 p-3">
-              <div className="shrink-0 text-xs font-medium text-muted-foreground uppercase">
+              <div className="flex shrink-0 items-center gap-1.5 text-xs font-medium text-muted-foreground uppercase">
                 Actual
+                {isLoading ? (
+                  <Loader2
+                    className="h-3 w-3 shrink-0 animate-spin"
+                    aria-hidden
+                    data-testid="trace-viewer-actual-loading"
+                  />
+                ) : null}
               </div>
               {actualToolCalls.length === 0 ? (
                 <div className="text-xs text-muted-foreground italic">
diff --git a/mcpjam-inspector/client/src/components/evals/types.ts b/mcpjam-inspector/client/src/components/evals/types.ts