Skip to content

Commit 465c99b

Browse files
chelojimenezclaude
andauthored
feat(evals): suite-level Default Execution Config editor (#2043)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 9a569c9 commit 465c99b

9 files changed

Lines changed: 414 additions & 5 deletions

File tree

mcpjam-inspector/client/src/components/CiEvalsTab.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ export function CiEvalsTab({
282282
latestRunBySuiteId,
283283
evalsNavigationContext: "ci-evals",
284284
projectServers: ciProjectServers,
285+
availableModels,
285286
});
286287

287288
const suiteAggregate = useMemo(() => {

mcpjam-inspector/client/src/components/EvalsTab.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ export function EvalsTab({
150150
latestRunBySuiteId,
151151
projectServers,
152152
isDirectGuest,
153+
availableModels,
153154
});
154155
const {
155156
deletingSuiteId,

mcpjam-inspector/client/src/components/evals/eval-runner.tsx

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ interface EvalRunnerProps {
5252
inline?: boolean;
5353
onSuccess?: (suiteId?: string) => void;
5454
preselectedServer?: string;
55+
/** Pre-populate model selection from suite.defaultConfig.modelId. User can still change it. */
56+
defaultModelId?: string;
5557
}
5658

5759
type StepKey = (typeof WIZARD_STEPS)[number]["key"];
@@ -151,6 +153,7 @@ export function EvalRunner({
151153
inline = false,
152154
onSuccess,
153155
preselectedServer,
156+
defaultModelId,
154157
}: EvalRunnerProps) {
155158
const [open, setOpen] = useState(false);
156159
const [isSubmitting, setIsSubmitting] = useState(false);
@@ -260,6 +263,18 @@ export function EvalRunner({
260263
return;
261264
}
262265

266+
// Suite defaultConfig takes precedence over stored preferences when present
267+
if (defaultModelId) {
268+
const suiteDefault = availableModels.find(
269+
(m) => String(m.id) === defaultModelId,
270+
);
271+
if (suiteDefault) {
272+
setSelectedModels([suiteDefault]);
273+
setHasRestoredPreferences(true);
274+
return;
275+
}
276+
}
277+
263278
if (savedPreferences?.modelIds && savedPreferences.modelIds.length > 0) {
264279
const matches = availableModels.filter((model) =>
265280
savedPreferences.modelIds.includes(model.id),
@@ -270,7 +285,7 @@ export function EvalRunner({
270285
}
271286

272287
setHasRestoredPreferences(true);
273-
}, [availableModels, savedPreferences, hasRestoredPreferences]);
288+
}, [availableModels, savedPreferences, hasRestoredPreferences, defaultModelId]);
274289

275290
useEffect(() => {
276291
if (typeof window === "undefined") return;
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
import { useEffect, useState } from "react";
2+
import { Loader2, Settings2, Trash2 } from "lucide-react";
3+
import { Button } from "@mcpjam/design-system/button";
4+
import { Label } from "@mcpjam/design-system/label";
5+
import {
6+
Select,
7+
SelectContent,
8+
SelectItem,
9+
SelectTrigger,
10+
SelectValue,
11+
} from "@mcpjam/design-system/select";
12+
import { Slider } from "@mcpjam/design-system/slider";
13+
import { Textarea } from "@mcpjam/design-system/textarea";
14+
import type { EvalSuite } from "./types";
15+
import type { ModelDefinition } from "@/shared/types";
16+
17+
type SuiteExecutionConfigEditorProps = {
18+
suite: Pick<EvalSuite, "_id" | "defaultConfig">;
19+
availableModels: ModelDefinition[];
20+
onSave: (
21+
defaultConfig: NonNullable<EvalSuite["defaultConfig"]>
22+
) => Promise<void>;
23+
onClear?: () => Promise<void>;
24+
};
25+
26+
const DEFAULT_TEMPERATURE = 0.7;
27+
28+
export function SuiteExecutionConfigEditor({
29+
suite,
30+
availableModels,
31+
onSave,
32+
onClear,
33+
}: SuiteExecutionConfigEditorProps) {
34+
const [modelId, setModelId] = useState(suite.defaultConfig?.modelId ?? "");
35+
const [provider, setProvider] = useState(suite.defaultConfig?.provider ?? "");
36+
const [systemPrompt, setSystemPrompt] = useState(
37+
suite.defaultConfig?.systemPrompt ?? ""
38+
);
39+
const [temperature, setTemperature] = useState(
40+
suite.defaultConfig?.temperature ?? DEFAULT_TEMPERATURE
41+
);
42+
const [isSaving, setIsSaving] = useState(false);
43+
const [isClearing, setIsClearing] = useState(false);
44+
45+
// Depend on scalar values, not the object reference: a parent re-render
46+
// that produces a fresh `suite.defaultConfig` with identical values would
47+
// otherwise stomp in-progress edits. `suite._id` is also a dep so drafts
48+
// can't leak across suites when the editor stays mounted (parent doesn't
49+
// key it by suite id) and two suites happen to share the same scalars.
50+
useEffect(() => {
51+
setModelId(suite.defaultConfig?.modelId ?? "");
52+
setProvider(suite.defaultConfig?.provider ?? "");
53+
setSystemPrompt(suite.defaultConfig?.systemPrompt ?? "");
54+
setTemperature(suite.defaultConfig?.temperature ?? DEFAULT_TEMPERATURE);
55+
}, [
56+
suite._id,
57+
suite.defaultConfig?.modelId,
58+
suite.defaultConfig?.provider,
59+
suite.defaultConfig?.systemPrompt,
60+
suite.defaultConfig?.temperature,
61+
]);
62+
63+
// For suites saved before provider was tracked, fall back to the first
64+
// matching model so the Select still renders the saved choice. Computed at
65+
// render time so we don't have to add availableModels as an effect dep
66+
// (which would risk stomping in-progress edits on parent re-renders).
67+
const displayProvider =
68+
provider ||
69+
(modelId
70+
? availableModels.find((m) => String(m.id) === modelId)?.provider ?? ""
71+
: "");
72+
73+
const savedModelId = suite.defaultConfig?.modelId ?? "";
74+
const savedProvider = suite.defaultConfig?.provider ?? "";
75+
const savedSystemPrompt = suite.defaultConfig?.systemPrompt ?? "";
76+
const savedTemperature =
77+
suite.defaultConfig?.temperature ?? DEFAULT_TEMPERATURE;
78+
79+
const isDirty =
80+
modelId !== savedModelId ||
81+
provider !== savedProvider ||
82+
systemPrompt !== savedSystemPrompt ||
83+
temperature !== savedTemperature;
84+
85+
const handleReset = () => {
86+
setModelId(savedModelId);
87+
setProvider(savedProvider);
88+
setSystemPrompt(savedSystemPrompt);
89+
setTemperature(savedTemperature);
90+
};
91+
92+
const handleSave = async () => {
93+
if (!modelId) return;
94+
setIsSaving(true);
95+
try {
96+
await onSave({
97+
modelId,
98+
provider: provider || displayProvider || undefined,
99+
systemPrompt,
100+
temperature,
101+
});
102+
} finally {
103+
setIsSaving(false);
104+
}
105+
};
106+
107+
const handleClear = async () => {
108+
if (!onClear) return;
109+
setIsClearing(true);
110+
try {
111+
await onClear();
112+
} finally {
113+
setIsClearing(false);
114+
}
115+
};
116+
117+
return (
118+
<section className="space-y-3">
119+
<div>
120+
<h2 className="text-base font-semibold text-foreground">
121+
Default Execution Config
122+
</h2>
123+
<p className="mt-1 text-xs text-muted-foreground">
124+
The model and parameters all iterations in this suite inherit. Per-case{" "}
125+
<code className="rounded bg-muted px-1 py-0.5 text-[10px]">
126+
advancedConfig
127+
</code>{" "}
128+
overrides take precedence.
129+
</p>
130+
</div>
131+
132+
<div className="space-y-4 rounded-xl border bg-card/60 p-4">
133+
{/* Model */}
134+
<div>
135+
<Label className="text-xs font-medium text-muted-foreground">
136+
Model
137+
</Label>
138+
{/* Encode provider into the Select value so colliding model ids
139+
across providers (e.g. native OpenAI gpt-4o vs OpenRouter
140+
gpt-4o) are saved with the correct provider. */}
141+
<Select
142+
value={modelId ? `${displayProvider}:${modelId}` : ""}
143+
onValueChange={(value) => {
144+
const sep = value.indexOf(":");
145+
const nextProvider = sep >= 0 ? value.slice(0, sep) : "";
146+
const nextId = sep >= 0 ? value.slice(sep + 1) : value;
147+
setProvider(nextProvider);
148+
setModelId(nextId);
149+
}}
150+
disabled={isSaving || isClearing}
151+
>
152+
<SelectTrigger className="mt-1.5 border-0 bg-muted/50 transition-colors hover:bg-muted">
153+
<SelectValue placeholder="Select a model" />
154+
</SelectTrigger>
155+
<SelectContent>
156+
{availableModels.map((model) => {
157+
const value = `${model.provider}:${String(model.id)}`;
158+
return (
159+
<SelectItem key={value} value={value}>
160+
{model.name}
161+
</SelectItem>
162+
);
163+
})}
164+
</SelectContent>
165+
</Select>
166+
</div>
167+
168+
{/* System prompt */}
169+
<div>
170+
<Label className="text-xs font-medium text-muted-foreground">
171+
System prompt
172+
</Label>
173+
<p className="mb-1.5 mt-0.5 text-[10px] text-muted-foreground">
174+
Instructions given to the model at the start of each run.
175+
</p>
176+
<Textarea
177+
value={systemPrompt}
178+
onChange={(e) => setSystemPrompt(e.target.value)}
179+
placeholder="You are a helpful assistant…"
180+
className="min-h-[80px] resize-y border-0 bg-muted/50 text-sm"
181+
disabled={isSaving || isClearing}
182+
/>
183+
</div>
184+
185+
{/* Temperature */}
186+
<div>
187+
<div className="flex items-center justify-between">
188+
<Label className="text-xs font-medium text-muted-foreground">
189+
Temperature
190+
</Label>
191+
<span className="text-xs text-muted-foreground">
192+
{temperature.toFixed(2)}
193+
</span>
194+
</div>
195+
<Slider
196+
min={0}
197+
max={2}
198+
step={0.05}
199+
value={[temperature]}
200+
onValueChange={(values) =>
201+
setTemperature(values[0] ?? DEFAULT_TEMPERATURE)
202+
}
203+
className="mt-3"
204+
disabled={isSaving || isClearing}
205+
/>
206+
</div>
207+
</div>
208+
209+
<div className="flex flex-wrap items-center justify-between gap-3 rounded-lg bg-muted/40 px-3 py-2 text-xs text-muted-foreground">
210+
<div className="flex items-center gap-2">
211+
<Settings2 className="h-3.5 w-3.5" />
212+
<span>
213+
{modelId
214+
? `Default: ${modelId}`
215+
: "No default model configured"}
216+
</span>
217+
</div>
218+
<div className="flex items-center gap-2">
219+
{onClear && suite.defaultConfig ? (
220+
<Button
221+
type="button"
222+
variant="ghost"
223+
size="sm"
224+
onClick={() => void handleClear()}
225+
disabled={isClearing || isSaving}
226+
className="text-destructive hover:text-destructive"
227+
>
228+
{isClearing ? (
229+
<Loader2 className="mr-1 h-3.5 w-3.5 animate-spin" />
230+
) : (
231+
<Trash2 className="mr-1 h-3.5 w-3.5" />
232+
)}
233+
Remove
234+
</Button>
235+
) : null}
236+
<Button
237+
type="button"
238+
variant="ghost"
239+
size="sm"
240+
onClick={handleReset}
241+
disabled={!isDirty || isSaving || isClearing}
242+
>
243+
Reset
244+
</Button>
245+
<Button
246+
type="button"
247+
size="sm"
248+
onClick={() => void handleSave()}
249+
disabled={!isDirty || isSaving || isClearing || !modelId}
250+
>
251+
{isSaving ? (
252+
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
253+
) : null}
254+
Save config
255+
</Button>
256+
</div>
257+
</div>
258+
</section>
259+
);
260+
}

mcpjam-inspector/client/src/components/evals/suite-iterations-view.tsx

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import { TestCaseDetailView } from "./test-case-detail-view";
1818
import { SuiteDashboard } from "./suite-dashboard";
1919
import { EvalExportModal } from "./eval-export-modal";
2020
import { SuiteEnvironmentEditor } from "./suite-environment-editor";
21+
import { SuiteExecutionConfigEditor } from "./suite-execution-config-editor";
2122
import { useSuiteData, useRunDetailData } from "./use-suite-data";
2223
import type {
2324
EvalCase,
@@ -853,6 +854,53 @@ export function SuiteIterationsView({
853854
/>
854855
) : null}
855856

857+
<SuiteExecutionConfigEditor
858+
suite={suite}
859+
availableModels={availableModels}
860+
onSave={async (defaultConfig) => {
861+
try {
862+
await updateSuite({
863+
suiteId: suite._id,
864+
defaultConfig,
865+
});
866+
toast.success("Suite execution config updated");
867+
} catch (error) {
868+
toast.error(
869+
getBillingErrorMessage(
870+
error,
871+
"Failed to update suite execution config",
872+
),
873+
);
874+
console.error(
875+
"Failed to update suite execution config:",
876+
error,
877+
);
878+
throw error;
879+
}
880+
}}
881+
onClear={async () => {
882+
try {
883+
await updateSuite({
884+
suiteId: suite._id,
885+
defaultConfig: null,
886+
});
887+
toast.success("Suite execution config removed");
888+
} catch (error) {
889+
toast.error(
890+
getBillingErrorMessage(
891+
error,
892+
"Failed to remove suite execution config",
893+
),
894+
);
895+
console.error(
896+
"Failed to remove suite execution config:",
897+
error,
898+
);
899+
throw error;
900+
}
901+
}}
902+
/>
903+
856904
{/* Suite Description Section */}
857905
<div className="space-y-3">
858906
<div>

mcpjam-inspector/client/src/components/evals/types.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ export type EvalSuite = {
4545
};
4646
_creationTime?: number; // Convex auto field
4747
tags?: string[];
48+
defaultConfig?: {
49+
modelId: string;
50+
provider?: string;
51+
systemPrompt: string;
52+
temperature: number;
53+
};
4854
};
4955

5056
export type EvalCase = {

0 commit comments

Comments
 (0)