Skip to content

Commit b1ecb2d

Browse files
authored
Merge pull request #13 from Significant-Gravitas/symphony/SYM-16
feat(cli): add --scenario flag with name matching and list command
2 parents e150b1f + 3055fbe commit b1ecb2d

8 files changed

Lines changed: 584 additions & 8 deletions

File tree

docs/product-specs/current-state.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Last validated against `platform.md`: 2026-04-13
77
- [x] YAML validation succeeds for well-formed data
88
- [x] Evaluation run records ordered results and artifacts
99
- [x] Scenario filters narrow execution to matching scenarios
10+
- [x] List command shows available scenarios
1011
- [x] Dry-run mode records intent without contacting external systems
1112
- [x] Judge requests preserve cache-friendly prompt prefixes
1213
- [x] Parallel mode overlaps scenario execution while preserving ordering

docs/product-specs/e2e-checklist.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Derived from `platform.md`. Every scenario should have a coverage owner.
77
| YAML validation succeeds for well-formed data | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
88
| Evaluation run records ordered results and artifacts | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
99
| Scenario filters narrow execution to matching scenarios | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
10+
| List command shows available scenarios | `tests/e2e/cli.e2e.test.ts` | ⏳ planned |
1011
| Dry-run mode records intent without contacting external systems | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
1112
| Judge requests preserve cache-friendly prompt prefixes | `tests/unit/judge.test.ts` | ✅ covered |
1213
| Parallel mode overlaps scenario execution while preserving ordering | `tests/e2e/cli.e2e.test.ts` | ✅ covered |

docs/product-specs/platform.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,20 @@ humans can use to inspect pass/fail outcomes
2727

2828
**Given** valid endpoint, scenario, persona, and rubric YAML files that define
2929
multiple scenarios and tags
30-
**When** the user runs an evaluation suite with `--scenario-id` or `--tags`
30+
**When** the user runs an evaluation suite with `--scenario` (or `--scenario-id`)
31+
or `--tags`
3132
**Then** the CLI runs only the matching scenarios, records the selected
3233
scenario IDs in run history, and fails fast before any endpoint traffic when no
33-
scenario matches the requested filters
34+
scenario matches the requested filters. The `--scenario` flag accepts one or
35+
more comma-separated values that match by scenario ID or scenario name. When no
36+
match is found, the error message lists all available scenario IDs and names.
37+
38+
### List command shows available scenarios
39+
40+
**Given** a scenario file or directory containing scenario YAML files
41+
**When** the user runs the `list` command with `--scenarios`
42+
**Then** the CLI prints each scenario's ID, name, and tags, and returns a
43+
non-zero exit code when no scenarios match the optional `--tags` filter
3444

3545
### Dry-run mode records intent without contacting external systems
3646

src/cli/main.ts

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,20 @@ function selectDashboardScenarios(options: {
237237
.map((item) => item.trim())
238238
.filter(Boolean),
239239
);
240+
const requestedScenarioIds = options.scenarioId
241+
? new Set(
242+
options.scenarioId
243+
.split(",")
244+
.map((item) => item.trim())
245+
.filter(Boolean),
246+
)
247+
: undefined;
240248
const selectedScenarios = scenarioCollection.scenarios.filter((scenario) => {
241-
if (options.scenarioId && scenario.id !== options.scenarioId) {
249+
if (
250+
requestedScenarioIds &&
251+
!requestedScenarioIds.has(scenario.id) &&
252+
!requestedScenarioIds.has(scenario.name)
253+
) {
242254
return false;
243255
}
244256
if (
@@ -310,6 +322,9 @@ async function handleRun(args: string[]): Promise<number> {
310322
);
311323
}
312324

325+
const scenarioId =
326+
parseOption(args, "--scenario") ?? parseOption(args, "--scenario-id");
327+
313328
const client = new OpenAiResponsesClient();
314329
client.assertConfigured();
315330
const recorder = new SqliteRunRecorder(
@@ -331,7 +346,7 @@ async function handleRun(args: string[]): Promise<number> {
331346
dashboard.state.primeScenarios(
332347
selectDashboardScenarios({
333348
scenariosPath: scenarios,
334-
scenarioId: parseOption(args, "--scenario-id"),
349+
scenarioId,
335350
tags: parseOption(args, "--tags"),
336351
repeat,
337352
}),
@@ -345,7 +360,7 @@ async function handleRun(args: string[]): Promise<number> {
345360
scenarios,
346361
personas,
347362
rubric,
348-
scenarioId: parseOption(args, "--scenario-id"),
363+
scenarioId,
349364
tags: parseOption(args, "--tags"),
350365
client,
351366
recorder,
@@ -365,6 +380,45 @@ async function handleRun(args: string[]): Promise<number> {
365380
}
366381
}
367382

383+
async function handleList(
384+
args: string[],
385+
globalDataPath?: string,
386+
): Promise<number> {
387+
const scenariosPath =
388+
parseOption(args, "--scenarios") ?? globalDataPath ?? "data";
389+
const tags = parseOption(args, "--tags");
390+
const scenarioCollection = parseScenariosInput(scenariosPath);
391+
392+
const requestedTags = new Set(
393+
(tags ?? "")
394+
.split(",")
395+
.map((item) => item.trim())
396+
.filter(Boolean),
397+
);
398+
399+
const selectedScenarios = scenarioCollection.scenarios.filter((scenario) => {
400+
if (
401+
requestedTags.size > 0 &&
402+
!scenario.tags.some((tag) => requestedTags.has(tag))
403+
) {
404+
return false;
405+
}
406+
return true;
407+
});
408+
409+
if (selectedScenarios.length === 0) {
410+
console.error("No scenarios found.");
411+
return 1;
412+
}
413+
414+
for (const scenario of selectedScenarios) {
415+
const tagSuffix =
416+
scenario.tags.length > 0 ? ` [${scenario.tags.join(", ")}]` : "";
417+
console.log(`${scenario.id}: ${scenario.name}${tagSuffix}`);
418+
}
419+
return 0;
420+
}
421+
368422
async function handleReport(
369423
args: string[],
370424
globalDataPath?: string,
@@ -449,6 +503,9 @@ export async function executeCli(argv: string[]): Promise<number> {
449503
if (command === "validate") {
450504
return await handleValidate(rest, globalDataPath);
451505
}
506+
if (command === "list") {
507+
return await handleList(rest, globalDataPath);
508+
}
452509
if (command === "run") {
453510
return await handleRun(rest);
454511
}

src/domains/evaluation/run-suite.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -937,10 +937,17 @@ export async function runSuite(options: {
937937
.filter(Boolean),
938938
);
939939

940+
const requestedIds = new Set(
941+
(options.scenarioId ?? "")
942+
.split(",")
943+
.map((item) => item.trim())
944+
.filter(Boolean),
945+
);
946+
940947
let selectedScenarios = [...scenarioCollection.scenarios];
941-
if (options.scenarioId) {
948+
if (requestedIds.size > 0) {
942949
selectedScenarios = selectedScenarios.filter(
943-
(item) => item.id === options.scenarioId,
950+
(item) => requestedIds.has(item.id) || requestedIds.has(item.name),
944951
);
945952
}
946953
if (requestedTags.size > 0) {
@@ -949,6 +956,14 @@ export async function runSuite(options: {
949956
);
950957
}
951958
if (selectedScenarios.length === 0) {
959+
if (options.scenarioId) {
960+
const available = scenarioCollection.scenarios.map(
961+
(s) => `${s.id} (${s.name})`,
962+
);
963+
throw new AgentProbeConfigError(
964+
`No scenario matching "${options.scenarioId}" found. Available: ${available.join(", ")}`,
965+
);
966+
}
952967
throw new AgentProbeConfigError(
953968
"No scenarios matched the requested filters.",
954969
);

tests/e2e/cli.e2e.test.ts

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,85 @@ describe("bun e2e baseline for the typescript cli", () => {
528528
expect(backend.countByKind("send_message")).toBe(1);
529529
});
530530

531+
test("--scenario flag alias works like --scenario-id", async () => {
532+
await workspace.writeOpenAiScript(buildOpenAiRules());
533+
534+
const result = await runAgentprobe(
535+
[
536+
"run",
537+
"--endpoint",
538+
workspace.endpointPath,
539+
"--scenarios",
540+
workspace.scenariosPath,
541+
"--personas",
542+
workspace.personasPath,
543+
"--rubric",
544+
workspace.rubricPath,
545+
"--scenario",
546+
"billing-followup",
547+
],
548+
{
549+
backendUrl: backend.url,
550+
suiteDir: workspace.suiteDir,
551+
workspace,
552+
},
553+
);
554+
555+
expect(result.exitCode).toBe(0);
556+
expect(result.stdout).not.toContain("refund-smoke");
557+
expect(result.stdout).toContain("PASS billing-followup score=0.80");
558+
559+
const runRows = queryRows(
560+
workspace.dbPath,
561+
["selected_scenario_ids_json"],
562+
"runs",
563+
"started_at DESC",
564+
);
565+
expect(runRows[0]?.selected_scenario_ids_json).toEqual([
566+
"billing-followup",
567+
]);
568+
});
569+
570+
test("--scenario filters by scenario name", async () => {
571+
await workspace.writeOpenAiScript(buildOpenAiRules());
572+
573+
const result = await runAgentprobe(
574+
[
575+
"run",
576+
"--endpoint",
577+
workspace.endpointPath,
578+
"--scenarios",
579+
workspace.scenariosPath,
580+
"--personas",
581+
workspace.personasPath,
582+
"--rubric",
583+
workspace.rubricPath,
584+
"--scenario",
585+
"Billing escalation follow-up",
586+
],
587+
{
588+
backendUrl: backend.url,
589+
suiteDir: workspace.suiteDir,
590+
workspace,
591+
},
592+
);
593+
594+
expect(result.exitCode).toBe(0);
595+
expect(result.stdout).not.toContain("refund-smoke");
596+
expect(result.stdout).toContain("PASS billing-followup score=0.80");
597+
598+
const runRows = queryRows(
599+
workspace.dbPath,
600+
["selected_scenario_ids_json"],
601+
"runs",
602+
"started_at DESC",
603+
);
604+
expect(runRows[0]?.selected_scenario_ids_json).toEqual([
605+
"billing-followup",
606+
]);
607+
expect(backend.countByKind("send_message")).toBe(1);
608+
});
609+
531610
test("tag filtering runs only matching scenarios", async () => {
532611
await workspace.writeOpenAiScript(buildOpenAiRules());
533612

@@ -566,6 +645,83 @@ describe("bun e2e baseline for the typescript cli", () => {
566645
expect(backend.countByKind("send_message")).toBe(1);
567646
});
568647

648+
test("comma-separated --scenario-id runs multiple specific scenarios", async () => {
649+
await workspace.writeOpenAiScript(buildOpenAiRules());
650+
651+
const result = await runAgentprobe(
652+
[
653+
"run",
654+
"--endpoint",
655+
workspace.endpointPath,
656+
"--scenarios",
657+
workspace.scenariosPath,
658+
"--personas",
659+
workspace.personasPath,
660+
"--rubric",
661+
workspace.rubricPath,
662+
"--scenario-id",
663+
"refund-smoke,billing-followup",
664+
],
665+
{
666+
backendUrl: backend.url,
667+
suiteDir: workspace.suiteDir,
668+
workspace,
669+
},
670+
);
671+
672+
expect(result.exitCode).toBe(0);
673+
expect(result.stdout).toContain("PASS refund-smoke score=1.00");
674+
expect(result.stdout).toContain("PASS billing-followup score=0.80");
675+
676+
const runRows = queryRows(
677+
workspace.dbPath,
678+
["selected_scenario_ids_json"],
679+
"runs",
680+
"started_at DESC",
681+
);
682+
expect(runRows[0]?.selected_scenario_ids_json).toEqual([
683+
"refund-smoke",
684+
"billing-followup",
685+
]);
686+
expect(backend.countByKind("send_message")).toBe(2);
687+
});
688+
689+
test("list command shows available scenarios", async () => {
690+
const result = await runAgentprobe(
691+
["list", "--scenarios", workspace.scenariosPath],
692+
{
693+
backendUrl: backend.url,
694+
suiteDir: workspace.suiteDir,
695+
workspace,
696+
},
697+
);
698+
699+
expect(result.exitCode).toBe(0);
700+
expect(result.stdout).toContain(
701+
"refund-smoke: Refund smoke question [smoke]",
702+
);
703+
expect(result.stdout).toContain(
704+
"billing-followup: Billing escalation follow-up [regression]",
705+
);
706+
});
707+
708+
test("list command with --tags filters scenarios", async () => {
709+
const result = await runAgentprobe(
710+
["list", "--scenarios", workspace.scenariosPath, "--tags", "smoke"],
711+
{
712+
backendUrl: backend.url,
713+
suiteDir: workspace.suiteDir,
714+
workspace,
715+
},
716+
);
717+
718+
expect(result.exitCode).toBe(0);
719+
expect(result.stdout).toContain(
720+
"refund-smoke: Refund smoke question [smoke]",
721+
);
722+
expect(result.stdout).not.toContain("billing-followup");
723+
});
724+
569725
test("no-match filtering returns a configuration error without target traffic", async () => {
570726
await workspace.writeOpenAiScript({ rules: [] });
571727

@@ -598,6 +754,37 @@ describe("bun e2e baseline for the typescript cli", () => {
598754
expect(await readOpenAiLog(workspace.openAiLogPath)).toHaveLength(0);
599755
});
600756

757+
test("no-match scenario-id returns a configuration error with available ids", async () => {
758+
await workspace.writeOpenAiScript({ rules: [] });
759+
760+
const result = await runAgentprobe(
761+
[
762+
"run",
763+
"--endpoint",
764+
workspace.endpointPath,
765+
"--scenarios",
766+
workspace.scenariosPath,
767+
"--personas",
768+
workspace.personasPath,
769+
"--rubric",
770+
workspace.rubricPath,
771+
"--scenario-id",
772+
"does-not-exist",
773+
],
774+
{
775+
backendUrl: backend.url,
776+
suiteDir: workspace.suiteDir,
777+
workspace,
778+
},
779+
);
780+
781+
expect(result.exitCode).toBe(2);
782+
expect(result.stderr).toContain("does-not-exist");
783+
expect(result.stderr).toContain("refund-smoke");
784+
expect(result.stderr).toContain("billing-followup");
785+
expect(backend.requestLog).toHaveLength(0);
786+
});
787+
601788
test("dry-run avoids backend and openai calls while still recording the run", async () => {
602789
await workspace.writeOpenAiScript({ rules: [] });
603790

tests/unit/db.test.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,8 @@ describe("sqlite recorder", () => {
414414
expect(configRun.exitCode).toBe(2);
415415
expect(configRun.finalError).toEqual({
416416
type: "AgentProbeConfigError",
417-
message: "No scenarios matched the requested filters.",
417+
message:
418+
'No scenario matching "missing-scenario" found. Available: smoke-scenario (Smoke)',
418419
});
419420

420421
const runtimeRoot = makeTempDir("db-runtime-error");

0 commit comments

Comments
 (0)