Merge pull request #13 from Significant-Gravitas/symphony/SYM-16

Swiftyos · web-flow · commit b1ecb2d868e8 · 2026-04-15T17:23:38.000+02:00
feat(cli): add --scenario flag with name matching and list command
diff --git a/docs/product-specs/current-state.md b/docs/product-specs/current-state.md
@@ -7,6 +7,7 @@ Last validated against `platform.md`: 2026-04-13
 - [x] YAML validation succeeds for well-formed data
 - [x] Evaluation run records ordered results and artifacts
 - [x] Scenario filters narrow execution to matching scenarios
+- [x] List command shows available scenarios
 - [x] Dry-run mode records intent without contacting external systems
 - [x] Judge requests preserve cache-friendly prompt prefixes
 - [x] Parallel mode overlaps scenario execution while preserving ordering
diff --git a/docs/product-specs/e2e-checklist.md b/docs/product-specs/e2e-checklist.md
@@ -7,6 +7,7 @@ Derived from `platform.md`. Every scenario should have a coverage owner.
 | YAML validation succeeds for well-formed data | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
 | Evaluation run records ordered results and artifacts | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
 | Scenario filters narrow execution to matching scenarios | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
+| List command shows available scenarios | `tests/e2e/cli.e2e.test.ts` | ⏳ planned |
 | Dry-run mode records intent without contacting external systems | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
 | Judge requests preserve cache-friendly prompt prefixes | `tests/unit/judge.test.ts` | ✅ covered |
 | Parallel mode overlaps scenario execution while preserving ordering | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
diff --git a/docs/product-specs/platform.md b/docs/product-specs/platform.md
@@ -27,10 +27,20 @@ humans can use to inspect pass/fail outcomes
 
 **Given** valid endpoint, scenario, persona, and rubric YAML files that define
 multiple scenarios and tags
-**When** the user runs an evaluation suite with `--scenario-id` or `--tags`
+**When** the user runs an evaluation suite with `--scenario` (or `--scenario-id`)
+or `--tags`
 **Then** the CLI runs only the matching scenarios, records the selected
 scenario IDs in run history, and fails fast before any endpoint traffic when no
-scenario matches the requested filters
+scenario matches the requested filters. The `--scenario` flag accepts one or
+more comma-separated values that match by scenario ID or scenario name. When no
+match is found, the error message lists all available scenario IDs and names.
+
+### List command shows available scenarios
+
+**Given** a scenario file or directory containing scenario YAML files
+**When** the user runs the `list` command with `--scenarios`
+**Then** the CLI prints each scenario's ID, name, and tags, and returns a
+non-zero exit code when no scenarios match the optional `--tags` filter
 
 ### Dry-run mode records intent without contacting external systems
 
diff --git a/src/cli/main.ts b/src/cli/main.ts
@@ -237,8 +237,20 @@ function selectDashboardScenarios(options: {
       .map((item) => item.trim())
       .filter(Boolean),
   );
+  const requestedScenarioIds = options.scenarioId
+    ? new Set(
+        options.scenarioId
+          .split(",")
+          .map((item) => item.trim())
+          .filter(Boolean),
+      )
+    : undefined;
   const selectedScenarios = scenarioCollection.scenarios.filter((scenario) => {
-    if (options.scenarioId && scenario.id !== options.scenarioId) {
+    if (
+      requestedScenarioIds &&
+      !requestedScenarioIds.has(scenario.id) &&
+      !requestedScenarioIds.has(scenario.name)
+    ) {
       return false;
     }
     if (
@@ -310,6 +322,9 @@ async function handleRun(args: string[]): Promise<number> {
     );
   }
 
+  const scenarioId =
+    parseOption(args, "--scenario") ?? parseOption(args, "--scenario-id");
+
   const client = new OpenAiResponsesClient();
   client.assertConfigured();
   const recorder = new SqliteRunRecorder(
@@ -331,7 +346,7 @@ async function handleRun(args: string[]): Promise<number> {
       dashboard.state.primeScenarios(
         selectDashboardScenarios({
           scenariosPath: scenarios,
-          scenarioId: parseOption(args, "--scenario-id"),
+          scenarioId,
           tags: parseOption(args, "--tags"),
           repeat,
         }),
@@ -345,7 +360,7 @@ async function handleRun(args: string[]): Promise<number> {
       scenarios,
       personas,
       rubric,
-      scenarioId: parseOption(args, "--scenario-id"),
+      scenarioId,
       tags: parseOption(args, "--tags"),
       client,
       recorder,
@@ -365,6 +380,45 @@ async function handleRun(args: string[]): Promise<number> {
   }
 }
 
+async function handleList(
+  args: string[],
+  globalDataPath?: string,
+): Promise<number> {
+  const scenariosPath =
+    parseOption(args, "--scenarios") ?? globalDataPath ?? "data";
+  const tags = parseOption(args, "--tags");
+  const scenarioCollection = parseScenariosInput(scenariosPath);
+
+  const requestedTags = new Set(
+    (tags ?? "")
+      .split(",")
+      .map((item) => item.trim())
+      .filter(Boolean),
+  );
+
+  const selectedScenarios = scenarioCollection.scenarios.filter((scenario) => {
+    if (
+      requestedTags.size > 0 &&
+      !scenario.tags.some((tag) => requestedTags.has(tag))
+    ) {
+      return false;
+    }
+    return true;
+  });
+
+  if (selectedScenarios.length === 0) {
+    console.error("No scenarios found.");
+    return 1;
+  }
+
+  for (const scenario of selectedScenarios) {
+    const tagSuffix =
+      scenario.tags.length > 0 ? ` [${scenario.tags.join(", ")}]` : "";
+    console.log(`${scenario.id}: ${scenario.name}${tagSuffix}`);
+  }
+  return 0;
+}
+
 async function handleReport(
   args: string[],
   globalDataPath?: string,
@@ -449,6 +503,9 @@ export async function executeCli(argv: string[]): Promise<number> {
     if (command === "validate") {
       return await handleValidate(rest, globalDataPath);
     }
+    if (command === "list") {
+      return await handleList(rest, globalDataPath);
+    }
     if (command === "run") {
       return await handleRun(rest);
     }
diff --git a/src/domains/evaluation/run-suite.ts b/src/domains/evaluation/run-suite.ts
@@ -937,10 +937,17 @@ export async function runSuite(options: {
         .filter(Boolean),
     );
 
+    const requestedIds = new Set(
+      (options.scenarioId ?? "")
+        .split(",")
+        .map((item) => item.trim())
+        .filter(Boolean),
+    );
+
     let selectedScenarios = [...scenarioCollection.scenarios];
-    if (options.scenarioId) {
+    if (requestedIds.size > 0) {
       selectedScenarios = selectedScenarios.filter(
-        (item) => item.id === options.scenarioId,
+        (item) => requestedIds.has(item.id) || requestedIds.has(item.name),
       );
     }
     if (requestedTags.size > 0) {
@@ -949,6 +956,14 @@ export async function runSuite(options: {
       );
     }
     if (selectedScenarios.length === 0) {
+      if (options.scenarioId) {
+        const available = scenarioCollection.scenarios.map(
+          (s) => `${s.id} (${s.name})`,
+        );
+        throw new AgentProbeConfigError(
+          `No scenario matching "${options.scenarioId}" found. Available: ${available.join(", ")}`,
+        );
+      }
       throw new AgentProbeConfigError(
         "No scenarios matched the requested filters.",
       );
diff --git a/tests/e2e/cli.e2e.test.ts b/tests/e2e/cli.e2e.test.ts
@@ -528,6 +528,85 @@ describe("bun e2e baseline for the typescript cli", () => {
     expect(backend.countByKind("send_message")).toBe(1);
   });
 
+  test("--scenario flag alias works like --scenario-id", async () => {
+    await workspace.writeOpenAiScript(buildOpenAiRules());
+
+    const result = await runAgentprobe(
+      [
+        "run",
+        "--endpoint",
+        workspace.endpointPath,
+        "--scenarios",
+        workspace.scenariosPath,
+        "--personas",
+        workspace.personasPath,
+        "--rubric",
+        workspace.rubricPath,
+        "--scenario",
+        "billing-followup",
+      ],
+      {
+        backendUrl: backend.url,
+        suiteDir: workspace.suiteDir,
+        workspace,
+      },
+    );
+
+    expect(result.exitCode).toBe(0);
+    expect(result.stdout).not.toContain("refund-smoke");
+    expect(result.stdout).toContain("PASS billing-followup score=0.80");
+
+    const runRows = queryRows(
+      workspace.dbPath,
+      ["selected_scenario_ids_json"],
+      "runs",
+      "started_at DESC",
+    );
+    expect(runRows[0]?.selected_scenario_ids_json).toEqual([
+      "billing-followup",
+    ]);
+  });
+
+  test("--scenario filters by scenario name", async () => {
+    await workspace.writeOpenAiScript(buildOpenAiRules());
+
+    const result = await runAgentprobe(
+      [
+        "run",
+        "--endpoint",
+        workspace.endpointPath,
+        "--scenarios",
+        workspace.scenariosPath,
+        "--personas",
+        workspace.personasPath,
+        "--rubric",
+        workspace.rubricPath,
+        "--scenario",
+        "Billing escalation follow-up",
+      ],
+      {
+        backendUrl: backend.url,
+        suiteDir: workspace.suiteDir,
+        workspace,
+      },
+    );
+
+    expect(result.exitCode).toBe(0);
+    expect(result.stdout).not.toContain("refund-smoke");
+    expect(result.stdout).toContain("PASS billing-followup score=0.80");
+
+    const runRows = queryRows(
+      workspace.dbPath,
+      ["selected_scenario_ids_json"],
+      "runs",
+      "started_at DESC",
+    );
+    expect(runRows[0]?.selected_scenario_ids_json).toEqual([
+      "billing-followup",
+    ]);
+    expect(backend.countByKind("send_message")).toBe(1);
+  });
+
   test("tag filtering runs only matching scenarios", async () => {
     await workspace.writeOpenAiScript(buildOpenAiRules());
 
@@ -566,6 +645,83 @@ describe("bun e2e baseline for the typescript cli", () => {
     expect(backend.countByKind("send_message")).toBe(1);
   });
 
+  test("comma-separated --scenario-id runs multiple specific scenarios", async () => {
+    await workspace.writeOpenAiScript(buildOpenAiRules());
+
+    const result = await runAgentprobe(
+      [
+        "run",
+        "--endpoint",
+        workspace.endpointPath,
+        "--scenarios",
+        workspace.scenariosPath,
+        "--personas",
+        workspace.personasPath,
+        "--rubric",
+        workspace.rubricPath,
+        "--scenario-id",
+        "refund-smoke,billing-followup",
+      ],
+      {
+        backendUrl: backend.url,
+        suiteDir: workspace.suiteDir,
+        workspace,
+      },
+    );
+
+    expect(result.exitCode).toBe(0);
+    expect(result.stdout).toContain("PASS refund-smoke score=1.00");
+    expect(result.stdout).toContain("PASS billing-followup score=0.80");
+
+    const runRows = queryRows(
+      workspace.dbPath,
+      ["selected_scenario_ids_json"],
+      "runs",
+      "started_at DESC",
+    );
+    expect(runRows[0]?.selected_scenario_ids_json).toEqual([
+      "refund-smoke",
+      "billing-followup",
+    ]);
+    expect(backend.countByKind("send_message")).toBe(2);
+  });
+
+  test("list command shows available scenarios", async () => {
+    const result = await runAgentprobe(
+      ["list", "--scenarios", workspace.scenariosPath],
+      {
+        backendUrl: backend.url,
+        suiteDir: workspace.suiteDir,
+        workspace,
+      },
+    );
+
+    expect(result.exitCode).toBe(0);
+    expect(result.stdout).toContain(
+      "refund-smoke: Refund smoke question [smoke]",
+    );
+    expect(result.stdout).toContain(
+      "billing-followup: Billing escalation follow-up [regression]",
+    );
+  });
+
+  test("list command with --tags filters scenarios", async () => {
+    const result = await runAgentprobe(
+      ["list", "--scenarios", workspace.scenariosPath, "--tags", "smoke"],
+      {
+        backendUrl: backend.url,
+        suiteDir: workspace.suiteDir,
+        workspace,
+      },
+    );
+
+    expect(result.exitCode).toBe(0);
+    expect(result.stdout).toContain(
+      "refund-smoke: Refund smoke question [smoke]",
+    );
+    expect(result.stdout).not.toContain("billing-followup");
+  });
+
   test("no-match filtering returns a configuration error without target traffic", async () => {
     await workspace.writeOpenAiScript({ rules: [] });
 
@@ -598,6 +754,37 @@ describe("bun e2e baseline for the typescript cli", () => {
     expect(await readOpenAiLog(workspace.openAiLogPath)).toHaveLength(0);
   });
 
+  test("no-match scenario-id returns a configuration error with available ids", async () => {
+    await workspace.writeOpenAiScript({ rules: [] });
+
+    const result = await runAgentprobe(
+      [
+        "run",
+        "--endpoint",
+        workspace.endpointPath,
+        "--scenarios",
+        workspace.scenariosPath,
+        "--personas",
+        workspace.personasPath,
+        "--rubric",
+        workspace.rubricPath,
+        "--scenario-id",
+        "does-not-exist",
+      ],
+      {
+        backendUrl: backend.url,
+        suiteDir: workspace.suiteDir,
+        workspace,
+      },
+    );
+
+    expect(result.exitCode).toBe(2);
+    expect(result.stderr).toContain("does-not-exist");
+    expect(result.stderr).toContain("refund-smoke");
+    expect(result.stderr).toContain("billing-followup");
+    expect(backend.requestLog).toHaveLength(0);
+  });
+
   test("dry-run avoids backend and openai calls while still recording the run", async () => {
     await workspace.writeOpenAiScript({ rules: [] });
 
diff --git a/tests/unit/db.test.ts b/tests/unit/db.test.ts
@@ -414,7 +414,8 @@ describe("sqlite recorder", () => {
     expect(configRun.exitCode).toBe(2);
     expect(configRun.finalError).toEqual({
       type: "AgentProbeConfigError",
-      message: "No scenarios matched the requested filters.",
+      message:
+        'No scenario matching "missing-scenario" found. Available: smoke-scenario (Smoke)',
     });
 
     const runtimeRoot = makeTempDir("db-runtime-error");
diff --git a/tests/unit/runner.test.ts b/tests/unit/runner.test.ts