diff --git a/.github/skills/.vally.yaml b/.github/skills/.vally.yaml index 4ee187c6e8e..eaea06f0450 100644 --- a/.github/skills/.vally.yaml +++ b/.github/skills/.vally.yaml @@ -12,12 +12,15 @@ paths: evalFilenames: ["eval.yaml", "*.eval.yaml"] environments: + # Launch the pre-built DLLs via `dotnet `, NOT `dotnet run` — avoids the + # MSBuild boot race under parallel workers. See issue #15948. + # CI builds the DLLs in the 'Build MCP servers' step of skill-eval.yml. azsdk-mcp: mcpServers: azure-sdk-mcp: type: stdio command: dotnet - args: ["run", "--project", "../../tools/azsdk-cli/Azure.Sdk.Tools.Cli", "--", "start"] + args: ["../../artifacts/bin/Azure.Sdk.Tools.Cli/Debug/net8.0/azsdk.dll", "start"] timeout: "60s" env: AZSDKTOOLS_AGENT_TESTING: "true" @@ -27,5 +30,5 @@ environments: azure-sdk-mcp: type: stdio command: dotnet - args: ["run", "--project", "../../tools/azsdk-cli/Azure.Sdk.Tools.Mock"] + args: ["../../artifacts/bin/Azure.Sdk.Tools.Mock/Debug/net8.0/azsdk-mock.dll"] timeout: "60s" diff --git a/eng/pipelines/skill-eval.yml b/eng/pipelines/skill-eval.yml index c8d5d2e3ebd..24e930d9e0e 100644 --- a/eng/pipelines/skill-eval.yml +++ b/eng/pipelines/skill-eval.yml @@ -42,6 +42,14 @@ jobs: - script: npm install -g @github/copilot-sdk displayName: 'Install Copilot SDK' + # Pre-build the MCP servers so vally launches `dotnet ` instead of + # `dotnet run` — avoids the MSBuild boot race under parallel workers. + # See issue #15948. + - script: | + dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli -c Debug --nologo + dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug --nologo + displayName: 'Build MCP servers' + - script: | input_areas=$(echo "${{ parameters.areas }}" | xargs) if [ -n "$input_areas" ]; then diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs b/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs index b092008b4db..24acb887424 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs @@ -64,12 +64,15 @@ public class UpdateReleasePlanHandler : IMockToolHandler public class GetReleasePlanForSpecPrHandler : IMockToolHandler { public string ToolName => "azsdk_get_release_plan_for_spec_pr"; + // Deterministic "not found" — keeps the create-release-plan flow honest in + // eval scenarios. Stimuli that target an existing plan pass the work-item + // ID directly and call azsdk_get_release_plan instead. See #15948. public CommandResponse Handle(Dictionary? arguments) => new ReleasePlanResponse { TypeSpecProject = "specification/contosowidgetmanager/Contoso.WidgetManager", PackageType = SdkType.Dataplane, - Message = "Release plan found for spec PR (mock)", - ReleasePlanDetails = ReleasePlanMockResponses.ContosoWorkItem() + Message = "No release plan found for the given spec PR (mock)", + ReleasePlanDetails = null }; } diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore new file mode 100644 index 00000000000..80a68f12750 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore @@ -0,0 +1,2 @@ +vally-results/ +results/ diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml new file mode 100644 index 00000000000..20254a644c8 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -0,0 +1,92 @@ +# Vally configuration for Azure SDK Tools MCP tool / scenario evaluations. +# See: https://vally.dev/reference/vally-config +# +# These are scenario evals (does the agent invoke the right MCP tool(s) for a +# given prompt?) and are intentionally separate from the per-skill evals under +# .github/skills/. See README.md for context. + +paths: + evals: [evals/] + evalFilenames: ["*.eval.yaml"] + results: results/ + +environments: + # Launch the pre-built DLL via `dotnet `, NOT `dotnet run` — avoids the + # MSBuild boot race under parallel workers. See issue #15948. + # Run `dotnet build ../Azure.Sdk.Tools.Mock -c Debug` once before vally. + azsdk-mcp-mock: + mcpServers: + azure-sdk-mcp: + type: stdio + command: dotnet + args: ["../../../artifacts/bin/Azure.Sdk.Tools.Mock/Debug/net8.0/azsdk-mock.dll"] + timeout: "30s" + + # Live MCP. AZSDKTOOLS_AGENT_TESTING=true keeps write tools inside the test + # area. Pre-built DLL pattern — see issue #15948. + # Run `dotnet build ../Azure.Sdk.Tools.Cli -c Debug` once before vally. + azsdk-mcp-live: + mcpServers: + azure-sdk-mcp: + type: stdio + command: dotnet + args: ["../../../artifacts/bin/Azure.Sdk.Tools.Cli/Debug/net8.0/azsdk.dll", "start"] + timeout: "5m" + env: + AZSDKTOOLS_AGENT_TESTING: "true" + AZSDKTOOLS_COLLECT_TELEMETRY: "false" + +# Suites group evals for selective execution. +# +# Layout maps directly to suites — no tag-based mock/live filtering. Vally's +# suite filter is positive-match only (AND across keys, OR within values), +# so subfolders are the cleanest way to split mock vs live. See +# https://github.com/microsoft/vally suite-filter source. +suites: + # ---- by tier ---- + unit: + description: | + Hermetic single-tool / trigger evals. No external I/O. Fast; the + foundation of the PR gate. + evals: ["evals/tools/*.eval.yaml"] + + scenarios-mock: + description: | + Multi-tool scenarios against the mock MCP environment. Hermetic; safe + for PR gate. + evals: ["evals/workflow-scenarios/mock/*.eval.yaml"] + + scenarios-live: + description: | + Scenarios against live MCP — real DevOps / GitHub / pipelines. Slow; + nightly only. Prime any required clones first via + `evals/setup/ensure-specs-clone.ps1`. + evals: ["evals/workflow-scenarios/live/*.eval.yaml"] + + # ---- composite suites ---- + pr-gate: + description: Hermetic tiers only (unit + scenarios-mock). Target for CI PR check. + evals: + - "evals/tools/*.eval.yaml" + - "evals/workflow-scenarios/mock/*.eval.yaml" + nightly: + description: All tiers including live scenarios. + evals: ["evals/**/*.eval.yaml"] + + # ---- by feature area (tag-filtered) ---- + release-plan: + description: All evals tagged area=release-plan. + filter: { area: release-plan } + evals: ["evals/**/*.eval.yaml"] + typespec: + description: All evals tagged area=typespec. + filter: { area: typespec } + evals: ["evals/**/*.eval.yaml"] + pipeline: + description: All evals tagged area=pipeline. + filter: { area: pipeline } + evals: ["evals/**/*.eval.yaml"] + github: + description: All evals tagged area=github. + filter: { area: github } + evals: ["evals/**/*.eval.yaml"] diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md new file mode 100644 index 00000000000..8359c4dc9ba --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -0,0 +1,341 @@ +# Azure.Sdk.Tools.Vally + +MCP-tool / end-to-end scenario evaluations for the `azsdk` MCP server, run via +[`@microsoft/vally-cli`](https://www.npmjs.com/package/@microsoft/vally-cli). + +## Tool-scenario evals vs. skill evals + +The repo runs **two complementary eval surfaces**, both via the same +`@microsoft/vally-cli` binary. They answer different questions and live in +different folders. A full end-to-end gate runs *both*. + +| | **Tool-scenario evals** (this project) | **Skill evals** | +|---|---|---| +| **Question** | Given a user prompt, does the agent invoke the right MCP tool(s) with the right shape? | Given a user prompt, does the agent route to the right skill and follow its instructions? | +| **Catches** | Tool name / description / parameter regressions; multi-tool ordering; tool-catalog conflicts | Skill frontmatter / `description` / instruction regressions; skill-routing collisions | +| **Path** | [`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/`](evals/) (`tools/` + `workflow-scenarios/`) | [`.github/skills//evals/*.eval.yaml`](../../../.github/skills/) (and `evaluate/evals/` for capability suites) | +| **Loaded subject** | Production MCP server (`Azure.Sdk.Tools.Cli`) over stdio — real tools, real network calls | Skill's `SKILL.md` + frontmatter; the agent picks tools itself | +| **Primary grader** | `tool-calls` — checks the recorded trajectory for required tool names | Trigger / routing graders + per-skill rubric | +| **Run command** | `vally eval --eval-spec evals/tools/.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/` *from repo root* | +| **CI status** | Not wired yet (see follow-ups) | `vally lint` runs in [.github/workflows/skill-eval.yml](../../../.github/workflows/skill-eval.yml); full `eval` job pending | +| **Cost profile** | Higher — each run spins up the MCP server, real LLM turns (~5–15), real tool calls | Variable — trigger evals are cheap; capability evals (e.g. `azure-typespec-author`) are expensive | + +### Why both? + +A skill *uses* tools, but a tool can be invoked **without** any skill +(Copilot picks it directly from the catalog when the user prompt doesn't +trigger a skill — which is most prompts in practice). Concretely: + +- Drop tool-scenario evals → you stop catching regressions when someone + renames a tool, edits its description, or adds an overlapping tool that + the model now prefers. +- Drop skill evals → you stop catching regressions when someone edits a + skill's `description`, frontmatter, or instruction body and the router + stops invoking it for the right prompts. + +For workflows where a skill is a thin wrapper around one tool, the two +evals have meaningful overlap and you may keep just one. For workflows +where the skill does real orchestration (multi-tool sequencing, +conditional branches, recovery), both matter independently. + +### Scenarios checked in today + +**Tool-scenario evals (this project)** — organised by the standard test pyramid under [`evals/`](evals/). The folder is the **cost tier** (and CI cadence); the feature **area** is a tag inside each YAML so cross-cuts work via `.vally.yaml` suite filters. + +#### `evals/unit/` — hermetic single-tool evals (18) + +One prompt → one expected MCP tool. No `environment.git`, no fixtures. Fast; safe to run on every PR. Includes the per-tool **trigger** coverage ported from [#15183](https://github.com/Azure/azure-sdk-tools/pull/15183) (`triggers-*.eval.yaml`). + +| Scenario | Area | Shape | +|---|---|---| +| [`check-public-repo`](evals/unit/check-public-repo.eval.yaml) | typespec | Is a TypeSpec project published in `azure-rest-api-specs`? | +| [`validate-typespec`](evals/unit/validate-typespec.eval.yaml) | typespec | Run `tsp` linter/validation | +| [`get-modified-typespec-projects`](evals/unit/get-modified-typespec-projects.eval.yaml) | typespec | Git-aware tool against current branch | +| [`add-arm-resource`](evals/unit/add-arm-resource.eval.yaml) | typespec | Calls `azsdk_typespec_generate_authoring_plan` for an ARM resource | +| [`create-release-plan`](evals/unit/create-release-plan.eval.yaml) | release-plan | Create a release-plan work item | +| [`link-namespace-approval-issue`](evals/unit/link-namespace-approval-issue.eval.yaml) | release-plan | Link an existing approval issue to a release plan | +| [`get-pr-link-current-branch`](evals/unit/get-pr-link-current-branch.eval.yaml) | github | Resolve the PR for the active git branch | +| [`check-sdk-generation-status`](evals/unit/check-sdk-generation-status.eval.yaml) | pipeline | Pipeline status lookup | +| [`triggers-apiview`](evals/unit/triggers-apiview.eval.yaml) | apiview | `azsdk_apiview_*` | +| [`triggers-config`](evals/unit/triggers-config.eval.yaml) | engsys | `azsdk_check_service_label`, `azsdk_create_service_label` | +| [`triggers-engsys`](evals/unit/triggers-engsys.eval.yaml) | engsys | `azsdk_analyze_log_file`, failed-test tools, codeowner-cache | +| [`triggers-github`](evals/unit/triggers-github.eval.yaml) | github | `azsdk_create_pull_request`, `azsdk_get_pull_request*`, `azsdk_get_github_user_details` | +| [`triggers-package`](evals/unit/triggers-package.eval.yaml) | package | `azsdk_package_*`, `azsdk_release_sdk` | +| [`triggers-pipeline`](evals/unit/triggers-pipeline.eval.yaml) | pipeline | `azsdk_analyze_pipeline`, `azsdk_get_pipeline_*` | +| [`triggers-releaseplan`](evals/unit/triggers-releaseplan.eval.yaml) | release-plan | `azsdk_*_release_plan*`, `azsdk_run_generate_sdk`, `azsdk_link_*` | +| [`triggers-typespec`](evals/unit/triggers-typespec.eval.yaml) | typespec | `azsdk_typespec_*`, `azsdk_convert_swagger_to_typespec`, `azsdk_customized_code_update`, `azsdk_run_typespec_validation` | +| [`triggers-verify`](evals/unit/triggers-verify.eval.yaml) | engsys | `azsdk_verify_setup` | + +The companion [`scripts/Validate-EvalTools.ps1`](scripts/Validate-EvalTools.ps1) cross-checks that every tool referenced in `evals/unit/triggers-*.eval.yaml` exists on the running MCP server, and every server tool has at least one trigger. + +#### `evals/scenarios/` — multi-tool scenarios (4) + +Multi-step prompts that exercise 2+ MCP tools end-to-end. Split into +`mock/` (hermetic, runs on PR gate) and `live/` (real DevOps / GitHub / +pipelines, runs nightly). + +| Scenario | Area | Mode | Shape | +|---|---|---|---| +| [`check-public-repo-then-validate`](evals/scenarios/mock/check-public-repo-then-validate.eval.yaml) | typespec | mock | Validate, then check public-repo presence | +| [`typespec-generation-step02`](evals/scenarios/mock/typespec-generation-step02.eval.yaml) | typespec | mock | Step in the spec-PR generation flow | +| [`rename-client-property`](evals/scenarios/mock/rename-client-property.eval.yaml) | typespec | mock | Stub — needs `expected-diff` grader + sparse clone | +| [`release-planner`](evals/scenarios/live/release-planner.eval.yaml) | release-plan | **live** | Create + re-fetch a release plan, kick off SDK gen, link PR back — real DevOps test-area writes | + +Live scenarios need a primed `azure-rest-api-specs` clone — run +[`evals/setup/ensure-specs-clone.ps1`](evals/setup/ensure-specs-clone.ps1) +(auto-refreshes every 24h) before invoking the `scenarios-live` / `nightly` suite. + +**Skill evals (already in repo, *not* part of this PR)** — for reference: + +- **Trigger evals** (one per skill, verify routing): see e.g. + [`.github/skills/azsdk-common-prepare-release-plan/evals/trigger.eval.yaml`](../../../.github/skills/azsdk-common-prepare-release-plan/evals/trigger.eval.yaml), + plus `azsdk-common-sdk-release`, `azsdk-common-pipeline-troubleshooting`, + `azsdk-common-apiview-feedback-resolution`, `sensei`, + `skill-authoring`, `markdown-token-optimizer`. +- **Capability suite** for [`azure-typespec-author`](../../../.github/skills/azure-typespec-author/) — + 29 numbered cases under + [`.github/skills/azure-typespec-author/evaluate/evals/`](../../../.github/skills/azure-typespec-author/evaluate/evals/) + (`001001.eval.yaml` … `005001.eval.yaml`). These are the data-driven + TypeSpec authoring scenarios that *would* have been our follow-up #1 + here — they're already covered as skill evals, so this project doesn't + re-port them. + +This project supersedes the deleted `Azure.Sdk.Tools.Cli.Benchmarks` project +(removed in [#15697](https://github.com/Azure/azure-sdk-tools/pull/15697)) and +tracks the migration in +[#15124](https://github.com/Azure/azure-sdk-tools/issues/15124). + +## Layout + +``` +Azure.Sdk.Tools.Vally/ +├── .vally.yaml # Vally config (environments + suites) +├── evals/ +│ ├── tools/ # tool-shape + per-skill trigger evals, hermetic +│ ├── workflow-scenarios/ +│ │ ├── mock/ # multi-tool scenarios, hermetic (PR gate) +│ │ └── live/ # multi-tool scenarios, live MCP (nightly) +│ ├── setup/ # helper scripts (e.g. ensure-specs-clone.ps1) +│ └── fixtures/ # (future) pinned SHAs + per-eval mocks +├── fixtures/ # Per-scenario static input files (env.files) +│ └── /... +├── scripts/ # Repo-side helpers (Validate-EvalTools.ps1, …) +└── Graders/ # (future) Custom .NET graders + └── Azure.Sdk.Tools.Vally.csproj # added when first custom grader lands +``` + +Folder = tier (cost / CI cadence): `unit/` is hermetic + fast, +`scenarios/mock/` is multi-tool hermetic, `scenarios/live/` is multi-tool +against real services. Vally's suite filter is positive-match only, so the +mock-vs-live split lives on disk rather than in tags. Feature **area** still +lives as a `tags:` entry inside each YAML so cross-cuts (e.g. "all +release-plan evals") select via [`.vally.yaml`](.vally.yaml) suite filters +or `vally eval --tag`. + +## Quickstart — run one scenario + +The fastest path from a fresh clone to a green eval. Swap the path after +`-e` for any other `.eval.yaml` to try a different scenario. + +### 1. One-time setup + +```powershell +# From repo root +cd eng/skill-eval; npm ci; cd ../.. +dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli -c Debug +dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug +``` + +Rebuild the MCP servers any time you edit tool source. Vally does **not** +rebuild them — it just spawns the existing DLL. + +### 2. Move into this project and stash the paths + +All commands below run from here: + +```powershell +cd tools/azsdk-cli/Azure.Sdk.Tools.Vally +$vally = '../../../eng/skill-eval/node_modules/.bin/vally.cmd' +$skills = '../../../.github/skills' +``` + +### 3. Run a scenario + +**One trigger eval** (~30 s, hermetic): + +```powershell +& $vally eval -e evals/tools/create-release-plan.eval.yaml --skill-dir $skills +``` + +**The release-planner mock workflow** (~4 min, 5 stimuli, hermetic): + +```powershell +& $vally eval -e evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml --skill-dir $skills +``` + +**The release-planner live workflow** (~15 min, real DevOps writes to the +test area; prime the spec clone once): + +```powershell +./evals/setup/ensure-specs-clone.ps1 +& $vally eval -e evals/workflow-scenarios/live/release-planner.eval.yaml --skill-dir $skills --workers 1 +``` + +### 4. Pick a different scenario + +```powershell +# List everything you can pass to -e +Get-ChildItem evals -Recurse -Filter *.eval.yaml | ForEach-Object FullName +``` + +Common swaps: + +| What you want | Replace `-e` value with | +|---|---| +| A different release-plan trigger | `evals/tools/link-namespace-approval-issue.eval.yaml` | +| A TypeSpec workflow | `evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml` | +| All triggers for one feature | drop `-e` and use `--suite typespec` (or `release-plan`, `github`, …) | +| Everything hermetic | drop `-e` and use `--suite pr-gate` | + +### 5. Read the results + +- Live PASS/FAIL table prints to the terminal. +- Full trajectories land in `results//`. The most useful file + is `results.jsonl` — one line per stimulus run, with the prompt, every + tool call, and the final agent message. +- Add `--output-dir vally-results/` if you want a stable path + to re-open later. + +## Running locally (advanced) + +Prereqs are the same as the [Quickstart](#quickstart--run-one-scenario) +plus Node 22+ and a .NET SDK matching `global.json`. + +Run a suite (recommended): + +```powershell +cd tools/azsdk-cli/Azure.Sdk.Tools.Vally +$vally = '../../../eng/skill-eval/node_modules/.bin/vally.cmd' +$skills = '../../../.github/skills' + +# Fast tiers only — PR-gate candidate +& $vally eval --suite pr-gate --skill-dir $skills + +# A single tier +& $vally eval --suite unit --skill-dir $skills +& $vally eval --suite scenarios-mock --skill-dir $skills + +# By feature area (cross-cuts tiers via tag filter) +& $vally eval --suite release-plan --skill-dir $skills +& $vally eval --suite typespec --skill-dir $skills +``` + +> `--skill-dir` is **required** for workflow-scenario evals — without it, +> the agent never loads the project skills and the `skill-invocation` +> grader fails even when the tool calls are correct. +> +> Each agent still boots its own MCP child process, but `.vally.yaml` +> launches the **pre-built** `azsdk-mock.dll` / `azsdk.dll` via +> `dotnet ` (read-only memory-map, no MSBuild on the hot path), so +> `--workers 6+` is safe for `scenarios-mock`. The old MSBuild boot race +> is gone; the only remaining concurrency limit is rate limits on the +> Copilot CLI subprocesses. + +Run a single eval: + +```powershell +& $vally eval --eval-spec evals/tools/check-public-repo.eval.yaml --skill-dir $skills +``` + +Run the live scenarios tier (first, prime a per-user clone of +`azure-rest-api-specs`; the helper refreshes it every 24h): + +```powershell +./evals/setup/ensure-specs-clone.ps1 +& $vally eval --suite scenarios-live --skill-dir $skills --workers 1 +``` + +## Adding a new scenario + +1. **Pick a tier** — the folder you drop the YAML into: + - `evals/tools/` — one prompt, one MCP tool, no environment hooks. + - `evals/workflow-scenarios/mock/` — multi-tool flow against + `azsdk-mcp-mock`. Hermetic; runs on PR gate. + - `evals/workflow-scenarios/live/` — needs real DevOps / GitHub / + pipelines; bind `environment: azsdk-mcp-live`. Nightly only. +2. Pick a short, kebab-case name (e.g. `create-release-plan`). +3. Create `evals//.eval.yaml`. Start from a sibling in the same + tier as a template. +4. **Tag it** so suite filters pick it up: + ```yaml + tags: + area: release-plan # or typespec / pipeline / github / engsys / apiview / package + ``` +5. If the scenario needs input files, add them under + `fixtures//...` and reference them via `environment.files` in the + eval (relative paths from the eval file). +6. Pick graders — they’re a **list**, stack as many as you need: + - `tool-calls` — verify the agent invoked the expected MCP tool(s). + - `skill-invocation` — verify the right skill routed (e2e only). + - `tool-call-count` / `token-budget` / `turn-count` — chattiness / budget guards. + - `output-matches` / `output-contains` — assert final-message shape. + - `file-matches` / `file-exists` — verify produced/modified files. + - `prompt` — LLM-as-judge for free-form quality checks. + - Custom (`Graders/`) — add a .NET grader when no built-in fits. +7. The suite picks it up automatically (folders are globbed). Add a new + tag-filtered suite to [`.vally.yaml`](.vally.yaml) only if you’re + introducing a brand-new feature area. +8. Run locally to confirm it passes, then open a PR. + +## Recovery checklist (from deleted benchmark) + +Tracked in [#15124](https://github.com/Azure/azure-sdk-tools/issues/15124). +All 9 deleted scenarios have been ported as Vally `tool-calls` evals (presence +checks). Items marked with **(stub)** have known gaps documented inline in the +eval file: + +- [x] `check-public-repo` +- [x] `check-public-repo-then-validate` +- [x] `validate-typespec` +- [x] `typespec-generation-step02` +- [x] `get-modified-typespec-projects` **(stub — needs git-repo fixture / setup hook)** +- [x] `add-arm-resource` **(stub — needs fixtures + `npx tsp compile` post-check)** +- [x] `create-release-plan` +- [x] `link-namespace-approval-issue` +- [x] `get-pr-link-current-branch` +- [x] `check-sdk-generation-status` +- [x] `rename-client-property` **(stub — needs `expected-diff` grader + sparse-clone of `azure-rest-api-specs`)** + +### Known gaps vs. the original benchmark + +The current `tool-calls` grader only checks tool *names*. The deleted +benchmark's `ToolCallValidator` additionally asserted: + +1. **Argument values** (e.g. `serviceTreeId`, `buildId`, `typeSpecProjectPath`). +2. **Forbidden tools** (e.g. "must NOT call `azsdk_verify_setup`"). +3. **Call order** (e.g. validate before check-public-repo). +4. **Optional tools** (calls that are allowed but not required). + +Recovering 1–4 requires either upstream grader support in +`@microsoft/vally-cli` or a custom .NET grader under `Graders/`. Until then +those constraints are captured in prompt text and inline `TODO:` comments. + +### Follow-ups + +- [ ] Port `Evaluate_PromptToToolMatch` + `Evaluate_ToolDescriptionSimilarity` + from `Azure.Sdk.Tools.Cli.Evaluations` (still uses Copilot-SDK evaluator). +- [ ] File upstream issue against `@microsoft/vally-cli` to add `forbidden`, + `optional`, argument-matching, and ordering to the built-in `tool-calls` + grader (or accept that those gaps need custom graders). +- [ ] Wire a `vally eval` CI job for this project (current + [`.github/workflows/skill-eval.yml`](../../../.github/workflows/skill-eval.yml) + runs `vally lint` only and is scoped to skills). See + [#15126](https://github.com/Azure/azure-sdk-tools/issues/15126) and + [#15127](https://github.com/Azure/azure-sdk-tools/issues/15127). +- [ ] Decide on `AuthoringScenario` parity: the 29 TypeSpec authoring cases + are already covered as **skill evals** under + [`.github/skills/azure-typespec-author/evaluate/evals/`](../../../.github/skills/azure-typespec-author/evaluate/evals/). + Tracked as [#15767](https://github.com/Azure/azure-sdk-tools/issues/15767) — + likely close as duplicate unless we also want tool-level coverage of the + same prompts (catches catalog regressions even when the skill isn't + triggered). diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 new file mode 100644 index 00000000000..918d544edf8 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 @@ -0,0 +1,70 @@ +<# +.SYNOPSIS + Ensures a per-user shallow+sparse cache clone of Azure/azure-rest-api-specs + exists and is reasonably fresh. + +.DESCRIPTION + Run this before invoking the e2e suite (vally eval --suite e2e). + Maintains a cache clone that Vally's `environment.git.source` points at, + so individual eval YAMLs don't need a pre-existing checkout. + + - First run: shallow + blobless + cone-sparse clone (only + specification/contosowidgetmanager/ to keep size minimal). + - Subsequent runs within -MaxAgeHours: noop. + - Subsequent runs past -MaxAgeHours: `git fetch --depth 1 origin main` and + fast-forward `main`. + + Cache lives at: + Windows: $env:USERPROFILE\.vally-cache\azure-rest-api-specs + *nix: $HOME/.vally-cache/azure-rest-api-specs + +.PARAMETER MaxAgeHours + Skip the `git fetch` if the cache was last refreshed within this many + hours. Default: 24. + +.PARAMETER SparseCheckoutPaths + Cone-sparse paths to include. Default: specification/contosowidgetmanager. + Pass @() to disable sparse-checkout (full tree). +#> +[CmdletBinding()] +param( + [int] $MaxAgeHours = 24, + [string[]] $SparseCheckoutPaths = @('specification/contosowidgetmanager') +) + +$ErrorActionPreference = 'Stop' +Set-StrictMode -Version 4 + +$cacheRoot = if ($env:USERPROFILE) { Join-Path $env:USERPROFILE '.vally-cache' } else { Join-Path $HOME '.vally-cache' } +$cache = Join-Path $cacheRoot 'azure-rest-api-specs' +$stamp = Join-Path $cache '.vally-last-fetch' + +if (-not (Test-Path (Join-Path $cache '.git'))) { + Write-Host "[ensure-specs-clone] Cloning azure-rest-api-specs into cache: $cache" + New-Item -ItemType Directory -Force -Path $cacheRoot | Out-Null + git clone --depth 1 --filter=blob:none --no-checkout ` + https://github.com/Azure/azure-rest-api-specs.git $cache | Out-Null + if ($SparseCheckoutPaths.Count -gt 0) { + git -C $cache sparse-checkout init --cone | Out-Null + git -C $cache sparse-checkout set @SparseCheckoutPaths | Out-Null + } + git -C $cache checkout main | Out-Null + Set-Content -Path $stamp -Value (Get-Date -Format o) +} else { + $isStale = $true + if (Test-Path $stamp) { + $age = (Get-Date) - (Get-Item $stamp).LastWriteTime + $isStale = $age.TotalHours -gt $MaxAgeHours + } + if ($isStale) { + Write-Host "[ensure-specs-clone] Refreshing cache (>$MaxAgeHours h old): $cache" + git -C $cache fetch --depth 1 origin main | Out-Null + git -C $cache reset --hard origin/main | Out-Null + Set-Content -Path $stamp -Value (Get-Date -Format o) + } else { + Write-Host "[ensure-specs-clone] Cache is fresh (<$MaxAgeHours h): $cache" + } +} + +# Echo the cache path so the wrapper can capture it. +Write-Output $cache diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/add-arm-resource.eval.yaml new file mode 100644 index 00000000000..fc96cc2be73 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/add-arm-resource.eval.yaml @@ -0,0 +1,44 @@ +name: azsdk-mcp-tool-scenarios +description: | + Add-arm-resource: end-to-end scenario for authoring a new ARM resource + via TypeSpec. This is a complex, file-producing scenario (not a single + tool-call check) that needs a real fixture + tsp compile verification. +version: "1.0" +type: capability + + +tags: + tier: unit + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: add-arm-resource + prompt: | + In the specification/widget/resource-manager/Microsoft.Widget/Widget project, + add an ARM resource named 'Asset' with CRUD operations. + constraints: + max_turns: 20 + max_tokens: 50000 + # TODO: seed a fixture (environment.files or git) for the Microsoft.Widget + # project, add `file-exists` + `file-contains` graders on the produced + # asset.tsp, and a `run-command` grader to verify `npx tsp compile`. + graders: + - type: tool-calls + config: + required: + - edit + - azsdk_typespec_generate_authoring_plan + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-public-repo.eval.yaml new file mode 100644 index 00000000000..51e2ec3d129 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-public-repo.eval.yaml @@ -0,0 +1,43 @@ +name: azsdk-mcp-tool-scenarios +description: | + Tool-scenario evaluation suite for the azsdk MCP server. Verifies the + agent invokes the right MCP tools for given prompts, independent of any + specific skill. +version: "1.0" +type: capability + + +tags: + tier: unit + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: check-public-repo + prompt: | + Check if my TypeSpec project is in the public repo. + My setup has already been verified, do not run azsdk_verify_setup. + Project root: specification/contosowidgetmanager/Contoso.WidgetManager. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azsdk_typespec_check_project_in_public_repo + disallowed: + - azsdk_verify_setup + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-sdk-generation-status.eval.yaml new file mode 100644 index 00000000000..714d31cd732 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-sdk-generation-status.eval.yaml @@ -0,0 +1,42 @@ +name: azsdk-mcp-tool-scenarios +description: | + Check-sdk-generation-status: the agent should call azsdk_get_pipeline_status + to check the SDK generation pipeline status. +version: "1.0" +type: capability + + +tags: + tier: unit + area: pipeline + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: check-sdk-generation-status + prompt: | + Check the SDK generation pipeline status for build ID 5513110. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 5 + max_tokens: 5000 + # TODO: assert buildId=5513110 — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). + graders: + - type: tool-calls + config: + required: + - azsdk_get_pipeline_status + disallowed: + - azsdk_verify_setup + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/create-release-plan.eval.yaml new file mode 100644 index 00000000000..7b4a25f0725 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/create-release-plan.eval.yaml @@ -0,0 +1,112 @@ +# ============================================================================= +# Scenario: create-release-plan +# ----------------------------------------------------------------------------- +# Purpose: +# Tier-1 "tool-call" eval. Verify that, given a fully-specified prompt with +# all required context, the agent invokes `azsdk_create_release_plan` exactly +# once and does NOT redundantly call `azsdk_verify_setup` (the prompt already +# states setup is verified). +# +# What this eval is NOT: +# - Not an end-to-end flow (see release-planner-e2e.eval.yaml for that). +# - Does not validate argument values yet — see TODO below + #15833. +# - Does not need azure-rest-api-specs cloned; runs against the live MCP +# server in agent-testing mode (AZSDKTOOLS_AGENT_TESTING=true, set in +# ../../.vally.yaml). +# +# How to run locally: +# cd tools/azsdk-cli/Azure.Sdk.Tools.Vally +# ../../../eng/skill-eval/node_modules/.bin/vally.cmd eval \ +# --eval-spec evals/unit/create-release-plan.eval.yaml --verbose +# ============================================================================= + +name: azsdk-mcp-tool-scenarios +description: | + Create-release-plan: the agent should call azsdk_create_release_plan with + the supplied service-tree / product-tree / spec PR context. +version: "1.0" +type: capability + + +tags: + tier: unit + area: release-plan + +# `environment: azsdk-mcp` refers to the named environment defined in +# ../../.vally.yaml (configures the azsdk-cli MCP server + env vars). +environment: azsdk-mcp-mock + +config: + runs: 1 # bump for flakiness sampling (e.g. runs: 5) + timeout: 30m # total wall-clock budget for ALL stimuli in this file + model: gpt-5.4 # model alias — see .vally.yaml `models:` map + executor: copilot-sdk + +stimuli: + - name: create-release-plan + prompt: | + Create a release plan for the Contoso Widget Manager, no need to get it afterwards only create. + My setup has already been verified, do not run azsdk_verify_setup. Here is all the context you need: + TypeSpec project located at "specification/contosowidgetmanager/Contoso.WidgetManager". + Use service tree ID "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f", + product tree ID "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e", + target release timeline "December 2025", + API version "2022-11-01-preview", + SDK release type "beta", + and link it to the spec pull request "https://github.com/Azure/azure-rest-api-specs/pull/38387". + + # Per-stimulus guardrails. Anything beyond these fails the run. + constraints: + max_turns: 8 # agent loop iterations + max_tokens: 8000 # cumulative token spend + + # TODO: assert serviceTreeId / productTreeId / specApiVersion / specPullRequestUrl / sdkReleaseType / typeSpecProjectPath — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). + # + # `graders:` is a LIST — stack as many as you want. Each grader produces a + # score in [0,1]; the `scoring.weights` block below combines them into the + # final scenario score. Available grader `type:` values include: + # + # static (deterministic, free): + # tool-calls, skill-invocation, has-output, no-errors, turn-completed, + # token-budget, tool-call-count, turn-count, error-count, wall-time, + # program, run-command, stdout-contains, stdout-matches, + # stderr-contains, exit-code, file-exists, file-contains, + # file-matches, output-contains, output-matches + # llm (model-judged, costs tokens): + # prompt, pairwise + # + # Example of stacking multiple graders (uncomment to use): + # + # graders: + # - type: tool-calls + # config: + # required: [azsdk_create_release_plan] + # disallowed: [azsdk_verify_setup] + # - type: skill-invocation # was a specific skill invoked? + # config: + # required: [release-planner] + # - type: tool-call-count # cap chattiness + # config: + # max: 5 + # - type: prompt # llm-judged correctness + # config: + # model: gpt-5.4 + # rubric: | + # Did the final assistant message confirm the release plan was + # created and surface its ID? Answer "pass" or "fail". + graders: + - type: tool-calls + config: + required: + - azsdk_create_release_plan + disallowed: + - azsdk_verify_setup + +# Combine grader scores into the final scenario score. +# Keys must match the grader `type:` (or its `name:` if you set one). +# `threshold` is the minimum weighted score for the scenario to PASS. +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-modified-typespec-projects.eval.yaml new file mode 100644 index 00000000000..b62e3536c8b --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-modified-typespec-projects.eval.yaml @@ -0,0 +1,45 @@ +name: azsdk-mcp-tool-scenarios +description: | + Get-modified-typespec-projects: the agent should call + azsdk_get_modified_typespec_projects to list TypeSpec projects modified + in the current branch. +version: "1.0" +type: capability + + +tags: + tier: unit + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: get-modified-typespec-projects + prompt: | + List the TypeSpec projects modified in my current branch compared to main. + My setup has already been verified, do not run azsdk_verify_setup. + The repository root is the relative path ./azure-rest-api-specs. + constraints: + max_turns: 5 + max_tokens: 5000 + # TODO: seed a git worktree fixture (environment.git) with a modified + # tspconfig.yaml so the tool actually has a diff to report. + graders: + - type: tool-calls + config: + required: + - azsdk_get_modified_typespec_projects + disallowed: + - azsdk_verify_setup + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-pr-link-current-branch.eval.yaml new file mode 100644 index 00000000000..bb61cd1f5c3 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-pr-link-current-branch.eval.yaml @@ -0,0 +1,43 @@ +name: azsdk-mcp-tool-scenarios +description: | + Get-pr-link-current-branch: the agent should call + azsdk_get_pull_request_link_for_current_branch when asked about the + status of the spec PR on the current branch. +version: "1.0" +type: capability + + +tags: + tier: unit + area: github + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: get-pr-link-current-branch + prompt: | + What's the status of the spec PR in my current branch? Only check the status once. + My setup has already been verified, do not run azsdk_verify_setup. + The repository root is the relative path ./azure-rest-api-specs. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azsdk_get_pull_request_link_for_current_branch + disallowed: + - azsdk_verify_setup + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/link-namespace-approval-issue.eval.yaml new file mode 100644 index 00000000000..aeddc254dd0 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/link-namespace-approval-issue.eval.yaml @@ -0,0 +1,42 @@ +name: azsdk-mcp-tool-scenarios +description: | + Link-namespace-approval-issue: the agent should call + azsdk_link_namespace_approval_issue to link an issue to a release plan. +version: "1.0" +type: capability + + +tags: + tier: unit + area: release-plan + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: link-namespace-approval-issue + prompt: | + Link namespace approval issue https://github.com/Azure/azure-sdk/issues/1234 to release plan 12345. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 5 + max_tokens: 5000 + # TODO: assert releasePlanWorkItemId=12345 and namespaceApprovalIssue URL — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). + graders: + - type: tool-calls + config: + required: + - azsdk_link_namespace_approval_issue + disallowed: + - azsdk_verify_setup + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-apiview.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-apiview.eval.yaml new file mode 100644 index 00000000000..a1de2a6541f --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-apiview.eval.yaml @@ -0,0 +1,115 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + tier: unit + area: apiview + priority: p0 + +stimuli: + + # ==== azsdk_apiview_get_comments triggers ==== + - name: invoke-azsdk-apiview-get-comments-1 + prompt: "Get all the APIView comments for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_comments" + - name: invoke-azsdk-apiview-get-comments-2 + prompt: "Show me the API review feedback for this package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_comments" + - name: invoke-azsdk-apiview-get-comments-3 + prompt: "What comments did the API reviewers leave?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_comments" + + # ==== azsdk_apiview_get_copilot_review triggers ==== + - name: invoke-azsdk-apiview-get-copilot-review-1 + prompt: "Check if my Copilot review is done" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_copilot_review" + - name: invoke-azsdk-apiview-get-copilot-review-2 + prompt: "Get the results of my automated API review job" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_copilot_review" + - name: invoke-azsdk-apiview-get-copilot-review-3 + prompt: "What comments did the Copilot generate for my API review?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_copilot_review" + + # ==== azsdk_apiview_get_review_url triggers ==== + - name: invoke-azsdk-apiview-get-review-url-1 + prompt: "Get the APIView review link for the Azure.Storage.Blobs C# package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_review_url" + - name: invoke-azsdk-apiview-get-review-url-2 + prompt: "What is the APIView URL for the azure-core Python package?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_review_url" + - name: invoke-azsdk-apiview-get-review-url-3 + prompt: > + Give me the link to the API review page for the Java storage blob package version 12.32.0 + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_review_url" + + # ==== azsdk_apiview_request_copilot_review triggers ==== + - name: invoke-azsdk-apiview-request-copilot-review-1 + prompt: "Request a Copilot review for this API" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_request_copilot_review" + - name: invoke-azsdk-apiview-request-copilot-review-2 + prompt: "Run an automated review on my package's API surface" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_request_copilot_review" + - name: invoke-azsdk-apiview-request-copilot-review-3 + prompt: "Submit this APIView URL for an automated Copilot review" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_request_copilot_review" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-config.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-config.eval.yaml new file mode 100644 index 00000000000..d03ba74ad0a --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-config.eval.yaml @@ -0,0 +1,54 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + tier: unit + area: engsys + priority: p0 + +stimuli: + + # ==== azsdk_check_service_label triggers ==== + - name: invoke-azsdk-check-service-label-1 + prompt: "Check if a service label exists for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_check_service_label" + - name: invoke-azsdk-check-service-label-2 + prompt: "Does the service label for Contoso already exist?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_check_service_label" + + # ==== azsdk_create_service_label triggers ==== + - name: invoke-azsdk-create-service-label-1 + prompt: "Create a new service label for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_service_label" + - name: invoke-azsdk-create-service-label-2 + prompt: "Add a service label for Contoso Widget Manager" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_service_label" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-engsys.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-engsys.eval.yaml new file mode 100644 index 00000000000..5ee6b9c584b --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-engsys.eval.yaml @@ -0,0 +1,102 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + tier: unit + area: engsys + priority: p0 + +stimuli: + + # ==== azsdk_analyze_log_file triggers ==== + - name: invoke-azsdk-analyze-log-file-1 + prompt: "Analyze this log file for errors" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_analyze_log_file" + - name: invoke-azsdk-analyze-log-file-2 + prompt: "What errors are in this build log?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_analyze_log_file" + + # ==== azsdk_cleanup_ai_agents triggers ==== + - name: invoke-azsdk-cleanup-ai-agents-1 + prompt: "Clean up AI agents in my project" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_cleanup_ai_agents" + + # ==== azsdk_get_failed_test_case_data triggers ==== + - name: invoke-azsdk-get-failed-test-case-data-1 + prompt: "Get detailed information about a specific failed test" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_case_data" + - name: invoke-azsdk-get-failed-test-case-data-2 + prompt: "Show me the error message and stack trace for the failed test TestAuthentication" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_case_data" + + # ==== azsdk_get_failed_test_cases triggers ==== + - name: invoke-azsdk-get-failed-test-cases-1 + prompt: "Get the list of failed test cases from my test run" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_cases" + - name: invoke-azsdk-get-failed-test-cases-2 + prompt: "What tests failed in this TRX file?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_cases" + - name: invoke-azsdk-get-failed-test-cases-3 + prompt: "Show me which tests failed" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_cases" + + # ==== azsdk_get_failed_test_run_data triggers ==== + - name: invoke-azsdk-get-failed-test-run-data-1 + prompt: "Get complete details for all failed tests" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_run_data" + - name: invoke-azsdk-get-failed-test-run-data-2 + prompt: "Show me full information about all test failures including stack traces" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_run_data" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-github.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-github.eval.yaml new file mode 100644 index 00000000000..50ed5954c62 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-github.eval.yaml @@ -0,0 +1,79 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +tags: + tier: unit + area: github + priority: p0 + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +stimuli: + + # ==== azsdk_create_pull_request triggers ==== + - name: invoke-azsdk-create-pull-request-1 + prompt: "Create a pull request for my changes" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_pull_request" + + # ==== azsdk_get_github_user_details triggers ==== + - name: invoke-azsdk-get-github-user-details-1 + prompt: "Get details for GitHub user octocat" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_github_user_details" + - name: invoke-azsdk-get-github-user-details-2 + prompt: "Who is the GitHub user johndoe?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_github_user_details" + + # ==== azsdk_get_pull_request triggers ==== + - name: invoke-azsdk-get-pull-request-1 + prompt: "Get the details of my pull request" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pull_request" + - name: invoke-azsdk-get-pull-request-2 + prompt: "Show me the status and comments on PR #1234" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pull_request" + + # ==== azsdk_get_pull_request_link_for_current_branch triggers ==== + - name: invoke-azsdk-get-pull-request-link-for-current-branch-1 + prompt: "Get the PR link for my current branch" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pull_request_link_for_current_branch" + - name: invoke-azsdk-get-pull-request-link-for-current-branch-2 + prompt: "What's the pull request URL for this branch?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pull_request_link_for_current_branch" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-package.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-package.eval.yaml new file mode 100644 index 00000000000..e2d3217175a --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-package.eval.yaml @@ -0,0 +1,233 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +tags: + tier: unit + area: package + priority: p0 + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +stimuli: + + # ==== azsdk_package_build_code triggers ==== + - name: invoke-azsdk-package-build-code-1 + prompt: "Build my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_build_code" + - name: invoke-azsdk-package-build-code-2 + prompt: "Compile the code for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_build_code" + + # ==== azsdk_package_generate_code triggers ==== + - name: invoke-azsdk-package-generate-code-1 + prompt: "Generate SDK code from my TypeSpec" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_code" + - name: invoke-azsdk-package-generate-code-2 + prompt: "Run code generation for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_code" + + # ==== azsdk_package_generate_samples triggers ==== + - name: invoke-azsdk-package-generate-samples-1 + prompt: "Generate sample code for my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_samples" + - name: invoke-azsdk-package-generate-samples-2 + prompt: "Create sample code for my package based on these scenarios" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_samples" + - name: invoke-azsdk-package-generate-samples-3 + prompt: "Generate samples for my package using this prompt" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_samples" + + # ==== azsdk_package_pack triggers ==== + - name: invoke-azsdk-package-pack-1 + prompt: "Pack my SDK package into a distributable artifact" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_pack" + - name: invoke-azsdk-package-pack-2 + prompt: "Create distributable package artifacts for my SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_pack" + - name: invoke-azsdk-package-pack-3 + prompt: "Generate package artifacts for my SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_pack" + + # ==== azsdk_package_run_check triggers ==== + - name: invoke-azsdk-package-run-check-1 + prompt: "Run the azsdk package check command to validate my SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_check" + - name: invoke-azsdk-package-run-check-2 + prompt: "Run validation checks on my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_check" + - name: invoke-azsdk-package-run-check-3 + prompt: "Validate the changelog and dependencies for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_check" + + # ==== azsdk_package_run_tests triggers ==== + - name: invoke-azsdk-package-run-tests-1 + prompt: "Run tests for my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_tests" + - name: invoke-azsdk-package-run-tests-2 + prompt: "Run the tests for my specified SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_tests" + + # ==== azsdk_package_translate_samples triggers ==== + - name: invoke-azsdk-package-translate-samples-1 + prompt: "Translate the sample code from the Python package to the Java package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_translate_samples" + - name: invoke-azsdk-package-translate-samples-2 + prompt: "Convert samples from the source package to the target language package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_translate_samples" + - name: invoke-azsdk-package-translate-samples-3 + prompt: "Translate SDK samples from one language to another" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_translate_samples" + + # ==== azsdk_package_update_changelog_content triggers ==== + - name: invoke-azsdk-package-update-changelog-content-1 + prompt: "Update the changelog for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_changelog_content" + - name: invoke-azsdk-package-update-changelog-content-2 + prompt: "Update the changelog content for my package with new release notes" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_changelog_content" + + # ==== azsdk_package_update_metadata triggers ==== + - name: invoke-azsdk-package-update-metadata-1 + prompt: "Update the package metadata" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_metadata" + - name: invoke-azsdk-package-update-metadata-2 + prompt: "Update the package metadata for my SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_metadata" + + # ==== azsdk_package_update_version triggers ==== + - name: invoke-azsdk-package-update-version-1 + prompt: "Update my package version" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_version" + - name: invoke-azsdk-package-update-version-2 + prompt: "Bump the version to 1.2.0" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_version" + + # ==== azsdk_release_sdk triggers ==== + - name: invoke-azsdk-release-sdk-1 + prompt: "Release my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_release_sdk" + - name: invoke-azsdk-release-sdk-2 + prompt: "Trigger the release pipeline for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_release_sdk" + - name: invoke-azsdk-release-sdk-3 + prompt: "Start the SDK release process for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_release_sdk" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-pipeline.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-pipeline.eval.yaml new file mode 100644 index 00000000000..5e283686437 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-pipeline.eval.yaml @@ -0,0 +1,77 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +tags: + tier: unit + area: pipeline + priority: p0 + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +stimuli: + + # ==== azsdk_analyze_pipeline triggers ==== + - name: invoke-azsdk-analyze-pipeline-1 + prompt: "Analyze my pipeline run" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_analyze_pipeline" + - name: invoke-azsdk-analyze-pipeline-2 + prompt: "What happened in this pipeline build?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_analyze_pipeline" + + # ==== azsdk_get_pipeline_llm_artifacts triggers ==== + - name: invoke-azsdk-get-pipeline-llm-artifacts-1 + prompt: "Get the LLM artifacts from my pipeline" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_llm_artifacts" + - name: invoke-azsdk-get-pipeline-llm-artifacts-2 + prompt: "Download the analysis artifacts from the pipeline run" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_llm_artifacts" + + # ==== azsdk_get_pipeline_status triggers ==== + - name: invoke-azsdk-get-pipeline-status-1 + prompt: "Check the status of my Azure pipeline build 12345678" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_status" + - name: invoke-azsdk-get-pipeline-status-2 + prompt: "Get the pipeline build status for run 9876543" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_status" + - name: invoke-azsdk-get-pipeline-status-3 + prompt: "Get the pipeline build status for my CI run" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_status" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-releaseplan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-releaseplan.eval.yaml new file mode 100644 index 00000000000..1c8f9277ede --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-releaseplan.eval.yaml @@ -0,0 +1,317 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +tags: + tier: unit + area: release-plan + priority: p0 + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +stimuli: + + # ==== azsdk_abandon_release_plan triggers ==== + - name: invoke-azsdk-abandon-release-plan-1 + prompt: "Abandon the release plan for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + - name: invoke-azsdk-abandon-release-plan-2 + prompt: "Cancel and abandon my release plan for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + - name: invoke-azsdk-abandon-release-plan-3 + prompt: "Mark the release plan for work item 12345 as abandoned" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + - name: invoke-azsdk-abandon-release-plan-4 + prompt: "Abandon my release plan for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + - name: invoke-azsdk-abandon-release-plan-5 + prompt: "Cancel the release plan for my service for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + + # ==== azsdk_check_api_spec_ready_for_sdk triggers ==== + - name: invoke-azsdk-check-api-spec-ready-for-sdk-1 + prompt: "Check if my API spec is ready to generate SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_check_api_spec_ready_for_sdk" + - name: invoke-azsdk-check-api-spec-ready-for-sdk-2 + prompt: "Is my TypeSpec ready for SDK generation?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_check_api_spec_ready_for_sdk" + + # ==== azsdk_create_release_plan triggers ==== + - name: invoke-azsdk-create-release-plan-1 + prompt: "Create a release plan for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_release_plan" + - name: invoke-azsdk-create-release-plan-2 + prompt: "Create a release plan for Contoso Widget Manager service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_release_plan" + + # ==== azsdk_get_release_plan triggers ==== + - name: invoke-azsdk-get-release-plan-1 + prompt: "Get the release plan for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_release_plan" + - name: invoke-azsdk-get-release-plan-2 + prompt: "Show me the release plan details" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_release_plan" + + # ==== azsdk_get_release_plan_for_spec_pr triggers ==== + - name: invoke-azsdk-get-release-plan-for-spec-pr-1 + prompt: "Get the release plan for my spec PR" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_release_plan_for_spec_pr" + - name: invoke-azsdk-get-release-plan-for-spec-pr-2 + prompt: "What release plan is associated with this spec pull request?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_release_plan_for_spec_pr" + + # ==== azsdk_get_sdk_pull_request_link triggers ==== + - name: invoke-azsdk-get-sdk-pull-request-link-1 + prompt: "Get the SDK pull request link from the generation pipeline" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_sdk_pull_request_link" + - name: invoke-azsdk-get-sdk-pull-request-link-2 + prompt: "Where is the PR created by SDK generation?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_sdk_pull_request_link" + + # ==== azsdk_get_service_details_by_typespec_path triggers ==== + - name: invoke-azsdk-get-service-details-by-typespec-path-1 + prompt: "Get the service tree details for my TypeSpec project path" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-2 + prompt: "Look up the service and product details using the TypeSpec project path" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-3 + prompt: "What service tree ID and product info is associated with this TypeSpec path?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-4 + prompt: "Find product details for my typespec project" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-5 + prompt: "What service does this TypeSpec project belong to?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-6 + prompt: > + Get service and service tree product details for a product using TypeSpec project path + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + + # ==== azsdk_link_namespace_approval_issue triggers ==== + - name: invoke-azsdk-link-namespace-approval-issue-1 + prompt: "Link namespace approval issue to release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_link_namespace_approval_issue" + - name: invoke-azsdk-link-namespace-approval-issue-2 + prompt: "Associate the namespace approval with my release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_link_namespace_approval_issue" + + # ==== azsdk_link_sdk_pull_request_to_release_plan triggers ==== + - name: invoke-azsdk-link-sdk-pull-request-to-release-plan-1 + prompt: "Link my SDK pull request to the release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_link_sdk_pull_request_to_release_plan" + - name: invoke-azsdk-link-sdk-pull-request-to-release-plan-2 + prompt: "Link SDK pull request #5678 to release plan 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_link_sdk_pull_request_to_release_plan" + + # ==== azsdk_run_generate_sdk triggers ==== + - name: invoke-azsdk-run-generate-sdk-1 + prompt: "Generate SDK from my TypeSpec project using the pipeline" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_run_generate_sdk" + - name: invoke-azsdk-run-generate-sdk-2 + prompt: "Generate SDK for my TypeSpec project using the pipeline" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_run_generate_sdk" + + # ==== azsdk_update_api_spec_pull_request_in_release_plan triggers ==== + - name: invoke-azsdk-update-api-spec-pull-request-in-release-plan-1 + prompt: "Update the TypeSpec PR URL in the release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_api_spec_pull_request_in_release_plan" + - name: invoke-azsdk-update-api-spec-pull-request-in-release-plan-2 + prompt: "Update the TypeSpec pull request URL in my release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_api_spec_pull_request_in_release_plan" + + # ==== azsdk_update_language_exclusion_justification triggers ==== + - name: invoke-azsdk-update-language-exclusion-justification-1 + prompt: "Update the language exclusion justification" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_language_exclusion_justification" + - name: invoke-azsdk-update-language-exclusion-justification-2 + prompt: "Explain why Python is excluded from this release" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_language_exclusion_justification" + + # ==== azsdk_update_release_plan triggers ==== + - name: invoke-azsdk-update-release-plan-1 + prompt: "Update the release plan with my TypeSpec project path and API version" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + - name: invoke-azsdk-update-release-plan-2 + prompt: "Update the spec PR URL and SDK release type in my release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + - name: invoke-azsdk-update-release-plan-3 + prompt: "Update the existing release plan for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + - name: invoke-azsdk-update-release-plan-4 + prompt: "Update my release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + - name: invoke-azsdk-update-release-plan-5 + prompt: "Update TypeSpec project in release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + + # ==== azsdk_update_sdk_details_in_release_plan triggers ==== + - name: invoke-azsdk-update-sdk-details-in-release-plan-1 + prompt: "Update SDK details in the release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_sdk_details_in_release_plan" + - name: invoke-azsdk-update-sdk-details-in-release-plan-2 + prompt: "Change the SDK package name in the release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_sdk_details_in_release_plan" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-typespec.eval.yaml new file mode 100644 index 00000000000..63e4b9182f3 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-typespec.eval.yaml @@ -0,0 +1,190 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + tier: unit + area: typespec + priority: p0 + +stimuli: + + # ==== azsdk_convert_swagger_to_typespec triggers ==== + - name: invoke-azsdk-convert-swagger-to-typespec-1 + prompt: "Convert my swagger to TypeSpec" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_convert_swagger_to_typespec" + - name: invoke-azsdk-convert-swagger-to-typespec-2 + prompt: "Migrate my API from swagger to TypeSpec" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_convert_swagger_to_typespec" + + # ==== azsdk_customized_code_update triggers ==== + - name: invoke-azsdk-customized-code-update-1 + prompt: "Update customized code with patches to fix build errors" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_customized_code_update" + - name: invoke-azsdk-customized-code-update-2 + prompt: "Apply customized code patches and rebuild to fix errors" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_customized_code_update" + + # ==== azsdk_get_modified_typespec_projects triggers ==== + - name: invoke-azsdk-get-modified-typespec-projects-1 + prompt: "What TypeSpec projects were modified in my branch?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_modified_typespec_projects" + - name: invoke-azsdk-get-modified-typespec-projects-2 + prompt: "List the changed TypeSpec projects" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_modified_typespec_projects" + + # ==== azsdk_run_typespec_validation triggers ==== + - name: invoke-azsdk-run-typespec-validation-1 + prompt: "Run TypeSpec validation on my project" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_run_typespec_validation" + - name: invoke-azsdk-run-typespec-validation-2 + prompt: "Run TypeSpec configuration validation for my project root path" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_run_typespec_validation" + + # ==== azsdk_typespec_check_project_in_public_repo triggers ==== + - name: invoke-azsdk-typespec-check-project-in-public-repo-1 + prompt: "Check if my TypeSpec project is in the public repo" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_check_project_in_public_repo" + - name: invoke-azsdk-typespec-check-project-in-public-repo-2 + prompt: "Check if my TypeSpec project is in the public spec repo" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_check_project_in_public_repo" + + # ==== azsdk_typespec_delegate_apiview_feedback triggers ==== + - name: invoke-azsdk-typespec-delegate-apiview-feedback-1 + prompt: "Delegate the APIView feedback to Copilot for resolution" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-2 + prompt: "Address the APIView comments by creating a GitHub issue and assigning Copilot" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-3 + prompt: "Resolve the APIView reviewer feedback from this URL" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-4 + prompt: > + Help me fix these comments: https://spa.apiview.dev/review/c375391d5ab9419f83e3bdsfas9asdfadf2e?activeApiRevisionId=fc2a4adfasdfasdagae3w3hhtd + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-5 + prompt: > + Fix this feedback: https://spa.apiview.dev/review/c375391d5ab9419f83e3bdsfas9asdfadf2e?activeApiRevisionId=fc2a4adfasdfasdagae3w3hhtd + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-6 + prompt: > + Create an issue and assign to copilot to fix this: https://spa.apiview.dev/review/adfaset5391d5ab9419f83e3bds9asdfadf2e?activeApiRevisionId=adf34adfasadastasdagae3w3hhtd + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + + # ==== azsdk_typespec_generate_authoring_plan triggers ==== + - name: invoke-azsdk-typespec-generate-authoring-plan-1 + prompt: > + Generate a solution to add a new resource 'asset' for service widget with TypeSpec. + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_generate_authoring_plan" + - name: invoke-azsdk-typespec-generate-authoring-plan-2 + prompt: "Generate a solution to add a new api version for service widget with TypeSpec." + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_generate_authoring_plan" + - name: invoke-azsdk-typespec-generate-authoring-plan-3 + prompt: > + Generate a solution to set a default value `21` for property `age` in model EmployeeProperties from a api version say 2025-11-01 with TypeSpec. + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_generate_authoring_plan" + + # ==== azsdk_typespec_init_project triggers ==== + - name: invoke-azsdk-typespec-init-project-1 + prompt: "Initialize a new TypeSpec project" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_init_project" + - name: invoke-azsdk-typespec-init-project-2 + prompt: "Initialize a new TypeSpec project for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_init_project" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-verify.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-verify.eval.yaml new file mode 100644 index 00000000000..24e43a38451 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-verify.eval.yaml @@ -0,0 +1,45 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp-mock + +config: + runs: 5 + timeout: "120s" + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + tier: unit + area: engsys + priority: p0 + +stimuli: + + # ==== azsdk_verify_setup triggers ==== + - name: invoke-azsdk-verify-setup-1 + prompt: "Verify my environment setup" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_verify_setup" + - name: invoke-azsdk-verify-setup-2 + prompt: "Verify my developer environment setup for MCP tools" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_verify_setup" + - name: invoke-azsdk-verify-setup-3 + prompt: "Verify my MCP release tool setup" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_verify_setup" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/validate-typespec.eval.yaml new file mode 100644 index 00000000000..75993329a60 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/validate-typespec.eval.yaml @@ -0,0 +1,39 @@ +name: azsdk-mcp-tool-scenarios +description: | + Validate-typespec: the agent should run TypeSpec validation when asked to + validate a TypeSpec project. +version: "1.0" +type: capability + + +tags: + tier: unit + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: validate-typespec + prompt: | + Validate my typespec project. It is already confirmed we are in a public repository. + The path to my typespec is specification/contosowidgetmanager/Contoso.WidgetManager/main.tsp. + constraints: + max_turns: 8 + max_tokens: 8000 + graders: + - type: tool-calls + config: + required: + - azsdk_run_typespec_validation + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml new file mode 100644 index 00000000000..c3ffb9356ea --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml @@ -0,0 +1,151 @@ +name: azsdk-mcp-tool-scenarios +description: | + Live end-to-end demo for the full release-planner -> generate-SDK flow. + + Drives the *real* azsdk-cli MCP server against real DevOps APIs, inside a + real git worktree of azure-rest-api-specs. The MCP server runs with + AZSDKTOOLS_AGENT_TESTING=true (set globally in .vally.yaml), so work items + route to the DevOps test area path and are safe to leave around / re-run. + + This scenario walks the agent through a multi-step chain that exercises + multiple skills back-to-back in a single conversation: + + 1. Release-plan skill -> azsdk_create_release_plan, azsdk_get_release_plan + 2. Generate-SDK skill -> azsdk_run_generate_sdk + 3. Release-plan skill -> azsdk_link_sdk_pull_request_to_release_plan + + The goal is to verify Vally end-to-end (live agent + live MCP + live DevOps) + can: + - route each turn to the correct skill, + - call the correct tool on that skill, + - and do so in the expected order across multiple steps. + + Demonstrates Vally's environment.git fixture hook + live MCP + Copilot SDK + executor + real DevOps in one shot. + + Prerequisite: a clone of Azure/azure-rest-api-specs at the path referenced + by environment.git.source below. Locally, run + evals/setup/ensure-specs-clone.ps1 to prime a per-user cache + (auto-refresh every 24h) at the path this source points at. CI should + clone the repo as a pipeline checkout step instead. + +version: "1.0" +type: capability + +tags: + area: release-plan +metadata: + repos: + - name: Azure/azure-rest-api-specs + +# Bound to the live env because the scenario asserts real DevOps writes +# (test-area path) + real generation pipeline. Mock can't satisfy those +# graders. Picked up by the `scenarios-live` / `nightly` suite via folder. +environment: azsdk-mcp-live + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: release-planner-e2e + environment: + # Source is the per-user cache populated by evals/setup/ensure-specs-clone.ps1 + # (idempotent shallow+sparse clone, auto-refresh every 24h). + # NOTE: hardcoded absolute path — Vally does not currently expand + # ${USERPROFILE} / env vars in env.git.source. Adjust per machine + # or replace with a CI-provided path. See upstream issue: + # https://github.com/microsoft/vally/issues (TODO: file env-var expansion) + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main + prompt: | + Walk me through the full + release-plan + SDK-generation flow for the Contoso Widget Manager + end-to-end. Do every step below, in order, and use real tools (no + dry-run, no simulation): + + 1. Create a release plan using: + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f" + - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e" + - target release timeline: "December 2026" + - API version: "2022-11-01-preview" + - SDK release type: "beta" + - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" + + 2. Fetch the release plan you just created back from DevOps to confirm + it was saved, and tell me its work-item ID. + + 3. Kick off SDK generation for that same TypeSpec project via the + generation pipeline (Python SDK is fine). Use the work-item ID + from step 2. + + 4. Once the generation pipeline reports a pull request URL, link + that SDK pull request back to the release plan from step 2. + + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 20 + max_tokens: 30000 + # TODO: assert strict ordering create -> get -> generate -> link + # — blocked on https://github.com/microsoft/vally/issues/453 (tool-calls grader sequence:). + # TODO: assert args (serviceTreeId / productTreeId / typeSpecProjectPath / workItemId) + # — blocked on https://github.com/microsoft/vally/issues/454 (tool-calls grader generic args:). + # TODO: add `azsdk-common-generate-sdk-locally` (or the equivalent pipeline- + # driven skill) to skill-invocation `required` once a skill that owns + # `azsdk_run_generate_sdk` is registered. Today the only skill that + # declares any of the tools in this scenario is azsdk-common-prepare-release-plan. + graders: + # 1. Skill-routing check (FIRST — fast, deterministic, free): did the + # agent dispatch to the right skill at all? If this fails, the + # tool-calls grader below is meaningless. + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + # 2. Tool-call check: given the right skill was loaded, did it call + # the right MCP tools? Each tool here is owned by the skill above + # except azsdk_run_generate_sdk (see TODO). + - type: tool-calls + config: + required: + - azsdk_create_release_plan + - azsdk_get_release_plan + - azsdk_run_generate_sdk + - azsdk_link_sdk_pull_request_to_release_plan + disallowed: + - azsdk_verify_setup + # 3. Final-answer correctness (LLM-judged): the deterministic graders + # above only verify the agent *did* the right things, not that it + # *reported* them back to the user correctly. Tools can fire + # successfully while the final message hallucinates IDs / URLs. + # This grader uses gpt-5.4 as judge against a free-form rubric so + # minor wording variants (`WI 29262`, `work-item #29262`) all pass. + - type: prompt + config: + model: gpt-5.4 + rubric: | + Did the final assistant message clearly state BOTH of the + following, consistent with the tools that were actually called? + + 1. A numeric DevOps work-item ID for the release plan that was + created (or confirmed). Any unambiguous format is fine + (e.g. "work item 29262", "WI #29262", "/_workitems/edit/29262"). + + 2. A GitHub pull request URL on + github.com/Azure/azure-sdk-for-* that was linked back to + that release plan. + + Answer "pass" only if BOTH are present. Otherwise answer "fail" + and briefly say which one is missing. + +scoring: + weights: + skill-invocation: 1 + tool-calls: 1 + prompt: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml new file mode 100644 index 00000000000..95c2ea0f5f2 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml @@ -0,0 +1,48 @@ +name: azsdk-mcp-tool-scenarios +description: | + Validate-then-check-public-repo: the agent should run TypeSpec validation, + then check if the project is in the public repo. +version: "1.0" +type: capability + + +tags: + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: validate-then-check-public-repo + prompt: | + Run TypeSpec validation, then check if the project is in the public repo. + Project path: specification/contosowidgetmanager/Contoso.WidgetManager. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 8 + max_tokens: 8000 + # TODO: assert ordering (validate before check) — blocked on https://github.com/Azure/azure-sdk-tools/issues/15832 (Vally tool-calls grader needs sequence:). + graders: + - type: tool-calls + config: + required: + - azsdk_run_typespec_validation + - azsdk_typespec_check_project_in_public_repo + disallowed: + - azsdk_verify_setup + - type: skill-invocation + config: + required: + - azure-typespec-author + +scoring: + weights: + tool-calls: 1 + skill-invocation: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml new file mode 100644 index 00000000000..6a7d1c77230 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml @@ -0,0 +1,214 @@ +name: azsdk-mcp-tool-scenarios +description: | + Mock-environment workflow scenarios derived from the release-planner + replacement test plan (#15835). Each stimulus mirrors one of the four + high-level scenarios that release-planner-dashboard must hand off to the + agent: + + 1. Create a release plan (private preview / public preview / GA) + 2. Generate SDK for all languages in an existing release plan + 3. Link a different spec PR to an existing release plan + 4. Update SDK details (package names) in a release plan + + Plus an end-to-end "create + generate" flow used as the headline demo + prompt. + + Bound to the mock MCP — these graders only inspect skill routing and tool + selection, not real DevOps writes. The full live e2e flow lives in + evals/scenarios/live/release-planner.eval.yaml. + +version: "1.0" +type: capability + +tags: + area: release-plan + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + # --- Scenario 1: Create release plan --------------------------------- + - name: create-public-preview-release-plan + environment: + # Per-user cache populated by evals/setup/ensure-specs-clone.ps1 + # (idempotent shallow+sparse clone, auto-refresh every 24h). Same + # source the live e2e uses — keeps the relative TypeSpec path + # resolvable on disk even though the MCP responses are mocked. + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main + prompt: | + Create a public preview + release plan for the Contoso Widget Manager. Here is all the context + you need: + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f" + - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e" + - target release timeline: "June 2026" + - API version: "2022-11-01-preview" + - SDK release type: "beta" + - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 6 + max_tokens: 8000 + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_create_release_plan + disallowed: + - azsdk_verify_setup + + # --- End-to-end demo prompt: create + generate ----------------------- + - name: create-release-plan-and-generate-sdk + environment: + # Same per-user azure-rest-api-specs worktree as the create stimulus + # above so the agent sees a real on-disk spec repo. + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main + prompt: | + Walk me through creating + a release plan and then generating SDK for the Contoso Widget Manager: + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - API release type: "Public Preview" + - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f" + - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e" + - target release timeline: "June 2026" + - API version: "2022-11-01-preview" + - SDK release type: "beta" + - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" + After the release plan is created, generate SDK for all languages + using the work-item ID from the created release plan. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 12 + max_tokens: 16000 + # TODO: assert ordering create -> get -> generate + # — blocked on Vally tool-calls grader sequence: support. + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_create_release_plan + - azsdk_run_generate_sdk + disallowed: + - azsdk_verify_setup + + # --- Scenario 2: Generate SDK for an existing release plan ----------- + - name: generate-sdk-for-existing-release-plan + environment: + # Same per-user azure-rest-api-specs worktree as the create stimuli + # above so the agent can locate the TypeSpec project on disk while + # driving the release-planner flow. + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main + prompt: | + Using the release-planner + flow, generate SDK for all languages for the Contoso Widget Manager + release plan. Here is the context you need: + - release plan work item ID: "29262" + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 8 + max_tokens: 10000 + graders: + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_run_generate_sdk + disallowed: + - azsdk_verify_setup + - azsdk_create_release_plan + + # --- Scenario 3: Link a different spec PR to an existing release plan + - name: link-different-spec-pr-to-release-plan + environment: + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main + prompt: | + Using the release-planner + flow, update the API spec pull request on an existing Contoso Widget + Manager release plan. Here is the context you need: + - release plan work item ID: "29262" + - new spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38500" + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 6 + max_tokens: 8000 + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_update_api_spec_pull_request_in_release_plan + disallowed: + - azsdk_verify_setup + - azsdk_create_release_plan + + # --- Scenario 4: Update SDK details (package names) ------------------ + - name: update-sdk-details-in-release-plan + environment: + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main + prompt: | + Using the release-planner + flow, refresh the SDK package-name details on an existing Contoso + Widget Manager release plan from the on-disk TypeSpec emitter + configuration. Here is the context you need: + - release plan work item ID: "29262" + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - tspconfig path: "specification/contosowidgetmanager/Contoso.WidgetManager/tspconfig.yaml" + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 8 + max_tokens: 10000 + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_update_sdk_details_in_release_plan + disallowed: + - azsdk_verify_setup + - azsdk_create_release_plan + +scoring: + weights: + skill-invocation: 1 + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/rename-client-property.eval.yaml new file mode 100644 index 00000000000..0892601a25e --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/rename-client-property.eval.yaml @@ -0,0 +1,48 @@ +name: azsdk-mcp-tool-scenarios +description: | + Rename-client-property: the agent should rename @clientName("uri", "csharp") + to @clientName("imageUri", "csharp") on the AddFaceFromUrlRequest.url + property in specification/ai/Face/models.common.tsp. +version: "1.0" +type: capability + + +tags: + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: rename-client-property + prompt: | + In the specification/ai/Face project, find the AddFaceFromUrlRequest model. + It has a property called 'url' that's been renamed to "uri" in c#. + Change that to imageUri for c#. + constraints: + max_turns: 5 + max_tokens: 5000 + # TODO: seed a git worktree (environment.git) at specification/ai/Face and + # add a `file-matches` grader on models.common.tsp to verify the + # @clientName("uri", "csharp") → @clientName("imageUri", "csharp") rename. + graders: + - type: tool-calls + config: + required: + - edit + - type: skill-invocation + config: + required: + - azure-typespec-author + +scoring: + weights: + tool-calls: 1 + skill-invocation: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/typespec-generation-step02.eval.yaml new file mode 100644 index 00000000000..954188d4b5b --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/typespec-generation-step02.eval.yaml @@ -0,0 +1,47 @@ +name: azsdk-mcp-tool-scenarios +description: | + TypeSpec generation workflow step 2: the agent should check whether the + project is in the public repo as part of the validation step. +version: "1.0" +type: capability + + +tags: + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: typespec-generation-step02-validation + prompt: | + I'm working on the TypeSpec generation workflow. I need to validate my TypeSpec project + as part of step 2. Please check if my TypeSpec project is in the public repo. + The project is at specification/contosowidgetmanager/Contoso.WidgetManager. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azsdk_typespec_check_project_in_public_repo + disallowed: + - azsdk_verify_setup + - type: skill-invocation + config: + required: + - azure-typespec-author + +scoring: + weights: + tool-calls: 1 + skill-invocation: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep new file mode 100644 index 00000000000..6f799cb330a --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep @@ -0,0 +1,8 @@ +# Scenario fixtures live here, one folder per scenario name +# (matching the `name:` field in the corresponding evals/*.eval.yaml). +# +# Reference them from the eval via: +# environment: +# files: +# - src: ../fixtures// +# dest: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 new file mode 100644 index 00000000000..fcd257e4cc8 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 @@ -0,0 +1,160 @@ +<# +.SYNOPSIS + Validates that all tool names referenced in tool-trigger eval files exist in the MCP server. + +.DESCRIPTION + This script: + 1. Runs `azsdk list` to get all registered MCP tool names from the server. + 2. Parses all `triggers-*.eval.yaml` files under the unit/ directory. + 3. Reports any eval tool references that don't exist on the server, + and any server tools that are missing eval coverage. + +.PARAMETER ProjectPath + Path to the Azure.Sdk.Tools.Cli project. Defaults to ../Azure.Sdk.Tools.Cli relative to this script. + +.PARAMETER EvalPath + Path to the directory containing `triggers-*.eval.yaml` files. + Defaults to ../evals/unit relative to this script. + +.PARAMETER SkipBuild + If set, passes --no-build to dotnet run (requires a prior build). +#> +[CmdletBinding()] +param( + [string]$ProjectPath, + [string]$EvalPath, + [switch]$SkipBuild +) + +Set-StrictMode -Version 4 +$ErrorActionPreference = 'Stop' + +$scriptDir = $PSScriptRoot +$vallyRoot = (Resolve-Path (Join-Path $scriptDir "..")).Path +$cliParent = (Resolve-Path (Join-Path $vallyRoot "..")).Path + +if (-not $ProjectPath) { + $ProjectPath = Join-Path $cliParent "Azure.Sdk.Tools.Cli" +} +if (-not $EvalPath) { + $EvalPath = Join-Path $vallyRoot "evals/unit" +} + +if (-not (Test-Path $ProjectPath)) { + Write-Error "CLI project not found at: $ProjectPath" + return 1 +} +if (-not (Test-Path $EvalPath)) { + Write-Error "Evaluations directory not found at: $EvalPath" + return 1 +} + +# Step 1: Get tool names from the MCP server via `azsdk list` +Write-Host "Running 'azsdk list' to get registered MCP tools..." -ForegroundColor Cyan + +$dotnetArgs = @("run", "--project", $ProjectPath) +if ($SkipBuild) { + $dotnetArgs += "--no-build" +} +$dotnetArgs += @("--", "list", "--output", "json") + +$listOutput = & dotnet @dotnetArgs 2>&1 +$jsonLines = $listOutput | Where-Object { $_ -is [string] -and $_ -notmatch "^Using launch settings" } +$jsonText = $jsonLines -join "`n" + +try { + $parsed = $jsonText | ConvertFrom-Json + [string[]]$serverTools = @($parsed.Tools | ForEach-Object { $_.McpToolName } | Where-Object { $_ } | Sort-Object -Unique) +} catch { + Write-Error "Failed to parse 'azsdk list --output json'. Error: $_" + return 1 +} + +# Filter out tools that are excluded from eval coverage (example, test, and utility tools) +$excludedTools = @( + "azsdk_hello_world", + "azsdk_hello_world_fail", + "azsdk_example_process_execution", + "azsdk_example_powershell_execution", + "azsdk_example_azure_service", + "azsdk_example_ai_service", + "azsdk_example_error_handling", + "azsdk_example_agent_fibonacci", + "azsdk_example_github_service", + "azsdk_example_devops_service", + "azsdk_upgrade", + "azsdk_engsys_codeowner_view", + "azsdk_engsys_codeowner_add_label_owner", + "azsdk_engsys_codeowner_remove_label_owner", + "azsdk_engsys_codeowner_add_package_owner", + "azsdk_engsys_codeowner_remove_package_owner", + "azsdk_engsys_codeowner_add_package_label", + "azsdk_engsys_codeowner_remove_package_label" +) + +[string[]]$serverTools = @($serverTools | Where-Object { $_ -notin $excludedTools }) + +if ($serverTools.Count -eq 0) { + Write-Error "No tools found from 'azsdk list'. Check that the CLI project builds and runs correctly." + return 1 +} + +Write-Host "Found $($serverTools.Count) tools registered on the MCP server ($($excludedTools.Count) excluded).`n" -ForegroundColor Green + +# Step 2: Parse all triggers-*.eval.yaml files in the unit directory for tool name references +$evalFiles = Get-ChildItem -Path $EvalPath -Filter "triggers-*.eval.yaml" + +if ($evalFiles.Count -eq 0) { + Write-Error "No triggers-*.eval.yaml files found in: $EvalPath" + return 1 +} + +$evalToolsByFile = @{} +$allEvalTools = [System.Collections.Generic.HashSet[string]]::new() + +foreach ($file in $evalFiles) { + $key = $file.BaseName + $matchResults = Select-String -Path $file.FullName -Pattern 'name:\s*"(azsdk_[^"]+)"' + [string[]]$tools = @($matchResults | ForEach-Object { $_.Matches[0].Groups[1].Value } | Sort-Object -Unique) + $evalToolsByFile[$key] = $tools + foreach ($t in $tools) { + [void]$allEvalTools.Add($t) + } +} + +Write-Host "Found $($allEvalTools.Count) unique tools across $($evalFiles.Count) eval files.`n" -ForegroundColor Green + +# Step 3: Compare +[string[]]$missingFromServer = @($allEvalTools | Where-Object { $_ -notin $serverTools } | Sort-Object) +[string[]]$missingFromEvals = @($serverTools | Where-Object { $_ -notin $allEvalTools } | Sort-Object) + +$hasErrors = $false + +if ($missingFromServer.Count -gt 0) { + $hasErrors = $true + Write-Host "ERROR: Eval references tools NOT found on the MCP server:" -ForegroundColor Red + foreach ($tool in $missingFromServer) { + # Find which eval file references it + $sources = $evalToolsByFile.GetEnumerator() | Where-Object { $_.Value -contains $tool } | ForEach-Object { $_.Key } + Write-Host " - $tool (referenced in: $($sources -join ', '))" -ForegroundColor Red + } + Write-Host "" +} + +if ($missingFromEvals.Count -gt 0) { + $hasErrors = $true + Write-Host "ERROR: Server tools with no eval coverage:" -ForegroundColor Red + foreach ($tool in $missingFromEvals) { + Write-Host " - $tool" -ForegroundColor Red + } + Write-Host "" +} + +Write-Host "" +if ($hasErrors) { + Write-Host "RESULT: FAIL - Eval tools and MCP server tools are out of sync." -ForegroundColor Red + exit 1 +} else { + Write-Host "RESULT: PASS - All eval tools exist on the MCP server." -ForegroundColor Green + exit 0 +} diff --git a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md new file mode 100644 index 00000000000..629f42bfdca --- /dev/null +++ b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md @@ -0,0 +1,426 @@ +# Spec: 8 Operations — Agent Evaluation Strategy + +## Table of Contents + +- [Definitions](#definitions) +- [Background / Problem Statement](#background--problem-statement) +- [Goals and Exceptions/Limitations](#goals-and-exceptionslimitations) +- [Design Proposal](#design-proposal) +- [Agent Prompts](#agent-prompts) +- [Success Criteria](#success-criteria) +- [Open Questions](#open-questions) +- [Implementation Plan](#implementation-plan) + +--- + +## Definitions + +- **Agent**: a live LLM conversation driving Azure SDK MCP tools through skills. +- **Skill**: a markdown contract under `.github/skills//` telling the + agent *when* to engage and *which* tools/workflow to use. +- **MCP tool**: a discrete capability exposed by the Azure SDK MCP server. +- **Workflow scenario**: a user prompt that crosses multiple tools / skills + end-to-end (e.g. *create release plan → generate SDK → link the SDK PR*). +- **Stimulus**: one prompt + its expected behavior — the unit of an eval. +- **Three graders per stimulus**: `skill-invocation` (right skill picked), + `tool-calls` (right tools / order / args), and `prompt` (right final answer). +- **Mock MCP**: an in-memory fake of the Azure SDK MCP server — no network, + no side effects. **Live MCP**: the real server hitting real DevOps / GitHub. + + +--- + +## Background / Problem Statement + +We're shipping agent-driven replacements for manual SDK workflows — starting +with the release planner. When someone +asks *"does the agent actually do what we said it does?"*, today the only +honest answer is "I tried a few prompts on my laptop." That is not good +enough to hand to partner teams or to keep regressions out as more workflows +land. + +We need a small, shared set of prompts we promise to support, run regularly, +with a clear pass/fail per prompt — so we can point at the report instead +of re-demoing. + +--- + +## Goals and Exceptions/Limitations + +### Goals + +- [ ] **One file per workflow, three graders per prompt** — skill picked, + tools called, final answer. +- [ ] **Mock MCP by default, live MCP only on opt-in** — no accidental writes + to DevOps / GitHub; release / publish tools stay mock-only. +- [ ] **Mock covers every tool the scenarios call**, with realistic responses. +- [ ] **Anyone can clone and run** — env vars, no hard-coded paths; live + scenarios declare what repos they need. +- [ ] **The run produces a status table** of pass/fail per prompt plus a + trajectory per prompt — readable by non-engineers. +- [ ] **Reports come out in the formats people actually use** — markdown + for humans, JUnit for CI, CSV for spreadsheets and dashboards. +- [ ] **Adding a partner-reported prompt is one new stimulus**, no runner + or CI changes. +- [ ] **Multi-step chains work** (e.g. *validate TypeSpec → create release + plan → generate SDK → link the SDK PR*). + +### Exceptions and Limitations + +- **Some prompts can only be checked against live MCP** — the mock can't + prove a release plan was really created. Those run opt-in only. +- **The agent is not deterministic.** Same prompt, different wording or + turn count each run. We grade shape, not exact strings, and accept some + flake. + +--- + + +## Design Proposal + +### The three eval kinds + +We organize evals around what's actually being tested. No tier numbers — +use the names. The first three columns are the same axis (what does this +prove); the last two say where each lives and what backend it needs. + +| Kind | What it proves | Agent | MCP | Lives in | +|---|---|---|---|---| +| **Skills** | A user prompt routes to the right skill. | live | none | `.github/skills//evals/` | +| **Workflows — Mock** | Agent picks the right skills, calls the right tools in the right order with the right args, returns the right answer. | live | **mock** | `evals/workflow-scenarios/mock/` | +| **Workflows — Live** | Same as above, but against the real backend — catches drift the mock can't see (TypeSpec ordering, real codegen output, real DevOps state). | live | **live** | `evals/workflow-scenarios/live/` | + +Plus a hermetic tool-shape layer that isn't agent-driven: + +| Kind | What it proves | Lives in | +|---|---|---| +| **Tools** | Tool X exists and returns the right shape for these inputs. Cross-skill trigger tables. | `evals/tools/` | + +#### Required graders by kind + +Mock and live workflow scenarios share the same scenario format but +differ in which graders are *required* vs *optional*: + +| Kind | `tool-calls` | `skill-invocation` | response grader (`prompt` / LLM-judge) | +|---|---|---|---| +| **Workflows — Mock** | required | optional | not applicable — mock responses are stubbed, so a response grader has nothing meaningful to assert | +| **Workflows — Live** | required | required | required — only live runs produce a real assistant answer worth grading | + +Rationale: the mock backend deterministically replays canned data, so +"the agent said the right thing" reduces to "the agent called the right +tools." Live runs are the only place a free-form response can drift, so +that's where the response grader earns its cost. + + +### Folder layout + +``` +evals/ +├── tools/ tool-shape + cross-skill triggers (hermetic) +├── workflow-scenarios/ +│ ├── mock/ workflow scenarios run against the mock MCP +│ └── live/ workflow scenarios run against the live MCP +└── setup/ shared fixture scripts (repo clone, etc.) +``` + +A scenario lives under `mock/` or `live/` based on which backend the +graders are written against, not based on the prompt. A prompt can +have a `mock/` and a `live/` variant (release-planner does). + +**Scenarios are environment-agnostic.** A scenario file declares the +prompt, expected skills, expected tool sequence, and graders — nothing +about whether MCP is mock or live. Same file, same graders; the MCP +backend is picked at run time. + +| Run mode | MCP | Repos? | When | Coverage | +|---|---|---|---|---| +| Workflows — Mock | mock (stub, no LLM) | azure-sdk-tools only | nightly + on demand | every scenario | +| Workflows — Live | live (real backends) | azure-sdk-tools + shallow/sparse clones of the spec & language SDK repos each scenario declares | weekly | scenarios tagged `live-safe` (curated subset) | + +When live and mock results disagree, the mock is wrong — the divergence +points straight at the missing or stale handler. Every scenario that +runs on mock therefore drives the mock to grow handlers for the tools +it exercises. + +### Where each eval lives + +| What it tests | Lives in | +|---|---| +| **One skill** (does this skill route, call its tools, return a sensible answer) | `.github/skills//evals/` | +| **Cross-skill / cross-tool** (multi-step chains, e2e flows, mock-server integration, anything that doesn't belong to one skill) | `tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/` | + +Skill evals stay next to `SKILL.md` — that's the convention skill +authors expect, and it keeps everything about a skill in one folder. +Existing skill eval files do not move. + +#### Skill eval suite — current state and direction + +The per-skill suite predates this project. Today roughly a dozen skills +have eval files; some are missing thresholds and pass without asserting +anything, and most capability stimuli are graded only by a single +substring check — they pass whether the agent called the right tool, +the wrong tool, or just echoed the prompt. + +*Direction.* Raise the bar on what counts as a per-skill eval: adopt +the four-layer pattern — skill-invocation + tool-calls + structural +output match + optional LLM-judge — as the required shape for every +capability stimulus. A `skill-eval-authoring` skill packages the +pattern, grader catalog, and anti-patterns so other Azure SDK teams +adopt without re-learning the gotchas. + +### Decision tree — where does my new eval go? + +``` +Do you only care that the agent picks the right skill +(you don't care which tools it then calls)? +└── yes → .github/skills//evals/ (not this project) + +Do you want to check that one MCP tool returns the right shape +for a given input — no agent in the loop? +└── yes → evals/tools/ + +Is it a multi-step / multi-tool agent flow? +└── yes → Workflow scenario + ├── Default → evals/workflow-scenarios/mock/ + │ Runs against the mock MCP. Use this unless the mock can't + │ faithfully cover the behavior. + └── Also need live coverage → add an evals/workflow-scenarios/live/ + variant. Reserve for cases where the real backend's behavior + matters (TypeSpec ordering, real codegen output, real DevOps + state). +``` + +### CI + +The suite runs on a schedule, not on every pull request. Agent runs +talk to an LLM — they cost money and they flake in ways that have +nothing to do with the code under review. + +| When | What runs | Backend | +|---|---|---| +| Nightly | All workflow scenarios + the hermetic tool layer | mock | +| Weekly | Workflow scenarios marked safe to run live | live (with safe-mode flag on writes) | +| On demand | Any suite, any backend | author's choice | + +#### PR gate for essential workflows (open) + +A case for *narrow* PR gating: a small curated set of mock scenarios +covering the workflows we have already promised to partner teams +(release-planner today; more as they ship) could run on PRs that touch +the agent, skills, or MCP tools — so we catch a regression in the +workflows users actually rely on before merge, instead of the morning +after. + +Unresolved trade-offs: which scenarios count as "essential"; how to +keep the gate from flaking on LLM non-determinism (retries? loose +thresholds? quorum across N runs?); whether the cost of the gated +subset is acceptable for every PR; and which paths actually trigger it +(agent-only? skills? MCP server? all of the above?). + +See [Open Questions](#open-questions). + +#### Pre-run setup for live scenarios + +**The problem.** A real workflow crosses repos. The release planner +reads a TypeSpec project from `azure-rest-api-specs`, generates code +into a language SDK repo, and links a PR back. The tools the agent +calls expect those files on disk. If a repo is missing, the agent +fails for the wrong reason and we learn nothing. + +**The setup step.** Each live scenario declares the repos (and +optionally the commit) it needs. One setup step reads all live +scenarios, takes the union, and makes sure each repo is present at the +requested commit before any eval runs. + +**Locally.** A single script. Run it once; it clones into a cache +folder under your home directory and reuses the clone on subsequent +runs. Same script CI uses. + +**In CI.** The weekly live job runs the same script. The cache folder +is a build-cache artifact keyed on the set of repos the scenarios +declare; it's invalidated only when that set changes. + +**Pinning.** A scenario can pin a commit when reproducibility matters. +Otherwise the setup step takes the default branch and records the +commit it used in the run output. + +The nightly mock job runs no setup — mock evals touch no external repos. + + +### Mock MCP server status + +#### How it works + +`Azure.Sdk.Tools.Mock` reflects over the real CLI's tool list at boot and +registers a mock proxy for **every** tool the real `Azure.Sdk.Tools.Cli` +advertises, preserving each tool's name, description, and input schema. +At call time the proxy looks up a handler by tool name: + +- **Custom handler exists** → scripted, type-correct response. +- **No custom handler** → fallback `{ Message = "Success" }`. + + + +### Results + +The goal: anyone — partner team, manager, the engineer who broke +something — should be able to open a run and understand what passed, +what failed, and why, without help. + +Each run writes three files into the output directory: + +| File | What it is | Who reads it | +|---|---|---| +| `eval-results.md` | Human status table: one row per prompt, pass/fail per grader. | Reviewers, partner teams, anyone scanning a run. | +| `results.jsonl` | The full agent trajectory — every tool call, args, return values, timings. One JSON object per line. | Engineers debugging a failure with tooling. | +| `junit.xml` | Standard test-results format the CI test-results widget already understands. | CI dashboards. | + +The JSONL is rich but hard to read raw. We add two post-processors +on top of it: + +- **Trajectory HTML** — one self-contained web page per prompt, opens + straight from `file://`. Shows the same trajectory as `results.jsonl` + but readable by someone who has never seen JSONL. +- **CSV history** — one row per prompt, appended across runs. Lets us + ask *"how often did release-planner pass in the last 30 nightlies?"* + and feed a dashboard later. + +In CI: trajectories + JSONL are uploaded as build artifacts you can +download from the run page; the CSV gets appended to a long-lived +history branch. + +### Performance and cost controls + +Why this section exists: agent evals are *slow* and *expensive*. Every +run talks to a real LLM — every tool call is a round trip, every turn +is tokens billed against our subscription. Without limits, a single +badly-written scenario can sit in a loop for an hour and burn through +the budget while still reporting *"passed"*. + +Concrete example: one real release-planner end-to-end run took **17 +minutes wall time, 1.78M tokens, 41 turns**. + +The framework therefore enforces three things: + +**1. Per-scenario budgets.** Every scenario file declares an upper +bound on: + +- **Turns** — how many times the agent loops. +- **Wall time** — how long the whole run can take. +- **Billable tokens** — input + output tokens we actually pay for. +- **Tool calls** — catches an agent stuck calling the same tool forever. + +The runner warns at 50% of any limit, fails the scenario at 100%, and +kills the whole run at 200% so a runaway can't bleed indefinitely. + +**2. Tiered defaults.** Mock runs nightly against an in-memory fake — +cheap and fast, so the limits are tight. Live runs weekly against real +backends — slower by nature, so the limits are looser. + +| Tier | Turns | Wall (s) | Billable tokens | +|---|---|---|---| +| Nightly mock | 30 | 300 | 200k | +| Weekly live | 60 | 600 | 500k | + +A scenario that needs more must opt in with a justification comment in +the scenario file. If reviewers reject the opt-in, the scenario has to +be rewritten to fit, or moved to mock — budgets don't widen. + +**3. Background guardrails** — things the scenario author never has +to think about, baked into the framework: + +- Polling tools (`*_get_*_status`) return a terminal state on the first poll under safe mode — no agent stuck waiting for *"in progress"* to flip. +- LLM-judge graders default to a cheaper model than the agent itself. +- CI cancels superseded runs when a branch gets a new push. + + +--- + +## Agent Prompts + +The list of prompts the agent is promised to support. Each lives as a +stimulus in `evals/workflow-scenarios/mock/.eval.yaml` (plus a +`live/` counterpart where applicable). Adding a new prompt is one new +entry in the matching file. + +### Release-planner workflow + +Derived from the release-planner replacement test plan +([#15835](https://github.com/Azure/azure-sdk-tools/issues/15835)). All +five route to the `azsdk-common-prepare-release-plan` skill. + +| Prompt | What the agent must do | Required tool calls | +|---|---|---| +| Create a public-preview release plan for a TypeSpec spec, target month June 2026 | Pick the prepare-release-plan skill; check for an existing plan; create one. | `azsdk_get_release_plan`, `azsdk_create_release_plan` | +| Create a release plan **and** generate SDK for a TypeSpec spec, release type beta | End-to-end chain: create, then generate, then back-fill SDK details. | `azsdk_get_release_plan`, `azsdk_create_release_plan`, `azsdk_run_generate_sdk`, `azsdk_update_sdk_details_in_release_plan` | +| Generate SDK for all languages for an existing release plan id | Look up the plan, run generation against the languages it lists. | `azsdk_get_release_plan`, `azsdk_run_generate_sdk` | +| Link a different spec PR (`https://github.com/Azure/azure-rest-api-specs/pull/...`) to an existing release plan | Look up the plan, swap the spec-PR field. | `azsdk_get_release_plan`, `azsdk_update_api_spec_pull_request_in_release_plan` | +| Update SDK details (package names) on an existing release plan from `tspconfig.yaml` | Look up the plan, update the SDK details from emitter config. | `azsdk_get_release_plan`, `azsdk_update_sdk_details_in_release_plan` | + +All five forbid `azsdk_verify_setup` (the setup gate runs once at the +top of the workflow, not per prompt) and forbid the irrelevant +`azsdk_create_release_plan` in the four "existing plan" prompts so we +catch the agent creating a duplicate. + +### Other workflows in the first round + +| Workflow | File | Coverage | +|---|---|---| +| Check spec is in public repo then validate TypeSpec | `check-public-repo-then-validate.eval.yaml` | TypeSpec authoring routing + validation tool call. | +| TypeSpec generation — step 2 of the authoring flow | `typespec-generation-step02.eval.yaml` | TypeSpec authoring skill + generate tool. | +| Rename a client property in a generated SDK | `rename-client-property.eval.yaml` | Customization skill + customize-code tool. | + +The live counterpart of release-planner lives at +`evals/workflow-scenarios/live/release-planner.eval.yaml` and adds a +prompt-grader that checks the real DevOps response. + +--- + +## Success Criteria + +- A single command runs the full mock suite locally and produces + `eval-results.md`, `results.jsonl`, JUnit XML, the per-prompt + trajectory HTML, and a `history.csv` row. +- Every release-planner prompt above is green in the mock suite. +- Every MCP tool a green scenario calls has a custom mock handler + returning a chainable, type-correct response. +- A new contributor can clone the repo, set the documented env vars, + and reproduce the same `eval-results.md` verdict table on their + machine. +- A partner team reporting *"I tried this prompt and the agent didn't + do anything"* can be answered by pasting their prompt as a new + stimulus and re-running the workflow file — no runner or CI changes. +- The status table is what we hand to reviewers (Renhe, Laurent, + partner teams) to answer *"what does the agent currently support?"* + +--- + +## Open Questions + +### CI cadence and PR gating + +**Cadence.** Current proposal: nightly mock + weekly live + on-demand. +Open: is nightly the right frequency for mock, or do we want it on +every push to `main`? Is weekly enough for live, given live is the +only thing that catches real-backend drift? + +**PR gate for essential workflows.** Should a curated subset of mock +scenarios block merge on PRs that touch the agent, skills, or MCP +tools? Specifically to answer: + +- *Which workflows are "essential"* — just release-planner today, or + a broader set? Who decides when a new workflow joins or leaves the + gated set? +- *Which paths trigger the gate* — agent code, skill markdown, MCP + tool code, mock handlers, all of the above? Anything else? +- *How do we tame flake* — retries on failure, quorum across N runs, + loose thresholds, or just accept some red and require a human + override? Hard requirement: a green PR must mean *the gated + scenarios passed*, not *we got lucky this run*. +- *What's the cost ceiling* — the gated subset runs on every PR push + to a touched path; what's the per-PR token / wall-time budget we're + willing to spend before we move it back off the PR? + +We need owners' input on all four before turning the gate on. + +-