From a0f9233f680d2176ca415f681de62c876cab229e Mon Sep 17 00:00:00 2001 From: helen229 Date: Wed, 27 May 2026 15:03:15 -0700 Subject: [PATCH 01/24] Scaffold Azure.Sdk.Tools.Vally tool-scenario eval suite (#15124) Adds a new Vally eval suite under tools/azsdk-cli/Azure.Sdk.Tools.Vally/ for MCP tool / scenario evaluations, replacing the deleted Azure.Sdk.Tools.Cli.Benchmarks project (#15697). - README documents project intent, layout, local run instructions, and how to add a new scenario. - .vally.yaml wires the azsdk-mcp environment (stdio dotnet run against Azure.Sdk.Tools.Cli) and defines 'typespec' and 'all' suites. - evals/check-public-repo.eval.yaml is the first ported scenario (from the deleted CheckPublicRepoScenario): verifies the agent invokes azsdk_typespec_check_project_in_public_repo for a public-repo check prompt. Lints clean via 'vally lint --eval-spec'. - fixtures/.gitkeep reserves the per-scenario fixtures layout. Remaining scenarios from the deleted benchmark are tracked as a checklist in the project README and in #15124. --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 30 ++++++ .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 95 +++++++++++++++++++ .../evals/check-public-repo.eval.yaml | 46 +++++++++ .../Azure.Sdk.Tools.Vally/fixtures/.gitkeep | 8 ++ 4 files changed, 179 insertions(+) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml new file mode 100644 index 00000000000..7cecb229901 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -0,0 +1,30 @@ +# Vally configuration for Azure SDK Tools MCP tool / scenario evaluations. +# See: https://vally.dev/reference/vally-config +# +# These are scenario evals (does the agent invoke the right MCP tool(s) for a +# given prompt?) and are intentionally separate from the per-skill evals under +# .github/skills/. See README.md for context. + +paths: + evals: [evals/] + evalFilenames: ["*.eval.yaml"] + results: results/ + +environments: + azsdk-mcp: + mcpServers: + azure-sdk-mcp: + type: stdio + command: dotnet + args: ["run", "--project", "../Azure.Sdk.Tools.Cli", "--", "start"] + timeout: 300000 + env: + AZSDKTOOLS_AGENT_TESTING: "false" + AZSDKTOOLS_COLLECT_TELEMETRY: "false" + +suites: + typespec: + evals: + - "evals/check-public-repo.eval.yaml" + all: + evals: ["evals/*.eval.yaml"] diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md new file mode 100644 index 00000000000..75c1b51f1d3 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -0,0 +1,95 @@ +# Azure.Sdk.Tools.Vally + +MCP-tool / end-to-end scenario evaluations for the `azsdk` MCP server, run via +[`@microsoft/vally-cli`](https://www.npmjs.com/package/@microsoft/vally-cli). + +These evals are **distinct from the skill evals under `.github/skills/`**: + +- **Skill evals** test that the agent picks and follows a specific skill + (routing + skill capability). +- **Tool-scenario evals here** test that, given a user prompt, the agent + invokes the right MCP tool(s) end-to-end — independent of any one skill. + This includes single-tool checks (e.g. "agent invokes + `azsdk_typespec_check_project_in_public_repo`") as well as multi-step + scenarios that span several MCP calls (release-plan, SDK generation, + release status, etc.). + +This project supersedes the deleted `Azure.Sdk.Tools.Cli.Benchmarks` project +(removed in [#15697](https://github.com/Azure/azure-sdk-tools/pull/15697)) and +tracks the migration in +[#15124](https://github.com/Azure/azure-sdk-tools/issues/15124). + +## Layout + +``` +Azure.Sdk.Tools.Vally/ +├── .vally.yaml # Vally config (environments + suites) +├── evals/ # Scenario eval YAML files +│ └── *.eval.yaml +├── fixtures/ # Per-scenario file fixtures +│ └── /... +└── Graders/ # (future) Custom .NET graders + └── Azure.Sdk.Tools.Vally.csproj # added when first custom grader lands +``` + +## Running locally + +Prereqs: + +- Node 22+ +- .NET SDK matching the rest of the repo (see `global.json`) +- `@microsoft/vally-cli` installed via the repo's pinned lockfile: + + ```powershell + cd eng/skill-eval + npm ci + ``` + +Run all tool-scenario evals from this directory: + +```powershell +cd tools/azsdk-cli/Azure.Sdk.Tools.Vally +../../../eng/skill-eval/node_modules/.bin/vally run . +``` + +Run a single eval: + +```powershell +../../../eng/skill-eval/node_modules/.bin/vally run evals/check-public-repo.eval.yaml +``` + +## Adding a new scenario + +1. Pick a short, kebab-case name (e.g. `create-release-plan`). +2. Create `evals/.eval.yaml`. Start from + [`evals/check-public-repo.eval.yaml`](evals/check-public-repo.eval.yaml) as + a template. +3. If the scenario needs input files, add them under + `fixtures//...` and reference them via `environment.files` in the + eval (relative paths from the eval file). +4. Pick graders: + - `tool-calls` — verify the agent invoked the expected MCP tool(s). + - `file-matches` — verify the agent produced/modified files correctly. + - `prompt` — LLM-as-judge for free-form quality checks. + - Custom (`Graders/`) — add a .NET grader when none of the built-ins fit + (and add the `Azure.Sdk.Tools.Vally.csproj` when the first one lands). +5. Add the new eval path to the relevant `suites:` entry in + [`.vally.yaml`](.vally.yaml). +6. Run locally to confirm it passes, then open a PR. + +## Recovery checklist (from deleted benchmark) + +Tracked in [#15124](https://github.com/Azure/azure-sdk-tools/issues/15124): + +- [x] `check-public-repo` (reference scenario) +- [ ] `check-public-repo-then-validate` +- [ ] `validate-typespec` +- [ ] `typespec-generation-step02` +- [ ] `get-modified-typespec-projects` +- [ ] `add-arm-resource` +- [ ] `create-release-plan` +- [ ] `link-namespace-approval-issue` +- [ ] `get-pr-link-current-branch` +- [ ] `check-sdk-generation-status` +- [ ] Port `Evaluate_PromptToToolMatch` + `Evaluate_ToolDescriptionSimilarity` + from `Azure.Sdk.Tools.Cli.Evaluations` (uses Copilot-SDK evaluator today). diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml new file mode 100644 index 00000000000..60c269444a7 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml @@ -0,0 +1,46 @@ +name: azsdk-mcp-tool-scenarios +description: | + Tool-scenario evaluation suite for the azsdk MCP server. Verifies the + agent invokes the right MCP tools for given prompts, independent of any + specific skill. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +# Test cases +stimuli: + - name: check-public-repo + # Migrated from the deleted Azure.Sdk.Tools.Cli.Benchmarks CheckPublicRepoScenario + # (see PR #15697 for deletion, PR #14507 for the original benchmark form). + # Validates that the agent invokes `azsdk_typespec_check_project_in_public_repo` + # when asked whether a TypeSpec project is in the public repo, and that it + # does NOT call `azsdk_verify_setup` after being told setup is verified. + prompt: | + Check if my TypeSpec project is in the public repo. + My setup has already been verified, do not run azsdk_verify_setup. + Project root: specification/contosowidgetmanager/Contoso.WidgetManager. + constraints: + max_turns: 5 + max_tokens: 5000 + # NOTE: the deleted benchmark also asserted that `azsdk_verify_setup` + # was NOT called (the prompt explicitly tells the agent skip it). The + # current `tool-calls` grader does not support a `forbidden` list, so we + # rely on the prompt for now. Revisit once Vally supports negative + # tool-call assertions, or add a custom grader under `Graders/`. + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_typespec_check_project_in_public_repo + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep new file mode 100644 index 00000000000..6f799cb330a --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep @@ -0,0 +1,8 @@ +# Scenario fixtures live here, one folder per scenario name +# (matching the `name:` field in the corresponding evals/*.eval.yaml). +# +# Reference them from the eval via: +# environment: +# files: +# - src: ../fixtures// +# dest: From 701b7f8e45331d4ea892484ef7c5fc7960e27069 Mon Sep 17 00:00:00 2001 From: helen229 Date: Wed, 27 May 2026 15:15:40 -0700 Subject: [PATCH 02/24] Port remaining 9 benchmark scenarios to Vally (#15124) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds eval YAMLs for every scenario that was deleted from Azure.Sdk.Tools.Cli.Benchmarks in #15697: - check-public-repo-then-validate - validate-typespec - typespec-generation-step02 - get-modified-typespec-projects (stub — needs git-repo fixture / setup hook) - add-arm-resource (stub — needs fixtures + npx tsp compile post-check) - create-release-plan - link-namespace-approval-issue - get-pr-link-current-branch - check-sdk-generation-status Each eval uses the built-in tool-calls grader for presence checks; the original benchmark's argument/order/forbidden/optional assertions are captured in prompt text + inline TODOs (require custom graders or upstream Vally support, documented in README). Also adds release-plan/github/pipeline suites to .vally.yaml. All 10 evals pass 'vally lint --eval-spec'. --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 15 +++++ .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 52 +++++++++++++----- .../evals/add-arm-resource.eval.yaml | 55 +++++++++++++++++++ .../check-public-repo-then-validate.eval.yaml | 39 +++++++++++++ .../check-sdk-generation-status.eval.yaml | 36 ++++++++++++ .../evals/create-release-plan.eval.yaml | 45 +++++++++++++++ .../get-modified-typespec-projects.eval.yaml | 51 +++++++++++++++++ .../get-pr-link-current-branch.eval.yaml | 36 ++++++++++++ .../link-namespace-approval-issue.eval.yaml | 37 +++++++++++++ .../typespec-generation-step02.eval.yaml | 38 +++++++++++++ .../evals/validate-typespec.eval.yaml | 37 +++++++++++++ 11 files changed, 428 insertions(+), 13 deletions(-) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index 7cecb229901..577e3de3a79 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -26,5 +26,20 @@ suites: typespec: evals: - "evals/check-public-repo.eval.yaml" + - "evals/check-public-repo-then-validate.eval.yaml" + - "evals/validate-typespec.eval.yaml" + - "evals/typespec-generation-step02.eval.yaml" + - "evals/get-modified-typespec-projects.eval.yaml" + - "evals/add-arm-resource.eval.yaml" + release-plan: + evals: + - "evals/create-release-plan.eval.yaml" + - "evals/link-namespace-approval-issue.eval.yaml" + github: + evals: + - "evals/get-pr-link-current-branch.eval.yaml" + pipeline: + evals: + - "evals/check-sdk-generation-status.eval.yaml" all: evals: ["evals/*.eval.yaml"] diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index 75c1b51f1d3..1f929a3e876 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -79,17 +79,43 @@ Run a single eval: ## Recovery checklist (from deleted benchmark) -Tracked in [#15124](https://github.com/Azure/azure-sdk-tools/issues/15124): - -- [x] `check-public-repo` (reference scenario) -- [ ] `check-public-repo-then-validate` -- [ ] `validate-typespec` -- [ ] `typespec-generation-step02` -- [ ] `get-modified-typespec-projects` -- [ ] `add-arm-resource` -- [ ] `create-release-plan` -- [ ] `link-namespace-approval-issue` -- [ ] `get-pr-link-current-branch` -- [ ] `check-sdk-generation-status` +Tracked in [#15124](https://github.com/Azure/azure-sdk-tools/issues/15124). +All 9 deleted scenarios have been ported as Vally `tool-calls` evals (presence +checks). Items marked with **(stub)** have known gaps documented inline in the +eval file: + +- [x] `check-public-repo` +- [x] `check-public-repo-then-validate` +- [x] `validate-typespec` +- [x] `typespec-generation-step02` +- [x] `get-modified-typespec-projects` **(stub — needs git-repo fixture / setup hook)** +- [x] `add-arm-resource` **(stub — needs fixtures + `npx tsp compile` post-check)** +- [x] `create-release-plan` +- [x] `link-namespace-approval-issue` +- [x] `get-pr-link-current-branch` +- [x] `check-sdk-generation-status` + +### Known gaps vs. the original benchmark + +The current `tool-calls` grader only checks tool *names*. The deleted +benchmark's `ToolCallValidator` additionally asserted: + +1. **Argument values** (e.g. `serviceTreeId`, `buildId`, `typeSpecProjectPath`). +2. **Forbidden tools** (e.g. "must NOT call `azsdk_verify_setup`"). +3. **Call order** (e.g. validate before check-public-repo). +4. **Optional tools** (calls that are allowed but not required). + +Recovering 1–4 requires either upstream grader support in +`@microsoft/vally-cli` or a custom .NET grader under `Graders/`. Until then +those constraints are captured in prompt text and inline `TODO:` comments. + +### Follow-ups + - [ ] Port `Evaluate_PromptToToolMatch` + `Evaluate_ToolDescriptionSimilarity` - from `Azure.Sdk.Tools.Cli.Evaluations` (uses Copilot-SDK evaluator today). + from `Azure.Sdk.Tools.Cli.Evaluations` (still uses Copilot-SDK evaluator). +- [ ] File upstream issue against `@microsoft/vally-cli` to add `forbidden`, + `optional`, argument-matching, and ordering to the built-in `tool-calls` + grader (or accept that those gaps need custom graders). +- [ ] Wire a `vally eval` CI job (current `.github/workflows/skill-eval.yml` + runs `vally lint` only). See [#15126](https://github.com/Azure/azure-sdk-tools/issues/15126) + and [#15127](https://github.com/Azure/azure-sdk-tools/issues/15127). diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml new file mode 100644 index 00000000000..c1fd9156c26 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml @@ -0,0 +1,55 @@ +name: azsdk-mcp-tool-scenarios +description: | + Add-arm-resource: end-to-end scenario for authoring a new ARM resource + via TypeSpec. This is a complex, file-producing scenario (not a single + tool-call check) that needs a real fixture + tsp compile verification. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: add-arm-resource + # Ported (stub) from deleted AddArmResourceScenario (#15697). + # + # TODO: this scenario is heavier than the others — the original asserted: + # - main.tsp still exists and imports the new asset.tsp + # - asset.tsp contains an ARM resource model with @armResourceOperations, + # interface Assets, ArmResourceRead/CreateOrReplace/Update/Delete, + # listByResourceGroup, listBySubscription + # - `npx tsp compile` succeeds against the modified project + # To port faithfully we need: + # 1. Fixtures under fixtures/add-arm-resource/ that mirror + # specification/widget/resource-manager/Microsoft.Widget/Widget + # (the Microsoft.Widget fixture already exists under + # .github/skills/azure-typespec-author/evaluate/fixtures/Microsoft.Widget + # and could be reused / symlinked). + # 2. A Vally environment hook (or custom grader) that runs + # `npm ci` + `npx tsp compile` after the agent finishes. + # 3. file-matches graders for the asset.tsp content patterns. + # For now this eval only checks that the agent makes at least one edit + # and invokes the authoring-plan tool — it does NOT verify the produced + # TypeSpec compiles. + prompt: | + In the specification/widget/resource-manager/Microsoft.Widget/Widget project, + add an ARM resource named 'Asset' with CRUD operations. + constraints: + max_turns: 20 + max_tokens: 50000 + graders: + - type: tool-calls + config: + required: + - edit + - azure-sdk-mcp-azsdk_typespec_generate_authoring_plan + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml new file mode 100644 index 00000000000..fa93ad2f311 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml @@ -0,0 +1,39 @@ +name: azsdk-mcp-tool-scenarios +description: | + Validate-then-check-public-repo: the agent should run TypeSpec validation, + then check if the project is in the public repo. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: validate-then-check-public-repo + # Ported from deleted CheckPublicRepoThenValidateScenario (#15697). + # Original also asserted ordering (validate before check) and forbade + # azsdk_verify_setup. Vally's tool-calls grader currently checks presence + # only, not order, and has no `forbidden` field — captured via prompt. + prompt: | + Run TypeSpec validation, then check if the project is in the public repo. + Project path: specification/contosowidgetmanager/Contoso.WidgetManager. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 8 + max_tokens: 8000 + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_run_typespec_validation + - azure-sdk-mcp-azsdk_typespec_check_project_in_public_repo + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml new file mode 100644 index 00000000000..75c321f201a --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml @@ -0,0 +1,36 @@ +name: azsdk-mcp-tool-scenarios +description: | + Check-sdk-generation-status: the agent should call azsdk_get_pipeline_status + to check the SDK generation pipeline status. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: check-sdk-generation-status + # Ported from deleted CheckSdkGenerationStatusScenario (#15697). + # Original also asserted buildId=5513110 was passed to the tool. + # Argument assertions require a custom grader (not yet built). + prompt: | + Check the SDK generation pipeline status for build ID 5513110. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_get_pipeline_status + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml new file mode 100644 index 00000000000..31a51b6dd50 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml @@ -0,0 +1,45 @@ +name: azsdk-mcp-tool-scenarios +description: | + Create-release-plan: the agent should call azsdk_create_release_plan with + the supplied service-tree / product-tree / spec PR context. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: create-release-plan + # Ported from deleted CreateReleasePlanScenario (#15697). + # Original also asserted exact tool arguments (serviceTreeId, productTreeId, + # specApiVersion, specPullRequestUrl, sdkReleaseType, typeSpecProjectPath). + # Vally's built-in tool-calls grader checks tool *name* only; argument + # assertions would need a custom grader under Graders/. + prompt: | + Create a release plan for the Contoso Widget Manager, no need to get it afterwards only create. + My setup has already been verified, do not run azsdk_verify_setup. Here is all the context you need: + TypeSpec project located at "specification/contosowidgetmanager/Contoso.WidgetManager". + Use service tree ID "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f", + product tree ID "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e", + target release timeline "December 2025", + API version "2022-11-01-preview", + SDK release type "beta", + and link it to the spec pull request "https://github.com/Azure/azure-rest-api-specs/pull/38387". + constraints: + max_turns: 8 + max_tokens: 8000 + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_create_release_plan + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml new file mode 100644 index 00000000000..74aa5cd9a63 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml @@ -0,0 +1,51 @@ +name: azsdk-mcp-tool-scenarios +description: | + Get-modified-typespec-projects: the agent should call + azsdk_get_modified_typespec_projects to list TypeSpec projects modified + in the current branch. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: get-modified-typespec-projects + # Ported from deleted GetModifiedTypespecProjectsScenario (#15697). + # + # TODO: the original scenario had a SetupAsync hook that sparse-checked + # out Azure/azure-rest-api-specs, modified + # specification/contosowidgetmanager/Contoso.WidgetManager/tspconfig.yaml, + # and committed the change so `git merge-base HEAD main` had a divergence + # point to report. Vally's `environment.files` only seeds files — it does + # not init a git repo or run commands. To exercise the tool's underlying + # git logic this eval needs either: + # 1. A new Vally environment hook that runs setup commands, OR + # 2. A pre-built fixture committed under fixtures/ that is itself a git + # repo (committed as a tarball / unpacked at setup), OR + # 3. A custom .NET grader under Graders/ that drives the tool with a + # fabricated workspace. + # For now this eval only verifies the tool *name* is selected — the + # actual diff result is not asserted. + prompt: | + List the TypeSpec projects modified in my current branch compared to main. + My setup has already been verified, do not run azsdk_verify_setup. + The repository root is the relative path ./azure-rest-api-specs. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_get_modified_typespec_projects + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml new file mode 100644 index 00000000000..4f4320714b2 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml @@ -0,0 +1,36 @@ +name: azsdk-mcp-tool-scenarios +description: | + Get-pr-link-current-branch: the agent should call + azsdk_get_pull_request_link_for_current_branch when asked about the + status of the spec PR on the current branch. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: get-pr-link-current-branch + # Ported from deleted GetPrLinkCurrentBranchScenario (#15697). + prompt: | + What's the status of the spec PR in my current branch? Only check the status once. + My setup has already been verified, do not run azsdk_verify_setup. + The repository root is the relative path ./azure-rest-api-specs. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_get_pull_request_link_for_current_branch + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml new file mode 100644 index 00000000000..efb593f8787 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml @@ -0,0 +1,37 @@ +name: azsdk-mcp-tool-scenarios +description: | + Link-namespace-approval-issue: the agent should call + azsdk_link_namespace_approval_issue to link an issue to a release plan. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: link-namespace-approval-issue + # Ported from deleted LinkNamespaceApprovalIssueScenario (#15697). + # Original also asserted releasePlanWorkItemId=12345 and + # namespaceApprovalIssue=https://github.com/Azure/azure-sdk/issues/1234. + # Argument assertions require a custom grader (not yet built). + prompt: | + Link namespace approval issue https://github.com/Azure/azure-sdk/issues/1234 to release plan 12345. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_link_namespace_approval_issue + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml new file mode 100644 index 00000000000..c1af5286811 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml @@ -0,0 +1,38 @@ +name: azsdk-mcp-tool-scenarios +description: | + TypeSpec generation workflow step 2: the agent should check whether the + project is in the public repo as part of the validation step. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: typespec-generation-step02-validation + # Ported from deleted TypespecGenerationStep02Scenario (#15697). + # The original benchmark replaced a mid-conversation JSON trace with a + # standalone prompt; same intent preserved here. + prompt: | + I'm working on the TypeSpec generation workflow. I need to validate my TypeSpec project + as part of step 2. Please check if my TypeSpec project is in the public repo. + The project is at specification/contosowidgetmanager/Contoso.WidgetManager. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_typespec_check_project_in_public_repo + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml new file mode 100644 index 00000000000..50d20f05f43 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml @@ -0,0 +1,37 @@ +name: azsdk-mcp-tool-scenarios +description: | + Validate-typespec: the agent should run TypeSpec validation when asked to + validate a TypeSpec project. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: validate-typespec + # Ported from deleted ValidateTypespecScenario (#15697). + # The original allowed `azsdk_typespec_check_project_in_public_repo` and + # `azsdk_verify_setup` as optional tool calls. Vally's current grader has + # no `optional` concept — extra calls are not penalised by `required:`. + prompt: | + Validate my typespec project. It is already confirmed we are in a public repository. + The path to my typespec is specification/contosowidgetmanager/Contoso.WidgetManager/main.tsp. + constraints: + max_turns: 8 + max_tokens: 8000 + graders: + - type: tool-calls + config: + required: + - azure-sdk-mcp-azsdk_run_typespec_validation + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 From 26cc6efb26290bf527fba8fd028848d062bcf21d Mon Sep 17 00:00:00 2001 From: helen229 Date: Mon, 1 Jun 2026 10:09:27 -0700 Subject: [PATCH 03/24] Add rename-client-property stub eval to Vally suite (#15124) Ports the deleted RenameClientPropertyScenario as a tool-calls-only stub. Full expected-diff grading + sparse-clone setup hook are tracked as follow-ups in the README. --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 1 + .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 7 +++ .../evals/rename-client-property.eval.yaml | 54 +++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/rename-client-property.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index 577e3de3a79..81b8eb23de1 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -31,6 +31,7 @@ suites: - "evals/typespec-generation-step02.eval.yaml" - "evals/get-modified-typespec-projects.eval.yaml" - "evals/add-arm-resource.eval.yaml" + - "evals/rename-client-property.eval.yaml" release-plan: evals: - "evals/create-release-plan.eval.yaml" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index 1f929a3e876..ce8bcc696ba 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -94,6 +94,7 @@ eval file: - [x] `link-namespace-approval-issue` - [x] `get-pr-link-current-branch` - [x] `check-sdk-generation-status` +- [x] `rename-client-property` **(stub — needs `expected-diff` grader + sparse-clone of `azure-rest-api-specs`)** ### Known gaps vs. the original benchmark @@ -111,6 +112,12 @@ those constraints are captured in prompt text and inline `TODO:` comments. ### Follow-ups +- [ ] Port the data-driven `AuthoringScenario` suite (29 TypeSpec versioning / + ARM / data-plane authoring cases from `TestData/TypeSpec/TestCases.json`). + Tracked in [#15767](https://github.com/Azure/azure-sdk-tools/issues/15767). + Blocked on: an AI-rubric grader, a `tsp compile` post-check grader, and a + fixture-copy setup hook (each case ships its own `.tsp` files + examples + and expects the agent to call the `azure-typespec-author` skill). - [ ] Port `Evaluate_PromptToToolMatch` + `Evaluate_ToolDescriptionSimilarity` from `Azure.Sdk.Tools.Cli.Evaluations` (still uses Copilot-SDK evaluator). - [ ] File upstream issue against `@microsoft/vally-cli` to add `forbidden`, diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/rename-client-property.eval.yaml new file mode 100644 index 00000000000..76b16d568f6 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/rename-client-property.eval.yaml @@ -0,0 +1,54 @@ +name: azsdk-mcp-tool-scenarios +description: | + Rename-client-property: the agent should rename @clientName("uri", "csharp") + to @clientName("imageUri", "csharp") on the AddFaceFromUrlRequest.url + property in specification/ai/Face/models.common.tsp. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: claude-opus-4.6 + executor: copilot-sdk + +stimuli: + - name: rename-client-property + # Ported from deleted RenameClientPropertyScenario (#15697). + # STUB: the original benchmark asserted an exact unified diff against + # specification/ai/Face/models.common.tsp via ExpectedDiffValidator, plus + # FileExistsValidator + ContainsValidator. None of those graders exist in + # Vally today. This eval currently only checks that the agent reached for + # the file-edit tool; the actual correctness of the rename is not graded. + # TODO: add a custom "expected-diff" grader (or upstream a file-diff + # grader to @microsoft/vally-cli) to validate the unified diff: + # diff --git a/specification/ai/Face/models.common.tsp b/specification/ai/Face/models.common.tsp + # @@ -155,7 +155,7 @@ + # @doc("Add face from url request.") + # model AddFaceFromUrlRequest is AddFaceOptions { + # - @clientName("uri", "csharp") + # + @clientName("imageUri", "csharp") + # @doc("URL of input image.") + # url: url; + # } + # TODO: needs a setup hook to sparse-clone Azure/azure-rest-api-specs at + # specification/ai/Face into the workspace before the agent runs. + prompt: | + In the specification/ai/Face project, find the AddFaceFromUrlRequest model. + It has a property called 'url' that's been renamed to "uri" in c#. + Change that to imageUri for c#. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - edit + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 From 8e4f524b1262a01f50e35ea56b74e0d9d6dc55ea Mon Sep 17 00:00:00 2001 From: helen229 Date: Tue, 2 Jun 2026 11:48:42 -0700 Subject: [PATCH 04/24] Fix tool name prefix in graders, timeout format, expand README --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 2 +- .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 98 +++++++++++++++---- .../evals/add-arm-resource.eval.yaml | 2 +- .../check-public-repo-then-validate.eval.yaml | 4 +- .../evals/check-public-repo.eval.yaml | 2 +- .../check-sdk-generation-status.eval.yaml | 2 +- .../evals/create-release-plan.eval.yaml | 2 +- .../get-modified-typespec-projects.eval.yaml | 2 +- .../get-pr-link-current-branch.eval.yaml | 2 +- .../link-namespace-approval-issue.eval.yaml | 2 +- .../typespec-generation-step02.eval.yaml | 2 +- .../evals/validate-typespec.eval.yaml | 2 +- 12 files changed, 91 insertions(+), 31 deletions(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index 81b8eb23de1..b052cbf44fc 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -17,7 +17,7 @@ environments: type: stdio command: dotnet args: ["run", "--project", "../Azure.Sdk.Tools.Cli", "--", "start"] - timeout: 300000 + timeout: "5m" env: AZSDKTOOLS_AGENT_TESTING: "false" AZSDKTOOLS_COLLECT_TELEMETRY: "false" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index ce8bcc696ba..e6ef107c77f 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -3,16 +3,73 @@ MCP-tool / end-to-end scenario evaluations for the `azsdk` MCP server, run via [`@microsoft/vally-cli`](https://www.npmjs.com/package/@microsoft/vally-cli). -These evals are **distinct from the skill evals under `.github/skills/`**: - -- **Skill evals** test that the agent picks and follows a specific skill - (routing + skill capability). -- **Tool-scenario evals here** test that, given a user prompt, the agent - invokes the right MCP tool(s) end-to-end — independent of any one skill. - This includes single-tool checks (e.g. "agent invokes - `azsdk_typespec_check_project_in_public_repo`") as well as multi-step - scenarios that span several MCP calls (release-plan, SDK generation, - release status, etc.). +## Tool-scenario evals vs. skill evals + +The repo runs **two complementary eval surfaces**, both via the same +`@microsoft/vally-cli` binary. They answer different questions and live in +different folders. A full end-to-end gate runs *both*. + +| | **Tool-scenario evals** (this project) | **Skill evals** | +|---|---|---| +| **Question** | Given a user prompt, does the agent invoke the right MCP tool(s) with the right shape? | Given a user prompt, does the agent route to the right skill and follow its instructions? | +| **Catches** | Tool name / description / parameter regressions; multi-tool ordering; tool-catalog conflicts | Skill frontmatter / `description` / instruction regressions; skill-routing collisions | +| **Path** | [`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/*.eval.yaml`](evals/) | [`.github/skills//evals/*.eval.yaml`](../../../.github/skills/) (and `evaluate/evals/` for capability suites) | +| **Loaded subject** | Production MCP server (`Azure.Sdk.Tools.Cli`) over stdio — real tools, real network calls | Skill's `SKILL.md` + frontmatter; the agent picks tools itself | +| **Primary grader** | `tool-calls` — checks the recorded trajectory for required tool names | Trigger / routing graders + per-skill rubric | +| **Run command** | `vally eval --eval-spec evals/.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/` *from repo root* | +| **CI status** | Not wired yet (see follow-ups) | `vally lint` runs in [.github/workflows/skill-eval.yml](../../../.github/workflows/skill-eval.yml); full `eval` job pending | +| **Cost profile** | Higher — each run spins up the MCP server, real LLM turns (~5–15), real tool calls | Variable — trigger evals are cheap; capability evals (e.g. `azure-typespec-author`) are expensive | + +### Why both? + +A skill *uses* tools, but a tool can be invoked **without** any skill +(Copilot picks it directly from the catalog when the user prompt doesn't +trigger a skill — which is most prompts in practice). Concretely: + +- Drop tool-scenario evals → you stop catching regressions when someone + renames a tool, edits its description, or adds an overlapping tool that + the model now prefers. +- Drop skill evals → you stop catching regressions when someone edits a + skill's `description`, frontmatter, or instruction body and the router + stops invoking it for the right prompts. + +For workflows where a skill is a thin wrapper around one tool, the two +evals have meaningful overlap and you may keep just one. For workflows +where the skill does real orchestration (multi-tool sequencing, +conditional branches, recovery), both matter independently. + +### Scenarios checked in today + +**Tool-scenario evals (this project)** — 11 scenarios under [`evals/`](evals/): + +| Scenario | Shape | +|---|---| +| [`check-public-repo`](evals/check-public-repo.eval.yaml) | Single-tool: is a TypeSpec project published in `azure-rest-api-specs`? | +| [`check-public-repo-then-validate`](evals/check-public-repo-then-validate.eval.yaml) | Multi-tool, ordered: validate then check | +| [`validate-typespec`](evals/validate-typespec.eval.yaml) | Single-tool: run `tsp` linter/validation | +| [`typespec-generation-step02`](evals/typespec-generation-step02.eval.yaml) | Step in the spec-PR generation flow | +| [`get-modified-typespec-projects`](evals/get-modified-typespec-projects.eval.yaml) | Git-aware tool against current branch | +| [`add-arm-resource`](evals/add-arm-resource.eval.yaml) | Calls `azsdk_typespec_generate_authoring_plan` for an ARM resource | +| [`create-release-plan`](evals/create-release-plan.eval.yaml) | Single-tool: create a release-plan work item | +| [`link-namespace-approval-issue`](evals/link-namespace-approval-issue.eval.yaml) | Link an existing approval issue to a release plan | +| [`get-pr-link-current-branch`](evals/get-pr-link-current-branch.eval.yaml) | Resolve the PR for the active git branch | +| [`check-sdk-generation-status`](evals/check-sdk-generation-status.eval.yaml) | Pipeline status lookup | +| [`rename-client-property`](evals/rename-client-property.eval.yaml) | Stub — needs `expected-diff` grader | + +**Skill evals (already in repo, *not* part of this PR)** — for reference: + +- **Trigger evals** (one per skill, verify routing): see e.g. + [`.github/skills/azsdk-common-prepare-release-plan/evals/trigger.eval.yaml`](../../../.github/skills/azsdk-common-prepare-release-plan/evals/trigger.eval.yaml), + plus `azsdk-common-sdk-release`, `azsdk-common-pipeline-troubleshooting`, + `azsdk-common-apiview-feedback-resolution`, `sensei`, + `skill-authoring`, `markdown-token-optimizer`. +- **Capability suite** for [`azure-typespec-author`](../../../.github/skills/azure-typespec-author/) — + 29 numbered cases under + [`.github/skills/azure-typespec-author/evaluate/evals/`](../../../.github/skills/azure-typespec-author/evaluate/evals/) + (`001001.eval.yaml` … `005001.eval.yaml`). These are the data-driven + TypeSpec authoring scenarios that *would* have been our follow-up #1 + here — they're already covered as skill evals, so this project doesn't + re-port them. This project supersedes the deleted `Azure.Sdk.Tools.Cli.Benchmarks` project (removed in [#15697](https://github.com/Azure/azure-sdk-tools/pull/15697)) and @@ -112,17 +169,20 @@ those constraints are captured in prompt text and inline `TODO:` comments. ### Follow-ups -- [ ] Port the data-driven `AuthoringScenario` suite (29 TypeSpec versioning / - ARM / data-plane authoring cases from `TestData/TypeSpec/TestCases.json`). - Tracked in [#15767](https://github.com/Azure/azure-sdk-tools/issues/15767). - Blocked on: an AI-rubric grader, a `tsp compile` post-check grader, and a - fixture-copy setup hook (each case ships its own `.tsp` files + examples - and expects the agent to call the `azure-typespec-author` skill). - [ ] Port `Evaluate_PromptToToolMatch` + `Evaluate_ToolDescriptionSimilarity` from `Azure.Sdk.Tools.Cli.Evaluations` (still uses Copilot-SDK evaluator). - [ ] File upstream issue against `@microsoft/vally-cli` to add `forbidden`, `optional`, argument-matching, and ordering to the built-in `tool-calls` grader (or accept that those gaps need custom graders). -- [ ] Wire a `vally eval` CI job (current `.github/workflows/skill-eval.yml` - runs `vally lint` only). See [#15126](https://github.com/Azure/azure-sdk-tools/issues/15126) - and [#15127](https://github.com/Azure/azure-sdk-tools/issues/15127). +- [ ] Wire a `vally eval` CI job for this project (current + [`.github/workflows/skill-eval.yml`](../../../.github/workflows/skill-eval.yml) + runs `vally lint` only and is scoped to skills). See + [#15126](https://github.com/Azure/azure-sdk-tools/issues/15126) and + [#15127](https://github.com/Azure/azure-sdk-tools/issues/15127). +- [ ] Decide on `AuthoringScenario` parity: the 29 TypeSpec authoring cases + are already covered as **skill evals** under + [`.github/skills/azure-typespec-author/evaluate/evals/`](../../../.github/skills/azure-typespec-author/evaluate/evals/). + Tracked as [#15767](https://github.com/Azure/azure-sdk-tools/issues/15767) — + likely close as duplicate unless we also want tool-level coverage of the + same prompts (catches catalog regressions even when the skill isn't + triggered). diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml index c1fd9156c26..596bb4a3111 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml @@ -47,7 +47,7 @@ stimuli: config: required: - edit - - azure-sdk-mcp-azsdk_typespec_generate_authoring_plan + - azsdk_typespec_generate_authoring_plan scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml index fa93ad2f311..0f82553728c 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml @@ -30,8 +30,8 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_run_typespec_validation - - azure-sdk-mcp-azsdk_typespec_check_project_in_public_repo + - azsdk_run_typespec_validation + - azsdk_typespec_check_project_in_public_repo scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml index 60c269444a7..c0deb25bc8a 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml @@ -38,7 +38,7 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_typespec_check_project_in_public_repo + - azsdk_typespec_check_project_in_public_repo scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml index 75c321f201a..ef4617ea0c0 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml @@ -28,7 +28,7 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_get_pipeline_status + - azsdk_get_pipeline_status scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml index 31a51b6dd50..068e4f54b19 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml @@ -37,7 +37,7 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_create_release_plan + - azsdk_create_release_plan scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml index 74aa5cd9a63..b5cb73c5b07 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml @@ -43,7 +43,7 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_get_modified_typespec_projects + - azsdk_get_modified_typespec_projects scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml index 4f4320714b2..ac957a97391 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml @@ -28,7 +28,7 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_get_pull_request_link_for_current_branch + - azsdk_get_pull_request_link_for_current_branch scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml index efb593f8787..b3e9cac0576 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml @@ -29,7 +29,7 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_link_namespace_approval_issue + - azsdk_link_namespace_approval_issue scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml index c1af5286811..7a7717824a0 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml @@ -30,7 +30,7 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_typespec_check_project_in_public_repo + - azsdk_typespec_check_project_in_public_repo scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml index 50d20f05f43..0a66b39a715 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml @@ -29,7 +29,7 @@ stimuli: - type: tool-calls config: required: - - azure-sdk-mcp-azsdk_run_typespec_validation + - azsdk_run_typespec_validation scoring: weights: From d9ea3e4e6ed9d2821bc33f6c71ae50cc9c058ac9 Mon Sep 17 00:00:00 2001 From: helen229 Date: Tue, 2 Jun 2026 11:56:45 -0700 Subject: [PATCH 05/24] Reorganize evals into scenarios/ and triggers/; port trigger evals from #15183 - Move 11 multi-step scenario evals to evals/scenarios/ - Port 9 per-tool trigger evals from jeo02/migrate-evaluations-to-vally (PR #15183) to evals/triggers/, stripped azure-sdk-mcp- prefix from graders to match bare MCP tool names - Port Validate-EvalTools.ps1 to scripts/, retargeted at evals/triggers/ with bare-name regex - Update .vally.yaml suites for new layout (scenarios, triggers, all) - Update README to document the split and per-trigger-file tool coverage - Add .gitignore for vally-results/ and results/ --- .../Azure.Sdk.Tools.Vally/.gitignore | 2 + .../Azure.Sdk.Tools.Vally/.vally.yaml | 35 +- .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 48 ++- .../add-arm-resource.eval.yaml | 0 .../check-public-repo-then-validate.eval.yaml | 0 .../check-public-repo.eval.yaml | 0 .../check-sdk-generation-status.eval.yaml | 0 .../create-release-plan.eval.yaml | 0 .../get-modified-typespec-projects.eval.yaml | 0 .../get-pr-link-current-branch.eval.yaml | 0 .../link-namespace-approval-issue.eval.yaml | 0 .../rename-client-property.eval.yaml | 0 .../typespec-generation-step02.eval.yaml | 0 .../validate-typespec.eval.yaml | 0 .../evals/triggers/apiview.eval.yaml | 112 +++++++ .../evals/triggers/config.eval.yaml | 51 +++ .../evals/triggers/engsys.eval.yaml | 99 ++++++ .../evals/triggers/github.eval.yaml | 76 +++++ .../evals/triggers/package.eval.yaml | 230 +++++++++++++ .../evals/triggers/pipeline.eval.yaml | 74 +++++ .../evals/triggers/releaseplan.eval.yaml | 314 ++++++++++++++++++ .../evals/triggers/typespec.eval.yaml | 187 +++++++++++ .../evals/triggers/verify.eval.yaml | 42 +++ .../scripts/Validate-EvalTools.ps1 | 160 +++++++++ 24 files changed, 1404 insertions(+), 26 deletions(-) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/add-arm-resource.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/check-public-repo-then-validate.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/check-public-repo.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/check-sdk-generation-status.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/create-release-plan.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/get-modified-typespec-projects.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/get-pr-link-current-branch.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/link-namespace-approval-issue.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/rename-client-property.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/typespec-generation-step02.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{ => scenarios}/validate-typespec.eval.yaml (100%) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/apiview.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/config.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/engsys.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/github.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/package.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/pipeline.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/releaseplan.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/typespec.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/verify.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore new file mode 100644 index 00000000000..80a68f12750 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore @@ -0,0 +1,2 @@ +vally-results/ +results/ diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index b052cbf44fc..a5bb1d43da4 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -22,25 +22,36 @@ environments: AZSDKTOOLS_AGENT_TESTING: "false" AZSDKTOOLS_COLLECT_TELEMETRY: "false" +# Suites group evals for selective execution. +# - scenarios/* = multi-step workflow evals (this PR) +# - triggers/* = per-tool single-prompt invocation evals (ported from PR #15183) suites: + # Scenario suites (multi-step workflows) typespec: evals: - - "evals/check-public-repo.eval.yaml" - - "evals/check-public-repo-then-validate.eval.yaml" - - "evals/validate-typespec.eval.yaml" - - "evals/typespec-generation-step02.eval.yaml" - - "evals/get-modified-typespec-projects.eval.yaml" - - "evals/add-arm-resource.eval.yaml" - - "evals/rename-client-property.eval.yaml" + - "evals/scenarios/check-public-repo.eval.yaml" + - "evals/scenarios/check-public-repo-then-validate.eval.yaml" + - "evals/scenarios/validate-typespec.eval.yaml" + - "evals/scenarios/typespec-generation-step02.eval.yaml" + - "evals/scenarios/get-modified-typespec-projects.eval.yaml" + - "evals/scenarios/add-arm-resource.eval.yaml" + - "evals/scenarios/rename-client-property.eval.yaml" release-plan: evals: - - "evals/create-release-plan.eval.yaml" - - "evals/link-namespace-approval-issue.eval.yaml" + - "evals/scenarios/create-release-plan.eval.yaml" + - "evals/scenarios/link-namespace-approval-issue.eval.yaml" github: evals: - - "evals/get-pr-link-current-branch.eval.yaml" + - "evals/scenarios/get-pr-link-current-branch.eval.yaml" pipeline: evals: - - "evals/check-sdk-generation-status.eval.yaml" + - "evals/scenarios/check-sdk-generation-status.eval.yaml" + scenarios: + evals: ["evals/scenarios/*.eval.yaml"] + + # Trigger suite (per-tool prompt → tool invocation coverage) + triggers: + evals: ["evals/triggers/*.eval.yaml"] + all: - evals: ["evals/*.eval.yaml"] + evals: ["evals/**/*.eval.yaml"] diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index e6ef107c77f..34b89ffc069 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -13,10 +13,10 @@ different folders. A full end-to-end gate runs *both*. |---|---|---| | **Question** | Given a user prompt, does the agent invoke the right MCP tool(s) with the right shape? | Given a user prompt, does the agent route to the right skill and follow its instructions? | | **Catches** | Tool name / description / parameter regressions; multi-tool ordering; tool-catalog conflicts | Skill frontmatter / `description` / instruction regressions; skill-routing collisions | -| **Path** | [`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/*.eval.yaml`](evals/) | [`.github/skills//evals/*.eval.yaml`](../../../.github/skills/) (and `evaluate/evals/` for capability suites) | +| **Path** | [`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/`](evals/) (`scenarios/` + `triggers/`) | [`.github/skills//evals/*.eval.yaml`](../../../.github/skills/) (and `evaluate/evals/` for capability suites) | | **Loaded subject** | Production MCP server (`Azure.Sdk.Tools.Cli`) over stdio — real tools, real network calls | Skill's `SKILL.md` + frontmatter; the agent picks tools itself | | **Primary grader** | `tool-calls` — checks the recorded trajectory for required tool names | Trigger / routing graders + per-skill rubric | -| **Run command** | `vally eval --eval-spec evals/.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/` *from repo root* | +| **Run command** | `vally eval --eval-spec evals/scenarios/.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/` *from repo root* | | **CI status** | Not wired yet (see follow-ups) | `vally lint` runs in [.github/workflows/skill-eval.yml](../../../.github/workflows/skill-eval.yml); full `eval` job pending | | **Cost profile** | Higher — each run spins up the MCP server, real LLM turns (~5–15), real tool calls | Variable — trigger evals are cheap; capability evals (e.g. `azure-typespec-author`) are expensive | @@ -40,21 +40,41 @@ conditional branches, recovery), both matter independently. ### Scenarios checked in today -**Tool-scenario evals (this project)** — 11 scenarios under [`evals/`](evals/): +**Tool-scenario evals (this project)** — split into two suites under [`evals/`](evals/): + +#### `evals/scenarios/` — multi-step workflow evals (11) | Scenario | Shape | |---|---| -| [`check-public-repo`](evals/check-public-repo.eval.yaml) | Single-tool: is a TypeSpec project published in `azure-rest-api-specs`? | -| [`check-public-repo-then-validate`](evals/check-public-repo-then-validate.eval.yaml) | Multi-tool, ordered: validate then check | -| [`validate-typespec`](evals/validate-typespec.eval.yaml) | Single-tool: run `tsp` linter/validation | -| [`typespec-generation-step02`](evals/typespec-generation-step02.eval.yaml) | Step in the spec-PR generation flow | -| [`get-modified-typespec-projects`](evals/get-modified-typespec-projects.eval.yaml) | Git-aware tool against current branch | -| [`add-arm-resource`](evals/add-arm-resource.eval.yaml) | Calls `azsdk_typespec_generate_authoring_plan` for an ARM resource | -| [`create-release-plan`](evals/create-release-plan.eval.yaml) | Single-tool: create a release-plan work item | -| [`link-namespace-approval-issue`](evals/link-namespace-approval-issue.eval.yaml) | Link an existing approval issue to a release plan | -| [`get-pr-link-current-branch`](evals/get-pr-link-current-branch.eval.yaml) | Resolve the PR for the active git branch | -| [`check-sdk-generation-status`](evals/check-sdk-generation-status.eval.yaml) | Pipeline status lookup | -| [`rename-client-property`](evals/rename-client-property.eval.yaml) | Stub — needs `expected-diff` grader | +| [`check-public-repo`](evals/scenarios/check-public-repo.eval.yaml) | Single-tool: is a TypeSpec project published in `azure-rest-api-specs`? | +| [`check-public-repo-then-validate`](evals/scenarios/check-public-repo-then-validate.eval.yaml) | Multi-tool, ordered: validate then check | +| [`validate-typespec`](evals/scenarios/validate-typespec.eval.yaml) | Single-tool: run `tsp` linter/validation | +| [`typespec-generation-step02`](evals/scenarios/typespec-generation-step02.eval.yaml) | Step in the spec-PR generation flow | +| [`get-modified-typespec-projects`](evals/scenarios/get-modified-typespec-projects.eval.yaml) | Git-aware tool against current branch | +| [`add-arm-resource`](evals/scenarios/add-arm-resource.eval.yaml) | Calls `azsdk_typespec_generate_authoring_plan` for an ARM resource | +| [`create-release-plan`](evals/scenarios/create-release-plan.eval.yaml) | Single-tool: create a release-plan work item | +| [`link-namespace-approval-issue`](evals/scenarios/link-namespace-approval-issue.eval.yaml) | Link an existing approval issue to a release plan | +| [`get-pr-link-current-branch`](evals/scenarios/get-pr-link-current-branch.eval.yaml) | Resolve the PR for the active git branch | +| [`check-sdk-generation-status`](evals/scenarios/check-sdk-generation-status.eval.yaml) | Pipeline status lookup | +| [`rename-client-property`](evals/scenarios/rename-client-property.eval.yaml) | Stub — needs `expected-diff` grader | + +#### `evals/triggers/` — per-tool prompt → tool invocation coverage (ported from [#15183](https://github.com/Azure/azure-sdk-tools/pull/15183)) + +One YAML per tool category; each stimulus is a single prompt expected to invoke a single MCP tool. Used to catch tool-rename / description-drift regressions. + +| File | Tools covered | +|---|---| +| [`apiview.eval.yaml`](evals/triggers/apiview.eval.yaml) | `azsdk_apiview_*` | +| [`config.eval.yaml`](evals/triggers/config.eval.yaml) | `azsdk_check_service_label`, `azsdk_create_service_label` | +| [`engsys.eval.yaml`](evals/triggers/engsys.eval.yaml) | `azsdk_analyze_log_file`, failed-test tools, codeowner-cache | +| [`github.eval.yaml`](evals/triggers/github.eval.yaml) | `azsdk_create_pull_request`, `azsdk_get_pull_request*`, `azsdk_get_github_user_details` | +| [`package.eval.yaml`](evals/triggers/package.eval.yaml) | `azsdk_package_*`, `azsdk_release_sdk` | +| [`pipeline.eval.yaml`](evals/triggers/pipeline.eval.yaml) | `azsdk_analyze_pipeline`, `azsdk_get_pipeline_*` | +| [`releaseplan.eval.yaml`](evals/triggers/releaseplan.eval.yaml) | `azsdk_*_release_plan*`, `azsdk_run_generate_sdk`, `azsdk_link_*` | +| [`typespec.eval.yaml`](evals/triggers/typespec.eval.yaml) | `azsdk_typespec_*`, `azsdk_convert_swagger_to_typespec`, `azsdk_customized_code_update`, `azsdk_run_typespec_validation` | +| [`verify.eval.yaml`](evals/triggers/verify.eval.yaml) | `azsdk_verify_setup` | + +The companion [`scripts/Validate-EvalTools.ps1`](scripts/Validate-EvalTools.ps1) cross-checks that every tool referenced in `evals/triggers/` exists on the running MCP server, and every server tool has at least one trigger. **Skill evals (already in repo, *not* part of this PR)** — for reference: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/add-arm-resource.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo-then-validate.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-public-repo.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/check-sdk-generation-status.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/create-release-plan.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-modified-typespec-projects.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/get-pr-link-current-branch.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/link-namespace-approval-issue.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/rename-client-property.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/typespec-generation-step02.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/validate-typespec.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/apiview.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/apiview.eval.yaml new file mode 100644 index 00000000000..afeea4f8ee2 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/apiview.eval.yaml @@ -0,0 +1,112 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + priority: p0 + +stimuli: + + # ==== azsdk_apiview_get_comments triggers ==== + - name: invoke-azsdk-apiview-get-comments-1 + prompt: "Get all the APIView comments for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_comments" + - name: invoke-azsdk-apiview-get-comments-2 + prompt: "Show me the API review feedback for this package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_comments" + - name: invoke-azsdk-apiview-get-comments-3 + prompt: "What comments did the API reviewers leave?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_comments" + + # ==== azsdk_apiview_get_copilot_review triggers ==== + - name: invoke-azsdk-apiview-get-copilot-review-1 + prompt: "Check if my Copilot review is done" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_copilot_review" + - name: invoke-azsdk-apiview-get-copilot-review-2 + prompt: "Get the results of my automated API review job" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_copilot_review" + - name: invoke-azsdk-apiview-get-copilot-review-3 + prompt: "What comments did the Copilot generate for my API review?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_copilot_review" + + # ==== azsdk_apiview_get_review_url triggers ==== + - name: invoke-azsdk-apiview-get-review-url-1 + prompt: "Get the APIView review link for the Azure.Storage.Blobs C# package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_review_url" + - name: invoke-azsdk-apiview-get-review-url-2 + prompt: "What is the APIView URL for the azure-core Python package?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_review_url" + - name: invoke-azsdk-apiview-get-review-url-3 + prompt: > + Give me the link to the API review page for the Java storage blob package version 12.32.0 + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_get_review_url" + + # ==== azsdk_apiview_request_copilot_review triggers ==== + - name: invoke-azsdk-apiview-request-copilot-review-1 + prompt: "Request a Copilot review for this API" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_request_copilot_review" + - name: invoke-azsdk-apiview-request-copilot-review-2 + prompt: "Run an automated review on my package's API surface" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_request_copilot_review" + - name: invoke-azsdk-apiview-request-copilot-review-3 + prompt: "Submit this APIView URL for an automated Copilot review" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_apiview_request_copilot_review" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/config.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/config.eval.yaml new file mode 100644 index 00000000000..29dad5ba24c --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/config.eval.yaml @@ -0,0 +1,51 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + priority: p0 + +stimuli: + + # ==== azsdk_check_service_label triggers ==== + - name: invoke-azsdk-check-service-label-1 + prompt: "Check if a service label exists for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_check_service_label" + - name: invoke-azsdk-check-service-label-2 + prompt: "Does the service label for Contoso already exist?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_check_service_label" + + # ==== azsdk_create_service_label triggers ==== + - name: invoke-azsdk-create-service-label-1 + prompt: "Create a new service label for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_service_label" + - name: invoke-azsdk-create-service-label-2 + prompt: "Add a service label for Contoso Widget Manager" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_service_label" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/engsys.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/engsys.eval.yaml new file mode 100644 index 00000000000..1ad3ca5b5d2 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/engsys.eval.yaml @@ -0,0 +1,99 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + priority: p0 + +stimuli: + + # ==== azsdk_analyze_log_file triggers ==== + - name: invoke-azsdk-analyze-log-file-1 + prompt: "Analyze this log file for errors" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_analyze_log_file" + - name: invoke-azsdk-analyze-log-file-2 + prompt: "What errors are in this build log?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_analyze_log_file" + + # ==== azsdk_cleanup_ai_agents triggers ==== + - name: invoke-azsdk-cleanup-ai-agents-1 + prompt: "Clean up AI agents in my project" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_cleanup_ai_agents" + + # ==== azsdk_get_failed_test_case_data triggers ==== + - name: invoke-azsdk-get-failed-test-case-data-1 + prompt: "Get detailed information about a specific failed test" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_case_data" + - name: invoke-azsdk-get-failed-test-case-data-2 + prompt: "Show me the error message and stack trace for the failed test TestAuthentication" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_case_data" + + # ==== azsdk_get_failed_test_cases triggers ==== + - name: invoke-azsdk-get-failed-test-cases-1 + prompt: "Get the list of failed test cases from my test run" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_cases" + - name: invoke-azsdk-get-failed-test-cases-2 + prompt: "What tests failed in this TRX file?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_cases" + - name: invoke-azsdk-get-failed-test-cases-3 + prompt: "Show me which tests failed" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_cases" + + # ==== azsdk_get_failed_test_run_data triggers ==== + - name: invoke-azsdk-get-failed-test-run-data-1 + prompt: "Get complete details for all failed tests" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_run_data" + - name: invoke-azsdk-get-failed-test-run-data-2 + prompt: "Show me full information about all test failures including stack traces" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_failed_test_run_data" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/github.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/github.eval.yaml new file mode 100644 index 00000000000..047571f9c70 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/github.eval.yaml @@ -0,0 +1,76 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +tags: + priority: p0 + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +stimuli: + + # ==== azsdk_create_pull_request triggers ==== + - name: invoke-azsdk-create-pull-request-1 + prompt: "Create a pull request for my changes" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_pull_request" + + # ==== azsdk_get_github_user_details triggers ==== + - name: invoke-azsdk-get-github-user-details-1 + prompt: "Get details for GitHub user octocat" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_github_user_details" + - name: invoke-azsdk-get-github-user-details-2 + prompt: "Who is the GitHub user johndoe?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_github_user_details" + + # ==== azsdk_get_pull_request triggers ==== + - name: invoke-azsdk-get-pull-request-1 + prompt: "Get the details of my pull request" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pull_request" + - name: invoke-azsdk-get-pull-request-2 + prompt: "Show me the status and comments on PR #1234" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pull_request" + + # ==== azsdk_get_pull_request_link_for_current_branch triggers ==== + - name: invoke-azsdk-get-pull-request-link-for-current-branch-1 + prompt: "Get the PR link for my current branch" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pull_request_link_for_current_branch" + - name: invoke-azsdk-get-pull-request-link-for-current-branch-2 + prompt: "What's the pull request URL for this branch?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pull_request_link_for_current_branch" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/package.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/package.eval.yaml new file mode 100644 index 00000000000..dd4430062b0 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/package.eval.yaml @@ -0,0 +1,230 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +tags: + priority: p0 + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +stimuli: + + # ==== azsdk_package_build_code triggers ==== + - name: invoke-azsdk-package-build-code-1 + prompt: "Build my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_build_code" + - name: invoke-azsdk-package-build-code-2 + prompt: "Compile the code for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_build_code" + + # ==== azsdk_package_generate_code triggers ==== + - name: invoke-azsdk-package-generate-code-1 + prompt: "Generate SDK code from my TypeSpec" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_code" + - name: invoke-azsdk-package-generate-code-2 + prompt: "Run code generation for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_code" + + # ==== azsdk_package_generate_samples triggers ==== + - name: invoke-azsdk-package-generate-samples-1 + prompt: "Generate sample code for my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_samples" + - name: invoke-azsdk-package-generate-samples-2 + prompt: "Create sample code for my package based on these scenarios" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_samples" + - name: invoke-azsdk-package-generate-samples-3 + prompt: "Generate samples for my package using this prompt" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_generate_samples" + + # ==== azsdk_package_pack triggers ==== + - name: invoke-azsdk-package-pack-1 + prompt: "Pack my SDK package into a distributable artifact" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_pack" + - name: invoke-azsdk-package-pack-2 + prompt: "Create distributable package artifacts for my SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_pack" + - name: invoke-azsdk-package-pack-3 + prompt: "Generate package artifacts for my SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_pack" + + # ==== azsdk_package_run_check triggers ==== + - name: invoke-azsdk-package-run-check-1 + prompt: "Run the azsdk package check command to validate my SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_check" + - name: invoke-azsdk-package-run-check-2 + prompt: "Run validation checks on my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_check" + - name: invoke-azsdk-package-run-check-3 + prompt: "Validate the changelog and dependencies for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_check" + + # ==== azsdk_package_run_tests triggers ==== + - name: invoke-azsdk-package-run-tests-1 + prompt: "Run tests for my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_tests" + - name: invoke-azsdk-package-run-tests-2 + prompt: "Run the tests for my specified SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_run_tests" + + # ==== azsdk_package_translate_samples triggers ==== + - name: invoke-azsdk-package-translate-samples-1 + prompt: "Translate the sample code from the Python package to the Java package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_translate_samples" + - name: invoke-azsdk-package-translate-samples-2 + prompt: "Convert samples from the source package to the target language package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_translate_samples" + - name: invoke-azsdk-package-translate-samples-3 + prompt: "Translate SDK samples from one language to another" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_translate_samples" + + # ==== azsdk_package_update_changelog_content triggers ==== + - name: invoke-azsdk-package-update-changelog-content-1 + prompt: "Update the changelog for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_changelog_content" + - name: invoke-azsdk-package-update-changelog-content-2 + prompt: "Update the changelog content for my package with new release notes" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_changelog_content" + + # ==== azsdk_package_update_metadata triggers ==== + - name: invoke-azsdk-package-update-metadata-1 + prompt: "Update the package metadata" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_metadata" + - name: invoke-azsdk-package-update-metadata-2 + prompt: "Update the package metadata for my SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_metadata" + + # ==== azsdk_package_update_version triggers ==== + - name: invoke-azsdk-package-update-version-1 + prompt: "Update my package version" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_version" + - name: invoke-azsdk-package-update-version-2 + prompt: "Bump the version to 1.2.0" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_package_update_version" + + # ==== azsdk_release_sdk triggers ==== + - name: invoke-azsdk-release-sdk-1 + prompt: "Release my SDK package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_release_sdk" + - name: invoke-azsdk-release-sdk-2 + prompt: "Trigger the release pipeline for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_release_sdk" + - name: invoke-azsdk-release-sdk-3 + prompt: "Start the SDK release process for my package" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_release_sdk" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/pipeline.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/pipeline.eval.yaml new file mode 100644 index 00000000000..74ea1342082 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/pipeline.eval.yaml @@ -0,0 +1,74 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +tags: + priority: p0 + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +stimuli: + + # ==== azsdk_analyze_pipeline triggers ==== + - name: invoke-azsdk-analyze-pipeline-1 + prompt: "Analyze my pipeline run" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_analyze_pipeline" + - name: invoke-azsdk-analyze-pipeline-2 + prompt: "What happened in this pipeline build?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_analyze_pipeline" + + # ==== azsdk_get_pipeline_llm_artifacts triggers ==== + - name: invoke-azsdk-get-pipeline-llm-artifacts-1 + prompt: "Get the LLM artifacts from my pipeline" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_llm_artifacts" + - name: invoke-azsdk-get-pipeline-llm-artifacts-2 + prompt: "Download the analysis artifacts from the pipeline run" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_llm_artifacts" + + # ==== azsdk_get_pipeline_status triggers ==== + - name: invoke-azsdk-get-pipeline-status-1 + prompt: "Check the status of my Azure pipeline build 12345678" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_status" + - name: invoke-azsdk-get-pipeline-status-2 + prompt: "Get the pipeline build status for run 9876543" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_status" + - name: invoke-azsdk-get-pipeline-status-3 + prompt: "Get the pipeline build status for my CI run" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_pipeline_status" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/releaseplan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/releaseplan.eval.yaml new file mode 100644 index 00000000000..b99e0bda5bf --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/releaseplan.eval.yaml @@ -0,0 +1,314 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +tags: + priority: p0 + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +stimuli: + + # ==== azsdk_abandon_release_plan triggers ==== + - name: invoke-azsdk-abandon-release-plan-1 + prompt: "Abandon the release plan for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + - name: invoke-azsdk-abandon-release-plan-2 + prompt: "Cancel and abandon my release plan for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + - name: invoke-azsdk-abandon-release-plan-3 + prompt: "Mark the release plan for work item 12345 as abandoned" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + - name: invoke-azsdk-abandon-release-plan-4 + prompt: "Abandon my release plan for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + - name: invoke-azsdk-abandon-release-plan-5 + prompt: "Cancel the release plan for my service for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_abandon_release_plan" + + # ==== azsdk_check_api_spec_ready_for_sdk triggers ==== + - name: invoke-azsdk-check-api-spec-ready-for-sdk-1 + prompt: "Check if my API spec is ready to generate SDK" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_check_api_spec_ready_for_sdk" + - name: invoke-azsdk-check-api-spec-ready-for-sdk-2 + prompt: "Is my TypeSpec ready for SDK generation?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_check_api_spec_ready_for_sdk" + + # ==== azsdk_create_release_plan triggers ==== + - name: invoke-azsdk-create-release-plan-1 + prompt: "Create a release plan for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_release_plan" + - name: invoke-azsdk-create-release-plan-2 + prompt: "Create a release plan for Contoso Widget Manager service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_create_release_plan" + + # ==== azsdk_get_release_plan triggers ==== + - name: invoke-azsdk-get-release-plan-1 + prompt: "Get the release plan for work item 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_release_plan" + - name: invoke-azsdk-get-release-plan-2 + prompt: "Show me the release plan details" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_release_plan" + + # ==== azsdk_get_release_plan_for_spec_pr triggers ==== + - name: invoke-azsdk-get-release-plan-for-spec-pr-1 + prompt: "Get the release plan for my spec PR" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_release_plan_for_spec_pr" + - name: invoke-azsdk-get-release-plan-for-spec-pr-2 + prompt: "What release plan is associated with this spec pull request?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_release_plan_for_spec_pr" + + # ==== azsdk_get_sdk_pull_request_link triggers ==== + - name: invoke-azsdk-get-sdk-pull-request-link-1 + prompt: "Get the SDK pull request link from the generation pipeline" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_sdk_pull_request_link" + - name: invoke-azsdk-get-sdk-pull-request-link-2 + prompt: "Where is the PR created by SDK generation?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_sdk_pull_request_link" + + # ==== azsdk_get_service_details_by_typespec_path triggers ==== + - name: invoke-azsdk-get-service-details-by-typespec-path-1 + prompt: "Get the service tree details for my TypeSpec project path" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-2 + prompt: "Look up the service and product details using the TypeSpec project path" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-3 + prompt: "What service tree ID and product info is associated with this TypeSpec path?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-4 + prompt: "Find product details for my typespec project" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-5 + prompt: "What service does this TypeSpec project belong to?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + - name: invoke-azsdk-get-service-details-by-typespec-path-6 + prompt: > + Get service and service tree product details for a product using TypeSpec project path + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_service_details_by_typespec_path" + + # ==== azsdk_link_namespace_approval_issue triggers ==== + - name: invoke-azsdk-link-namespace-approval-issue-1 + prompt: "Link namespace approval issue to release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_link_namespace_approval_issue" + - name: invoke-azsdk-link-namespace-approval-issue-2 + prompt: "Associate the namespace approval with my release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_link_namespace_approval_issue" + + # ==== azsdk_link_sdk_pull_request_to_release_plan triggers ==== + - name: invoke-azsdk-link-sdk-pull-request-to-release-plan-1 + prompt: "Link my SDK pull request to the release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_link_sdk_pull_request_to_release_plan" + - name: invoke-azsdk-link-sdk-pull-request-to-release-plan-2 + prompt: "Link SDK pull request #5678 to release plan 12345" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_link_sdk_pull_request_to_release_plan" + + # ==== azsdk_run_generate_sdk triggers ==== + - name: invoke-azsdk-run-generate-sdk-1 + prompt: "Generate SDK from my TypeSpec project using the pipeline" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_run_generate_sdk" + - name: invoke-azsdk-run-generate-sdk-2 + prompt: "Generate SDK for my TypeSpec project using the pipeline" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_run_generate_sdk" + + # ==== azsdk_update_api_spec_pull_request_in_release_plan triggers ==== + - name: invoke-azsdk-update-api-spec-pull-request-in-release-plan-1 + prompt: "Update the TypeSpec PR URL in the release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_api_spec_pull_request_in_release_plan" + - name: invoke-azsdk-update-api-spec-pull-request-in-release-plan-2 + prompt: "Update the TypeSpec pull request URL in my release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_api_spec_pull_request_in_release_plan" + + # ==== azsdk_update_language_exclusion_justification triggers ==== + - name: invoke-azsdk-update-language-exclusion-justification-1 + prompt: "Update the language exclusion justification" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_language_exclusion_justification" + - name: invoke-azsdk-update-language-exclusion-justification-2 + prompt: "Explain why Python is excluded from this release" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_language_exclusion_justification" + + # ==== azsdk_update_release_plan triggers ==== + - name: invoke-azsdk-update-release-plan-1 + prompt: "Update the release plan with my TypeSpec project path and API version" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + - name: invoke-azsdk-update-release-plan-2 + prompt: "Update the spec PR URL and SDK release type in my release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + - name: invoke-azsdk-update-release-plan-3 + prompt: "Update the existing release plan for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + - name: invoke-azsdk-update-release-plan-4 + prompt: "Update my release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + - name: invoke-azsdk-update-release-plan-5 + prompt: "Update TypeSpec project in release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_release_plan" + + # ==== azsdk_update_sdk_details_in_release_plan triggers ==== + - name: invoke-azsdk-update-sdk-details-in-release-plan-1 + prompt: "Update SDK details in the release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_sdk_details_in_release_plan" + - name: invoke-azsdk-update-sdk-details-in-release-plan-2 + prompt: "Change the SDK package name in the release plan" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_update_sdk_details_in_release_plan" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/typespec.eval.yaml new file mode 100644 index 00000000000..c87605ef9bc --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/typespec.eval.yaml @@ -0,0 +1,187 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + priority: p0 + +stimuli: + + # ==== azsdk_convert_swagger_to_typespec triggers ==== + - name: invoke-azsdk-convert-swagger-to-typespec-1 + prompt: "Convert my swagger to TypeSpec" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_convert_swagger_to_typespec" + - name: invoke-azsdk-convert-swagger-to-typespec-2 + prompt: "Migrate my API from swagger to TypeSpec" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_convert_swagger_to_typespec" + + # ==== azsdk_customized_code_update triggers ==== + - name: invoke-azsdk-customized-code-update-1 + prompt: "Update customized code with patches to fix build errors" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_customized_code_update" + - name: invoke-azsdk-customized-code-update-2 + prompt: "Apply customized code patches and rebuild to fix errors" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_customized_code_update" + + # ==== azsdk_get_modified_typespec_projects triggers ==== + - name: invoke-azsdk-get-modified-typespec-projects-1 + prompt: "What TypeSpec projects were modified in my branch?" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_modified_typespec_projects" + - name: invoke-azsdk-get-modified-typespec-projects-2 + prompt: "List the changed TypeSpec projects" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_get_modified_typespec_projects" + + # ==== azsdk_run_typespec_validation triggers ==== + - name: invoke-azsdk-run-typespec-validation-1 + prompt: "Run TypeSpec validation on my project" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_run_typespec_validation" + - name: invoke-azsdk-run-typespec-validation-2 + prompt: "Run TypeSpec configuration validation for my project root path" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_run_typespec_validation" + + # ==== azsdk_typespec_check_project_in_public_repo triggers ==== + - name: invoke-azsdk-typespec-check-project-in-public-repo-1 + prompt: "Check if my TypeSpec project is in the public repo" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_check_project_in_public_repo" + - name: invoke-azsdk-typespec-check-project-in-public-repo-2 + prompt: "Check if my TypeSpec project is in the public spec repo" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_check_project_in_public_repo" + + # ==== azsdk_typespec_delegate_apiview_feedback triggers ==== + - name: invoke-azsdk-typespec-delegate-apiview-feedback-1 + prompt: "Delegate the APIView feedback to Copilot for resolution" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-2 + prompt: "Address the APIView comments by creating a GitHub issue and assigning Copilot" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-3 + prompt: "Resolve the APIView reviewer feedback from this URL" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-4 + prompt: > + Help me fix these comments: https://spa.apiview.dev/review/c375391d5ab9419f83e3bdsfas9asdfadf2e?activeApiRevisionId=fc2a4adfasdfasdagae3w3hhtd + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-5 + prompt: > + Fix this feedback: https://spa.apiview.dev/review/c375391d5ab9419f83e3bdsfas9asdfadf2e?activeApiRevisionId=fc2a4adfasdfasdagae3w3hhtd + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + - name: invoke-azsdk-typespec-delegate-apiview-feedback-6 + prompt: > + Create an issue and assign to copilot to fix this: https://spa.apiview.dev/review/adfaset5391d5ab9419f83e3bds9asdfadf2e?activeApiRevisionId=adf34adfasadastasdagae3w3hhtd + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_delegate_apiview_feedback" + + # ==== azsdk_typespec_generate_authoring_plan triggers ==== + - name: invoke-azsdk-typespec-generate-authoring-plan-1 + prompt: > + Generate a solution to add a new resource 'asset' for service widget with TypeSpec. + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_generate_authoring_plan" + - name: invoke-azsdk-typespec-generate-authoring-plan-2 + prompt: "Generate a solution to add a new api version for service widget with TypeSpec." + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_generate_authoring_plan" + - name: invoke-azsdk-typespec-generate-authoring-plan-3 + prompt: > + Generate a solution to set a default value `21` for property `age` in model EmployeeProperties from a api version say 2025-11-01 with TypeSpec. + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_generate_authoring_plan" + + # ==== azsdk_typespec_init_project triggers ==== + - name: invoke-azsdk-typespec-init-project-1 + prompt: "Initialize a new TypeSpec project" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_init_project" + - name: invoke-azsdk-typespec-init-project-2 + prompt: "Initialize a new TypeSpec project for my service" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_typespec_init_project" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/verify.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/verify.eval.yaml new file mode 100644 index 00000000000..9c6c24c3cb5 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/verify.eval.yaml @@ -0,0 +1,42 @@ +name: azsdk-mcp-tool-invocation-eval +description: | + Verify that prompts correctly invoke the expected Azure SDK MCP tools. + Each stimulus is a single user prompt that should trigger a specific tool. +version: "1.0" +type: capability + +environment: azsdk-mcp + +config: + runs: 5 + timeout: 120 + executor: copilot-sdk + model: claude-opus-4.6 + +tags: + priority: p0 + +stimuli: + + # ==== azsdk_verify_setup triggers ==== + - name: invoke-azsdk-verify-setup-1 + prompt: "Verify my environment setup" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_verify_setup" + - name: invoke-azsdk-verify-setup-2 + prompt: "Verify my developer environment setup for MCP tools" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_verify_setup" + - name: invoke-azsdk-verify-setup-3 + prompt: "Verify my MCP release tool setup" + graders: + - type: tool-calls + config: + required: + - name: "azsdk_verify_setup" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 new file mode 100644 index 00000000000..bba3c94dda3 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 @@ -0,0 +1,160 @@ +<# +.SYNOPSIS + Validates that all tool names referenced in tool-trigger eval files exist in the MCP server. + +.DESCRIPTION + This script: + 1. Runs `azsdk list` to get all registered MCP tool names from the server. + 2. Parses all *.eval.yaml files under the triggers/ directory. + 3. Reports any eval tool references that don't exist on the server, + and any server tools that are missing eval coverage. + +.PARAMETER ProjectPath + Path to the Azure.Sdk.Tools.Cli project. Defaults to ../Azure.Sdk.Tools.Cli relative to this script. + +.PARAMETER EvalPath + Path to the triggers/ directory containing *.eval.yaml files. + Defaults to ../evals/triggers relative to this script. + +.PARAMETER SkipBuild + If set, passes --no-build to dotnet run (requires a prior build). +#> +[CmdletBinding()] +param( + [string]$ProjectPath, + [string]$EvalPath, + [switch]$SkipBuild +) + +Set-StrictMode -Version 4 +$ErrorActionPreference = 'Stop' + +$scriptDir = $PSScriptRoot +$vallyRoot = (Resolve-Path (Join-Path $scriptDir "..")).Path +$cliParent = (Resolve-Path (Join-Path $vallyRoot "..")).Path + +if (-not $ProjectPath) { + $ProjectPath = Join-Path $cliParent "Azure.Sdk.Tools.Cli" +} +if (-not $EvalPath) { + $EvalPath = Join-Path $vallyRoot "evals/triggers" +} + +if (-not (Test-Path $ProjectPath)) { + Write-Error "CLI project not found at: $ProjectPath" + return 1 +} +if (-not (Test-Path $EvalPath)) { + Write-Error "Evaluations directory not found at: $EvalPath" + return 1 +} + +# Step 1: Get tool names from the MCP server via `azsdk list` +Write-Host "Running 'azsdk list' to get registered MCP tools..." -ForegroundColor Cyan + +$dotnetArgs = @("run", "--project", $ProjectPath) +if ($SkipBuild) { + $dotnetArgs += "--no-build" +} +$dotnetArgs += @("--", "list", "--output", "json") + +$listOutput = & dotnet @dotnetArgs 2>&1 +$jsonLines = $listOutput | Where-Object { $_ -is [string] -and $_ -notmatch "^Using launch settings" } +$jsonText = $jsonLines -join "`n" + +try { + $parsed = $jsonText | ConvertFrom-Json + [string[]]$serverTools = @($parsed.Tools | ForEach-Object { $_.McpToolName } | Where-Object { $_ } | Sort-Object -Unique) +} catch { + Write-Error "Failed to parse 'azsdk list --output json'. Error: $_" + return 1 +} + +# Filter out tools that are excluded from eval coverage (example, test, and utility tools) +$excludedTools = @( + "azsdk_hello_world", + "azsdk_hello_world_fail", + "azsdk_example_process_execution", + "azsdk_example_powershell_execution", + "azsdk_example_azure_service", + "azsdk_example_ai_service", + "azsdk_example_error_handling", + "azsdk_example_agent_fibonacci", + "azsdk_example_github_service", + "azsdk_example_devops_service", + "azsdk_upgrade", + "azsdk_engsys_codeowner_view", + "azsdk_engsys_codeowner_add_label_owner", + "azsdk_engsys_codeowner_remove_label_owner", + "azsdk_engsys_codeowner_add_package_owner", + "azsdk_engsys_codeowner_remove_package_owner", + "azsdk_engsys_codeowner_add_package_label", + "azsdk_engsys_codeowner_remove_package_label" +) + +[string[]]$serverTools = @($serverTools | Where-Object { $_ -notin $excludedTools }) + +if ($serverTools.Count -eq 0) { + Write-Error "No tools found from 'azsdk list'. Check that the CLI project builds and runs correctly." + return 1 +} + +Write-Host "Found $($serverTools.Count) tools registered on the MCP server ($($excludedTools.Count) excluded).`n" -ForegroundColor Green + +# Step 2: Parse all *.eval.yaml files in the triggers directory for tool name references +$evalFiles = Get-ChildItem -Path $EvalPath -Filter "*.eval.yaml" + +if ($evalFiles.Count -eq 0) { + Write-Error "No *.eval.yaml files found in: $EvalPath" + return 1 +} + +$evalToolsByFile = @{} +$allEvalTools = [System.Collections.Generic.HashSet[string]]::new() + +foreach ($file in $evalFiles) { + $key = $file.BaseName + $matchResults = Select-String -Path $file.FullName -Pattern 'name:\s*"(azsdk_[^"]+)"' + [string[]]$tools = @($matchResults | ForEach-Object { $_.Matches[0].Groups[1].Value } | Sort-Object -Unique) + $evalToolsByFile[$key] = $tools + foreach ($t in $tools) { + [void]$allEvalTools.Add($t) + } +} + +Write-Host "Found $($allEvalTools.Count) unique tools across $($evalFiles.Count) eval files.`n" -ForegroundColor Green + +# Step 3: Compare +[string[]]$missingFromServer = @($allEvalTools | Where-Object { $_ -notin $serverTools } | Sort-Object) +[string[]]$missingFromEvals = @($serverTools | Where-Object { $_ -notin $allEvalTools } | Sort-Object) + +$hasErrors = $false + +if ($missingFromServer.Count -gt 0) { + $hasErrors = $true + Write-Host "ERROR: Eval references tools NOT found on the MCP server:" -ForegroundColor Red + foreach ($tool in $missingFromServer) { + # Find which eval file references it + $sources = $evalToolsByFile.GetEnumerator() | Where-Object { $_.Value -contains $tool } | ForEach-Object { $_.Key } + Write-Host " - $tool (referenced in: $($sources -join ', '))" -ForegroundColor Red + } + Write-Host "" +} + +if ($missingFromEvals.Count -gt 0) { + $hasErrors = $true + Write-Host "ERROR: Server tools with no eval coverage:" -ForegroundColor Red + foreach ($tool in $missingFromEvals) { + Write-Host " - $tool" -ForegroundColor Red + } + Write-Host "" +} + +Write-Host "" +if ($hasErrors) { + Write-Host "RESULT: FAIL - Eval tools and MCP server tools are out of sync." -ForegroundColor Red + exit 1 +} else { + Write-Host "RESULT: PASS - All eval tools exist on the MCP server." -ForegroundColor Green + exit 0 +} From 02aee348b3c77ea906e812d0bfed195e489f936a Mon Sep 17 00:00:00 2001 From: helen229 Date: Tue, 2 Jun 2026 12:20:46 -0700 Subject: [PATCH 06/24] update the config and use gpt-5.4 model --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 5 ++++- .../scenarios/add-arm-resource.eval.yaml | 22 +------------------ .../check-public-repo-then-validate.eval.yaml | 6 +---- .../scenarios/check-public-repo.eval.yaml | 13 +---------- .../check-sdk-generation-status.eval.yaml | 5 +---- .../scenarios/create-release-plan.eval.yaml | 7 +----- .../get-modified-typespec-projects.eval.yaml | 18 +-------------- .../get-pr-link-current-branch.eval.yaml | 3 +-- .../link-namespace-approval-issue.eval.yaml | 6 +---- .../rename-client-property.eval.yaml | 21 +----------------- .../typespec-generation-step02.eval.yaml | 5 +---- .../scenarios/validate-typespec.eval.yaml | 6 +---- 12 files changed, 15 insertions(+), 102 deletions(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index a5bb1d43da4..771b210183d 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -19,7 +19,10 @@ environments: args: ["run", "--project", "../Azure.Sdk.Tools.Cli", "--", "start"] timeout: "5m" env: - AZSDKTOOLS_AGENT_TESTING: "false" + # Test mode: tools that would create real ADO work items / external + # resources (e.g. azsdk_create_release_plan) short-circuit into a + # test variant so evals are safe to re-run. + AZSDKTOOLS_AGENT_TESTING: "true" AZSDKTOOLS_COLLECT_TELEMETRY: "false" # Suites group evals for selective execution. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml index 596bb4a3111..e7e2b6c5b00 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml @@ -11,31 +11,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: add-arm-resource - # Ported (stub) from deleted AddArmResourceScenario (#15697). - # - # TODO: this scenario is heavier than the others — the original asserted: - # - main.tsp still exists and imports the new asset.tsp - # - asset.tsp contains an ARM resource model with @armResourceOperations, - # interface Assets, ArmResourceRead/CreateOrReplace/Update/Delete, - # listByResourceGroup, listBySubscription - # - `npx tsp compile` succeeds against the modified project - # To port faithfully we need: - # 1. Fixtures under fixtures/add-arm-resource/ that mirror - # specification/widget/resource-manager/Microsoft.Widget/Widget - # (the Microsoft.Widget fixture already exists under - # .github/skills/azure-typespec-author/evaluate/fixtures/Microsoft.Widget - # and could be reused / symlinked). - # 2. A Vally environment hook (or custom grader) that runs - # `npm ci` + `npx tsp compile` after the agent finishes. - # 3. file-matches graders for the asset.tsp content patterns. - # For now this eval only checks that the agent makes at least one edit - # and invokes the authoring-plan tool — it does NOT verify the produced - # TypeSpec compiles. prompt: | In the specification/widget/resource-manager/Microsoft.Widget/Widget project, add an ARM resource named 'Asset' with CRUD operations. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml index 0f82553728c..9dda24bb440 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml @@ -10,15 +10,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: validate-then-check-public-repo - # Ported from deleted CheckPublicRepoThenValidateScenario (#15697). - # Original also asserted ordering (validate before check) and forbade - # azsdk_verify_setup. Vally's tool-calls grader currently checks presence - # only, not order, and has no `forbidden` field — captured via prompt. prompt: | Run TypeSpec validation, then check if the project is in the public repo. Project path: specification/contosowidgetmanager/Contoso.WidgetManager. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml index c0deb25bc8a..4e64d0dc46f 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml @@ -11,17 +11,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk -# Test cases stimuli: - name: check-public-repo - # Migrated from the deleted Azure.Sdk.Tools.Cli.Benchmarks CheckPublicRepoScenario - # (see PR #15697 for deletion, PR #14507 for the original benchmark form). - # Validates that the agent invokes `azsdk_typespec_check_project_in_public_repo` - # when asked whether a TypeSpec project is in the public repo, and that it - # does NOT call `azsdk_verify_setup` after being told setup is verified. prompt: | Check if my TypeSpec project is in the public repo. My setup has already been verified, do not run azsdk_verify_setup. @@ -29,11 +23,6 @@ stimuli: constraints: max_turns: 5 max_tokens: 5000 - # NOTE: the deleted benchmark also asserted that `azsdk_verify_setup` - # was NOT called (the prompt explicitly tells the agent skip it). The - # current `tool-calls` grader does not support a `forbidden` list, so we - # rely on the prompt for now. Revisit once Vally supports negative - # tool-call assertions, or add a custom grader under `Graders/`. graders: - type: tool-calls config: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml index ef4617ea0c0..95749a9f0fb 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml @@ -10,14 +10,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: check-sdk-generation-status - # Ported from deleted CheckSdkGenerationStatusScenario (#15697). - # Original also asserted buildId=5513110 was passed to the tool. - # Argument assertions require a custom grader (not yet built). prompt: | Check the SDK generation pipeline status for build ID 5513110. My setup has already been verified, do not run azsdk_verify_setup. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml index 068e4f54b19..f18be8ae011 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml @@ -10,16 +10,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: create-release-plan - # Ported from deleted CreateReleasePlanScenario (#15697). - # Original also asserted exact tool arguments (serviceTreeId, productTreeId, - # specApiVersion, specPullRequestUrl, sdkReleaseType, typeSpecProjectPath). - # Vally's built-in tool-calls grader checks tool *name* only; argument - # assertions would need a custom grader under Graders/. prompt: | Create a release plan for the Contoso Widget Manager, no need to get it afterwards only create. My setup has already been verified, do not run azsdk_verify_setup. Here is all the context you need: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml index b5cb73c5b07..c679c03dcf6 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml @@ -11,27 +11,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: get-modified-typespec-projects - # Ported from deleted GetModifiedTypespecProjectsScenario (#15697). - # - # TODO: the original scenario had a SetupAsync hook that sparse-checked - # out Azure/azure-rest-api-specs, modified - # specification/contosowidgetmanager/Contoso.WidgetManager/tspconfig.yaml, - # and committed the change so `git merge-base HEAD main` had a divergence - # point to report. Vally's `environment.files` only seeds files — it does - # not init a git repo or run commands. To exercise the tool's underlying - # git logic this eval needs either: - # 1. A new Vally environment hook that runs setup commands, OR - # 2. A pre-built fixture committed under fixtures/ that is itself a git - # repo (committed as a tarball / unpacked at setup), OR - # 3. A custom .NET grader under Graders/ that drives the tool with a - # fabricated workspace. - # For now this eval only verifies the tool *name* is selected — the - # actual diff result is not asserted. prompt: | List the TypeSpec projects modified in my current branch compared to main. My setup has already been verified, do not run azsdk_verify_setup. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml index ac957a97391..d1b3d21e067 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml @@ -11,12 +11,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: get-pr-link-current-branch - # Ported from deleted GetPrLinkCurrentBranchScenario (#15697). prompt: | What's the status of the spec PR in my current branch? Only check the status once. My setup has already been verified, do not run azsdk_verify_setup. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml index b3e9cac0576..58d90e6c585 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml @@ -10,15 +10,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: link-namespace-approval-issue - # Ported from deleted LinkNamespaceApprovalIssueScenario (#15697). - # Original also asserted releasePlanWorkItemId=12345 and - # namespaceApprovalIssue=https://github.com/Azure/azure-sdk/issues/1234. - # Argument assertions require a custom grader (not yet built). prompt: | Link namespace approval issue https://github.com/Azure/azure-sdk/issues/1234 to release plan 12345. My setup has already been verified, do not run azsdk_verify_setup. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml index 76b16d568f6..da80d06c585 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml @@ -11,30 +11,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: rename-client-property - # Ported from deleted RenameClientPropertyScenario (#15697). - # STUB: the original benchmark asserted an exact unified diff against - # specification/ai/Face/models.common.tsp via ExpectedDiffValidator, plus - # FileExistsValidator + ContainsValidator. None of those graders exist in - # Vally today. This eval currently only checks that the agent reached for - # the file-edit tool; the actual correctness of the rename is not graded. - # TODO: add a custom "expected-diff" grader (or upstream a file-diff - # grader to @microsoft/vally-cli) to validate the unified diff: - # diff --git a/specification/ai/Face/models.common.tsp b/specification/ai/Face/models.common.tsp - # @@ -155,7 +155,7 @@ - # @doc("Add face from url request.") - # model AddFaceFromUrlRequest is AddFaceOptions { - # - @clientName("uri", "csharp") - # + @clientName("imageUri", "csharp") - # @doc("URL of input image.") - # url: url; - # } - # TODO: needs a setup hook to sparse-clone Azure/azure-rest-api-specs at - # specification/ai/Face into the workspace before the agent runs. prompt: | In the specification/ai/Face project, find the AddFaceFromUrlRequest model. It has a property called 'url' that's been renamed to "uri" in c#. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml index 7a7717824a0..49f7616d360 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml @@ -10,14 +10,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: typespec-generation-step02-validation - # Ported from deleted TypespecGenerationStep02Scenario (#15697). - # The original benchmark replaced a mid-conversation JSON trace with a - # standalone prompt; same intent preserved here. prompt: | I'm working on the TypeSpec generation workflow. I need to validate my TypeSpec project as part of step 2. Please check if my TypeSpec project is in the public repo. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml index 0a66b39a715..0e2b0f5610a 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml @@ -10,15 +10,11 @@ environment: azsdk-mcp config: runs: 1 timeout: 30m - model: claude-opus-4.6 + model: gpt-5.4 executor: copilot-sdk stimuli: - name: validate-typespec - # Ported from deleted ValidateTypespecScenario (#15697). - # The original allowed `azsdk_typespec_check_project_in_public_repo` and - # `azsdk_verify_setup` as optional tool calls. Vally's current grader has - # no `optional` concept — extra calls are not penalised by `required:`. prompt: | Validate my typespec project. It is already confirmed we are in a public repository. The path to my typespec is specification/contosowidgetmanager/Contoso.WidgetManager/main.tsp. From d1f212f1b2c570099130c2695f0cb90e07e7d109 Mon Sep 17 00:00:00 2001 From: helen229 Date: Tue, 2 Jun 2026 13:05:07 -0700 Subject: [PATCH 07/24] add disallowed --- .../evals/scenarios/add-arm-resource.eval.yaml | 3 +++ .../evals/scenarios/check-public-repo-then-validate.eval.yaml | 3 +++ .../evals/scenarios/check-public-repo.eval.yaml | 2 ++ .../evals/scenarios/check-sdk-generation-status.eval.yaml | 3 +++ .../evals/scenarios/create-release-plan.eval.yaml | 3 +++ .../evals/scenarios/get-modified-typespec-projects.eval.yaml | 4 ++++ .../evals/scenarios/get-pr-link-current-branch.eval.yaml | 2 ++ .../evals/scenarios/link-namespace-approval-issue.eval.yaml | 3 +++ .../evals/scenarios/rename-client-property.eval.yaml | 3 +++ .../evals/scenarios/typespec-generation-step02.eval.yaml | 2 ++ 10 files changed, 28 insertions(+) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml index e7e2b6c5b00..7ec341ed11b 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml @@ -22,6 +22,9 @@ stimuli: constraints: max_turns: 20 max_tokens: 50000 + # TODO: seed a fixture (environment.files or git) for the Microsoft.Widget + # project, add `file-exists` + `file-contains` graders on the produced + # asset.tsp, and a `run-command` grader to verify `npx tsp compile`. graders: - type: tool-calls config: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml index 9dda24bb440..3f16aa6cf3f 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml @@ -22,12 +22,15 @@ stimuli: constraints: max_turns: 8 max_tokens: 8000 + # TODO: assert ordering (validate before check) — blocked on https://github.com/Azure/azure-sdk-tools/issues/15832 (Vally tool-calls grader needs sequence:). graders: - type: tool-calls config: required: - azsdk_run_typespec_validation - azsdk_typespec_check_project_in_public_repo + disallowed: + - azsdk_verify_setup scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml index 4e64d0dc46f..47944016a9c 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml @@ -28,6 +28,8 @@ stimuli: config: required: - azsdk_typespec_check_project_in_public_repo + disallowed: + - azsdk_verify_setup scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml index 95749a9f0fb..211479c9225 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml @@ -21,11 +21,14 @@ stimuli: constraints: max_turns: 5 max_tokens: 5000 + # TODO: assert buildId=5513110 — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). graders: - type: tool-calls config: required: - azsdk_get_pipeline_status + disallowed: + - azsdk_verify_setup scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml index f18be8ae011..d58e5f7fef2 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml @@ -28,11 +28,14 @@ stimuli: constraints: max_turns: 8 max_tokens: 8000 + # TODO: assert serviceTreeId / productTreeId / specApiVersion / specPullRequestUrl / sdkReleaseType / typeSpecProjectPath — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). graders: - type: tool-calls config: required: - azsdk_create_release_plan + disallowed: + - azsdk_verify_setup scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml index c679c03dcf6..db4ed40c09e 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml @@ -23,11 +23,15 @@ stimuli: constraints: max_turns: 5 max_tokens: 5000 + # TODO: seed a git worktree fixture (environment.git) with a modified + # tspconfig.yaml so the tool actually has a diff to report. graders: - type: tool-calls config: required: - azsdk_get_modified_typespec_projects + disallowed: + - azsdk_verify_setup scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml index d1b3d21e067..4eb8538f623 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml @@ -28,6 +28,8 @@ stimuli: config: required: - azsdk_get_pull_request_link_for_current_branch + disallowed: + - azsdk_verify_setup scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml index 58d90e6c585..79a27314a86 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml @@ -21,11 +21,14 @@ stimuli: constraints: max_turns: 5 max_tokens: 5000 + # TODO: assert releasePlanWorkItemId=12345 and namespaceApprovalIssue URL — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). graders: - type: tool-calls config: required: - azsdk_link_namespace_approval_issue + disallowed: + - azsdk_verify_setup scoring: weights: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml index da80d06c585..5b8035945b2 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml @@ -23,6 +23,9 @@ stimuli: constraints: max_turns: 5 max_tokens: 5000 + # TODO: seed a git worktree (environment.git) at specification/ai/Face and + # add a `file-matches` grader on models.common.tsp to verify the + # @clientName("uri", "csharp") → @clientName("imageUri", "csharp") rename. graders: - type: tool-calls config: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml index 49f7616d360..d00c6e95bd6 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml @@ -28,6 +28,8 @@ stimuli: config: required: - azsdk_typespec_check_project_in_public_repo + disallowed: + - azsdk_verify_setup scoring: weights: From fd4eaf8398a062ad8b6ea6b0e4f8c37054fd9c47 Mon Sep 17 00:00:00 2001 From: helen229 Date: Tue, 2 Jun 2026 15:34:11 -0700 Subject: [PATCH 08/24] Vally: restructure evals into unit/integration/e2e test pyramid Replace per-area folders (scenarios/, triggers/) with tier-based folders. Feature area moves to a YAML tag, enabling tag-filtered suites. Add composite suites (pr-gate, nightly) and area-filtered suites in .vally.yaml. Update Validate-EvalTools.ps1 to scan evals/unit for triggers-*.eval.yaml. Refresh README and Run-LiveEvals.ps1 paths. --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 70 +++++--- .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 159 ++++++++++++------ .../Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 | 56 ++++++ .../evals/e2e/release-planner-e2e.eval.yaml | 83 +++++++++ .../check-public-repo-then-validate.eval.yaml | 6 + .../rename-client-property.eval.yaml | 6 + .../typespec-generation-step02.eval.yaml | 6 + .../scenarios/create-release-plan.eval.yaml | 43 ----- .../evals/setup/ensure-specs-clone.ps1 | 70 ++++++++ .../add-arm-resource.eval.yaml | 6 + .../check-public-repo.eval.yaml | 6 + .../check-sdk-generation-status.eval.yaml | 6 + .../evals/unit/create-release-plan.eval.yaml | 112 ++++++++++++ .../get-modified-typespec-projects.eval.yaml | 6 + .../get-pr-link-current-branch.eval.yaml | 6 + .../link-namespace-approval-issue.eval.yaml | 6 + .../triggers-apiview.eval.yaml} | 5 +- .../triggers-config.eval.yaml} | 5 +- .../triggers-engsys.eval.yaml} | 5 +- .../triggers-github.eval.yaml} | 5 +- .../triggers-package.eval.yaml} | 5 +- .../triggers-pipeline.eval.yaml} | 5 +- .../triggers-releaseplan.eval.yaml} | 5 +- .../triggers-typespec.eval.yaml} | 5 +- .../triggers-verify.eval.yaml} | 5 +- .../validate-typespec.eval.yaml | 6 + .../scripts/Validate-EvalTools.ps1 | 14 +- 27 files changed, 572 insertions(+), 140 deletions(-) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => integration}/check-public-repo-then-validate.eval.yaml (96%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => integration}/rename-client-property.eval.yaml (95%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => integration}/typespec-generation-step02.eval.yaml (95%) delete mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => unit}/add-arm-resource.eval.yaml (96%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => unit}/check-public-repo.eval.yaml (95%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => unit}/check-sdk-generation-status.eval.yaml (96%) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => unit}/get-modified-typespec-projects.eval.yaml (96%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => unit}/get-pr-link-current-branch.eval.yaml (96%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => unit}/link-namespace-approval-issue.eval.yaml (96%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/apiview.eval.yaml => unit/triggers-apiview.eval.yaml} (98%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/config.eval.yaml => unit/triggers-config.eval.yaml} (97%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/engsys.eval.yaml => unit/triggers-engsys.eval.yaml} (98%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/github.eval.yaml => unit/triggers-github.eval.yaml} (98%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/package.eval.yaml => unit/triggers-package.eval.yaml} (99%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/pipeline.eval.yaml => unit/triggers-pipeline.eval.yaml} (98%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/releaseplan.eval.yaml => unit/triggers-releaseplan.eval.yaml} (99%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/typespec.eval.yaml => unit/triggers-typespec.eval.yaml} (99%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{triggers/verify.eval.yaml => unit/triggers-verify.eval.yaml} (96%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => unit}/validate-typespec.eval.yaml (95%) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index 771b210183d..4cbba44eb09 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -26,35 +26,51 @@ environments: AZSDKTOOLS_COLLECT_TELEMETRY: "false" # Suites group evals for selective execution. -# - scenarios/* = multi-step workflow evals (this PR) -# - triggers/* = per-tool single-prompt invocation evals (ported from PR #15183) +# +# Folder layout mirrors the standard test pyramid: +# evals/unit/ — single-tool, hermetic, fast (incl. per-tool triggers) +# evals/integration/ — multi-tool chains, still hermetic +# evals/e2e/ — live MCP + real git env + skills loaded (slow) +# +# Folder = tier (cost / cadence). Tags inside each YAML carry the feature +# area (release-plan, typespec, pipeline, github, …) so cross-cuts are +# selected via `filter:` below or `vally eval --tag area=`. suites: - # Scenario suites (multi-step workflows) - typespec: + # ---- by tier ---- + unit: + description: Hermetic single-tool evals (incl. per-tool trigger coverage). Fast; safe for PR gate. + evals: ["evals/unit/*.eval.yaml"] + integration: + description: Multi-tool chained evals; hermetic. Still suitable for PR gate. + evals: ["evals/integration/*.eval.yaml"] + e2e: + description: Live end-to-end against real MCP + real azure-rest-api-specs clone. Use Run-LiveEvals.ps1. + evals: ["evals/e2e/*.eval.yaml"] + + # ---- composite suites ---- + pr-gate: + description: Fast tiers only (unit + integration). Target for CI PR check. evals: - - "evals/scenarios/check-public-repo.eval.yaml" - - "evals/scenarios/check-public-repo-then-validate.eval.yaml" - - "evals/scenarios/validate-typespec.eval.yaml" - - "evals/scenarios/typespec-generation-step02.eval.yaml" - - "evals/scenarios/get-modified-typespec-projects.eval.yaml" - - "evals/scenarios/add-arm-resource.eval.yaml" - - "evals/scenarios/rename-client-property.eval.yaml" + - "evals/unit/*.eval.yaml" + - "evals/integration/*.eval.yaml" + nightly: + description: All tiers including live e2e. + evals: ["evals/**/*.eval.yaml"] + + # ---- by feature area (tag-filtered) ---- release-plan: - evals: - - "evals/scenarios/create-release-plan.eval.yaml" - - "evals/scenarios/link-namespace-approval-issue.eval.yaml" - github: - evals: - - "evals/scenarios/get-pr-link-current-branch.eval.yaml" + description: All evals tagged area=release-plan. + filter: { area: release-plan } + evals: ["evals/**/*.eval.yaml"] + typespec: + description: All evals tagged area=typespec. + filter: { area: typespec } + evals: ["evals/**/*.eval.yaml"] pipeline: - evals: - - "evals/scenarios/check-sdk-generation-status.eval.yaml" - scenarios: - evals: ["evals/scenarios/*.eval.yaml"] - - # Trigger suite (per-tool prompt → tool invocation coverage) - triggers: - evals: ["evals/triggers/*.eval.yaml"] - - all: + description: All evals tagged area=pipeline. + filter: { area: pipeline } + evals: ["evals/**/*.eval.yaml"] + github: + description: All evals tagged area=github. + filter: { area: github } evals: ["evals/**/*.eval.yaml"] diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index 34b89ffc069..1dd36742f78 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -16,7 +16,7 @@ different folders. A full end-to-end gate runs *both*. | **Path** | [`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/`](evals/) (`scenarios/` + `triggers/`) | [`.github/skills//evals/*.eval.yaml`](../../../.github/skills/) (and `evaluate/evals/` for capability suites) | | **Loaded subject** | Production MCP server (`Azure.Sdk.Tools.Cli`) over stdio — real tools, real network calls | Skill's `SKILL.md` + frontmatter; the agent picks tools itself | | **Primary grader** | `tool-calls` — checks the recorded trajectory for required tool names | Trigger / routing graders + per-skill rubric | -| **Run command** | `vally eval --eval-spec evals/scenarios/.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/` *from repo root* | +| **Run command** | `vally eval --eval-spec evals/unit/.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/` *from repo root* | | **CI status** | Not wired yet (see follow-ups) | `vally lint` runs in [.github/workflows/skill-eval.yml](../../../.github/workflows/skill-eval.yml); full `eval` job pending | | **Cost profile** | Higher — each run spins up the MCP server, real LLM turns (~5–15), real tool calls | Variable — trigger evals are cheap; capability evals (e.g. `azure-typespec-author`) are expensive | @@ -40,41 +40,51 @@ conditional branches, recovery), both matter independently. ### Scenarios checked in today -**Tool-scenario evals (this project)** — split into two suites under [`evals/`](evals/): - -#### `evals/scenarios/` — multi-step workflow evals (11) - -| Scenario | Shape | -|---|---| -| [`check-public-repo`](evals/scenarios/check-public-repo.eval.yaml) | Single-tool: is a TypeSpec project published in `azure-rest-api-specs`? | -| [`check-public-repo-then-validate`](evals/scenarios/check-public-repo-then-validate.eval.yaml) | Multi-tool, ordered: validate then check | -| [`validate-typespec`](evals/scenarios/validate-typespec.eval.yaml) | Single-tool: run `tsp` linter/validation | -| [`typespec-generation-step02`](evals/scenarios/typespec-generation-step02.eval.yaml) | Step in the spec-PR generation flow | -| [`get-modified-typespec-projects`](evals/scenarios/get-modified-typespec-projects.eval.yaml) | Git-aware tool against current branch | -| [`add-arm-resource`](evals/scenarios/add-arm-resource.eval.yaml) | Calls `azsdk_typespec_generate_authoring_plan` for an ARM resource | -| [`create-release-plan`](evals/scenarios/create-release-plan.eval.yaml) | Single-tool: create a release-plan work item | -| [`link-namespace-approval-issue`](evals/scenarios/link-namespace-approval-issue.eval.yaml) | Link an existing approval issue to a release plan | -| [`get-pr-link-current-branch`](evals/scenarios/get-pr-link-current-branch.eval.yaml) | Resolve the PR for the active git branch | -| [`check-sdk-generation-status`](evals/scenarios/check-sdk-generation-status.eval.yaml) | Pipeline status lookup | -| [`rename-client-property`](evals/scenarios/rename-client-property.eval.yaml) | Stub — needs `expected-diff` grader | - -#### `evals/triggers/` — per-tool prompt → tool invocation coverage (ported from [#15183](https://github.com/Azure/azure-sdk-tools/pull/15183)) - -One YAML per tool category; each stimulus is a single prompt expected to invoke a single MCP tool. Used to catch tool-rename / description-drift regressions. - -| File | Tools covered | -|---|---| -| [`apiview.eval.yaml`](evals/triggers/apiview.eval.yaml) | `azsdk_apiview_*` | -| [`config.eval.yaml`](evals/triggers/config.eval.yaml) | `azsdk_check_service_label`, `azsdk_create_service_label` | -| [`engsys.eval.yaml`](evals/triggers/engsys.eval.yaml) | `azsdk_analyze_log_file`, failed-test tools, codeowner-cache | -| [`github.eval.yaml`](evals/triggers/github.eval.yaml) | `azsdk_create_pull_request`, `azsdk_get_pull_request*`, `azsdk_get_github_user_details` | -| [`package.eval.yaml`](evals/triggers/package.eval.yaml) | `azsdk_package_*`, `azsdk_release_sdk` | -| [`pipeline.eval.yaml`](evals/triggers/pipeline.eval.yaml) | `azsdk_analyze_pipeline`, `azsdk_get_pipeline_*` | -| [`releaseplan.eval.yaml`](evals/triggers/releaseplan.eval.yaml) | `azsdk_*_release_plan*`, `azsdk_run_generate_sdk`, `azsdk_link_*` | -| [`typespec.eval.yaml`](evals/triggers/typespec.eval.yaml) | `azsdk_typespec_*`, `azsdk_convert_swagger_to_typespec`, `azsdk_customized_code_update`, `azsdk_run_typespec_validation` | -| [`verify.eval.yaml`](evals/triggers/verify.eval.yaml) | `azsdk_verify_setup` | - -The companion [`scripts/Validate-EvalTools.ps1`](scripts/Validate-EvalTools.ps1) cross-checks that every tool referenced in `evals/triggers/` exists on the running MCP server, and every server tool has at least one trigger. +**Tool-scenario evals (this project)** — organised by the standard test pyramid under [`evals/`](evals/). The folder is the **cost tier** (and CI cadence); the feature **area** is a tag inside each YAML so cross-cuts work via `.vally.yaml` suite filters. + +#### `evals/unit/` — hermetic single-tool evals (18) + +One prompt → one expected MCP tool. No `environment.git`, no fixtures. Fast; safe to run on every PR. Includes the per-tool **trigger** coverage ported from [#15183](https://github.com/Azure/azure-sdk-tools/pull/15183) (`triggers-*.eval.yaml`). + +| Scenario | Area | Shape | +|---|---|---| +| [`check-public-repo`](evals/unit/check-public-repo.eval.yaml) | typespec | Is a TypeSpec project published in `azure-rest-api-specs`? | +| [`validate-typespec`](evals/unit/validate-typespec.eval.yaml) | typespec | Run `tsp` linter/validation | +| [`get-modified-typespec-projects`](evals/unit/get-modified-typespec-projects.eval.yaml) | typespec | Git-aware tool against current branch | +| [`add-arm-resource`](evals/unit/add-arm-resource.eval.yaml) | typespec | Calls `azsdk_typespec_generate_authoring_plan` for an ARM resource | +| [`create-release-plan`](evals/unit/create-release-plan.eval.yaml) | release-plan | Create a release-plan work item | +| [`link-namespace-approval-issue`](evals/unit/link-namespace-approval-issue.eval.yaml) | release-plan | Link an existing approval issue to a release plan | +| [`get-pr-link-current-branch`](evals/unit/get-pr-link-current-branch.eval.yaml) | github | Resolve the PR for the active git branch | +| [`check-sdk-generation-status`](evals/unit/check-sdk-generation-status.eval.yaml) | pipeline | Pipeline status lookup | +| [`triggers-apiview`](evals/unit/triggers-apiview.eval.yaml) | apiview | `azsdk_apiview_*` | +| [`triggers-config`](evals/unit/triggers-config.eval.yaml) | engsys | `azsdk_check_service_label`, `azsdk_create_service_label` | +| [`triggers-engsys`](evals/unit/triggers-engsys.eval.yaml) | engsys | `azsdk_analyze_log_file`, failed-test tools, codeowner-cache | +| [`triggers-github`](evals/unit/triggers-github.eval.yaml) | github | `azsdk_create_pull_request`, `azsdk_get_pull_request*`, `azsdk_get_github_user_details` | +| [`triggers-package`](evals/unit/triggers-package.eval.yaml) | package | `azsdk_package_*`, `azsdk_release_sdk` | +| [`triggers-pipeline`](evals/unit/triggers-pipeline.eval.yaml) | pipeline | `azsdk_analyze_pipeline`, `azsdk_get_pipeline_*` | +| [`triggers-releaseplan`](evals/unit/triggers-releaseplan.eval.yaml) | release-plan | `azsdk_*_release_plan*`, `azsdk_run_generate_sdk`, `azsdk_link_*` | +| [`triggers-typespec`](evals/unit/triggers-typespec.eval.yaml) | typespec | `azsdk_typespec_*`, `azsdk_convert_swagger_to_typespec`, `azsdk_customized_code_update`, `azsdk_run_typespec_validation` | +| [`triggers-verify`](evals/unit/triggers-verify.eval.yaml) | engsys | `azsdk_verify_setup` | + +The companion [`scripts/Validate-EvalTools.ps1`](scripts/Validate-EvalTools.ps1) cross-checks that every tool referenced in `evals/unit/triggers-*.eval.yaml` exists on the running MCP server, and every server tool has at least one trigger. + +#### `evals/integration/` — multi-tool chained evals (3) + +Still hermetic (no `environment.git`), but the agent must invoke 2+ MCP tools in sequence. + +| Scenario | Area | Shape | +|---|---|---| +| [`check-public-repo-then-validate`](evals/integration/check-public-repo-then-validate.eval.yaml) | typespec | Validate, then check public-repo presence | +| [`typespec-generation-step02`](evals/integration/typespec-generation-step02.eval.yaml) | typespec | Step in the spec-PR generation flow | +| [`rename-client-property`](evals/integration/rename-client-property.eval.yaml) | typespec | Stub — needs `expected-diff` grader + sparse clone | + +#### `evals/e2e/` — live end-to-end (1) + +Drives the real MCP server inside a real `azure-rest-api-specs` worktree. Slow; run via [`Run-LiveEvals.ps1`](Run-LiveEvals.ps1) (auto-primes a per-user cache via [`evals/setup/ensure-specs-clone.ps1`](evals/setup/ensure-specs-clone.ps1)). + +| Scenario | Area | Shape | +|---|---|---| +| [`release-planner-e2e`](evals/e2e/release-planner-e2e.eval.yaml) | release-plan | Create then re-fetch a release plan; real DevOps test-area writes | **Skill evals (already in repo, *not* part of this PR)** — for reference: @@ -101,14 +111,23 @@ tracks the migration in ``` Azure.Sdk.Tools.Vally/ ├── .vally.yaml # Vally config (environments + suites) -├── evals/ # Scenario eval YAML files -│ └── *.eval.yaml -├── fixtures/ # Per-scenario file fixtures +├── Run-LiveEvals.ps1 # Wrapper for the e2e tier (primes spec-repo cache) +├── evals/ +│ ├── unit/ # tier 1: single-tool, hermetic, fast +│ ├── integration/ # tier 2: multi-tool chains, hermetic +│ ├── e2e/ # tier 3: live MCP + real azure-rest-api-specs +│ └── setup/ # helper scripts (e.g. ensure-specs-clone.ps1) +├── fixtures/ # Per-scenario static input files (env.files) │ └── /... +├── scripts/ # Repo-side helpers (Validate-EvalTools.ps1, …) └── Graders/ # (future) Custom .NET graders └── Azure.Sdk.Tools.Vally.csproj # added when first custom grader lands ``` +Folder = test pyramid tier (cost / CI cadence). Feature **area** lives as a +`tags:` entry inside each YAML so cross-cuts (e.g. “all release-plan evals”) +select via [`.vally.yaml`](.vally.yaml) suite filters or `vally eval --tag`. + ## Running locally Prereqs: @@ -122,37 +141,69 @@ Prereqs: npm ci ``` -Run all tool-scenario evals from this directory: +Run a suite (recommended): ```powershell cd tools/azsdk-cli/Azure.Sdk.Tools.Vally -../../../eng/skill-eval/node_modules/.bin/vally run . +$vally = '../../../eng/skill-eval/node_modules/.bin/vally.cmd' + +# Fast tiers only — PR-gate candidate +& $vally eval --suite pr-gate + +# A single tier +& $vally eval --suite unit +& $vally eval --suite integration + +# By feature area (cross-cuts tiers via tag filter) +& $vally eval --suite release-plan +& $vally eval --suite typespec ``` Run a single eval: ```powershell -../../../eng/skill-eval/node_modules/.bin/vally run evals/check-public-repo.eval.yaml +& $vally eval --eval-spec evals/unit/check-public-repo.eval.yaml +``` + +Run the live e2e tier (auto-primes a per-user clone of +`azure-rest-api-specs`; refreshes every 24h): + +```powershell +./Run-LiveEvals.ps1 # default: release-planner-e2e +./Run-LiveEvals.ps1 -VallyVerbose # with verbose output +./Run-LiveEvals.ps1 -EvalSpecs evals/e2e/foo.eval.yaml,evals/e2e/bar.eval.yaml ``` ## Adding a new scenario -1. Pick a short, kebab-case name (e.g. `create-release-plan`). -2. Create `evals/.eval.yaml`. Start from - [`evals/check-public-repo.eval.yaml`](evals/check-public-repo.eval.yaml) as - a template. -3. If the scenario needs input files, add them under +1. **Pick a tier** — the folder you drop the YAML into: + - `evals/unit/` — one prompt, one MCP tool, no environment hooks. + - `evals/integration/` — multi-tool flow, still hermetic. + - `evals/e2e/` — needs live MCP + a real git env / external service. +2. Pick a short, kebab-case name (e.g. `create-release-plan`). +3. Create `evals//.eval.yaml`. Start from a sibling in the same + tier as a template. +4. **Tag it** so suite filters pick it up: + ```yaml + tags: + tier: unit # or integration / e2e + area: release-plan # or typespec / pipeline / github / engsys / apiview / package + ``` +5. If the scenario needs input files, add them under `fixtures//...` and reference them via `environment.files` in the eval (relative paths from the eval file). -4. Pick graders: +6. Pick graders — they’re a **list**, stack as many as you need: - `tool-calls` — verify the agent invoked the expected MCP tool(s). - - `file-matches` — verify the agent produced/modified files correctly. + - `skill-invocation` — verify the right skill routed (e2e only). + - `tool-call-count` / `token-budget` / `turn-count` — chattiness / budget guards. + - `output-matches` / `output-contains` — assert final-message shape. + - `file-matches` / `file-exists` — verify produced/modified files. - `prompt` — LLM-as-judge for free-form quality checks. - - Custom (`Graders/`) — add a .NET grader when none of the built-ins fit - (and add the `Azure.Sdk.Tools.Vally.csproj` when the first one lands). -5. Add the new eval path to the relevant `suites:` entry in - [`.vally.yaml`](.vally.yaml). -6. Run locally to confirm it passes, then open a PR. + - Custom (`Graders/`) — add a .NET grader when no built-in fits. +7. The suite picks it up automatically (folders are globbed). Add a new + tag-filtered suite to [`.vally.yaml`](.vally.yaml) only if you’re + introducing a brand-new feature area. +8. Run locally to confirm it passes, then open a PR. ## Recovery checklist (from deleted benchmark) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 new file mode 100644 index 00000000000..d80cfe24401 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 @@ -0,0 +1,56 @@ +<# +.SYNOPSIS + Runs Vally live-tier evals locally with shared spec-repo setup. + +.DESCRIPTION + - Calls evals/setup/ensure-specs-clone.ps1 once to prime the azure-rest-api-specs + cache (idempotent, refreshes if >24h old). + - Then runs the given eval spec(s) via the locally-installed Vally CLI. + + Defaults to the release-planner-e2e demo. Pass -EvalSpecs to run others. + +.EXAMPLE + ./Run-LiveEvals.ps1 + +.EXAMPLE + ./Run-LiveEvals.ps1 -EvalSpecs evals/e2e/release-planner-e2e.eval.yaml,evals/e2e/foo.eval.yaml +#> +[CmdletBinding()] +param( + [string[]] $EvalSpecs = @('evals/e2e/release-planner-e2e.eval.yaml'), + [switch] $VallyVerbose +) + +$ErrorActionPreference = 'Stop' +Set-StrictMode -Version 4 + +$repoRoot = Resolve-Path (Join-Path $PSScriptRoot '..\..\..') +$vallyCli = Join-Path $repoRoot 'eng\skill-eval\node_modules\.bin\vally.cmd' +$setupScript = Join-Path $PSScriptRoot 'evals\setup\ensure-specs-clone.ps1' + +if (-not (Test-Path $vallyCli)) { + throw "Vally CLI not found at $vallyCli. Run 'npm install' in eng/skill-eval first." +} + +Write-Host "==> Ensuring azure-rest-api-specs cache" +& pwsh -NoProfile -File $setupScript | Out-Host + +Push-Location $PSScriptRoot +try { + foreach ($spec in $EvalSpecs) { + Write-Host "" + Write-Host "==> Running $spec" + $args = @('eval', '--eval-spec', $spec) + if ($VallyVerbose) { $args += '--verbose' } + & $vallyCli @args + if ($LASTEXITCODE -ne 0) { + throw "Eval failed: $spec (exit $LASTEXITCODE)" + } + } +} +finally { + Pop-Location +} + +Write-Host "" +Write-Host "==> All evals passed." diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml new file mode 100644 index 00000000000..a16b68cf4d6 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml @@ -0,0 +1,83 @@ +name: azsdk-mcp-tool-scenarios +description: | + Live end-to-end demo for the release-planner flow. + + Drives the *real* azsdk-cli MCP server against real DevOps APIs, inside a + real git worktree of azure-rest-api-specs. The MCP server runs with + AZSDKTOOLS_AGENT_TESTING=true (set globally in .vally.yaml), so work items + route to the DevOps test area path and are safe to leave around / re-run. + + Demonstrates Vally's environment.git fixture hook + live MCP + Copilot SDK + executor + real DevOps in one shot. + + Prerequisite: a clone of Azure/azure-rest-api-specs at the path referenced + by environment.git.source below. Locally, use the Run-LiveEvals.ps1 + wrapper at the package root — it primes a per-user cache via + evals/setup/ensure-specs-clone.ps1 (auto-refresh every 24h) and points + this source path at it. CI should clone the repo as a pipeline checkout + step instead. + +version: "1.0" +type: capability + +# Tagged "tier: e2e" so a future PR-gate run can skip live evals if needed. +tags: + tier: e2e + area: release-plan + +environment: azsdk-mcp + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: release-planner-e2e + environment: + # Source is the per-user cache populated by Run-LiveEvals.ps1 + # (idempotent shallow+sparse clone, auto-refresh every 24h). + # NOTE: hardcoded absolute path — Vally does not currently expand + # ${USERPROFILE} / env vars in env.git.source. Adjust per machine + # or replace with a CI-provided path. See upstream issue: + # https://github.com/microsoft/vally/issues (TODO: file env-var expansion) + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main + prompt: | + I'm in a checkout of azure-rest-api-specs. Walk through a release plan + for the Contoso Widget Manager end-to-end: + + 1. Create a release plan using: + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f" + - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e" + - target release timeline: "December 2026" + - API version: "2022-11-01-preview" + - SDK release type: "beta" + - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" + + 2. Fetch the release plan you just created back from DevOps to confirm + it was saved. + + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 10 + max_tokens: 10000 + # TODO: assert ordering create -> get — blocked on https://github.com/Azure/azure-sdk-tools/issues/15832 (Vally tool-calls grader needs sequence:). + # TODO: assert serviceTreeId / productTreeId / typeSpecProjectPath args — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). + graders: + - type: tool-calls + config: + required: + - azsdk_create_release_plan + - azsdk_get_release_plan + disallowed: + - azsdk_verify_setup + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/check-public-repo-then-validate.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/check-public-repo-then-validate.eval.yaml index 3f16aa6cf3f..bac7328999c 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/check-public-repo-then-validate.eval.yaml @@ -5,6 +5,11 @@ description: | version: "1.0" type: capability + +tags: + tier: integration + area: typespec + environment: azsdk-mcp config: @@ -36,3 +41,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/rename-client-property.eval.yaml similarity index 95% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/rename-client-property.eval.yaml index 5b8035945b2..f07a2615f90 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/rename-client-property.eval.yaml @@ -6,6 +6,11 @@ description: | version: "1.0" type: capability + +tags: + tier: integration + area: typespec + environment: azsdk-mcp config: @@ -36,3 +41,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/typespec-generation-step02.eval.yaml similarity index 95% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/typespec-generation-step02.eval.yaml index d00c6e95bd6..5f149d3c1a1 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/typespec-generation-step02.eval.yaml @@ -5,6 +5,11 @@ description: | version: "1.0" type: capability + +tags: + tier: integration + area: typespec + environment: azsdk-mcp config: @@ -35,3 +40,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml deleted file mode 100644 index d58e5f7fef2..00000000000 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/create-release-plan.eval.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: azsdk-mcp-tool-scenarios -description: | - Create-release-plan: the agent should call azsdk_create_release_plan with - the supplied service-tree / product-tree / spec PR context. -version: "1.0" -type: capability - -environment: azsdk-mcp - -config: - runs: 1 - timeout: 30m - model: gpt-5.4 - executor: copilot-sdk - -stimuli: - - name: create-release-plan - prompt: | - Create a release plan for the Contoso Widget Manager, no need to get it afterwards only create. - My setup has already been verified, do not run azsdk_verify_setup. Here is all the context you need: - TypeSpec project located at "specification/contosowidgetmanager/Contoso.WidgetManager". - Use service tree ID "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f", - product tree ID "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e", - target release timeline "December 2025", - API version "2022-11-01-preview", - SDK release type "beta", - and link it to the spec pull request "https://github.com/Azure/azure-rest-api-specs/pull/38387". - constraints: - max_turns: 8 - max_tokens: 8000 - # TODO: assert serviceTreeId / productTreeId / specApiVersion / specPullRequestUrl / sdkReleaseType / typeSpecProjectPath — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). - graders: - - type: tool-calls - config: - required: - - azsdk_create_release_plan - disallowed: - - azsdk_verify_setup - -scoring: - weights: - tool-calls: 1 - threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 new file mode 100644 index 00000000000..57eab6ac92f --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 @@ -0,0 +1,70 @@ +<# +.SYNOPSIS + Ensures a per-user shallow+sparse cache clone of Azure/azure-rest-api-specs + exists and is reasonably fresh. + +.DESCRIPTION + Used as a pre-run step by the Vally live-eval wrapper (Run-LiveEvals.ps1). + Maintains a cache clone that Vally's `environment.git.source` points at, + so individual eval YAMLs don't need a pre-existing checkout. + + - First run: shallow + blobless + cone-sparse clone (only + specification/contosowidgetmanager/ to keep size minimal). + - Subsequent runs within -MaxAgeHours: noop. + - Subsequent runs past -MaxAgeHours: `git fetch --depth 1 origin main` and + fast-forward `main`. + + Cache lives at: + Windows: $env:USERPROFILE\.vally-cache\azure-rest-api-specs + *nix: $HOME/.vally-cache/azure-rest-api-specs + +.PARAMETER MaxAgeHours + Skip the `git fetch` if the cache was last refreshed within this many + hours. Default: 24. + +.PARAMETER SparseCheckoutPaths + Cone-sparse paths to include. Default: specification/contosowidgetmanager. + Pass @() to disable sparse-checkout (full tree). +#> +[CmdletBinding()] +param( + [int] $MaxAgeHours = 24, + [string[]] $SparseCheckoutPaths = @('specification/contosowidgetmanager') +) + +$ErrorActionPreference = 'Stop' +Set-StrictMode -Version 4 + +$cacheRoot = if ($env:USERPROFILE) { Join-Path $env:USERPROFILE '.vally-cache' } else { Join-Path $HOME '.vally-cache' } +$cache = Join-Path $cacheRoot 'azure-rest-api-specs' +$stamp = Join-Path $cache '.vally-last-fetch' + +if (-not (Test-Path (Join-Path $cache '.git'))) { + Write-Host "[ensure-specs-clone] Cloning azure-rest-api-specs into cache: $cache" + New-Item -ItemType Directory -Force -Path $cacheRoot | Out-Null + git clone --depth 1 --filter=blob:none --no-checkout ` + https://github.com/Azure/azure-rest-api-specs.git $cache | Out-Null + if ($SparseCheckoutPaths.Count -gt 0) { + git -C $cache sparse-checkout init --cone | Out-Null + git -C $cache sparse-checkout set @SparseCheckoutPaths | Out-Null + } + git -C $cache checkout main | Out-Null + Set-Content -Path $stamp -Value (Get-Date -Format o) +} else { + $isStale = $true + if (Test-Path $stamp) { + $age = (Get-Date) - (Get-Item $stamp).LastWriteTime + $isStale = $age.TotalHours -gt $MaxAgeHours + } + if ($isStale) { + Write-Host "[ensure-specs-clone] Refreshing cache (>$MaxAgeHours h old): $cache" + git -C $cache fetch --depth 1 origin main | Out-Null + git -C $cache reset --hard origin/main | Out-Null + Set-Content -Path $stamp -Value (Get-Date -Format o) + } else { + Write-Host "[ensure-specs-clone] Cache is fresh (<$MaxAgeHours h): $cache" + } +} + +# Echo the cache path so the wrapper can capture it. +Write-Output $cache diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml index 7ec341ed11b..13485b27443 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/add-arm-resource.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml @@ -6,6 +6,11 @@ description: | version: "1.0" type: capability + +tags: + tier: unit + area: typespec + environment: azsdk-mcp config: @@ -36,3 +41,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml similarity index 95% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml index 47944016a9c..02cff88da23 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml @@ -6,6 +6,11 @@ description: | version: "1.0" type: capability + +tags: + tier: unit + area: typespec + environment: azsdk-mcp config: @@ -35,3 +40,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml index 211479c9225..3a4e9ef2998 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-sdk-generation-status.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml @@ -5,6 +5,11 @@ description: | version: "1.0" type: capability + +tags: + tier: unit + area: pipeline + environment: azsdk-mcp config: @@ -34,3 +39,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml new file mode 100644 index 00000000000..df24807e604 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml @@ -0,0 +1,112 @@ +# ============================================================================= +# Scenario: create-release-plan +# ----------------------------------------------------------------------------- +# Purpose: +# Tier-1 "tool-call" eval. Verify that, given a fully-specified prompt with +# all required context, the agent invokes `azsdk_create_release_plan` exactly +# once and does NOT redundantly call `azsdk_verify_setup` (the prompt already +# states setup is verified). +# +# What this eval is NOT: +# - Not an end-to-end flow (see release-planner-e2e.eval.yaml for that). +# - Does not validate argument values yet — see TODO below + #15833. +# - Does not need azure-rest-api-specs cloned; runs against the live MCP +# server in agent-testing mode (AZSDKTOOLS_AGENT_TESTING=true, set in +# ../../.vally.yaml). +# +# How to run locally: +# cd tools/azsdk-cli/Azure.Sdk.Tools.Vally +# ../../../eng/skill-eval/node_modules/.bin/vally.cmd eval \ +# --eval-spec evals/unit/create-release-plan.eval.yaml --verbose +# ============================================================================= + +name: azsdk-mcp-tool-scenarios +description: | + Create-release-plan: the agent should call azsdk_create_release_plan with + the supplied service-tree / product-tree / spec PR context. +version: "1.0" +type: capability + + +tags: + tier: unit + area: release-plan + +# `environment: azsdk-mcp` refers to the named environment defined in +# ../../.vally.yaml (configures the azsdk-cli MCP server + env vars). +environment: azsdk-mcp + +config: + runs: 1 # bump for flakiness sampling (e.g. runs: 5) + timeout: 30m # total wall-clock budget for ALL stimuli in this file + model: gpt-5.4 # model alias — see .vally.yaml `models:` map + executor: copilot-sdk + +stimuli: + - name: create-release-plan + prompt: | + Create a release plan for the Contoso Widget Manager, no need to get it afterwards only create. + My setup has already been verified, do not run azsdk_verify_setup. Here is all the context you need: + TypeSpec project located at "specification/contosowidgetmanager/Contoso.WidgetManager". + Use service tree ID "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f", + product tree ID "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e", + target release timeline "December 2025", + API version "2022-11-01-preview", + SDK release type "beta", + and link it to the spec pull request "https://github.com/Azure/azure-rest-api-specs/pull/38387". + + # Per-stimulus guardrails. Anything beyond these fails the run. + constraints: + max_turns: 8 # agent loop iterations + max_tokens: 8000 # cumulative token spend + + # TODO: assert serviceTreeId / productTreeId / specApiVersion / specPullRequestUrl / sdkReleaseType / typeSpecProjectPath — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). + # + # `graders:` is a LIST — stack as many as you want. Each grader produces a + # score in [0,1]; the `scoring.weights` block below combines them into the + # final scenario score. Available grader `type:` values include: + # + # static (deterministic, free): + # tool-calls, skill-invocation, has-output, no-errors, turn-completed, + # token-budget, tool-call-count, turn-count, error-count, wall-time, + # program, run-command, stdout-contains, stdout-matches, + # stderr-contains, exit-code, file-exists, file-contains, + # file-matches, output-contains, output-matches + # llm (model-judged, costs tokens): + # prompt, pairwise + # + # Example of stacking multiple graders (uncomment to use): + # + # graders: + # - type: tool-calls + # config: + # required: [azsdk_create_release_plan] + # disallowed: [azsdk_verify_setup] + # - type: skill-invocation # was a specific skill invoked? + # config: + # required: [release-planner] + # - type: tool-call-count # cap chattiness + # config: + # max: 5 + # - type: prompt # llm-judged correctness + # config: + # model: gpt-5.4 + # rubric: | + # Did the final assistant message confirm the release plan was + # created and surface its ID? Answer "pass" or "fail". + graders: + - type: tool-calls + config: + required: + - azsdk_create_release_plan + disallowed: + - azsdk_verify_setup + +# Combine grader scores into the final scenario score. +# Keys must match the grader `type:` (or its `name:` if you set one). +# `threshold` is the minimum weighted score for the scenario to PASS. +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml index db4ed40c09e..6eb37e87fd3 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-modified-typespec-projects.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml @@ -6,6 +6,11 @@ description: | version: "1.0" type: capability + +tags: + tier: unit + area: typespec + environment: azsdk-mcp config: @@ -37,3 +42,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml index 4eb8538f623..254a59f9897 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/get-pr-link-current-branch.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml @@ -6,6 +6,11 @@ description: | version: "1.0" type: capability + +tags: + tier: unit + area: github + environment: azsdk-mcp config: @@ -35,3 +40,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml index 79a27314a86..b9c869f6f2b 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/link-namespace-approval-issue.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml @@ -5,6 +5,11 @@ description: | version: "1.0" type: capability + +tags: + tier: unit + area: release-plan + environment: azsdk-mcp config: @@ -34,3 +39,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/apiview.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml similarity index 98% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/apiview.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml index afeea4f8ee2..7c89923f269 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/apiview.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml @@ -13,7 +13,9 @@ config: executor: copilot-sdk model: claude-opus-4.6 -tags: +tags: + tier: unit + area: apiview priority: p0 stimuli: @@ -110,3 +112,4 @@ stimuli: config: required: - name: "azsdk_apiview_request_copilot_review" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/config.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml similarity index 97% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/config.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml index 29dad5ba24c..1a3e9d0af04 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/config.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml @@ -13,7 +13,9 @@ config: executor: copilot-sdk model: claude-opus-4.6 -tags: +tags: + tier: unit + area: engsys priority: p0 stimuli: @@ -49,3 +51,4 @@ stimuli: config: required: - name: "azsdk_create_service_label" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/engsys.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml similarity index 98% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/engsys.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml index 1ad3ca5b5d2..74a70bb1285 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/engsys.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml @@ -13,7 +13,9 @@ config: executor: copilot-sdk model: claude-opus-4.6 -tags: +tags: + tier: unit + area: engsys priority: p0 stimuli: @@ -97,3 +99,4 @@ stimuli: config: required: - name: "azsdk_get_failed_test_run_data" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/github.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml similarity index 98% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/github.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml index 047571f9c70..7d43f138e56 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/github.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml @@ -7,7 +7,9 @@ type: capability environment: azsdk-mcp -tags: +tags: + tier: unit + area: github priority: p0 config: @@ -74,3 +76,4 @@ stimuli: config: required: - name: "azsdk_get_pull_request_link_for_current_branch" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/package.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml similarity index 99% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/package.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml index dd4430062b0..834ea8ac45d 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/package.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml @@ -7,7 +7,9 @@ type: capability environment: azsdk-mcp -tags: +tags: + tier: unit + area: package priority: p0 config: @@ -228,3 +230,4 @@ stimuli: config: required: - name: "azsdk_release_sdk" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/pipeline.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml similarity index 98% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/pipeline.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml index 74ea1342082..196adea711c 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/pipeline.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml @@ -7,7 +7,9 @@ type: capability environment: azsdk-mcp -tags: +tags: + tier: unit + area: pipeline priority: p0 config: @@ -72,3 +74,4 @@ stimuli: config: required: - name: "azsdk_get_pipeline_status" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/releaseplan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml similarity index 99% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/releaseplan.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml index b99e0bda5bf..635c99e4740 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/releaseplan.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml @@ -7,7 +7,9 @@ type: capability environment: azsdk-mcp -tags: +tags: + tier: unit + area: release-plan priority: p0 config: @@ -312,3 +314,4 @@ stimuli: config: required: - name: "azsdk_update_sdk_details_in_release_plan" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml similarity index 99% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/typespec.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml index c87605ef9bc..f596187b8bb 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/typespec.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml @@ -13,7 +13,9 @@ config: executor: copilot-sdk model: claude-opus-4.6 -tags: +tags: + tier: unit + area: typespec priority: p0 stimuli: @@ -185,3 +187,4 @@ stimuli: config: required: - name: "azsdk_typespec_init_project" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/verify.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/verify.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml index 9c6c24c3cb5..a5a62ba0e33 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/triggers/verify.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml @@ -13,7 +13,9 @@ config: executor: copilot-sdk model: claude-opus-4.6 -tags: +tags: + tier: unit + area: engsys priority: p0 stimuli: @@ -40,3 +42,4 @@ stimuli: config: required: - name: "azsdk_verify_setup" + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml similarity index 95% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml index 0e2b0f5610a..3649a637745 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/validate-typespec.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml @@ -5,6 +5,11 @@ description: | version: "1.0" type: capability + +tags: + tier: unit + area: typespec + environment: azsdk-mcp config: @@ -31,3 +36,4 @@ scoring: weights: tool-calls: 1 threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 index bba3c94dda3..fcd257e4cc8 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 @@ -5,7 +5,7 @@ .DESCRIPTION This script: 1. Runs `azsdk list` to get all registered MCP tool names from the server. - 2. Parses all *.eval.yaml files under the triggers/ directory. + 2. Parses all `triggers-*.eval.yaml` files under the unit/ directory. 3. Reports any eval tool references that don't exist on the server, and any server tools that are missing eval coverage. @@ -13,8 +13,8 @@ Path to the Azure.Sdk.Tools.Cli project. Defaults to ../Azure.Sdk.Tools.Cli relative to this script. .PARAMETER EvalPath - Path to the triggers/ directory containing *.eval.yaml files. - Defaults to ../evals/triggers relative to this script. + Path to the directory containing `triggers-*.eval.yaml` files. + Defaults to ../evals/unit relative to this script. .PARAMETER SkipBuild If set, passes --no-build to dotnet run (requires a prior build). @@ -37,7 +37,7 @@ if (-not $ProjectPath) { $ProjectPath = Join-Path $cliParent "Azure.Sdk.Tools.Cli" } if (-not $EvalPath) { - $EvalPath = Join-Path $vallyRoot "evals/triggers" + $EvalPath = Join-Path $vallyRoot "evals/unit" } if (-not (Test-Path $ProjectPath)) { @@ -101,11 +101,11 @@ if ($serverTools.Count -eq 0) { Write-Host "Found $($serverTools.Count) tools registered on the MCP server ($($excludedTools.Count) excluded).`n" -ForegroundColor Green -# Step 2: Parse all *.eval.yaml files in the triggers directory for tool name references -$evalFiles = Get-ChildItem -Path $EvalPath -Filter "*.eval.yaml" +# Step 2: Parse all triggers-*.eval.yaml files in the unit directory for tool name references +$evalFiles = Get-ChildItem -Path $EvalPath -Filter "triggers-*.eval.yaml" if ($evalFiles.Count -eq 0) { - Write-Error "No *.eval.yaml files found in: $EvalPath" + Write-Error "No triggers-*.eval.yaml files found in: $EvalPath" return 1 } From a88ae11fa08d751aa746a320f8e8a1b2161712f6 Mon Sep 17 00:00:00 2001 From: helen229 Date: Tue, 2 Jun 2026 15:43:24 -0700 Subject: [PATCH 09/24] Vally: remove Run-LiveEvals.ps1 (local-only test wrapper) Drop the local-only convenience wrapper and refer directly to evals/setup/ensure-specs-clone.ps1 in docs and YAML comments. Users prime the spec clone manually and invoke 'vally eval --suite e2e'. --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 2 +- .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 12 ++-- .../Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 | 56 ------------------- .../evals/e2e/release-planner-e2e.eval.yaml | 11 ++-- .../evals/setup/ensure-specs-clone.ps1 | 2 +- 5 files changed, 12 insertions(+), 71 deletions(-) delete mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index 4cbba44eb09..e4e9150f15d 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -44,7 +44,7 @@ suites: description: Multi-tool chained evals; hermetic. Still suitable for PR gate. evals: ["evals/integration/*.eval.yaml"] e2e: - description: Live end-to-end against real MCP + real azure-rest-api-specs clone. Use Run-LiveEvals.ps1. + description: Live end-to-end against real MCP + real azure-rest-api-specs clone. Prime the clone first with evals/setup/ensure-specs-clone.ps1. evals: ["evals/e2e/*.eval.yaml"] # ---- composite suites ---- diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index 1dd36742f78..651f2d031d0 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -80,7 +80,7 @@ Still hermetic (no `environment.git`), but the agent must invoke 2+ MCP tools in #### `evals/e2e/` — live end-to-end (1) -Drives the real MCP server inside a real `azure-rest-api-specs` worktree. Slow; run via [`Run-LiveEvals.ps1`](Run-LiveEvals.ps1) (auto-primes a per-user cache via [`evals/setup/ensure-specs-clone.ps1`](evals/setup/ensure-specs-clone.ps1)). +Drives the real MCP server inside a real `azure-rest-api-specs` worktree. Slow; prime a per-user clone first via [`evals/setup/ensure-specs-clone.ps1`](evals/setup/ensure-specs-clone.ps1) (auto-refreshes every 24h). | Scenario | Area | Shape | |---|---|---| @@ -111,7 +111,6 @@ tracks the migration in ``` Azure.Sdk.Tools.Vally/ ├── .vally.yaml # Vally config (environments + suites) -├── Run-LiveEvals.ps1 # Wrapper for the e2e tier (primes spec-repo cache) ├── evals/ │ ├── unit/ # tier 1: single-tool, hermetic, fast │ ├── integration/ # tier 2: multi-tool chains, hermetic @@ -165,13 +164,12 @@ Run a single eval: & $vally eval --eval-spec evals/unit/check-public-repo.eval.yaml ``` -Run the live e2e tier (auto-primes a per-user clone of -`azure-rest-api-specs`; refreshes every 24h): +Run the live e2e tier (first, prime a per-user clone of +`azure-rest-api-specs`; the helper refreshes it every 24h): ```powershell -./Run-LiveEvals.ps1 # default: release-planner-e2e -./Run-LiveEvals.ps1 -VallyVerbose # with verbose output -./Run-LiveEvals.ps1 -EvalSpecs evals/e2e/foo.eval.yaml,evals/e2e/bar.eval.yaml +./evals/setup/ensure-specs-clone.ps1 +& $vally eval --suite e2e ``` ## Adding a new scenario diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 deleted file mode 100644 index d80cfe24401..00000000000 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/Run-LiveEvals.ps1 +++ /dev/null @@ -1,56 +0,0 @@ -<# -.SYNOPSIS - Runs Vally live-tier evals locally with shared spec-repo setup. - -.DESCRIPTION - - Calls evals/setup/ensure-specs-clone.ps1 once to prime the azure-rest-api-specs - cache (idempotent, refreshes if >24h old). - - Then runs the given eval spec(s) via the locally-installed Vally CLI. - - Defaults to the release-planner-e2e demo. Pass -EvalSpecs to run others. - -.EXAMPLE - ./Run-LiveEvals.ps1 - -.EXAMPLE - ./Run-LiveEvals.ps1 -EvalSpecs evals/e2e/release-planner-e2e.eval.yaml,evals/e2e/foo.eval.yaml -#> -[CmdletBinding()] -param( - [string[]] $EvalSpecs = @('evals/e2e/release-planner-e2e.eval.yaml'), - [switch] $VallyVerbose -) - -$ErrorActionPreference = 'Stop' -Set-StrictMode -Version 4 - -$repoRoot = Resolve-Path (Join-Path $PSScriptRoot '..\..\..') -$vallyCli = Join-Path $repoRoot 'eng\skill-eval\node_modules\.bin\vally.cmd' -$setupScript = Join-Path $PSScriptRoot 'evals\setup\ensure-specs-clone.ps1' - -if (-not (Test-Path $vallyCli)) { - throw "Vally CLI not found at $vallyCli. Run 'npm install' in eng/skill-eval first." -} - -Write-Host "==> Ensuring azure-rest-api-specs cache" -& pwsh -NoProfile -File $setupScript | Out-Host - -Push-Location $PSScriptRoot -try { - foreach ($spec in $EvalSpecs) { - Write-Host "" - Write-Host "==> Running $spec" - $args = @('eval', '--eval-spec', $spec) - if ($VallyVerbose) { $args += '--verbose' } - & $vallyCli @args - if ($LASTEXITCODE -ne 0) { - throw "Eval failed: $spec (exit $LASTEXITCODE)" - } - } -} -finally { - Pop-Location -} - -Write-Host "" -Write-Host "==> All evals passed." diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml index a16b68cf4d6..b542f073e30 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml @@ -11,11 +11,10 @@ description: | executor + real DevOps in one shot. Prerequisite: a clone of Azure/azure-rest-api-specs at the path referenced - by environment.git.source below. Locally, use the Run-LiveEvals.ps1 - wrapper at the package root — it primes a per-user cache via - evals/setup/ensure-specs-clone.ps1 (auto-refresh every 24h) and points - this source path at it. CI should clone the repo as a pipeline checkout - step instead. + by environment.git.source below. Locally, run + evals/setup/ensure-specs-clone.ps1 to prime a per-user cache + (auto-refresh every 24h) at the path this source points at. CI should + clone the repo as a pipeline checkout step instead. version: "1.0" type: capability @@ -36,7 +35,7 @@ config: stimuli: - name: release-planner-e2e environment: - # Source is the per-user cache populated by Run-LiveEvals.ps1 + # Source is the per-user cache populated by evals/setup/ensure-specs-clone.ps1 # (idempotent shallow+sparse clone, auto-refresh every 24h). # NOTE: hardcoded absolute path — Vally does not currently expand # ${USERPROFILE} / env vars in env.git.source. Adjust per machine diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 index 57eab6ac92f..918d544edf8 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 @@ -4,7 +4,7 @@ exists and is reasonably fresh. .DESCRIPTION - Used as a pre-run step by the Vally live-eval wrapper (Run-LiveEvals.ps1). + Run this before invoking the e2e suite (vally eval --suite e2e). Maintains a cache clone that Vally's `environment.git.source` points at, so individual eval YAMLs don't need a pre-existing checkout. From bb47139454f77cec85201f0834640f4c1ce6971d Mon Sep 17 00:00:00 2001 From: helen229 Date: Wed, 3 Jun 2026 07:33:23 -0700 Subject: [PATCH 10/24] some docs and test e2e one --- .../azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md | 539 ++++++++++++++++++ .../Azure.Sdk.Tools.Vally/REQUIREMENTS.md | 165 ++++++ .../evals/e2e/release-planner-e2e.eval.yaml | 81 ++- 3 files changed, 777 insertions(+), 8 deletions(-) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md new file mode 100644 index 00000000000..3a16536f477 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md @@ -0,0 +1,539 @@ +# Vally Tool-Scenario Evaluation — Design + +> Companion to [REQUIREMENTS.md](./REQUIREMENTS.md). Where REQUIREMENTS says +> *what* and *why*, this doc says *how*. + +--- + +## 0. Scope + +This design covers the eval framework that lives in +[`tools/azsdk-cli/Azure.Sdk.Tools.Vally/`](./) — i.e. the scenarios that +verify the Azure SDK agent picks the right skill, calls the right MCP tools, +in the right order, with the right arguments, and returns the right answer. + +--- + +## 1. Layering + +### 1.1 How many layers, and why + +Two layers, in one place. They are **not** two separate projects — they are +two sub-folders under [`evals/`](./evals/) plus matching suites in +[`.vally.yaml`](./.vally.yaml). Skill-only dispatch ("does prompt X route to +skill Y?") is **not** a tier here — it lives next to the skill in +`.github/skills//evals/` (see §1.2). + +``` +evals/ +├── unit/ tier 1: cross-skill tool triggers + single-tool happy path +├── scenarios/ tier 2: multi-tool agent flows (mock OR live, same YAML) +├── setup/ shared fixture scripts (specs clone, etc.) +└── fixtures/ pinned SHAs + per-eval mocks +``` + +| Tier | Folder | Agent? | What it proves | Wall time | Failure semantics | +|---|---|---|---|---|---| +| 1 unit | `evals/unit/` | none | "Tool X exists and returns the right shape for these inputs." Cross-skill trigger tables (tools used by ≥2 skills). | < 30s each | **required**, every PR | +| 2 scenarios | `evals/scenarios/` | **live (gpt-5.x)** | "Agent picks the right skills, calls the right tools in the right order with the right args, and returns the right answer" for a multi-step ask. | depends on env (see below) | depends on env (see below) | + +**The key insight: scenarios are environment-agnostic.** A scenario YAML +declares the prompt, expected skills, expected tool sequence, and graders +— nothing about whether MCP is mock or live. The MCP backend is picked at +run time: + +| Run mode | MCP | Repos? | When | Coverage | Cost | +|---|---|---|---|---|---| +| `scenarios` + `azsdk-mcp-mock` | mock | none | **every PR** | every scenario | ~1m / scenario, ~0 tokens beyond agent | +| `scenarios` + `azsdk-mcp-live` | live | shallow + sparse | **nightly** | scenarios tagged `live-safe` (curated subset) | 10-20m / scenario, ~2M tokens | + +Same file, same graders — just a different environment binding. A scenario +like `release-planner` runs on **mock** every PR (catches tool-sequence +regressions cheaply) **and** on **live** nightly (catches real DevOps +drift). When the live and mock results disagree, you have an exact bisect: +the mock lied. That's also how mock coverage gaps surface automatically — +every scenario that runs on mock forces the mock to grow handlers for the +tools it exercises (see §4). + +### 1.2 Relationship to `.github/skills/*/evals/` — split by ownership + +Two homes, split on a simple rule: + +| What it tests | Lives in | Owned by | +|---|---|---| +| **One skill** (does *this* skill route + call its tools + return a sensible answer) | `.github/skills//evals/` | Skill author | +| **Cross-skill / cross-tool** (multi-step chains, e2e flows, mock-server integration, anything that doesn't belong to one skill) | `tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/` | Eval-framework owner | + +Per-skill evals stay **next to `SKILL.md`** — that's the convention skill +authors expect, and it keeps "everything about my skill in one folder." +Today's per-skill `eval.yaml` files don't move. + +This project owns: + +- **The runner config** ([`.vally.yaml`](./.vally.yaml)): environments + (`azsdk-mcp-live`, `azsdk-mcp-mock`), suites, MCP server definitions. + Per-skill evals reference these environments by name. +- **Shared fixtures** ([`evals/setup/`](./evals/setup/), + [`evals/fixtures/`](./evals/fixtures/)): the specs-clone hook, SHA locks, + language-repo cache scripts. Per-skill evals can reuse them via `setup:`. +- **Cross-skill scenarios**: `evals/scenarios/` — multi-step flows like + release-planner that span release-plan + generate-sdk. These have no + single skill owner, so they live here. +- **Tier-1 `unit/` tool-trigger + tool-shape evals** that aren't owned by + any one skill (e.g. `triggers-pipeline.eval.yaml` covers tools used by + three different skills). + +The runner picks up both: + +``` +vally eval \ + --eval-spec '.github/skills/**/evals/*.eval.yaml' \ + --eval-spec 'tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/**/*.eval.yaml' \ + --skill-dir .github/skills +``` + +Or, equivalently, suites in `.vally.yaml` glob both paths. + +#### What skill authors get without moving anything + +Per-skill evals already use Vally graders. To unlock the §4.6 trifecta +(skill + tool-calls + correctness in one scenario), a skill author edits +their existing `evals/eval.yaml` to **add** the missing graders — they +don't relocate the file: + +```yaml +# .github/skills//evals/eval.yaml +environment: azsdk-mcp-mock # references env defined in our .vally.yaml +graders: + - type: skill-invocation + config: { required: [] } + - type: tool-calls + config: { required: [...], disallowed: [...] } + - type: prompt + config: { rubric: ... } +``` + +#### Why this split (not "move everything here") + +| Concern | Skill evals here (move) | Skill evals stay + cross-cuts here (this proposal) | +|---|---|---| +| Skill author finds their evals | filtered by tag, ~5 dirs away | next to SKILL.md ✓ | +| Skill + tool-calls + correctness in same scenario | ✓ | ✓ (add graders to existing file) | +| Cross-skill chains have a clear home | ✓ | ✓ (`evals/scenarios/`) | +| New skill author understands the layout | needs to learn tag filtering | "evals go next to your SKILL.md, like other skills" | +| Per-skill CI workflow unchanged | needs rewrite | ✓ | +| Mock vs. live opt-in works for skill evals | ✓ | ✓ (env defined here, referenced from skill eval) | +| Shanghai team adding cross-skill scenario | unclear (which tag?) | `evals/scenarios/` here | + + + +### 1.3 Folder → suite → trigger mapping + +Suites in `.vally.yaml`: + +| Suite | Globs | Env | Used by | +|---|---|---|---| +| `unit` | `evals/unit/**/*.eval.yaml` | `azsdk-mcp-mock` | PR + nightly | +| `scenarios-mock` | `evals/scenarios/**/*.eval.yaml` | `azsdk-mcp-mock` | PR + nightly | +| `scenarios-live` | `evals/scenarios/**/*.eval.yaml` (filtered by tag `live-safe`) | `azsdk-mcp-live` | nightly + label | +| `pr-gate` | `unit` + `scenarios-mock` | mock | every PR | +| `nightly` | `unit` + `scenarios-mock` + `scenarios-live` | mixed | nightly + label | + +Live runs are tag-gated (`--tag live-safe`) so destructive / production-only +scenarios stay opted out by default. + +### 1.4 Decision tree for "where does my new eval go?" + +``` +Does it test ONE skill's routing + tools + answer? +└── yes → .github/skills//evals/ (not this project) + +Is it a single-tool shape test or a trigger table covering tools used by ≥2 skills? +└── yes → evals/unit/ + +Is it a multi-step / multi-tool agent flow? +└── yes → evals/scenarios/ + ├── default: runs against mock on every PR + └── add `tags: { live-safe: "true" }` to also run against live nightly +``` + +--- + +## 2. CI + +### 2.1 Today + +- The skill evals (`.github/skills/**/evals/`) run via + [`.github/workflows/skill-eval.yml`](../../../.github/workflows/skill-eval.yml). +- The tool-scenario evals in this project: **run nowhere in CI**. Helen runs + them by hand. This is the gap [#15829](https://github.com/Azure/azure-sdk-tools/issues/15829) + closes. + +### 2.2 Next (issue #15829) + +Extend `.github/workflows/skill-eval.yml` — **do not** create a parallel +workflow. Two new jobs join the existing per-skill matrix: + +```yaml +jobs: + # existing: skill-evals (matrix per skill, unchanged) + + tool-scenarios-pr: + # PR-gate: unit + scenarios-mock. Hermetic: no live MCP, no repo clones. + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup-dotnet-and-node + - run: dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock + - working-directory: tools/azsdk-cli/Azure.Sdk.Tools.Vally + run: | + npx --yes @microsoft/vally-cli eval \ + --suite pr-gate \ + --skill-dir ../../../.github/skills \ + --junit --output-dir vally-results + - uses: actions/upload-artifact@v4 + with: { name: vally-pr, path: tools/azsdk-cli/Azure.Sdk.Tools.Vally/vally-results } + + tool-scenarios-nightly: + # Nightly: full suite incl. scenarios-live (tag-gated by `live-safe`). + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup-dotnet-and-node + - name: Restore repo cache + uses: actions/cache@v4 + with: + path: ~/.vally-cache/repos + key: vally-repos-${{ hashFiles('tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/*.yaml') }} + - name: Prime repo cache (clones any missing live-safe deps) + run: pwsh tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-repos.ps1 + env: { VALLY_REPO_CACHE: ~/.vally-cache/repos } + - run: dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli + - run: dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock + - working-directory: tools/azsdk-cli/Azure.Sdk.Tools.Vally + run: | + npx --yes @microsoft/vally-cli eval \ + --suite nightly \ + --skill-dir ../../../.github/skills \ + --junit --output-dir vally-results + env: + AZSDKTOOLS_AGENT_TESTING: "true" + AZURE_DEVOPS_PAT: ${{ secrets.AZURE_DEVOPS_PAT }} + VALLY_REPO_CACHE: ~/.vally-cache/repos + continue-on-error: true # advisory for first week + - uses: actions/upload-artifact@v4 + with: { name: vally-nightly, path: tools/azsdk-cli/Azure.Sdk.Tools.Vally/vally-results } +``` + +Triggers: + +| Trigger | What runs | +|---|---| +| `pull_request` | `pr-gate` (unit + scenarios-mock) | +| `schedule:` nightly | `nightly` (adds scenarios-live) | +| `workflow_dispatch` | manual escape hatch (suite picker) | + +### 2.3 Repo-caching strategy (issue #15831) + +**Problem.** Live-env scenarios call real tools that read files from +`azure-rest-api-specs` (and sometimes a language repo like +`azure-sdk-for-python`). Cloning from scratch on every run is slow +(~30s sparse spec clone, minutes for a full SDK repo). We need them +available without paying that cost per eval. + +**Who needs this:** only the `scenarios-live` nightly job. PR-gate +(`unit` + `scenarios-mock`) runs entirely against the mock server — no +clones, no network, no cache. + +**Constraint.** Vally itself does not clone repos. Its +`environment.git.source` field expects a worktree to already exist at +the given path. So cloning must happen as a pre-step *before* +`vally eval` runs. + +**Solution — pre-step script, scoped to live scenarios only.** Each +scenario tagged `live-safe` declares its repo deps in a sidecar block. +(Vally's `tags:` is a mapping, not an array; `metadata:` is accepted by +the linter as a passthrough — we use it for our own bookkeeping.) + +```yaml +# evals/scenarios/release-planner.eval.yaml +tags: + live-safe: "true" +metadata: + repos: + - name: Azure/azure-rest-api-specs + - name: Azure/azure-sdk-for-python +stimuli: + - environment: + git: + source: ${VALLY_REPO_CACHE}/Azure/azure-rest-api-specs # filled by pre-step + ref: main +``` + +Live runs select these scenarios via `--tag live-safe=true`. + +One generic script — [`evals/setup/ensure-repos.ps1`](./evals/setup/ensure-repos.ps1) +— walks `evals/scenarios/*.yaml`, **filters to scenarios tagged +`live-safe`**, collects the union of `metadata.repos`, and ensures each +listed repo is cloned into `$VALLY_REPO_CACHE//`. +Idempotent: existing checkouts are skipped. Scenarios without the +`live-safe` tag are not scanned — their repos (if any are declared) +are never cloned, because they never run live. + +- **Local dev:** `$VALLY_REPO_CACHE` defaults to + `$env:USERPROFILE\.vally-cache\repos`. Reused across local runs. +- **CI:** `$VALLY_REPO_CACHE` points at whatever `actions/cache` mounts. + Cache key = hash of the collected `metadata.repos` list across all + `live-safe` scenarios, so the cache only invalidates when a scenario + adds/removes a repo dependency. + +**On pinning (optional for v1).** By default the script clones `main`, +which means live-env evals can flake if upstream merges a breaking change +between nightly runs. Scenarios that want reproducibility can opt in per +repo by adding a `ref:` field under `metadata.repos`: + +```yaml +metadata: + repos: + - name: Azure/azure-rest-api-specs + ref: # optional; default = main + - name: Azure/azure-sdk-for-python +``` + +No central lock file, no bot PR — just an optional field on the repo +entry. "What version did this run use?" is always recoverable from the +`git rev-parse HEAD` recorded in `results.jsonl`. If per-scenario `ref:` +entries get unwieldy we can promote them to a shared lock file later. + +**Upstream wish.** A native Vally `fixtures.git:` block that clones for +us would let us drop the pre-step script and the `metadata` sidecar. +Filed separately; until then, the pre-step is the pragmatic v1. + +--- + +## 3. Live vs. mock — what runs where + +### 3.1 Decision matrix + +| What | Env | Why | +|---|---|---| +| Tool triggers (`evals/unit/triggers-*.eval.yaml`) | **mock** | No real tools called, just verifies prompt → tool name mapping. | +| Single-tool shape (`evals/unit/.eval.yaml`) | **mock** | Hermetic, fast, deterministic. | +| Scenario, default PR run (`evals/scenarios/*.eval.yaml`) | **mock** | Cheap, hermetic, no repo clones, safe for write tools (mocked). Catches tool-sequence regressions on every PR. | +| Scenario tagged `live-safe`, nightly run | **live** w/ `AZSDKTOOLS_AGENT_TESTING=true` | Catches real DevOps / GitHub drift. Work items route to test area path so re-runs are safe. PR creation hits real `azure-sdk-for-*` (proven 2026-06-02). | +| Scenario touching **production** systems (e.g. `azsdk_release_sdk` shipping to NuGet) | **mock only** — no `live-safe` tag | Never run live in CI. | + +The `live-safe` tag is the opt-in: by default a new scenario runs only on +mock. To also have it run live nightly, the author adds +`tags: { live-safe: "true" }` +and confirms the scenario is safe to repeat against the real systems. + +### 3.2 How it's expressed in YAML + +`.vally.yaml` declares two environments: + +```yaml +environments: + azsdk-mcp-live: + mcpServers: + azure-sdk-mcp: + command: dotnet + args: ["run", "--project", "../Azure.Sdk.Tools.Cli", "--", "start"] + env: + AZSDKTOOLS_AGENT_TESTING: "true" # safe-mode for write tools + azsdk-mcp-mock: + mcpServers: + azure-sdk-mcp: + command: dotnet + args: ["run", "--project", "../Azure.Sdk.Tools.Mock", "--", "start"] +``` + +Each eval is environment-agnostic; the env is bound at run time by the +suite (see §1.3). Default suite uses **mock**; `scenarios-live` swaps in +`azsdk-mcp-live` and filters via `--tag live-safe=true`. + +`AZSDKTOOLS_AGENT_TESTING=true` is the safety net — even on the live MCP, +write operations route to test work-item area paths. This is what made +today's release-planner scenario safe to re-run. + +--- + +## 4. Mock MCP server status + +### 4.1 What's there today + +[`Azure.Sdk.Tools.Mock`](../Azure.Sdk.Tools.Mock/) has handlers for **10 tools**: + +| Tool | Handler | +|---|---| +| `azsdk_create_release_plan` | [`Handlers/ReleasePlan/CreateReleasePlanHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/CreateReleasePlanHandler.cs) | +| `azsdk_get_release_plan` | [`GetReleasePlanHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/GetReleasePlanHandler.cs) | +| `azsdk_update_sdk_details_in_release_plan` | [`UpdateSdkDetailsHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/UpdateSdkDetailsHandler.cs) | +| `azsdk_update_release_plan` | [`UpdateReleasePlanTargetHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/UpdateReleasePlanTargetHandler.cs) | +| `azsdk_run_generate_sdk` | [`RunGenerateSdkHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/RunGenerateSdkHandler.cs) | +| `azsdk_link_sdk_pull_request_to_release_plan` | [`LinkSdkPrToReleasePlanHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/LinkSdkPrToReleasePlanHandler.cs) | +| `azsdk_link_namespace_approval_issue` | [`LinkNamespaceApprovalHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/LinkNamespaceApprovalHandler.cs) | +| `azsdk_get_sdk_pull_request_link` | [`GetSdkPullRequestLinkHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/GetSdkPullRequestLinkHandler.cs) | +| `azsdk_get_pipeline_status` | [`GetPipelineStatusHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/Pipeline/GetPipelineStatusHandler.cs) | +| `azsdk_release_sdk` | [`ReleaseSdkHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/Package/ReleaseSdkHandler.cs) | + +Today's release-planner scenario used **15 distinct tools** when run live. +So at minimum the mock is missing handlers for: +`azsdk_get_release_plan_for_spec_pr`, `azsdk_run_typespec_validation`, +`azsdk_check_api_spec_ready_for_sdk`, `azsdk_typespec_generate_authoring_plan`, +plus other `azsdk_*` tools referenced by the `unit/` evals. + +### 4.2 Is it up to date? + +No — there's no mechanism to detect drift. The mock is a hand-authored +allowlist; if `Azure.Sdk.Tools.Cli` adds a tool, no one knows the mock is +missing it until an eval fails. + +### 4.3 How we keep it up to date + +Three layers: + +1. **Inventory diff check** (lightweight, lands first): + - New script `eng/scripts/Get-McpToolInventory.ps1` enumerates tools + advertised by both `Azure.Sdk.Tools.Cli` and `Azure.Sdk.Tools.Mock` over + stdio (both already expose `tools/list` via MCP). + - Writes `tools/azsdk-cli/Azure.Sdk.Tools.Mock/COVERAGE.md` (checked in) + with three columns: tool, live ✓, mock ✓. + - CI job `mock-coverage-check` runs the script and fails if `COVERAGE.md` + is stale (regenerate → `git diff --exit-code`). + +2. **Per-eval enforcement** (already free via the runner): + - Any eval with `environment: azsdk-mcp-mock` that calls a tool the mock + doesn't handle will fail at runtime ("tool not found"). This is the + functional backstop — once an eval references a missing tool, CI red. + +--- + +## 5. Results UX — beyond "pass / fail" + +### 5.1 What we have today + +Per run, Vally writes: + +- `results.jsonl` — full trajectory: every tool call, args, return values, + events, metrics. +- `eval-results.md` — markdown summary table (one row per stimulus, grader + scores, links to details). +- JUnit XML (with `--junit`) — for CI test-results widgets. + +Both produced today for the release-planner-e2e run; see +[`vally-results/2026-06-03T03-06-41-076Z/`](./vally-results/2026-06-03T03-06-41-076Z/). + +The gap: `results.jsonl` is great for engineers but useless for Laurent or +anyone wanting to *see* why a run failed (or what the agent actually did). + +### 5.2 What Laurent / non-engineers actually want + +From the meeting: a way to slice/filter results across many runs ("how often +does the release-planner skill fire in the last 30 nightlies?") and to drill +into a single failing run without parsing JSON. + +Two artifacts cover that: + +#### (a) CSV export — the spreadsheet layer + +Thin post-processor: `eng/scripts/Export-VallyResultsCsv.ps1`. Reads +`results.jsonl`, emits one row per stimulus with columns: + +``` +timestamp, suite, scenario, tier, model, verdict, score, +skill-invocation, tool-calls, prompt, +skills_used, tool_call_count, turns, tokens, duration_s, +trajectory_url, eval_results_url +``` + +`trajectory_url` is a link to the rendered HTML (see (b) below). Append-only +file at `vally-results/history.csv` (committed to a separate `vally-history` +branch or pushed to an Azure Storage container — TBD). + +This is the artifact Laurent gets — one file, pivot-table friendly. + +#### (b) Trajectory viewer — the "what did the agent actually do" layer + +`results.jsonl` already contains the full event stream: + +``` +skill → tool_call → tool_result → assistant_message → tool_call → ... +``` + +We render it as a single static HTML page per stimulus: + +- Timeline view: vertical events with timestamps and durations. +- Each tool_call collapsible: arguments + return value side-by-side. +- Skill changes highlighted as section headers. +- Final assistant message at the bottom with the grader rubric + judge verdict. +- Graders shown as a pill row at the top (✅/❌ with hover for details). + +Implementation: `eng/scripts/Render-VallyTrajectory.ps1` (or a tiny Node +script) that templates a single self-contained HTML. CI uploads the directory +as an artifact; the CSV links into it. + +This is essentially what `agentviz` (referenced in +`--keep-executor-session-logs`) does, but standalone — no extra tool to +install. + +#### (c) Future: shared dashboard + +Once (a) and (b) are stable, the CSV can feed a Power BI / Kusto dashboard +that lives outside this repo. Out of scope for v1. + +### 5.3 Pipeline + +``` +vally eval ──> results.jsonl ─┬─> Export-VallyResultsCsv.ps1 ──> history.csv ──> dashboard + │ + └─> Render-VallyTrajectory.ps1 ──> *.html ──> artifact + link from csv +``` + +Both scripts read `results.jsonl` only — no Vally-side changes required. +If/when upstream Vally adds native CSV / HTML output, drop the scripts. + +--- + +## 6. Open design questions + +1. **Mock auto-generation.** Replace hand-written handlers with a single + generic handler that synthesizes a response from each tool's JSON + schema. + + **How.** `Azure.Sdk.Tools.Mock` starts up, calls the live MCP's + `tools/list` once at boot (or reads a checked-in snapshot of it), and + for every tool registers a fallback handler that: + + 1. Validates incoming args against `inputSchema`. + 2. Walks `outputSchema` (or the `result` JSON Schema) and emits a + default-value tree: `string` → `"mock-"`, `integer` → `0`, + `array` → `[]`, `object` → recurse, `$ref` → resolve. For ID-shaped + fields (e.g. `workItemId`), return a deterministic counter so + multi-step scenarios can chain (`create` returns `1538`, next + `get(1538)` returns the same shape). + 3. Hand-written handlers in `Handlers/` still win when present — they + override the generated default for the few tools whose realistic + response shape matters (e.g. `azsdk_get_pipeline_status` needs + a believable build-status sequence). + + **Trade-off.** Solves drift (any new tool gets a mock for free) but + default-value responses miss domain quirks (real pipeline status + transitions, real PR URLs). Mitigation: keep hand-written handlers + for the ≥5 tools whose responses scenarios actually assert on. + + **Defer until.** §4 manual coverage gap is closed (so we know which + tools actually need realistic shapes vs. which can take defaults). +2. **CSV storage.** Per-branch artifact (cheap, no infra), commit to a + `vally-history` branch (versioned, awkward), or push to Azure Storage + (best UX, needs infra). Default plan: artifact + Storage upload from + nightly only. +3. **Cross-org repo cache in CI.** `actions/cache` keyed on the hash of + `metadata.repos` across live-safe scenarios is fine for + `azure-rest-api-specs`, but the pull from GitHub still costs ~30s on + cache miss. For 5 language repos + specs, cold-start could approach 3 + min. Worth it vs. an Azure-hosted pre-baked image? Defer until we have + data. + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md new file mode 100644 index 00000000000..7924ab03f66 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md @@ -0,0 +1,165 @@ +# Vally Tool-Scenario Evaluation — Requirements + + +--- + +## 1. Context + +PR [#15811](https://github.com/Azure/azure-sdk-tools/pull/15811) ported the +deleted `Azure.Sdk.Tools.Cli.Benchmarks` tool-scenarios into +[`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/`](./evals/) as +`@microsoft/vally-cli` evals (11 scenarios, 10 fully-graded + 1 stub). They run +locally but are not yet wired into CI, have no shared environments, and cannot +yet assert *skill choice + tool-call shape + ordering* in a single scenario. + + +--- + +## 2. Goals + +1. A single eval can express **what skill was picked, what tools were called, in + what order, with what arguments, and whether the final answer is correct.** +2. Evals are **reproducible**: same SHA, same inputs ⇒ same trajectory. +3. Evals are **safe by default**: nothing destructive runs against live ADO / + GitHub on a nightly schedule unless the author opted in. +4. Evals are **portable**: a new contributor (Shanghai team, Laurent's reviewers) + can clone the repo and run any scenario without hand-editing paths. +5. Evals are **observable**: results are exportable (CSV / JUnit / markdown) + and consumable by non-engineers. + +--- + +## 3. Non-goals (for this round) + +- Authoring new eval scenarios beyond the 11 already ported (tracked separately). +- Schema-parity tests between `Azure.Sdk.Tools.Cli` and `Azure.Sdk.Tools.Mock` + responses — a separate concern, file against the mock project if needed. +- Replacing Vally as the eval runner. +- Building a UI on top of CSV exports. + +--- + +## 4. Functional requirements + +### 4.1 Unified scenario file (skill + tool + e2e in one place) + +A single `.eval.yaml` must be able to declare: + +- one or more `skill-invocation` graders (which `.github/skills/*` were picked), +- one or more `tool-calls` graders (which MCP tools fired, with arg matching), +- an optional `prompt` / `output-contains` / `output-matches` grader for the + final answer, +- arbitrary tags (`tier`, `scenario`, `skills`, `tools`, `owner`). + +Today these flows are split across two pipelines because Vally's per-scenario +grader set is limited. Removing that limitation is the #1 ask from the meeting. + +**Upstream dependencies**: +- [microsoft/vally#453](https://github.com/microsoft/vally/issues/453) — `tool-calls` grader: support strict call ordering (`sequence:`). +- [microsoft/vally#454](https://github.com/microsoft/vally/issues/454) — `tool-calls` grader: open `ToolMatch` for generic argument matching. + +### 4.2 Tiered evaluation taxonomy + +Scenarios fall into two tiers. Trigger cadence and failure semantics follow +from the tier, not from per-scenario configuration. + +| Tier | Agent | MCP | Trigger | Failure semantics | +|---|---|---|---|---| +| unit (tool-only) | none | mock | per PR | required | +| scenario (default) | live | mock | per PR | required | +| scenario + live opt-in | live | live | nightly | advisory → required | + +Skill-only routing evals ("does prompt X route to skill Y?") are out of scope +for this project; they live next to the skill. + +### 4.3 Mock vs. live MCP — opt-in per eval + +Tracked in [#15831](https://github.com/Azure/azure-sdk-tools/issues/15831). + +- Both a mock MCP server and the real MCP server must be selectable as the + scenario's MCP environment. +- **Default is mock.** Running against the live MCP server is per-scenario + opt-in. +- Scenarios touching **production** systems (e.g. shipping packages) must + remain mock-only and must not be opt-in-able. + +### 4.4 Workspace setup hooks (repo cloning for live scenarios) + +Tracked in [#15831](https://github.com/Azure/azure-sdk-tools/issues/15831). + +- The PR gate (unit + mock scenarios) must be fully hermetic: no clones, + no outbound network. +- Live-tier scenarios that need external repos (e.g. `azure-rest-api-specs`) + must declare those dependencies inside the scenario file. Adding a new + repo dependency must be a YAML-only change. +- Repo provisioning runs once per CI job and is shared across scenarios. +- Pinning a repo to a specific ref is supported but optional in v1. + +### 4.5 Configuration via environment variables, not hard-coded paths + +- Scenario YAMLs must not hard-code absolute paths. +- Repo locations must be resolved through configuration that works the + same way locally and in CI. + +### 4.6 Skill + tool-call grading must be enforced together + +For each prompt, the grader must verify both: + +1. The agent picked the **right skill** (`skill-invocation` grader). +2. The agent fired the **right MCP tool calls**, in the right order, with the + right arguments (`tool-calls` grader with the upstream extensions in §4.1). + +A scenario that asserts only the final answer text is incomplete. + +### 4.7 End-to-end multi-step scenarios + +Vally must be able to grade chains such as: + +- *validate TypeSpec project* → *create release plan* → *generate SDK*. + +This requires: + +- Ordering (vally#453). +- Argument matching (vally#454). +- Tier-appropriate environment (mock for destructive steps, live elsewhere). + +Initial e2e targets: `release-planner-e2e`, `create-release-plan`, +`generate-sdk`. Each must include the full tool-call chain in its graders. + +### 4.8 Result export + +- Native: `results.jsonl`, `eval-results.md`, JUnit XML (already supported). +- **New**: CSV export for Laurent's projection use case. Either a Vally + feature request or a thin post-processor script that consumes `results.jsonl`. + +### 4.9 Mock MCP tool coverage + +- Inventory the tools `Azure.Sdk.Tools.Mock` currently implements. +- For every tool referenced by an eval that runs on `azsdk-mcp-mock`, the mock + must have a handler (returning realistic shape, not necessarily real data). +- Track gaps in a checklist in the mock project's README. + +--- + +## 5. CI / pipeline requirements + +Tracked in [#15829](https://github.com/Azure/azure-sdk-tools/issues/15829). + +- A PR-gate job runs unit + mock scenarios on every PR. Hermetic; required + from day one. +- A nightly job additionally runs the live-tier scenarios against the real + MCP server. Starts advisory (does not block); flipped to required once + the baseline is stable. +- A manual trigger is available for ad-hoc runs. +- Results are published as build artifacts (markdown summary + JUnit XML + at minimum). +- Model and ADO credentials must not leak into logs. + +--- + +## 6. Authoring requirements (so non-Helen humans can extend the suite) + +- The authoring pattern (graders, tiers, mock-vs-live decision) is documented + outside this file and linked from the Vally project's README. +- A new contributor can add a scenario without editing CI configuration or + shared scripts. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml index b542f073e30..fe18eb9d814 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml @@ -1,12 +1,25 @@ name: azsdk-mcp-tool-scenarios description: | - Live end-to-end demo for the release-planner flow. + Live end-to-end demo for the full release-planner -> generate-SDK flow. Drives the *real* azsdk-cli MCP server against real DevOps APIs, inside a real git worktree of azure-rest-api-specs. The MCP server runs with AZSDKTOOLS_AGENT_TESTING=true (set globally in .vally.yaml), so work items route to the DevOps test area path and are safe to leave around / re-run. + This scenario walks the agent through a multi-step chain that exercises + multiple skills back-to-back in a single conversation: + + 1. Release-plan skill -> azsdk_create_release_plan, azsdk_get_release_plan + 2. Generate-SDK skill -> azsdk_run_generate_sdk + 3. Release-plan skill -> azsdk_link_sdk_pull_request_to_release_plan + + The goal is to verify Vally end-to-end (live agent + live MCP + live DevOps) + can: + - route each turn to the correct skill, + - call the correct tool on that skill, + - and do so in the expected order across multiple steps. + Demonstrates Vally's environment.git fixture hook + live MCP + Copilot SDK executor + real DevOps in one shot. @@ -46,8 +59,10 @@ stimuli: source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs ref: main prompt: | - I'm in a checkout of azure-rest-api-specs. Walk through a release plan - for the Contoso Widget Manager end-to-end: + I'm in a checkout of azure-rest-api-specs. Walk me through the full + release-plan + SDK-generation flow for the Contoso Widget Manager + end-to-end. Do every step below, in order, and use real tools (no + dry-run, no simulation): 1. Create a release plan using: - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" @@ -59,24 +74,74 @@ stimuli: - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" 2. Fetch the release plan you just created back from DevOps to confirm - it was saved. + it was saved, and tell me its work-item ID. + + 3. Kick off SDK generation for that same TypeSpec project via the + generation pipeline (Python SDK is fine). Use the work-item ID + from step 2. + + 4. Once the generation pipeline reports a pull request URL, link + that SDK pull request back to the release plan from step 2. My setup has already been verified, do not run azsdk_verify_setup. constraints: - max_turns: 10 - max_tokens: 10000 - # TODO: assert ordering create -> get — blocked on https://github.com/Azure/azure-sdk-tools/issues/15832 (Vally tool-calls grader needs sequence:). - # TODO: assert serviceTreeId / productTreeId / typeSpecProjectPath args — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher). + max_turns: 20 + max_tokens: 30000 + # TODO: assert strict ordering create -> get -> generate -> link + # — blocked on https://github.com/microsoft/vally/issues/453 (tool-calls grader sequence:). + # TODO: assert args (serviceTreeId / productTreeId / typeSpecProjectPath / workItemId) + # — blocked on https://github.com/microsoft/vally/issues/454 (tool-calls grader generic args:). + # TODO: add `azsdk-common-generate-sdk-locally` (or the equivalent pipeline- + # driven skill) to skill-invocation `required` once a skill that owns + # `azsdk_run_generate_sdk` is registered. Today the only skill that + # declares any of the tools in this scenario is azsdk-common-prepare-release-plan. graders: + # 1. Skill-routing check (FIRST — fast, deterministic, free): did the + # agent dispatch to the right skill at all? If this fails, the + # tool-calls grader below is meaningless. + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + # 2. Tool-call check: given the right skill was loaded, did it call + # the right MCP tools? Each tool here is owned by the skill above + # except azsdk_run_generate_sdk (see TODO). - type: tool-calls config: required: - azsdk_create_release_plan - azsdk_get_release_plan + - azsdk_run_generate_sdk + - azsdk_link_sdk_pull_request_to_release_plan disallowed: - azsdk_verify_setup + # 3. Final-answer correctness (LLM-judged): the deterministic graders + # above only verify the agent *did* the right things, not that it + # *reported* them back to the user correctly. Tools can fire + # successfully while the final message hallucinates IDs / URLs. + # This grader uses gpt-5.4 as judge against a free-form rubric so + # minor wording variants (`WI 29262`, `work-item #29262`) all pass. + - type: prompt + config: + model: gpt-5.4 + rubric: | + Did the final assistant message clearly state BOTH of the + following, consistent with the tools that were actually called? + + 1. A numeric DevOps work-item ID for the release plan that was + created (or confirmed). Any unambiguous format is fine + (e.g. "work item 29262", "WI #29262", "/_workitems/edit/29262"). + + 2. A GitHub pull request URL on + github.com/Azure/azure-sdk-for-* that was linked back to + that release plan. + + Answer "pass" only if BOTH are present. Otherwise answer "fail" + and briefly say which one is missing. scoring: weights: + skill-invocation: 1 tool-calls: 1 + prompt: 1 threshold: 1.0 From 4d89bac8a7f627c275530ff034bd3a8b75d61521 Mon Sep 17 00:00:00 2001 From: helen229 Date: Wed, 3 Jun 2026 10:28:31 -0700 Subject: [PATCH 11/24] update docs --- .../azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md | 225 +++++++++++++++--- .../Azure.Sdk.Tools.Vally/REQUIREMENTS.md | 52 ++-- 2 files changed, 223 insertions(+), 54 deletions(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md index 3a16536f477..4fa2e8f25f4 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md @@ -16,44 +16,66 @@ in the right order, with the right arguments, and returns the right answer. ## 1. Layering -### 1.1 How many layers, and why +### 1.1 Three levels of testing -Two layers, in one place. They are **not** two separate projects — they are -two sub-folders under [`evals/`](./evals/) plus matching suites in -[`.vally.yaml`](./.vally.yaml). Skill-only dispatch ("does prompt X route to -skill Y?") is **not** a tier here — it lives next to the skill in -`.github/skills//evals/` (see §1.2). +Aligned with the 2026-06 design review. Three named levels, +differentiated by **what they exercise** and **what backend they hit**: + +| Level | Name | What it proves | Agent | MCP | Lives in | +|---|---|---|---|---|---| +| 0 | **Routing evals** | Prompt X routes to skill Y | live | none (no MCP server) | `.github/skills//evals/` | +| 1 | **Workflow scenarios (mock)** | Agent picks the right skills, calls the right tools in the right order with the right args, returns the right answer | live | **mock** | `evals/scenarios/` *(default)* | +| 2 | **Live scenarios** | Same as level 1, but against the real backend — catches drift the mock can't see (TypeSpec ordering, real codegen output, real DevOps state) | live | **live** | `evals/scenarios/` + `tags: { live-safe: "true" }` | + +Plus a hermetic tool-shape layer that isn't agent-driven: + +| | Name | What it proves | Lives in | +|---|---|---|---| +| — | **Unit evals** | "Tool X exists and returns the right shape for these inputs." Cross-skill trigger tables. | `evals/unit/` | + +**Mock is the default. Live is the exception.** Both modes drive the +same live agent (LLM), so **both incur agent token cost**; the mock +itself is a deterministic C# stub with no LLM inside it. The cost delta +between mock and live is on three other axes: + +1. **Wall time.** Real backends (DevOps, codegen pipelines, GitHub) add + seconds-to-minutes per tool call; the mock returns instantly. +2. **Backend side effects + quota.** Live hits real ADO work items, + real pipeline runs, real PRs. Mock does none of that. +3. **Agent turn count (indirect token cost).** Real tool responses are + larger and more variable, which expands per-turn input and provokes + more retry / polling turns. The headline 1.78M tokens on the live + release-planner-e2e run is mostly this effect, not the mock saving + tokens directly. + +Reviewer framing, paraphrased: *live MCP incurs significant token cost, so +most testing — including release plan and SDK generation — should use +mock; live is reserved for scenarios mock can't deterministically cover.* +The "token cost" pointed at there is (3) above plus the wall-time fan-out, +not a claim that the mock is free. ``` evals/ -├── unit/ tier 1: cross-skill tool triggers + single-tool happy path -├── scenarios/ tier 2: multi-tool agent flows (mock OR live, same YAML) -├── setup/ shared fixture scripts (specs clone, etc.) +├── unit/ tool-shape + cross-skill triggers (hermetic) +├── scenarios/ level 1 by default; level 2 when tagged live-safe +├── setup/ shared fixture scripts (repo clone, etc.) └── fixtures/ pinned SHAs + per-eval mocks ``` -| Tier | Folder | Agent? | What it proves | Wall time | Failure semantics | -|---|---|---|---|---|---| -| 1 unit | `evals/unit/` | none | "Tool X exists and returns the right shape for these inputs." Cross-skill trigger tables (tools used by ≥2 skills). | < 30s each | **required**, every PR | -| 2 scenarios | `evals/scenarios/` | **live (gpt-5.x)** | "Agent picks the right skills, calls the right tools in the right order with the right args, and returns the right answer" for a multi-step ask. | depends on env (see below) | depends on env (see below) | - -**The key insight: scenarios are environment-agnostic.** A scenario YAML +**Key property: scenarios are environment-agnostic.** A scenario YAML declares the prompt, expected skills, expected tool sequence, and graders -— nothing about whether MCP is mock or live. The MCP backend is picked at -run time: +— nothing about whether MCP is mock or live. Same file, same graders; +the MCP backend is picked at run time. | Run mode | MCP | Repos? | When | Coverage | Cost | |---|---|---|---|---|---| -| `scenarios` + `azsdk-mcp-mock` | mock | none | **every PR** | every scenario | ~1m / scenario, ~0 tokens beyond agent | -| `scenarios` + `azsdk-mcp-live` | live | shallow + sparse | **nightly** | scenarios tagged `live-safe` (curated subset) | 10-20m / scenario, ~2M tokens | +| Level 1 (workflow / mock) | mock (deterministic stub, no LLM) | none | **every PR** | every scenario | agent tokens only; ~1m / scenario | +| Level 2 (live) | live (real backends) | shallow + sparse | **nightly** | scenarios tagged `live-safe` (curated subset) | agent tokens + real backend latency + more turns from real responses; 10-20m / scenario, ~2M agent tokens observed | -Same file, same graders — just a different environment binding. A scenario -like `release-planner` runs on **mock** every PR (catches tool-sequence -regressions cheaply) **and** on **live** nightly (catches real DevOps -drift). When the live and mock results disagree, you have an exact bisect: -the mock lied. That's also how mock coverage gaps surface automatically — -every scenario that runs on mock forces the mock to grow handlers for the -tools it exercises (see §4). +When the live and mock results disagree, the mock lied — exact bisect. +That's also how mock coverage gaps surface: every scenario that runs on +mock forces the mock to grow handlers for the tools it exercises +(see §4). ### 1.2 Relationship to `.github/skills/*/evals/` — split by ownership @@ -123,7 +145,7 @@ graders: | New skill author understands the layout | needs to learn tag filtering | "evals go next to your SKILL.md, like other skills" | | Per-skill CI workflow unchanged | needs rewrite | ✓ | | Mock vs. live opt-in works for skill evals | ✓ | ✓ (env defined here, referenced from skill eval) | -| Shanghai team adding cross-skill scenario | unclear (which tag?) | `evals/scenarios/` here | +| New contributor adding cross-skill scenario | unclear (which tag?) | `evals/scenarios/` here | @@ -145,16 +167,20 @@ scenarios stay opted out by default. ### 1.4 Decision tree for "where does my new eval go?" ``` -Does it test ONE skill's routing + tools + answer? -└── yes → .github/skills//evals/ (not this project) +Does it only test that the right skill is picked (no tool calls)? +└── yes → Level 0: .github/skills//evals/ (not this project) Is it a single-tool shape test or a trigger table covering tools used by ≥2 skills? └── yes → evals/unit/ Is it a multi-step / multi-tool agent flow? └── yes → evals/scenarios/ - ├── default: runs against mock on every PR - └── add `tags: { live-safe: "true" }` to also run against live nightly + ├── Level 1 by default: runs against MOCK on every PR. + │ *Use this unless the mock can't faithfully cover the behavior.* + └── Level 2: add `tags: { live-safe: "true" }` to ALSO run nightly + against live MCP. Reserve for cases where the real backend's + behavior matters (TypeSpec ordering, real codegen output, + real DevOps state). ``` --- @@ -165,8 +191,8 @@ Is it a multi-step / multi-tool agent flow? - The skill evals (`.github/skills/**/evals/`) run via [`.github/workflows/skill-eval.yml`](../../../.github/workflows/skill-eval.yml). -- The tool-scenario evals in this project: **run nowhere in CI**. Helen runs - them by hand. This is the gap [#15829](https://github.com/Azure/azure-sdk-tools/issues/15829) +- The tool-scenario evals in this project: **run nowhere in CI**. They run + by hand today. This is the gap [#15829](https://github.com/Azure/azure-sdk-tools/issues/15829) closes. ### 2.2 Next (issue #15829) @@ -426,10 +452,10 @@ Per run, Vally writes: Both produced today for the release-planner-e2e run; see [`vally-results/2026-06-03T03-06-41-076Z/`](./vally-results/2026-06-03T03-06-41-076Z/). -The gap: `results.jsonl` is great for engineers but useless for Laurent or -anyone wanting to *see* why a run failed (or what the agent actually did). +The gap: `results.jsonl` is great for engineers but useless for non-engineer +stakeholders who want to *see* why a run failed (or what the agent actually did). -### 5.2 What Laurent / non-engineers actually want +### 5.2 What non-engineer stakeholders actually want From the meeting: a way to slice/filter results across many runs ("how often does the release-planner skill fire in the last 30 nightlies?") and to drill @@ -453,7 +479,7 @@ trajectory_url, eval_results_url file at `vally-results/history.csv` (committed to a separate `vally-history` branch or pushed to an Azure Storage container — TBD). -This is the artifact Laurent gets — one file, pivot-table friendly. +This is the artifact non-engineer stakeholders get — one file, pivot-table friendly. #### (b) Trajectory viewer — the "what did the agent actually do" layer @@ -497,7 +523,130 @@ If/when upstream Vally adds native CSV / HTML output, drop the scripts. --- -## 6. Open design questions +## 6. Performance & cost controls + +### 6.1 Principle + +The framework must make expensive evals **fail loudly**, not silently bleed +CI minutes and tokens. An author writing a new scenario should not have to +know in advance how much it costs; the runner tells them, and refuses to +keep running it if it crosses policy. Polishing individual scenarios is +not a substitute for this — it doesn't scale to the next ten authors. + +The release-planner e2e run (17 min wall / 1.78M tokens / 41 turns) is the +existence proof: nothing in the framework today would have stopped it +landing as a "passing" scenario that quietly costs a full hour of CI per +nightly trigger. + +### 6.2 Budgets and enforcement + +Every scenario carries a budget. The runner measures actual cost and +enforces the budget in three bands: + +| Band | Trigger | Effect | +|---|---|---| +| Soft (warn) | actual ≥ 50% of budget | Logged + surfaced in `eval-results.md` | +| Hard (fail) | actual > 100% of budget | Scenario marked **failed**, CI job fails | +| Kill (abort) | actual > 200% of budget | Run aborted mid-flight, partial trajectory saved | + +Budgeted dimensions, in declining order of importance: + +1. **`maxTurns`** — single best proxy for cost; bounds the agent loop. +2. **`maxWallSec`** — protects CI minutes regardless of where time goes. +3. **`maxBillableTokens`** — input (uncached) + output. Cache hits don't + count, so the number tracks real $. +4. **`maxToolCalls`** — catches exploration spirals. + +Defaults are set globally in `.vally.yaml`, overridable per scenario: + +```yaml +# .vally.yaml +defaults: + limits: + maxTurns: 20 + maxWallSec: 120 + maxBillableTokens: 100_000 + maxToolCalls: 30 +``` + +A scenario that *needs* more must opt in explicitly with a comment +explaining why. The opt-in itself is reviewable in code: + +```yaml +# evals/scenarios/release-planner.eval.yaml +limits: + maxTurns: 60 # multi-step chain; see DESIGN §6.4 + maxWallSec: 600 # waits on real ADO pipeline status + maxBillableTokens: 250_000 +``` + +If the opt-in budget gets reviewed and rejected, the author's recourse +is to **switch to mock**, not to widen the budget. This is the lever +that pushes cost-blind scenarios off the live path. + +### 6.3 Tiered policy: PR vs nightly + +Budgets differ by tier. The PR gate is the strict one because it runs on +every push; nightly can be looser because it runs once. + +| Tier | maxTurns | maxWallSec | maxBillableTokens | Opt-out | +|---|---|---|---|---| +| PR gate (unit + mock) | 20 | 120 | 100k | not allowed | +| Nightly mock | 30 | 300 | 200k | reviewable | +| Nightly live | 60 | 600 | 500k | reviewable, requires justification comment | + +A scenario that wants to exceed the PR-gate ceiling **must** drop down +to nightly. The runner refuses to load over-budget scenarios into the +PR-gate suite. No way to silently land a slow scenario. + +### 6.4 General guardrails (framework-level, not per-scenario) + +These apply to every scenario the runner executes. None require the +scenario author to know about them. + +| # | Guardrail | Layer | What it prevents | +|---|---|---|---| +| G1 | **Hard turn / wall / token / tool-call caps** (§6.2) | runner | Runaway scenarios | +| G2 | **Virtual clock**: executor intercepts `Start-Sleep` / `Wait-*` / `sleep` and fast-forwards | executor adapter | Wall-time waste on polling loops | +| G3 | **Tool-result truncation** above N tokens with `…[truncated]` marker | executor adapter | Context blow-up from chatty tool responses | +| G4 | **Narration / meta-tool suppression**: tools that only echo intent (`report_intent` etc.) stripped from the tool list the model sees | executor config | Doubled turn count from pure-narration calls | +| G5 | **Polling tools default to terminal state under `AZSDKTOOLS_AGENT_TESTING=true`** (`*_get_*_status` returns `Succeeded` on first poll) | mock MCP policy | Any future polling tool inherits the fix | +| G6 | **Cheaper judge model** — LLM-judge graders default to a smaller model than the agent | runner config | Judge tokens dominating output cost | +| G7 | **CI concurrency cancel** — superseded PR runs killed immediately | CI workflow | Wasted compute on rapid pushes | +| G8 | **Honest cost reporting** — `eval-results.md` splits cached vs billable input and wall time into LLM / tool / wait | results renderer | Headline-token illusions hiding real cost | +| G9 | **Suite-level cost ceiling** — if any single scenario exceeds 25% of its suite's total budget, suite run fails with a "rebalance" error | runner | One scenario silently dominating the suite | + +G2, G3, G4, G6 also have the effect of making per-scenario budgets +*achievable*. Without them, an honest scenario can blow past `maxTurns` +just by being routed through a chatty executor. + +### 6.5 Where each guardrail lives + +| Guardrail | Owner | +|---|---| +| G1, G2, G3, G8, G9 | **Upstream Vally** — file as feature requests | +| G4, G6 | Copilot SDK executor / `.vally.yaml` runner config in this repo | +| G5 | `Azure.Sdk.Tools.Mock` in this repo | +| G7 | `.github/workflows/skill-eval.yml` in this repo | + +The local guardrails (G4–G7) can land immediately. The upstream +guardrails (G1–G3, G8, G9) are blocked on Vally; until they ship, we +approximate G1 with a thin post-run check that reads `results.jsonl` +and fails the CI step if any scenario exceeded its declared budget. + +### 6.6 Author-facing rule of thumb + +> **If the agent's loop talks to a real backend that takes more than a +> few seconds to respond, mock it.** The runner will let you know when +> you've crossed the line — you don't have to guess. + +The budget machinery exists so authors don't need to read this document +to write a cheap eval. They write the scenario; the runner fails it if +it costs too much; the CI message points them at the mock path. + +--- + +## 7. Open design questions 1. **Mock auto-generation.** Replace hand-written handlers with a single generic handler that synthesizes a response from each tool's JSON diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md index 7924ab03f66..109cb5cd8ae 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md @@ -22,8 +22,8 @@ yet assert *skill choice + tool-call shape + ordering* in a single scenario. 2. Evals are **reproducible**: same SHA, same inputs ⇒ same trajectory. 3. Evals are **safe by default**: nothing destructive runs against live ADO / GitHub on a nightly schedule unless the author opted in. -4. Evals are **portable**: a new contributor (Shanghai team, Laurent's reviewers) - can clone the repo and run any scenario without hand-editing paths. +4. Evals are **portable**: a new contributor can clone the repo and run any + scenario without hand-editing paths. 5. Evals are **observable**: results are exportable (CSV / JUnit / markdown) and consumable by non-engineers. @@ -58,19 +58,36 @@ grader set is limited. Removing that limitation is the #1 ask from the meeting. - [microsoft/vally#453](https://github.com/microsoft/vally/issues/453) — `tool-calls` grader: support strict call ordering (`sequence:`). - [microsoft/vally#454](https://github.com/microsoft/vally/issues/454) — `tool-calls` grader: open `ToolMatch` for generic argument matching. -### 4.2 Tiered evaluation taxonomy +### 4.2 Three levels of evaluation -Scenarios fall into two tiers. Trigger cadence and failure semantics follow -from the tier, not from per-scenario configuration. +Aligned with the 2026-06 design review. Three named levels; +the folder a scenario lives in (and an opt-in tag for level 2) determines +which one runs when. -| Tier | Agent | MCP | Trigger | Failure semantics | -|---|---|---|---|---| -| unit (tool-only) | none | mock | per PR | required | -| scenario (default) | live | mock | per PR | required | -| scenario + live opt-in | live | live | nightly | advisory → required | +| Level | Name | Agent | MCP | Trigger | Failure semantics | +|---|---|---|---|---|---| +| 0 | Routing evals (per-skill, prompt-to-skill matching) | live | none | per PR | required | +| 1 | **Workflow scenarios** (mock MCP — default) | live | mock | per PR | required | +| 2 | **Live scenarios** (live MCP — narrow opt-in) | live | live | nightly | advisory → required | -Skill-only routing evals ("does prompt X route to skill Y?") are out of scope -for this project; they live next to the skill. +Plus a hermetic tool-shape layer that isn't agent-driven: + +| Layer | Name | Agent | MCP | Trigger | Failure semantics | +|---|---|---|---|---|---| +| — | Unit evals (tool-shape + cross-skill triggers) | none | mock | per PR | required | + +**Mock is the default; live is the exception.** Both modes drive the +same live agent, so **both incur LLM token cost** — the mock MCP server +itself is a deterministic stub with no LLM in it. The cost delta is +backend latency + the larger / chattier responses live tools produce, +which expand per-turn input and provoke more polling/retry turns. +Level 2 is therefore reserved for scenarios the mock can't deterministically +cover (e.g. TypeSpec ordering, real codegen output, real DevOps state). +Most multi-step work — including release plan and SDK generation — +stays at level 1. + +Level 0 lives next to its skill and is out of scope for this project's +folder layout; this project owns the runner config it references. ### 4.3 Mock vs. live MCP — opt-in per eval @@ -79,7 +96,9 @@ Tracked in [#15831](https://github.com/Azure/azure-sdk-tools/issues/15831). - Both a mock MCP server and the real MCP server must be selectable as the scenario's MCP environment. - **Default is mock.** Running against the live MCP server is per-scenario - opt-in. + opt-in and must be justified — live MCP carries real token + wall-time + cost (see [DESIGN.md §6](./DESIGN.md)), so it is reserved for behavior + the mock can't faithfully reproduce. - Scenarios touching **production** systems (e.g. shipping packages) must remain mock-only and must not be opt-in-able. @@ -129,8 +148,9 @@ Initial e2e targets: `release-planner-e2e`, `create-release-plan`, ### 4.8 Result export - Native: `results.jsonl`, `eval-results.md`, JUnit XML (already supported). -- **New**: CSV export for Laurent's projection use case. Either a Vally - feature request or a thin post-processor script that consumes `results.jsonl`. +- **New**: CSV export for the cross-run projection / dashboard use case. + Either a Vally feature request or a thin post-processor script that + consumes `results.jsonl`. ### 4.9 Mock MCP tool coverage @@ -157,7 +177,7 @@ Tracked in [#15829](https://github.com/Azure/azure-sdk-tools/issues/15829). --- -## 6. Authoring requirements (so non-Helen humans can extend the suite) +## 6. Authoring requirements (so new contributors can extend the suite) - The authoring pattern (graders, tiers, mock-vs-live decision) is documented outside this file and linked from the Vally project's README. From f6f5c80a7fcda0e74c5abe136d9e192c17c90153 Mon Sep 17 00:00:00 2001 From: helen229 Date: Wed, 3 Jun 2026 13:35:15 -0700 Subject: [PATCH 12/24] udpate design --- .../azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md | 263 +++++++----------- 1 file changed, 108 insertions(+), 155 deletions(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md index 4fa2e8f25f4..e97e95d54e6 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md @@ -388,57 +388,55 @@ today's release-planner scenario safe to re-run. ## 4. Mock MCP server status -### 4.1 What's there today - -[`Azure.Sdk.Tools.Mock`](../Azure.Sdk.Tools.Mock/) has handlers for **10 tools**: - -| Tool | Handler | -|---|---| -| `azsdk_create_release_plan` | [`Handlers/ReleasePlan/CreateReleasePlanHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/CreateReleasePlanHandler.cs) | -| `azsdk_get_release_plan` | [`GetReleasePlanHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/GetReleasePlanHandler.cs) | -| `azsdk_update_sdk_details_in_release_plan` | [`UpdateSdkDetailsHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/UpdateSdkDetailsHandler.cs) | -| `azsdk_update_release_plan` | [`UpdateReleasePlanTargetHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/UpdateReleasePlanTargetHandler.cs) | -| `azsdk_run_generate_sdk` | [`RunGenerateSdkHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/RunGenerateSdkHandler.cs) | -| `azsdk_link_sdk_pull_request_to_release_plan` | [`LinkSdkPrToReleasePlanHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/LinkSdkPrToReleasePlanHandler.cs) | -| `azsdk_link_namespace_approval_issue` | [`LinkNamespaceApprovalHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/LinkNamespaceApprovalHandler.cs) | -| `azsdk_get_sdk_pull_request_link` | [`GetSdkPullRequestLinkHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/GetSdkPullRequestLinkHandler.cs) | -| `azsdk_get_pipeline_status` | [`GetPipelineStatusHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/Pipeline/GetPipelineStatusHandler.cs) | -| `azsdk_release_sdk` | [`ReleaseSdkHandler.cs`](../Azure.Sdk.Tools.Mock/Handlers/Package/ReleaseSdkHandler.cs) | - -Today's release-planner scenario used **15 distinct tools** when run live. -So at minimum the mock is missing handlers for: -`azsdk_get_release_plan_for_spec_pr`, `azsdk_run_typespec_validation`, -`azsdk_check_api_spec_ready_for_sdk`, `azsdk_typespec_generate_authoring_plan`, -plus other `azsdk_*` tools referenced by the `unit/` evals. - -### 4.2 Is it up to date? - -No — there's no mechanism to detect drift. The mock is a hand-authored -allowlist; if `Azure.Sdk.Tools.Cli` adds a tool, no one knows the mock is -missing it until an eval fails. - -### 4.3 How we keep it up to date - -Three layers: - -1. **Inventory diff check** (lightweight, lands first): - - New script `eng/scripts/Get-McpToolInventory.ps1` enumerates tools - advertised by both `Azure.Sdk.Tools.Cli` and `Azure.Sdk.Tools.Mock` over - stdio (both already expose `tools/list` via MCP). - - Writes `tools/azsdk-cli/Azure.Sdk.Tools.Mock/COVERAGE.md` (checked in) - with three columns: tool, live ✓, mock ✓. - - CI job `mock-coverage-check` runs the script and fails if `COVERAGE.md` - is stale (regenerate → `git diff --exit-code`). - -2. **Per-eval enforcement** (already free via the runner): - - Any eval with `environment: azsdk-mcp-mock` that calls a tool the mock - doesn't handle will fail at runtime ("tool not found"). This is the - functional backstop — once an eval references a missing tool, CI red. +### 4.1 How it works + +[`Azure.Sdk.Tools.Mock`](../Azure.Sdk.Tools.Mock/) reflects over +`SharedOptions.ToolsList` at boot and registers a mock proxy for **every** +tool the real `Azure.Sdk.Tools.Cli` advertises, preserving each tool's +name, description, and input schema +([`MockToolRegistrations.cs`](../Azure.Sdk.Tools.Mock/MockToolRegistrations.cs)). +At call time the proxy looks up an +[`IMockToolHandler`](../Azure.Sdk.Tools.Mock/Handlers/IMockToolHandler.cs) +by tool name: + +- **Custom handler exists** → scripted, type-correct response. +- **No custom handler** → fallback `DefaultCommandResponse { Message = "Success" }`. + +### 4.2 Why PR [#15854](https://github.com/Azure/azure-sdk-tools/pull/15854) + +Before #15854, only ~10 of 74 live tools had custom handlers. The other +~63 returned the generic success payload, which **breaks chained +scenarios** that need to thread a returned id into a follow-up call +(e.g. `create_release_plan` → `update_sdk_details_in_release_plan` +referencing the new `release_plan_id`). #15854 adds handlers for the +remaining tools so every mock-tier scenario gets a chainable, +type-correct response by default. Steady state: a new MCP tool ships +with its handler. + +### 4.3 No CI drift check needed + +The eval suite is the drift check. + +- **Tool added upstream, no handler yet** → auto-registered via + reflection; falls back to the success default. A scenario that actually + asserts on its response will fail in the next eval run — that failure + *is* the signal to add a handler. +- **Tool's response shape changes upstream** → scenarios asserting on the + changed field fail. Same signal, same fix. +- **Tool no eval ever touches** → no scenario fails, because the gap is + invisible to the test surface. We deliberately don't pay to plug it. + +Since every PR runs `pr-gate` (unit + scenarios-mock) and a failing +scenario fails the workflow, drift that matters surfaces immediately. +A separate `mock-coverage` inventory job, `COVERAGE.md` snapshot, or +scheduled diff script would be duplicate enforcement. --- ## 5. Results UX — beyond "pass / fail" +Tracked by parent issue [#15861](https://github.com/Azure/azure-sdk-tools/issues/15861). + ### 5.1 What we have today Per run, Vally writes: @@ -449,73 +447,78 @@ Per run, Vally writes: scores, links to details). - JUnit XML (with `--junit`) — for CI test-results widgets. -Both produced today for the release-planner-e2e run; see -[`vally-results/2026-06-03T03-06-41-076Z/`](./vally-results/2026-06-03T03-06-41-076Z/). +Two gaps: `results.jsonl` is a 300+ event JSON wall — usable by engineers +with a `jq` reflex, useless for everyone else trying to debug a failure; +and there's no way to slice across many runs ("how often did +release-planner fire green in the last 30 nightlies?"). -The gap: `results.jsonl` is great for engineers but useless for non-engineer -stakeholders who want to *see* why a run failed (or what the agent actually did). +### 5.2 Two artifacts, ordered by where they pay off first -### 5.2 What non-engineer stakeholders actually want +#### (a) Trajectory HTML — local debug first, CI artifact second -From the meeting: a way to slice/filter results across many runs ("how often -does the release-planner skill fire in the last 30 nightlies?") and to drill -into a single failing run without parsing JSON. +Sub-issue [#15862](https://github.com/Azure/azure-sdk-tools/issues/15862). -Two artifacts cover that: +[`eng/scripts/Render-VallyTrajectory.ps1`](../../../eng/scripts/Render-VallyTrajectory.ps1) +reads one `results.jsonl` and emits one **self-contained** HTML page per +stimulus into a sibling `trajectories/` directory. No external assets, no +network, opens via `file://`. Local loop: `vally eval` → +`Render-VallyTrajectory.ps1` → open the HTML. -#### (a) CSV export — the spreadsheet layer - -Thin post-processor: `eng/scripts/Export-VallyResultsCsv.ps1`. Reads -`results.jsonl`, emits one row per stimulus with columns: - -``` -timestamp, suite, scenario, tier, model, verdict, score, -skill-invocation, tool-calls, prompt, -skills_used, tool_call_count, turns, tokens, duration_s, -trajectory_url, eval_results_url -``` +Page layout: header (totals), grader pills (✅/❌ with rubric on hover), +stimulus, vertical event timeline with collapsible `tool_call` rows +(args ↔ result), skill switches as section headers, footer with final +assistant message + judge verdict + raw JSON link. Tool-result truncation +mirrors the §6 policy so the viewer reflects what the agent actually saw. -`trajectory_url` is a link to the rendered HTML (see (b) below). Append-only -file at `vally-results/history.csv` (committed to a separate `vally-history` -branch or pushed to an Azure Storage container — TBD). +This is the daily driver and lands first because it's usable the day it +merges — no CI required. -This is the artifact non-engineer stakeholders get — one file, pivot-table friendly. +#### (b) CSV export — cross-run analytics -#### (b) Trajectory viewer — the "what did the agent actually do" layer +Sub-issue [#15863](https://github.com/Azure/azure-sdk-tools/issues/15863). -`results.jsonl` already contains the full event stream: +[`eng/scripts/Export-VallyResultsCsv.ps1`](../../../eng/scripts/Export-VallyResultsCsv.ps1) +reads one or more `results.jsonl`, appends one row per stimulus to +`vally-results/history.csv`. Append-only; idempotent on +`(run_id, scenario)`. Columns cover verdict, grader scores, skills used, +turns, `tokens_billable` / `tokens_cached_read` (matching the §6.2 +`maxBillableTokens` definition), duration, and links back to the HTML + +`eval-results.md`. Local runs produce the row but **do not push** to +shared history — that's a CI concern. -``` -skill → tool_call → tool_result → assistant_message → tool_call → ... -``` +Marginal value locally (one run = one row), real value across many runs +in pivot tables / future dashboards. -We render it as a single static HTML page per stimulus: +#### (c) Hosting & CI wiring -- Timeline view: vertical events with timestamps and durations. -- Each tool_call collapsible: arguments + return value side-by-side. -- Skill changes highlighted as section headers. -- Final assistant message at the bottom with the grader rubric + judge verdict. -- Graders shown as a pill row at the top (✅/❌ with hover for details). +Sub-issue [#15866](https://github.com/Azure/azure-sdk-tools/issues/15866). +Decision: **GitHub Actions artifacts** for trajectories, **orphan +`vally-history` branch** for the CSV. -Implementation: `eng/scripts/Render-VallyTrajectory.ps1` (or a tiny Node -script) that templates a single self-contained HTML. CI uploads the directory -as an artifact; the CSV links into it. +| Artifact | Where | Retention | Auth | +|---|---|---|---| +| `trajectories/*.html` | `actions/upload-artifact@v4`, per run | 90 days | GH login | +| `results.jsonl` + `eval-results.md` | same artifact | 90 days | GH login | +| `history.csv` | orphan `vally-history` branch, force-pushed by nightly only | indefinite | repo read | -This is essentially what `agentviz` (referenced in -`--keep-executor-session-logs`) does, but standalone — no extra tool to -install. +Picked because it's zero infra (no Azure Storage, no SAS, no public +endpoint), the URL is one click from the Actions UI, and the +`vally-history` branch is plain git history a dashboard can poll later. +PR-gate uploads artifacts but skips the CSV append + history push — +history is a nightly concern. Promote to Azure Storage static site only +if the artifact-hop UX becomes a blocker. -#### (c) Future: shared dashboard +#### (d) Future: shared dashboard -Once (a) and (b) are stable, the CSV can feed a Power BI / Kusto dashboard +CSV in the `vally-history` branch can feed a Power BI / Kusto dashboard that lives outside this repo. Out of scope for v1. ### 5.3 Pipeline ``` -vally eval ──> results.jsonl ─┬─> Export-VallyResultsCsv.ps1 ──> history.csv ──> dashboard +vally eval ──> results.jsonl ─┬─> Render-VallyTrajectory.ps1 ──> *.html ──> artifact │ - └─> Render-VallyTrajectory.ps1 ──> *.html ──> artifact + link from csv + └─> Export-VallyResultsCsv.ps1 ──> history.csv ──> vally-history branch ──> (future) dashboard ``` Both scripts read `results.jsonl` only — no Vally-side changes required. @@ -602,37 +605,29 @@ PR-gate suite. No way to silently land a slow scenario. ### 6.4 General guardrails (framework-level, not per-scenario) These apply to every scenario the runner executes. None require the -scenario author to know about them. +scenario author to know about them. Scoped to what we own in this repo — +deeper runner-level controls (hard caps, virtual clock, tool-result +truncation, cost-split reporting) need upstream Vally support and are +out of scope here. | # | Guardrail | Layer | What it prevents | |---|---|---|---| -| G1 | **Hard turn / wall / token / tool-call caps** (§6.2) | runner | Runaway scenarios | -| G2 | **Virtual clock**: executor intercepts `Start-Sleep` / `Wait-*` / `sleep` and fast-forwards | executor adapter | Wall-time waste on polling loops | -| G3 | **Tool-result truncation** above N tokens with `…[truncated]` marker | executor adapter | Context blow-up from chatty tool responses | -| G4 | **Narration / meta-tool suppression**: tools that only echo intent (`report_intent` etc.) stripped from the tool list the model sees | executor config | Doubled turn count from pure-narration calls | -| G5 | **Polling tools default to terminal state under `AZSDKTOOLS_AGENT_TESTING=true`** (`*_get_*_status` returns `Succeeded` on first poll) | mock MCP policy | Any future polling tool inherits the fix | -| G6 | **Cheaper judge model** — LLM-judge graders default to a smaller model than the agent | runner config | Judge tokens dominating output cost | -| G7 | **CI concurrency cancel** — superseded PR runs killed immediately | CI workflow | Wasted compute on rapid pushes | -| G8 | **Honest cost reporting** — `eval-results.md` splits cached vs billable input and wall time into LLM / tool / wait | results renderer | Headline-token illusions hiding real cost | -| G9 | **Suite-level cost ceiling** — if any single scenario exceeds 25% of its suite's total budget, suite run fails with a "rebalance" error | runner | One scenario silently dominating the suite | - -G2, G3, G4, G6 also have the effect of making per-scenario budgets -*achievable*. Without them, an honest scenario can blow past `maxTurns` -just by being routed through a chatty executor. +| G1 | **Polling tools default to terminal state under `AZSDKTOOLS_AGENT_TESTING=true`** (`*_get_*_status` returns `Succeeded` on first poll) | mock MCP policy | Wall-time waste on polling loops; any future polling tool inherits the fix | +| G2 | **Cheaper judge model** — LLM-judge graders default to a smaller model than the agent | runner config (`.vally.yaml`) | Judge tokens dominating output cost | +| G3 | **CI concurrency cancel** — superseded PR runs killed immediately | CI workflow | Wasted compute on rapid pushes | ### 6.5 Where each guardrail lives | Guardrail | Owner | |---|---| -| G1, G2, G3, G8, G9 | **Upstream Vally** — file as feature requests | -| G4, G6 | Copilot SDK executor / `.vally.yaml` runner config in this repo | -| G5 | `Azure.Sdk.Tools.Mock` in this repo | -| G7 | `.github/workflows/skill-eval.yml` in this repo | +| G1 | `Azure.Sdk.Tools.Mock` in this repo | +| G2 | `.vally.yaml` runner config in this repo | +| G3 | `.github/workflows/skill-eval.yml` in this repo | -The local guardrails (G4–G7) can land immediately. The upstream -guardrails (G1–G3, G8, G9) are blocked on Vally; until they ship, we -approximate G1 with a thin post-run check that reads `results.jsonl` -and fails the CI step if any scenario exceeded its declared budget. +All three can land immediately — no upstream Vally dependency. Per-scenario +budgets (§6.2) are enforced today via a thin post-run check that reads +`results.jsonl` and fails the CI step if any scenario exceeded its +declared limits. ### 6.6 Author-facing rule of thumb @@ -644,45 +639,3 @@ The budget machinery exists so authors don't need to read this document to write a cheap eval. They write the scenario; the runner fails it if it costs too much; the CI message points them at the mock path. ---- - -## 7. Open design questions - -1. **Mock auto-generation.** Replace hand-written handlers with a single - generic handler that synthesizes a response from each tool's JSON - schema. - - **How.** `Azure.Sdk.Tools.Mock` starts up, calls the live MCP's - `tools/list` once at boot (or reads a checked-in snapshot of it), and - for every tool registers a fallback handler that: - - 1. Validates incoming args against `inputSchema`. - 2. Walks `outputSchema` (or the `result` JSON Schema) and emits a - default-value tree: `string` → `"mock-"`, `integer` → `0`, - `array` → `[]`, `object` → recurse, `$ref` → resolve. For ID-shaped - fields (e.g. `workItemId`), return a deterministic counter so - multi-step scenarios can chain (`create` returns `1538`, next - `get(1538)` returns the same shape). - 3. Hand-written handlers in `Handlers/` still win when present — they - override the generated default for the few tools whose realistic - response shape matters (e.g. `azsdk_get_pipeline_status` needs - a believable build-status sequence). - - **Trade-off.** Solves drift (any new tool gets a mock for free) but - default-value responses miss domain quirks (real pipeline status - transitions, real PR URLs). Mitigation: keep hand-written handlers - for the ≥5 tools whose responses scenarios actually assert on. - - **Defer until.** §4 manual coverage gap is closed (so we know which - tools actually need realistic shapes vs. which can take defaults). -2. **CSV storage.** Per-branch artifact (cheap, no infra), commit to a - `vally-history` branch (versioned, awkward), or push to Azure Storage - (best UX, needs infra). Default plan: artifact + Storage upload from - nightly only. -3. **Cross-org repo cache in CI.** `actions/cache` keyed on the hash of - `metadata.repos` across live-safe scenarios is fine for - `azure-rest-api-specs`, but the pull from GitHub still costs ~30s on - cache miss. For 5 language repos + specs, cold-start could approach 3 - min. Worth it vs. an Azure-hosted pre-baked image? Defer until we have - data. - From 3a8d6090bb576475a56d05885fa4207adf5b3910 Mon Sep 17 00:00:00 2001 From: helen229 Date: Wed, 3 Jun 2026 13:55:11 -0700 Subject: [PATCH 13/24] update with skill evals --- .../azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md index e97e95d54e6..394462db76a 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md @@ -147,7 +147,27 @@ graders: | Mock vs. live opt-in works for skill evals | ✓ | ✓ (env defined here, referenced from skill eval) | | New contributor adding cross-skill scenario | unclear (which tag?) | `evals/scenarios/` here | - +#### Per-skill eval suite — current state & direction + +The per-skill suite at `.github/skills//evals/` (triggered by +[`skill-eval.yml`](../../../.github/workflows/skill-eval.yml)) predates +this project. Captured here so the relationship is explicit. + +*Today.* Roughly a dozen skills have eval files; `azsdk-common-api-review` +has none, `sensei` is missing `scoring.threshold` (passes vacuously), and +two skills ship capability tests without a trigger file. Layout and env +wiring are correct, but most capability stimuli are graded only by a +single `output-contains` substring — they pass whether the agent called +the right tool, the wrong tool, or just echoed the prompt. CI is green +by construction. + +*Direction.* Raise the bar on what counts as a per-skill eval: adopt the +four-layer pattern — `skill-invocation` + `tool-calls` + `output-matches` +(structural regex, not single substrings) + optional `prompt` LLM-judge — +as the required shape for every capability stimulus. A +`skill-eval-authoring` skill packages the pattern, grader catalog, and +anti-patterns so other Azure SDK teams adopt without re-learning the +gotchas. ### 1.3 Folder → suite → trigger mapping From b7005b2d1c4a9025eced821208e929ec91ee65e7 Mon Sep 17 00:00:00 2001 From: helen229 Date: Wed, 3 Jun 2026 15:07:28 -0700 Subject: [PATCH 14/24] reorg based on the design --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 67 +++++--- .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 60 ++++--- .../check-public-repo-then-validate.eval.yaml | 3 +- .../live/release-planner.eval.yaml} | 12 +- .../check-public-repo-then-validate.eval.yaml | 43 +++++ .../mock}/rename-client-property.eval.yaml | 3 +- .../typespec-generation-step02.eval.yaml | 3 +- .../evals/scenarios/release-planner.eval.yaml | 154 ++++++++++++++++++ .../rename-client-property.eval.yaml | 43 +++++ .../typespec-generation-step02.eval.yaml | 42 +++++ .../evals/unit/add-arm-resource.eval.yaml | 2 +- .../evals/unit/check-public-repo.eval.yaml | 2 +- .../check-sdk-generation-status.eval.yaml | 2 +- .../evals/unit/create-release-plan.eval.yaml | 2 +- .../get-modified-typespec-projects.eval.yaml | 2 +- .../unit/get-pr-link-current-branch.eval.yaml | 2 +- .../link-namespace-approval-issue.eval.yaml | 2 +- .../evals/unit/triggers-apiview.eval.yaml | 4 +- .../evals/unit/triggers-config.eval.yaml | 4 +- .../evals/unit/triggers-engsys.eval.yaml | 4 +- .../evals/unit/triggers-github.eval.yaml | 4 +- .../evals/unit/triggers-package.eval.yaml | 4 +- .../evals/unit/triggers-pipeline.eval.yaml | 4 +- .../evals/unit/triggers-releaseplan.eval.yaml | 4 +- .../evals/unit/triggers-typespec.eval.yaml | 4 +- .../evals/unit/triggers-verify.eval.yaml | 4 +- .../evals/unit/validate-typespec.eval.yaml | 2 +- 27 files changed, 397 insertions(+), 85 deletions(-) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{integration => scenarios}/check-public-repo-then-validate.eval.yaml (96%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{e2e/release-planner-e2e.eval.yaml => scenarios/live/release-planner.eval.yaml} (95%) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{integration => scenarios/mock}/rename-client-property.eval.yaml (96%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{integration => scenarios/mock}/typespec-generation-step02.eval.yaml (95%) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/release-planner.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index e4e9150f15d..10b71f0a969 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -11,7 +11,28 @@ paths: results: results/ environments: - azsdk-mcp: + # Default for unit + mock scenarios. Runs the dedicated Azure.Sdk.Tools.Mock + # MCP server — a separate process whose tool surface mirrors the real CLI + # but with deterministic in-memory responses. + # + # Relative `--project` paths are resolved by `dotnet` against the cwd of + # the vally invocation. Always run vally from this directory: + # cd tools/azsdk-cli/Azure.Sdk.Tools.Vally && vally eval ... + # Same convention as .github/skills/.vally.yaml. + azsdk-mcp-mock: + mcpServers: + azure-sdk-mcp: + type: stdio + command: dotnet + args: ["run", "--project", "../Azure.Sdk.Tools.Mock"] + timeout: "60s" + + # Live MCP — real Azure.Sdk.Tools.Cli against real DevOps (test area path), + # real GitHub, real pipelines. AZSDKTOOLS_AGENT_TESTING=true keeps the + # handful of write tools (e.g. create_release_plan) inside the test area. + # Bound only by scenarios under evals/scenarios/live/ and selected by the + # `scenarios-live` / `nightly` suites. + azsdk-mcp-live: mcpServers: azure-sdk-mcp: type: stdio @@ -19,42 +40,44 @@ environments: args: ["run", "--project", "../Azure.Sdk.Tools.Cli", "--", "start"] timeout: "5m" env: - # Test mode: tools that would create real ADO work items / external - # resources (e.g. azsdk_create_release_plan) short-circuit into a - # test variant so evals are safe to re-run. AZSDKTOOLS_AGENT_TESTING: "true" AZSDKTOOLS_COLLECT_TELEMETRY: "false" # Suites group evals for selective execution. # -# Folder layout mirrors the standard test pyramid: -# evals/unit/ — single-tool, hermetic, fast (incl. per-tool triggers) -# evals/integration/ — multi-tool chains, still hermetic -# evals/e2e/ — live MCP + real git env + skills loaded (slow) -# -# Folder = tier (cost / cadence). Tags inside each YAML carry the feature -# area (release-plan, typespec, pipeline, github, …) so cross-cuts are -# selected via `filter:` below or `vally eval --tag area=`. +# Layout maps directly to suites — no tag-based mock/live filtering. Vally's +# suite filter is positive-match only (AND across keys, OR within values), +# so subfolders are the cleanest way to split mock vs live. See +# https://github.com/microsoft/vally suite-filter source. suites: # ---- by tier ---- unit: - description: Hermetic single-tool evals (incl. per-tool trigger coverage). Fast; safe for PR gate. + description: | + Hermetic single-tool / trigger evals. No external I/O. Fast; the + foundation of the PR gate. evals: ["evals/unit/*.eval.yaml"] - integration: - description: Multi-tool chained evals; hermetic. Still suitable for PR gate. - evals: ["evals/integration/*.eval.yaml"] - e2e: - description: Live end-to-end against real MCP + real azure-rest-api-specs clone. Prime the clone first with evals/setup/ensure-specs-clone.ps1. - evals: ["evals/e2e/*.eval.yaml"] + + scenarios-mock: + description: | + Multi-tool scenarios against the mock MCP environment. Hermetic; safe + for PR gate. + evals: ["evals/scenarios/mock/*.eval.yaml"] + + scenarios-live: + description: | + Scenarios against live MCP — real DevOps / GitHub / pipelines. Slow; + nightly only. Prime any required clones first via + `evals/setup/ensure-specs-clone.ps1`. + evals: ["evals/scenarios/live/*.eval.yaml"] # ---- composite suites ---- pr-gate: - description: Fast tiers only (unit + integration). Target for CI PR check. + description: Hermetic tiers only (unit + scenarios-mock). Target for CI PR check. evals: - "evals/unit/*.eval.yaml" - - "evals/integration/*.eval.yaml" + - "evals/scenarios/mock/*.eval.yaml" nightly: - description: All tiers including live e2e. + description: All tiers including live scenarios. evals: ["evals/**/*.eval.yaml"] # ---- by feature area (tag-filtered) ---- diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index 651f2d031d0..0af1fa8cfd3 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -68,23 +68,22 @@ One prompt → one expected MCP tool. No `environment.git`, no fixtures. Fast; s The companion [`scripts/Validate-EvalTools.ps1`](scripts/Validate-EvalTools.ps1) cross-checks that every tool referenced in `evals/unit/triggers-*.eval.yaml` exists on the running MCP server, and every server tool has at least one trigger. -#### `evals/integration/` — multi-tool chained evals (3) +#### `evals/scenarios/` — multi-tool scenarios (4) -Still hermetic (no `environment.git`), but the agent must invoke 2+ MCP tools in sequence. +Multi-step prompts that exercise 2+ MCP tools end-to-end. Split into +`mock/` (hermetic, runs on PR gate) and `live/` (real DevOps / GitHub / +pipelines, runs nightly). -| Scenario | Area | Shape | -|---|---|---| -| [`check-public-repo-then-validate`](evals/integration/check-public-repo-then-validate.eval.yaml) | typespec | Validate, then check public-repo presence | -| [`typespec-generation-step02`](evals/integration/typespec-generation-step02.eval.yaml) | typespec | Step in the spec-PR generation flow | -| [`rename-client-property`](evals/integration/rename-client-property.eval.yaml) | typespec | Stub — needs `expected-diff` grader + sparse clone | - -#### `evals/e2e/` — live end-to-end (1) +| Scenario | Area | Mode | Shape | +|---|---|---|---| +| [`check-public-repo-then-validate`](evals/scenarios/mock/check-public-repo-then-validate.eval.yaml) | typespec | mock | Validate, then check public-repo presence | +| [`typespec-generation-step02`](evals/scenarios/mock/typespec-generation-step02.eval.yaml) | typespec | mock | Step in the spec-PR generation flow | +| [`rename-client-property`](evals/scenarios/mock/rename-client-property.eval.yaml) | typespec | mock | Stub — needs `expected-diff` grader + sparse clone | +| [`release-planner`](evals/scenarios/live/release-planner.eval.yaml) | release-plan | **live** | Create + re-fetch a release plan, kick off SDK gen, link PR back — real DevOps test-area writes | -Drives the real MCP server inside a real `azure-rest-api-specs` worktree. Slow; prime a per-user clone first via [`evals/setup/ensure-specs-clone.ps1`](evals/setup/ensure-specs-clone.ps1) (auto-refreshes every 24h). - -| Scenario | Area | Shape | -|---|---|---| -| [`release-planner-e2e`](evals/e2e/release-planner-e2e.eval.yaml) | release-plan | Create then re-fetch a release plan; real DevOps test-area writes | +Live scenarios need a primed `azure-rest-api-specs` clone — run +[`evals/setup/ensure-specs-clone.ps1`](evals/setup/ensure-specs-clone.ps1) +(auto-refreshes every 24h) before invoking the `scenarios-live` / `nightly` suite. **Skill evals (already in repo, *not* part of this PR)** — for reference: @@ -112,10 +111,12 @@ tracks the migration in Azure.Sdk.Tools.Vally/ ├── .vally.yaml # Vally config (environments + suites) ├── evals/ -│ ├── unit/ # tier 1: single-tool, hermetic, fast -│ ├── integration/ # tier 2: multi-tool chains, hermetic -│ ├── e2e/ # tier 3: live MCP + real azure-rest-api-specs -│ └── setup/ # helper scripts (e.g. ensure-specs-clone.ps1) +│ ├── unit/ # tool-shape + per-skill trigger evals, hermetic +│ ├── scenarios/ +│ │ ├── mock/ # multi-tool scenarios, hermetic (PR gate) +│ │ └── live/ # multi-tool scenarios, live MCP (nightly) +│ ├── setup/ # helper scripts (e.g. ensure-specs-clone.ps1) +│ └── fixtures/ # (future) pinned SHAs + per-eval mocks ├── fixtures/ # Per-scenario static input files (env.files) │ └── /... ├── scripts/ # Repo-side helpers (Validate-EvalTools.ps1, …) @@ -123,9 +124,13 @@ Azure.Sdk.Tools.Vally/ └── Azure.Sdk.Tools.Vally.csproj # added when first custom grader lands ``` -Folder = test pyramid tier (cost / CI cadence). Feature **area** lives as a -`tags:` entry inside each YAML so cross-cuts (e.g. “all release-plan evals”) -select via [`.vally.yaml`](.vally.yaml) suite filters or `vally eval --tag`. +Folder = tier (cost / CI cadence): `unit/` is hermetic + fast, +`scenarios/mock/` is multi-tool hermetic, `scenarios/live/` is multi-tool +against real services. Vally's suite filter is positive-match only, so the +mock-vs-live split lives on disk rather than in tags. Feature **area** still +lives as a `tags:` entry inside each YAML so cross-cuts (e.g. "all +release-plan evals") select via [`.vally.yaml`](.vally.yaml) suite filters +or `vally eval --tag`. ## Running locally @@ -151,7 +156,7 @@ $vally = '../../../eng/skill-eval/node_modules/.bin/vally.cmd' # A single tier & $vally eval --suite unit -& $vally eval --suite integration +& $vally eval --suite scenarios-mock # By feature area (cross-cuts tiers via tag filter) & $vally eval --suite release-plan @@ -164,27 +169,28 @@ Run a single eval: & $vally eval --eval-spec evals/unit/check-public-repo.eval.yaml ``` -Run the live e2e tier (first, prime a per-user clone of +Run the live scenarios tier (first, prime a per-user clone of `azure-rest-api-specs`; the helper refreshes it every 24h): ```powershell ./evals/setup/ensure-specs-clone.ps1 -& $vally eval --suite e2e +& $vally eval --suite scenarios-live ``` ## Adding a new scenario 1. **Pick a tier** — the folder you drop the YAML into: - `evals/unit/` — one prompt, one MCP tool, no environment hooks. - - `evals/integration/` — multi-tool flow, still hermetic. - - `evals/e2e/` — needs live MCP + a real git env / external service. + - `evals/scenarios/mock/` — multi-tool flow against `azsdk-mcp-mock`. + Hermetic; runs on PR gate. + - `evals/scenarios/live/` — needs real DevOps / GitHub / pipelines; + bind `environment: azsdk-mcp-live`. Nightly only. 2. Pick a short, kebab-case name (e.g. `create-release-plan`). 3. Create `evals//.eval.yaml`. Start from a sibling in the same tier as a template. 4. **Tag it** so suite filters pick it up: ```yaml tags: - tier: unit # or integration / e2e area: release-plan # or typespec / pipeline / github / engsys / apiview / package ``` 5. If the scenario needs input files, add them under diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/check-public-repo-then-validate.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml index bac7328999c..26948f38aa5 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/check-public-repo-then-validate.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml @@ -7,10 +7,9 @@ type: capability tags: - tier: integration area: typespec -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/live/release-planner.eval.yaml similarity index 95% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/live/release-planner.eval.yaml index fe18eb9d814..1c709aa5bbb 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/e2e/release-planner-e2e.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/live/release-planner.eval.yaml @@ -32,12 +32,16 @@ description: | version: "1.0" type: capability -# Tagged "tier: e2e" so a future PR-gate run can skip live evals if needed. tags: - tier: e2e area: release-plan - -environment: azsdk-mcp +metadata: + repos: + - name: Azure/azure-rest-api-specs + +# Bound to the live env because the scenario asserts real DevOps writes +# (test-area path) + real generation pipeline. Mock can't satisfy those +# graders. Picked up by the `scenarios-live` / `nightly` suite via folder. +environment: azsdk-mcp-live config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml new file mode 100644 index 00000000000..26948f38aa5 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml @@ -0,0 +1,43 @@ +name: azsdk-mcp-tool-scenarios +description: | + Validate-then-check-public-repo: the agent should run TypeSpec validation, + then check if the project is in the public repo. +version: "1.0" +type: capability + + +tags: + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: validate-then-check-public-repo + prompt: | + Run TypeSpec validation, then check if the project is in the public repo. + Project path: specification/contosowidgetmanager/Contoso.WidgetManager. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 8 + max_tokens: 8000 + # TODO: assert ordering (validate before check) — blocked on https://github.com/Azure/azure-sdk-tools/issues/15832 (Vally tool-calls grader needs sequence:). + graders: + - type: tool-calls + config: + required: + - azsdk_run_typespec_validation + - azsdk_typespec_check_project_in_public_repo + disallowed: + - azsdk_verify_setup + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml similarity index 96% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/rename-client-property.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml index f07a2615f90..8b17d0ae161 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/rename-client-property.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml @@ -8,10 +8,9 @@ type: capability tags: - tier: integration area: typespec -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml similarity index 95% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/typespec-generation-step02.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml index 5f149d3c1a1..37ea805d290 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/integration/typespec-generation-step02.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml @@ -7,10 +7,9 @@ type: capability tags: - tier: integration area: typespec -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/release-planner.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/release-planner.eval.yaml new file mode 100644 index 00000000000..dd2c42bb2be --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/release-planner.eval.yaml @@ -0,0 +1,154 @@ +name: azsdk-mcp-tool-scenarios +description: | + Live end-to-end demo for the full release-planner -> generate-SDK flow. + + Drives the *real* azsdk-cli MCP server against real DevOps APIs, inside a + real git worktree of azure-rest-api-specs. The MCP server runs with + AZSDKTOOLS_AGENT_TESTING=true (set globally in .vally.yaml), so work items + route to the DevOps test area path and are safe to leave around / re-run. + + This scenario walks the agent through a multi-step chain that exercises + multiple skills back-to-back in a single conversation: + + 1. Release-plan skill -> azsdk_create_release_plan, azsdk_get_release_plan + 2. Generate-SDK skill -> azsdk_run_generate_sdk + 3. Release-plan skill -> azsdk_link_sdk_pull_request_to_release_plan + + The goal is to verify Vally end-to-end (live agent + live MCP + live DevOps) + can: + - route each turn to the correct skill, + - call the correct tool on that skill, + - and do so in the expected order across multiple steps. + + Demonstrates Vally's environment.git fixture hook + live MCP + Copilot SDK + executor + real DevOps in one shot. + + Prerequisite: a clone of Azure/azure-rest-api-specs at the path referenced + by environment.git.source below. Locally, run + evals/setup/ensure-specs-clone.ps1 to prime a per-user cache + (auto-refresh every 24h) at the path this source points at. CI should + clone the repo as a pipeline checkout step instead. + +version: "1.0" +type: capability + +# Tagged live-safe so this scenario ALSO runs against live MCP nightly +# (see DESIGN.md §3). Without the tag it would only run on mock. +tags: + live-safe: "true" + area: release-plan +metadata: + repos: + - name: Azure/azure-rest-api-specs + +# Bound to the live env because the scenario asserts real DevOps writes +# (test-area path) + real generation pipeline. Mock can't satisfy those +# graders. Pick this up via the `scenarios-live` / `nightly` suite. +environment: azsdk-mcp-live + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: release-planner-e2e + environment: + # Source is the per-user cache populated by evals/setup/ensure-specs-clone.ps1 + # (idempotent shallow+sparse clone, auto-refresh every 24h). + # NOTE: hardcoded absolute path — Vally does not currently expand + # ${USERPROFILE} / env vars in env.git.source. Adjust per machine + # or replace with a CI-provided path. See upstream issue: + # https://github.com/microsoft/vally/issues (TODO: file env-var expansion) + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main + prompt: | + I'm in a checkout of azure-rest-api-specs. Walk me through the full + release-plan + SDK-generation flow for the Contoso Widget Manager + end-to-end. Do every step below, in order, and use real tools (no + dry-run, no simulation): + + 1. Create a release plan using: + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f" + - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e" + - target release timeline: "December 2026" + - API version: "2022-11-01-preview" + - SDK release type: "beta" + - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" + + 2. Fetch the release plan you just created back from DevOps to confirm + it was saved, and tell me its work-item ID. + + 3. Kick off SDK generation for that same TypeSpec project via the + generation pipeline (Python SDK is fine). Use the work-item ID + from step 2. + + 4. Once the generation pipeline reports a pull request URL, link + that SDK pull request back to the release plan from step 2. + + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 20 + max_tokens: 30000 + # TODO: assert strict ordering create -> get -> generate -> link + # — blocked on https://github.com/microsoft/vally/issues/453 (tool-calls grader sequence:). + # TODO: assert args (serviceTreeId / productTreeId / typeSpecProjectPath / workItemId) + # — blocked on https://github.com/microsoft/vally/issues/454 (tool-calls grader generic args:). + # TODO: add `azsdk-common-generate-sdk-locally` (or the equivalent pipeline- + # driven skill) to skill-invocation `required` once a skill that owns + # `azsdk_run_generate_sdk` is registered. Today the only skill that + # declares any of the tools in this scenario is azsdk-common-prepare-release-plan. + graders: + # 1. Skill-routing check (FIRST — fast, deterministic, free): did the + # agent dispatch to the right skill at all? If this fails, the + # tool-calls grader below is meaningless. + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + # 2. Tool-call check: given the right skill was loaded, did it call + # the right MCP tools? Each tool here is owned by the skill above + # except azsdk_run_generate_sdk (see TODO). + - type: tool-calls + config: + required: + - azsdk_create_release_plan + - azsdk_get_release_plan + - azsdk_run_generate_sdk + - azsdk_link_sdk_pull_request_to_release_plan + disallowed: + - azsdk_verify_setup + # 3. Final-answer correctness (LLM-judged): the deterministic graders + # above only verify the agent *did* the right things, not that it + # *reported* them back to the user correctly. Tools can fire + # successfully while the final message hallucinates IDs / URLs. + # This grader uses gpt-5.4 as judge against a free-form rubric so + # minor wording variants (`WI 29262`, `work-item #29262`) all pass. + - type: prompt + config: + model: gpt-5.4 + rubric: | + Did the final assistant message clearly state BOTH of the + following, consistent with the tools that were actually called? + + 1. A numeric DevOps work-item ID for the release plan that was + created (or confirmed). Any unambiguous format is fine + (e.g. "work item 29262", "WI #29262", "/_workitems/edit/29262"). + + 2. A GitHub pull request URL on + github.com/Azure/azure-sdk-for-* that was linked back to + that release plan. + + Answer "pass" only if BOTH are present. Otherwise answer "fail" + and briefly say which one is missing. + +scoring: + weights: + skill-invocation: 1 + tool-calls: 1 + prompt: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml new file mode 100644 index 00000000000..8b17d0ae161 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml @@ -0,0 +1,43 @@ +name: azsdk-mcp-tool-scenarios +description: | + Rename-client-property: the agent should rename @clientName("uri", "csharp") + to @clientName("imageUri", "csharp") on the AddFaceFromUrlRequest.url + property in specification/ai/Face/models.common.tsp. +version: "1.0" +type: capability + + +tags: + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: rename-client-property + prompt: | + In the specification/ai/Face project, find the AddFaceFromUrlRequest model. + It has a property called 'url' that's been renamed to "uri" in c#. + Change that to imageUri for c#. + constraints: + max_turns: 5 + max_tokens: 5000 + # TODO: seed a git worktree (environment.git) at specification/ai/Face and + # add a `file-matches` grader on models.common.tsp to verify the + # @clientName("uri", "csharp") → @clientName("imageUri", "csharp") rename. + graders: + - type: tool-calls + config: + required: + - edit + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml new file mode 100644 index 00000000000..37ea805d290 --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml @@ -0,0 +1,42 @@ +name: azsdk-mcp-tool-scenarios +description: | + TypeSpec generation workflow step 2: the agent should check whether the + project is in the public repo as part of the validation step. +version: "1.0" +type: capability + + +tags: + area: typespec + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + - name: typespec-generation-step02-validation + prompt: | + I'm working on the TypeSpec generation workflow. I need to validate my TypeSpec project + as part of step 2. Please check if my TypeSpec project is in the public repo. + The project is at specification/contosowidgetmanager/Contoso.WidgetManager. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 5 + max_tokens: 5000 + graders: + - type: tool-calls + config: + required: + - azsdk_typespec_check_project_in_public_repo + disallowed: + - azsdk_verify_setup + +scoring: + weights: + tool-calls: 1 + threshold: 1.0 + diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml index 13485b27443..fc96cc2be73 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml @@ -11,7 +11,7 @@ tags: tier: unit area: typespec -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml index 02cff88da23..51e2ec3d129 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml @@ -11,7 +11,7 @@ tags: tier: unit area: typespec -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml index 3a4e9ef2998..714d31cd732 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml @@ -10,7 +10,7 @@ tags: tier: unit area: pipeline -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml index df24807e604..7b4a25f0725 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml @@ -34,7 +34,7 @@ tags: # `environment: azsdk-mcp` refers to the named environment defined in # ../../.vally.yaml (configures the azsdk-cli MCP server + env vars). -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 # bump for flakiness sampling (e.g. runs: 5) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml index 6eb37e87fd3..b62e3536c8b 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml @@ -11,7 +11,7 @@ tags: tier: unit area: typespec -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml index 254a59f9897..bb61cd1f5c3 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml @@ -11,7 +11,7 @@ tags: tier: unit area: github -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml index b9c869f6f2b..aeddc254dd0 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml @@ -10,7 +10,7 @@ tags: tier: unit area: release-plan -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml index 7c89923f269..a1de2a6541f 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml @@ -5,11 +5,11 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml index 1a3e9d0af04..d03ba74ad0a 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml @@ -5,11 +5,11 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml index 74a70bb1285..5ee6b9c584b 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml @@ -5,11 +5,11 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml index 7d43f138e56..50ed5954c62 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml @@ -5,7 +5,7 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock tags: tier: unit @@ -14,7 +14,7 @@ tags: config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml index 834ea8ac45d..e2d3217175a 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml @@ -5,7 +5,7 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock tags: tier: unit @@ -14,7 +14,7 @@ tags: config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml index 196adea711c..5e283686437 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml @@ -5,7 +5,7 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock tags: tier: unit @@ -14,7 +14,7 @@ tags: config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml index 635c99e4740..1c8f9277ede 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml @@ -5,7 +5,7 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock tags: tier: unit @@ -14,7 +14,7 @@ tags: config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml index f596187b8bb..63e4b9182f3 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml @@ -5,11 +5,11 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml index a5a62ba0e33..24e43a38451 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml @@ -5,11 +5,11 @@ description: | version: "1.0" type: capability -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 5 - timeout: 120 + timeout: "120s" executor: copilot-sdk model: claude-opus-4.6 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml index 3649a637745..75993329a60 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml @@ -10,7 +10,7 @@ tags: tier: unit area: typespec -environment: azsdk-mcp +environment: azsdk-mcp-mock config: runs: 1 From 6db7c5fc1f7a83291049ac366b60524a386178dc Mon Sep 17 00:00:00 2001 From: helen229 Date: Wed, 3 Jun 2026 15:14:08 -0700 Subject: [PATCH 15/24] remove the duplicates --- .../check-public-repo-then-validate.eval.yaml | 43 ----- .../evals/scenarios/release-planner.eval.yaml | 154 ------------------ .../rename-client-property.eval.yaml | 43 ----- .../typespec-generation-step02.eval.yaml | 42 ----- 4 files changed, 282 deletions(-) delete mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml delete mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/release-planner.eval.yaml delete mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml delete mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml deleted file mode 100644 index 26948f38aa5..00000000000 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/check-public-repo-then-validate.eval.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: azsdk-mcp-tool-scenarios -description: | - Validate-then-check-public-repo: the agent should run TypeSpec validation, - then check if the project is in the public repo. -version: "1.0" -type: capability - - -tags: - area: typespec - -environment: azsdk-mcp-mock - -config: - runs: 1 - timeout: 30m - model: gpt-5.4 - executor: copilot-sdk - -stimuli: - - name: validate-then-check-public-repo - prompt: | - Run TypeSpec validation, then check if the project is in the public repo. - Project path: specification/contosowidgetmanager/Contoso.WidgetManager. - My setup has already been verified, do not run azsdk_verify_setup. - constraints: - max_turns: 8 - max_tokens: 8000 - # TODO: assert ordering (validate before check) — blocked on https://github.com/Azure/azure-sdk-tools/issues/15832 (Vally tool-calls grader needs sequence:). - graders: - - type: tool-calls - config: - required: - - azsdk_run_typespec_validation - - azsdk_typespec_check_project_in_public_repo - disallowed: - - azsdk_verify_setup - -scoring: - weights: - tool-calls: 1 - threshold: 1.0 - diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/release-planner.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/release-planner.eval.yaml deleted file mode 100644 index dd2c42bb2be..00000000000 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/release-planner.eval.yaml +++ /dev/null @@ -1,154 +0,0 @@ -name: azsdk-mcp-tool-scenarios -description: | - Live end-to-end demo for the full release-planner -> generate-SDK flow. - - Drives the *real* azsdk-cli MCP server against real DevOps APIs, inside a - real git worktree of azure-rest-api-specs. The MCP server runs with - AZSDKTOOLS_AGENT_TESTING=true (set globally in .vally.yaml), so work items - route to the DevOps test area path and are safe to leave around / re-run. - - This scenario walks the agent through a multi-step chain that exercises - multiple skills back-to-back in a single conversation: - - 1. Release-plan skill -> azsdk_create_release_plan, azsdk_get_release_plan - 2. Generate-SDK skill -> azsdk_run_generate_sdk - 3. Release-plan skill -> azsdk_link_sdk_pull_request_to_release_plan - - The goal is to verify Vally end-to-end (live agent + live MCP + live DevOps) - can: - - route each turn to the correct skill, - - call the correct tool on that skill, - - and do so in the expected order across multiple steps. - - Demonstrates Vally's environment.git fixture hook + live MCP + Copilot SDK - executor + real DevOps in one shot. - - Prerequisite: a clone of Azure/azure-rest-api-specs at the path referenced - by environment.git.source below. Locally, run - evals/setup/ensure-specs-clone.ps1 to prime a per-user cache - (auto-refresh every 24h) at the path this source points at. CI should - clone the repo as a pipeline checkout step instead. - -version: "1.0" -type: capability - -# Tagged live-safe so this scenario ALSO runs against live MCP nightly -# (see DESIGN.md §3). Without the tag it would only run on mock. -tags: - live-safe: "true" - area: release-plan -metadata: - repos: - - name: Azure/azure-rest-api-specs - -# Bound to the live env because the scenario asserts real DevOps writes -# (test-area path) + real generation pipeline. Mock can't satisfy those -# graders. Pick this up via the `scenarios-live` / `nightly` suite. -environment: azsdk-mcp-live - -config: - runs: 1 - timeout: 30m - model: gpt-5.4 - executor: copilot-sdk - -stimuli: - - name: release-planner-e2e - environment: - # Source is the per-user cache populated by evals/setup/ensure-specs-clone.ps1 - # (idempotent shallow+sparse clone, auto-refresh every 24h). - # NOTE: hardcoded absolute path — Vally does not currently expand - # ${USERPROFILE} / env vars in env.git.source. Adjust per machine - # or replace with a CI-provided path. See upstream issue: - # https://github.com/microsoft/vally/issues (TODO: file env-var expansion) - git: - type: worktree - source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs - ref: main - prompt: | - I'm in a checkout of azure-rest-api-specs. Walk me through the full - release-plan + SDK-generation flow for the Contoso Widget Manager - end-to-end. Do every step below, in order, and use real tools (no - dry-run, no simulation): - - 1. Create a release plan using: - - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" - - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f" - - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e" - - target release timeline: "December 2026" - - API version: "2022-11-01-preview" - - SDK release type: "beta" - - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" - - 2. Fetch the release plan you just created back from DevOps to confirm - it was saved, and tell me its work-item ID. - - 3. Kick off SDK generation for that same TypeSpec project via the - generation pipeline (Python SDK is fine). Use the work-item ID - from step 2. - - 4. Once the generation pipeline reports a pull request URL, link - that SDK pull request back to the release plan from step 2. - - My setup has already been verified, do not run azsdk_verify_setup. - constraints: - max_turns: 20 - max_tokens: 30000 - # TODO: assert strict ordering create -> get -> generate -> link - # — blocked on https://github.com/microsoft/vally/issues/453 (tool-calls grader sequence:). - # TODO: assert args (serviceTreeId / productTreeId / typeSpecProjectPath / workItemId) - # — blocked on https://github.com/microsoft/vally/issues/454 (tool-calls grader generic args:). - # TODO: add `azsdk-common-generate-sdk-locally` (or the equivalent pipeline- - # driven skill) to skill-invocation `required` once a skill that owns - # `azsdk_run_generate_sdk` is registered. Today the only skill that - # declares any of the tools in this scenario is azsdk-common-prepare-release-plan. - graders: - # 1. Skill-routing check (FIRST — fast, deterministic, free): did the - # agent dispatch to the right skill at all? If this fails, the - # tool-calls grader below is meaningless. - - type: skill-invocation - config: - required: - - azsdk-common-prepare-release-plan - # 2. Tool-call check: given the right skill was loaded, did it call - # the right MCP tools? Each tool here is owned by the skill above - # except azsdk_run_generate_sdk (see TODO). - - type: tool-calls - config: - required: - - azsdk_create_release_plan - - azsdk_get_release_plan - - azsdk_run_generate_sdk - - azsdk_link_sdk_pull_request_to_release_plan - disallowed: - - azsdk_verify_setup - # 3. Final-answer correctness (LLM-judged): the deterministic graders - # above only verify the agent *did* the right things, not that it - # *reported* them back to the user correctly. Tools can fire - # successfully while the final message hallucinates IDs / URLs. - # This grader uses gpt-5.4 as judge against a free-form rubric so - # minor wording variants (`WI 29262`, `work-item #29262`) all pass. - - type: prompt - config: - model: gpt-5.4 - rubric: | - Did the final assistant message clearly state BOTH of the - following, consistent with the tools that were actually called? - - 1. A numeric DevOps work-item ID for the release plan that was - created (or confirmed). Any unambiguous format is fine - (e.g. "work item 29262", "WI #29262", "/_workitems/edit/29262"). - - 2. A GitHub pull request URL on - github.com/Azure/azure-sdk-for-* that was linked back to - that release plan. - - Answer "pass" only if BOTH are present. Otherwise answer "fail" - and briefly say which one is missing. - -scoring: - weights: - skill-invocation: 1 - tool-calls: 1 - prompt: 1 - threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml deleted file mode 100644 index 8b17d0ae161..00000000000 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/rename-client-property.eval.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: azsdk-mcp-tool-scenarios -description: | - Rename-client-property: the agent should rename @clientName("uri", "csharp") - to @clientName("imageUri", "csharp") on the AddFaceFromUrlRequest.url - property in specification/ai/Face/models.common.tsp. -version: "1.0" -type: capability - - -tags: - area: typespec - -environment: azsdk-mcp-mock - -config: - runs: 1 - timeout: 30m - model: gpt-5.4 - executor: copilot-sdk - -stimuli: - - name: rename-client-property - prompt: | - In the specification/ai/Face project, find the AddFaceFromUrlRequest model. - It has a property called 'url' that's been renamed to "uri" in c#. - Change that to imageUri for c#. - constraints: - max_turns: 5 - max_tokens: 5000 - # TODO: seed a git worktree (environment.git) at specification/ai/Face and - # add a `file-matches` grader on models.common.tsp to verify the - # @clientName("uri", "csharp") → @clientName("imageUri", "csharp") rename. - graders: - - type: tool-calls - config: - required: - - edit - -scoring: - weights: - tool-calls: 1 - threshold: 1.0 - diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml deleted file mode 100644 index 37ea805d290..00000000000 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/typespec-generation-step02.eval.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: azsdk-mcp-tool-scenarios -description: | - TypeSpec generation workflow step 2: the agent should check whether the - project is in the public repo as part of the validation step. -version: "1.0" -type: capability - - -tags: - area: typespec - -environment: azsdk-mcp-mock - -config: - runs: 1 - timeout: 30m - model: gpt-5.4 - executor: copilot-sdk - -stimuli: - - name: typespec-generation-step02-validation - prompt: | - I'm working on the TypeSpec generation workflow. I need to validate my TypeSpec project - as part of step 2. Please check if my TypeSpec project is in the public repo. - The project is at specification/contosowidgetmanager/Contoso.WidgetManager. - My setup has already been verified, do not run azsdk_verify_setup. - constraints: - max_turns: 5 - max_tokens: 5000 - graders: - - type: tool-calls - config: - required: - - azsdk_typespec_check_project_in_public_repo - disallowed: - - azsdk_verify_setup - -scoring: - weights: - tool-calls: 1 - threshold: 1.0 - From b77dccbda13c37a992ecf5cd02ad79b405c1d50a Mon Sep 17 00:00:00 2001 From: helen229 Date: Thu, 4 Jun 2026 07:33:47 -0700 Subject: [PATCH 16/24] add new scenarios --- .../check-public-repo-then-validate.eval.yaml | 5 + .../mock/release-planner-workflows.eval.yaml | 158 ++++++++++++++++++ .../mock/rename-client-property.eval.yaml | 5 + .../mock/typespec-generation-step02.eval.yaml | 5 + 4 files changed, 173 insertions(+) create mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/release-planner-workflows.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml index 26948f38aa5..95c2ea0f5f2 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml @@ -35,9 +35,14 @@ stimuli: - azsdk_typespec_check_project_in_public_repo disallowed: - azsdk_verify_setup + - type: skill-invocation + config: + required: + - azure-typespec-author scoring: weights: tool-calls: 1 + skill-invocation: 1 threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/release-planner-workflows.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/release-planner-workflows.eval.yaml new file mode 100644 index 00000000000..32e0b61ea5c --- /dev/null +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/release-planner-workflows.eval.yaml @@ -0,0 +1,158 @@ +name: azsdk-mcp-tool-scenarios +description: | + Mock-environment workflow scenarios derived from the release-planner + replacement test plan (#15835). Each stimulus mirrors one of the four + high-level scenarios that release-planner-dashboard must hand off to the + agent: + + 1. Create a release plan (private preview / public preview / GA) + 2. Generate SDK for all languages in an existing release plan + 3. Link a different spec PR to an existing release plan + 4. Update SDK details (package names) in a release plan + + Plus an end-to-end "create + generate" flow used as the headline demo + prompt. + + Bound to the mock MCP — these graders only inspect skill routing and tool + selection, not real DevOps writes. The full live e2e flow lives in + evals/scenarios/live/release-planner.eval.yaml. + +version: "1.0" +type: capability + +tags: + area: release-plan + +environment: azsdk-mcp-mock + +config: + runs: 1 + timeout: 30m + model: gpt-5.4 + executor: copilot-sdk + +stimuli: + # --- Scenario 1: Create release plan --------------------------------- + - name: create-public-preview-release-plan + prompt: | + Create a public preview release plan for + specification. + Target release month: June 2026. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 6 + max_tokens: 8000 + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_create_release_plan + disallowed: + - azsdk_verify_setup + + # --- End-to-end demo prompt: create + generate ----------------------- + - name: create-release-plan-and-generate-sdk + prompt: | + Create a release plan and generate SDK for the TypeSpec project + specification + Target release month: June 2026, SDK release type beta. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 12 + max_tokens: 16000 + # TODO: assert ordering create -> get -> generate -> update-details + # — blocked on Vally tool-calls grader sequence: support. + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_create_release_plan + - azsdk_run_generate_sdk + - azsdk_update_sdk_details_in_release_plan + disallowed: + - azsdk_verify_setup + + # --- Scenario 2: Generate SDK for an existing release plan ----------- + - name: generate-sdk-for-existing-release-plan + prompt: | + Generate SDK for all languages for release plan 29262. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 8 + max_tokens: 10000 + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_run_generate_sdk + disallowed: + - azsdk_verify_setup + - azsdk_create_release_plan + + # --- Scenario 3: Link a different spec PR to an existing release plan + - name: link-different-spec-pr-to-release-plan + prompt: | + Update the API spec PR in release plan 29262 to + https://github.com/Azure/azure-rest-api-specs/pull/38500. + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 6 + max_tokens: 8000 + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_update_api_spec_pull_request_in_release_plan + disallowed: + - azsdk_verify_setup + - azsdk_create_release_plan + + # --- Scenario 4: Update SDK details (package names) ------------------ + - name: update-sdk-details-in-release-plan + prompt: | + Update SDK details / package names in release plan 29262 based on the + TypeSpec emitter configuration in tspconfig.yaml for + specification + My setup has already been verified, do not run azsdk_verify_setup. + constraints: + max_turns: 8 + max_tokens: 10000 + graders: + - type: skill-invocation + config: + required: + - azsdk-common-prepare-release-plan + - type: tool-calls + config: + required: + - azsdk_get_release_plan + - azsdk_update_sdk_details_in_release_plan + disallowed: + - azsdk_verify_setup + - azsdk_create_release_plan + +scoring: + weights: + skill-invocation: 1 + tool-calls: 1 + threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml index 8b17d0ae161..0892601a25e 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml @@ -35,9 +35,14 @@ stimuli: config: required: - edit + - type: skill-invocation + config: + required: + - azure-typespec-author scoring: weights: tool-calls: 1 + skill-invocation: 1 threshold: 1.0 diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml index 37ea805d290..954188d4b5b 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml @@ -34,9 +34,14 @@ stimuli: - azsdk_typespec_check_project_in_public_repo disallowed: - azsdk_verify_setup + - type: skill-invocation + config: + required: + - azure-typespec-author scoring: weights: tool-calls: 1 + skill-invocation: 1 threshold: 1.0 From 1264e9a01de6ac56063974da26ee517b885d1d3b Mon Sep 17 00:00:00 2001 From: helen229 Date: Thu, 4 Jun 2026 10:03:08 -0700 Subject: [PATCH 17/24] update the doc --- .../azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md | 661 ------------------ .../Azure.Sdk.Tools.Vally/REQUIREMENTS.md | 185 ----- .../8-operations-agent-eval-strategy.spec.md | 367 ++++++++++ 3 files changed, 367 insertions(+), 846 deletions(-) delete mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md delete mode 100644 tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md create mode 100644 tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md deleted file mode 100644 index 394462db76a..00000000000 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/DESIGN.md +++ /dev/null @@ -1,661 +0,0 @@ -# Vally Tool-Scenario Evaluation — Design - -> Companion to [REQUIREMENTS.md](./REQUIREMENTS.md). Where REQUIREMENTS says -> *what* and *why*, this doc says *how*. - ---- - -## 0. Scope - -This design covers the eval framework that lives in -[`tools/azsdk-cli/Azure.Sdk.Tools.Vally/`](./) — i.e. the scenarios that -verify the Azure SDK agent picks the right skill, calls the right MCP tools, -in the right order, with the right arguments, and returns the right answer. - ---- - -## 1. Layering - -### 1.1 Three levels of testing - -Aligned with the 2026-06 design review. Three named levels, -differentiated by **what they exercise** and **what backend they hit**: - -| Level | Name | What it proves | Agent | MCP | Lives in | -|---|---|---|---|---|---| -| 0 | **Routing evals** | Prompt X routes to skill Y | live | none (no MCP server) | `.github/skills//evals/` | -| 1 | **Workflow scenarios (mock)** | Agent picks the right skills, calls the right tools in the right order with the right args, returns the right answer | live | **mock** | `evals/scenarios/` *(default)* | -| 2 | **Live scenarios** | Same as level 1, but against the real backend — catches drift the mock can't see (TypeSpec ordering, real codegen output, real DevOps state) | live | **live** | `evals/scenarios/` + `tags: { live-safe: "true" }` | - -Plus a hermetic tool-shape layer that isn't agent-driven: - -| | Name | What it proves | Lives in | -|---|---|---|---| -| — | **Unit evals** | "Tool X exists and returns the right shape for these inputs." Cross-skill trigger tables. | `evals/unit/` | - -**Mock is the default. Live is the exception.** Both modes drive the -same live agent (LLM), so **both incur agent token cost**; the mock -itself is a deterministic C# stub with no LLM inside it. The cost delta -between mock and live is on three other axes: - -1. **Wall time.** Real backends (DevOps, codegen pipelines, GitHub) add - seconds-to-minutes per tool call; the mock returns instantly. -2. **Backend side effects + quota.** Live hits real ADO work items, - real pipeline runs, real PRs. Mock does none of that. -3. **Agent turn count (indirect token cost).** Real tool responses are - larger and more variable, which expands per-turn input and provokes - more retry / polling turns. The headline 1.78M tokens on the live - release-planner-e2e run is mostly this effect, not the mock saving - tokens directly. - -Reviewer framing, paraphrased: *live MCP incurs significant token cost, so -most testing — including release plan and SDK generation — should use -mock; live is reserved for scenarios mock can't deterministically cover.* -The "token cost" pointed at there is (3) above plus the wall-time fan-out, -not a claim that the mock is free. - -``` -evals/ -├── unit/ tool-shape + cross-skill triggers (hermetic) -├── scenarios/ level 1 by default; level 2 when tagged live-safe -├── setup/ shared fixture scripts (repo clone, etc.) -└── fixtures/ pinned SHAs + per-eval mocks -``` - -**Key property: scenarios are environment-agnostic.** A scenario YAML -declares the prompt, expected skills, expected tool sequence, and graders -— nothing about whether MCP is mock or live. Same file, same graders; -the MCP backend is picked at run time. - -| Run mode | MCP | Repos? | When | Coverage | Cost | -|---|---|---|---|---|---| -| Level 1 (workflow / mock) | mock (deterministic stub, no LLM) | none | **every PR** | every scenario | agent tokens only; ~1m / scenario | -| Level 2 (live) | live (real backends) | shallow + sparse | **nightly** | scenarios tagged `live-safe` (curated subset) | agent tokens + real backend latency + more turns from real responses; 10-20m / scenario, ~2M agent tokens observed | - -When the live and mock results disagree, the mock lied — exact bisect. -That's also how mock coverage gaps surface: every scenario that runs on -mock forces the mock to grow handlers for the tools it exercises -(see §4). - -### 1.2 Relationship to `.github/skills/*/evals/` — split by ownership - -Two homes, split on a simple rule: - -| What it tests | Lives in | Owned by | -|---|---|---| -| **One skill** (does *this* skill route + call its tools + return a sensible answer) | `.github/skills//evals/` | Skill author | -| **Cross-skill / cross-tool** (multi-step chains, e2e flows, mock-server integration, anything that doesn't belong to one skill) | `tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/` | Eval-framework owner | - -Per-skill evals stay **next to `SKILL.md`** — that's the convention skill -authors expect, and it keeps "everything about my skill in one folder." -Today's per-skill `eval.yaml` files don't move. - -This project owns: - -- **The runner config** ([`.vally.yaml`](./.vally.yaml)): environments - (`azsdk-mcp-live`, `azsdk-mcp-mock`), suites, MCP server definitions. - Per-skill evals reference these environments by name. -- **Shared fixtures** ([`evals/setup/`](./evals/setup/), - [`evals/fixtures/`](./evals/fixtures/)): the specs-clone hook, SHA locks, - language-repo cache scripts. Per-skill evals can reuse them via `setup:`. -- **Cross-skill scenarios**: `evals/scenarios/` — multi-step flows like - release-planner that span release-plan + generate-sdk. These have no - single skill owner, so they live here. -- **Tier-1 `unit/` tool-trigger + tool-shape evals** that aren't owned by - any one skill (e.g. `triggers-pipeline.eval.yaml` covers tools used by - three different skills). - -The runner picks up both: - -``` -vally eval \ - --eval-spec '.github/skills/**/evals/*.eval.yaml' \ - --eval-spec 'tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/**/*.eval.yaml' \ - --skill-dir .github/skills -``` - -Or, equivalently, suites in `.vally.yaml` glob both paths. - -#### What skill authors get without moving anything - -Per-skill evals already use Vally graders. To unlock the §4.6 trifecta -(skill + tool-calls + correctness in one scenario), a skill author edits -their existing `evals/eval.yaml` to **add** the missing graders — they -don't relocate the file: - -```yaml -# .github/skills//evals/eval.yaml -environment: azsdk-mcp-mock # references env defined in our .vally.yaml -graders: - - type: skill-invocation - config: { required: [] } - - type: tool-calls - config: { required: [...], disallowed: [...] } - - type: prompt - config: { rubric: ... } -``` - -#### Why this split (not "move everything here") - -| Concern | Skill evals here (move) | Skill evals stay + cross-cuts here (this proposal) | -|---|---|---| -| Skill author finds their evals | filtered by tag, ~5 dirs away | next to SKILL.md ✓ | -| Skill + tool-calls + correctness in same scenario | ✓ | ✓ (add graders to existing file) | -| Cross-skill chains have a clear home | ✓ | ✓ (`evals/scenarios/`) | -| New skill author understands the layout | needs to learn tag filtering | "evals go next to your SKILL.md, like other skills" | -| Per-skill CI workflow unchanged | needs rewrite | ✓ | -| Mock vs. live opt-in works for skill evals | ✓ | ✓ (env defined here, referenced from skill eval) | -| New contributor adding cross-skill scenario | unclear (which tag?) | `evals/scenarios/` here | - -#### Per-skill eval suite — current state & direction - -The per-skill suite at `.github/skills//evals/` (triggered by -[`skill-eval.yml`](../../../.github/workflows/skill-eval.yml)) predates -this project. Captured here so the relationship is explicit. - -*Today.* Roughly a dozen skills have eval files; `azsdk-common-api-review` -has none, `sensei` is missing `scoring.threshold` (passes vacuously), and -two skills ship capability tests without a trigger file. Layout and env -wiring are correct, but most capability stimuli are graded only by a -single `output-contains` substring — they pass whether the agent called -the right tool, the wrong tool, or just echoed the prompt. CI is green -by construction. - -*Direction.* Raise the bar on what counts as a per-skill eval: adopt the -four-layer pattern — `skill-invocation` + `tool-calls` + `output-matches` -(structural regex, not single substrings) + optional `prompt` LLM-judge — -as the required shape for every capability stimulus. A -`skill-eval-authoring` skill packages the pattern, grader catalog, and -anti-patterns so other Azure SDK teams adopt without re-learning the -gotchas. - -### 1.3 Folder → suite → trigger mapping - -Suites in `.vally.yaml`: - -| Suite | Globs | Env | Used by | -|---|---|---|---| -| `unit` | `evals/unit/**/*.eval.yaml` | `azsdk-mcp-mock` | PR + nightly | -| `scenarios-mock` | `evals/scenarios/**/*.eval.yaml` | `azsdk-mcp-mock` | PR + nightly | -| `scenarios-live` | `evals/scenarios/**/*.eval.yaml` (filtered by tag `live-safe`) | `azsdk-mcp-live` | nightly + label | -| `pr-gate` | `unit` + `scenarios-mock` | mock | every PR | -| `nightly` | `unit` + `scenarios-mock` + `scenarios-live` | mixed | nightly + label | - -Live runs are tag-gated (`--tag live-safe`) so destructive / production-only -scenarios stay opted out by default. - -### 1.4 Decision tree for "where does my new eval go?" - -``` -Does it only test that the right skill is picked (no tool calls)? -└── yes → Level 0: .github/skills//evals/ (not this project) - -Is it a single-tool shape test or a trigger table covering tools used by ≥2 skills? -└── yes → evals/unit/ - -Is it a multi-step / multi-tool agent flow? -└── yes → evals/scenarios/ - ├── Level 1 by default: runs against MOCK on every PR. - │ *Use this unless the mock can't faithfully cover the behavior.* - └── Level 2: add `tags: { live-safe: "true" }` to ALSO run nightly - against live MCP. Reserve for cases where the real backend's - behavior matters (TypeSpec ordering, real codegen output, - real DevOps state). -``` - ---- - -## 2. CI - -### 2.1 Today - -- The skill evals (`.github/skills/**/evals/`) run via - [`.github/workflows/skill-eval.yml`](../../../.github/workflows/skill-eval.yml). -- The tool-scenario evals in this project: **run nowhere in CI**. They run - by hand today. This is the gap [#15829](https://github.com/Azure/azure-sdk-tools/issues/15829) - closes. - -### 2.2 Next (issue #15829) - -Extend `.github/workflows/skill-eval.yml` — **do not** create a parallel -workflow. Two new jobs join the existing per-skill matrix: - -```yaml -jobs: - # existing: skill-evals (matrix per skill, unchanged) - - tool-scenarios-pr: - # PR-gate: unit + scenarios-mock. Hermetic: no live MCP, no repo clones. - if: github.event_name == 'pull_request' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/setup-dotnet-and-node - - run: dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock - - working-directory: tools/azsdk-cli/Azure.Sdk.Tools.Vally - run: | - npx --yes @microsoft/vally-cli eval \ - --suite pr-gate \ - --skill-dir ../../../.github/skills \ - --junit --output-dir vally-results - - uses: actions/upload-artifact@v4 - with: { name: vally-pr, path: tools/azsdk-cli/Azure.Sdk.Tools.Vally/vally-results } - - tool-scenarios-nightly: - # Nightly: full suite incl. scenarios-live (tag-gated by `live-safe`). - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/setup-dotnet-and-node - - name: Restore repo cache - uses: actions/cache@v4 - with: - path: ~/.vally-cache/repos - key: vally-repos-${{ hashFiles('tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/*.yaml') }} - - name: Prime repo cache (clones any missing live-safe deps) - run: pwsh tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-repos.ps1 - env: { VALLY_REPO_CACHE: ~/.vally-cache/repos } - - run: dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli - - run: dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock - - working-directory: tools/azsdk-cli/Azure.Sdk.Tools.Vally - run: | - npx --yes @microsoft/vally-cli eval \ - --suite nightly \ - --skill-dir ../../../.github/skills \ - --junit --output-dir vally-results - env: - AZSDKTOOLS_AGENT_TESTING: "true" - AZURE_DEVOPS_PAT: ${{ secrets.AZURE_DEVOPS_PAT }} - VALLY_REPO_CACHE: ~/.vally-cache/repos - continue-on-error: true # advisory for first week - - uses: actions/upload-artifact@v4 - with: { name: vally-nightly, path: tools/azsdk-cli/Azure.Sdk.Tools.Vally/vally-results } -``` - -Triggers: - -| Trigger | What runs | -|---|---| -| `pull_request` | `pr-gate` (unit + scenarios-mock) | -| `schedule:` nightly | `nightly` (adds scenarios-live) | -| `workflow_dispatch` | manual escape hatch (suite picker) | - -### 2.3 Repo-caching strategy (issue #15831) - -**Problem.** Live-env scenarios call real tools that read files from -`azure-rest-api-specs` (and sometimes a language repo like -`azure-sdk-for-python`). Cloning from scratch on every run is slow -(~30s sparse spec clone, minutes for a full SDK repo). We need them -available without paying that cost per eval. - -**Who needs this:** only the `scenarios-live` nightly job. PR-gate -(`unit` + `scenarios-mock`) runs entirely against the mock server — no -clones, no network, no cache. - -**Constraint.** Vally itself does not clone repos. Its -`environment.git.source` field expects a worktree to already exist at -the given path. So cloning must happen as a pre-step *before* -`vally eval` runs. - -**Solution — pre-step script, scoped to live scenarios only.** Each -scenario tagged `live-safe` declares its repo deps in a sidecar block. -(Vally's `tags:` is a mapping, not an array; `metadata:` is accepted by -the linter as a passthrough — we use it for our own bookkeeping.) - -```yaml -# evals/scenarios/release-planner.eval.yaml -tags: - live-safe: "true" -metadata: - repos: - - name: Azure/azure-rest-api-specs - - name: Azure/azure-sdk-for-python -stimuli: - - environment: - git: - source: ${VALLY_REPO_CACHE}/Azure/azure-rest-api-specs # filled by pre-step - ref: main -``` - -Live runs select these scenarios via `--tag live-safe=true`. - -One generic script — [`evals/setup/ensure-repos.ps1`](./evals/setup/ensure-repos.ps1) -— walks `evals/scenarios/*.yaml`, **filters to scenarios tagged -`live-safe`**, collects the union of `metadata.repos`, and ensures each -listed repo is cloned into `$VALLY_REPO_CACHE//`. -Idempotent: existing checkouts are skipped. Scenarios without the -`live-safe` tag are not scanned — their repos (if any are declared) -are never cloned, because they never run live. - -- **Local dev:** `$VALLY_REPO_CACHE` defaults to - `$env:USERPROFILE\.vally-cache\repos`. Reused across local runs. -- **CI:** `$VALLY_REPO_CACHE` points at whatever `actions/cache` mounts. - Cache key = hash of the collected `metadata.repos` list across all - `live-safe` scenarios, so the cache only invalidates when a scenario - adds/removes a repo dependency. - -**On pinning (optional for v1).** By default the script clones `main`, -which means live-env evals can flake if upstream merges a breaking change -between nightly runs. Scenarios that want reproducibility can opt in per -repo by adding a `ref:` field under `metadata.repos`: - -```yaml -metadata: - repos: - - name: Azure/azure-rest-api-specs - ref: # optional; default = main - - name: Azure/azure-sdk-for-python -``` - -No central lock file, no bot PR — just an optional field on the repo -entry. "What version did this run use?" is always recoverable from the -`git rev-parse HEAD` recorded in `results.jsonl`. If per-scenario `ref:` -entries get unwieldy we can promote them to a shared lock file later. - -**Upstream wish.** A native Vally `fixtures.git:` block that clones for -us would let us drop the pre-step script and the `metadata` sidecar. -Filed separately; until then, the pre-step is the pragmatic v1. - ---- - -## 3. Live vs. mock — what runs where - -### 3.1 Decision matrix - -| What | Env | Why | -|---|---|---| -| Tool triggers (`evals/unit/triggers-*.eval.yaml`) | **mock** | No real tools called, just verifies prompt → tool name mapping. | -| Single-tool shape (`evals/unit/.eval.yaml`) | **mock** | Hermetic, fast, deterministic. | -| Scenario, default PR run (`evals/scenarios/*.eval.yaml`) | **mock** | Cheap, hermetic, no repo clones, safe for write tools (mocked). Catches tool-sequence regressions on every PR. | -| Scenario tagged `live-safe`, nightly run | **live** w/ `AZSDKTOOLS_AGENT_TESTING=true` | Catches real DevOps / GitHub drift. Work items route to test area path so re-runs are safe. PR creation hits real `azure-sdk-for-*` (proven 2026-06-02). | -| Scenario touching **production** systems (e.g. `azsdk_release_sdk` shipping to NuGet) | **mock only** — no `live-safe` tag | Never run live in CI. | - -The `live-safe` tag is the opt-in: by default a new scenario runs only on -mock. To also have it run live nightly, the author adds -`tags: { live-safe: "true" }` -and confirms the scenario is safe to repeat against the real systems. - -### 3.2 How it's expressed in YAML - -`.vally.yaml` declares two environments: - -```yaml -environments: - azsdk-mcp-live: - mcpServers: - azure-sdk-mcp: - command: dotnet - args: ["run", "--project", "../Azure.Sdk.Tools.Cli", "--", "start"] - env: - AZSDKTOOLS_AGENT_TESTING: "true" # safe-mode for write tools - azsdk-mcp-mock: - mcpServers: - azure-sdk-mcp: - command: dotnet - args: ["run", "--project", "../Azure.Sdk.Tools.Mock", "--", "start"] -``` - -Each eval is environment-agnostic; the env is bound at run time by the -suite (see §1.3). Default suite uses **mock**; `scenarios-live` swaps in -`azsdk-mcp-live` and filters via `--tag live-safe=true`. - -`AZSDKTOOLS_AGENT_TESTING=true` is the safety net — even on the live MCP, -write operations route to test work-item area paths. This is what made -today's release-planner scenario safe to re-run. - ---- - -## 4. Mock MCP server status - -### 4.1 How it works - -[`Azure.Sdk.Tools.Mock`](../Azure.Sdk.Tools.Mock/) reflects over -`SharedOptions.ToolsList` at boot and registers a mock proxy for **every** -tool the real `Azure.Sdk.Tools.Cli` advertises, preserving each tool's -name, description, and input schema -([`MockToolRegistrations.cs`](../Azure.Sdk.Tools.Mock/MockToolRegistrations.cs)). -At call time the proxy looks up an -[`IMockToolHandler`](../Azure.Sdk.Tools.Mock/Handlers/IMockToolHandler.cs) -by tool name: - -- **Custom handler exists** → scripted, type-correct response. -- **No custom handler** → fallback `DefaultCommandResponse { Message = "Success" }`. - -### 4.2 Why PR [#15854](https://github.com/Azure/azure-sdk-tools/pull/15854) - -Before #15854, only ~10 of 74 live tools had custom handlers. The other -~63 returned the generic success payload, which **breaks chained -scenarios** that need to thread a returned id into a follow-up call -(e.g. `create_release_plan` → `update_sdk_details_in_release_plan` -referencing the new `release_plan_id`). #15854 adds handlers for the -remaining tools so every mock-tier scenario gets a chainable, -type-correct response by default. Steady state: a new MCP tool ships -with its handler. - -### 4.3 No CI drift check needed - -The eval suite is the drift check. - -- **Tool added upstream, no handler yet** → auto-registered via - reflection; falls back to the success default. A scenario that actually - asserts on its response will fail in the next eval run — that failure - *is* the signal to add a handler. -- **Tool's response shape changes upstream** → scenarios asserting on the - changed field fail. Same signal, same fix. -- **Tool no eval ever touches** → no scenario fails, because the gap is - invisible to the test surface. We deliberately don't pay to plug it. - -Since every PR runs `pr-gate` (unit + scenarios-mock) and a failing -scenario fails the workflow, drift that matters surfaces immediately. -A separate `mock-coverage` inventory job, `COVERAGE.md` snapshot, or -scheduled diff script would be duplicate enforcement. - ---- - -## 5. Results UX — beyond "pass / fail" - -Tracked by parent issue [#15861](https://github.com/Azure/azure-sdk-tools/issues/15861). - -### 5.1 What we have today - -Per run, Vally writes: - -- `results.jsonl` — full trajectory: every tool call, args, return values, - events, metrics. -- `eval-results.md` — markdown summary table (one row per stimulus, grader - scores, links to details). -- JUnit XML (with `--junit`) — for CI test-results widgets. - -Two gaps: `results.jsonl` is a 300+ event JSON wall — usable by engineers -with a `jq` reflex, useless for everyone else trying to debug a failure; -and there's no way to slice across many runs ("how often did -release-planner fire green in the last 30 nightlies?"). - -### 5.2 Two artifacts, ordered by where they pay off first - -#### (a) Trajectory HTML — local debug first, CI artifact second - -Sub-issue [#15862](https://github.com/Azure/azure-sdk-tools/issues/15862). - -[`eng/scripts/Render-VallyTrajectory.ps1`](../../../eng/scripts/Render-VallyTrajectory.ps1) -reads one `results.jsonl` and emits one **self-contained** HTML page per -stimulus into a sibling `trajectories/` directory. No external assets, no -network, opens via `file://`. Local loop: `vally eval` → -`Render-VallyTrajectory.ps1` → open the HTML. - -Page layout: header (totals), grader pills (✅/❌ with rubric on hover), -stimulus, vertical event timeline with collapsible `tool_call` rows -(args ↔ result), skill switches as section headers, footer with final -assistant message + judge verdict + raw JSON link. Tool-result truncation -mirrors the §6 policy so the viewer reflects what the agent actually saw. - -This is the daily driver and lands first because it's usable the day it -merges — no CI required. - -#### (b) CSV export — cross-run analytics - -Sub-issue [#15863](https://github.com/Azure/azure-sdk-tools/issues/15863). - -[`eng/scripts/Export-VallyResultsCsv.ps1`](../../../eng/scripts/Export-VallyResultsCsv.ps1) -reads one or more `results.jsonl`, appends one row per stimulus to -`vally-results/history.csv`. Append-only; idempotent on -`(run_id, scenario)`. Columns cover verdict, grader scores, skills used, -turns, `tokens_billable` / `tokens_cached_read` (matching the §6.2 -`maxBillableTokens` definition), duration, and links back to the HTML + -`eval-results.md`. Local runs produce the row but **do not push** to -shared history — that's a CI concern. - -Marginal value locally (one run = one row), real value across many runs -in pivot tables / future dashboards. - -#### (c) Hosting & CI wiring - -Sub-issue [#15866](https://github.com/Azure/azure-sdk-tools/issues/15866). -Decision: **GitHub Actions artifacts** for trajectories, **orphan -`vally-history` branch** for the CSV. - -| Artifact | Where | Retention | Auth | -|---|---|---|---| -| `trajectories/*.html` | `actions/upload-artifact@v4`, per run | 90 days | GH login | -| `results.jsonl` + `eval-results.md` | same artifact | 90 days | GH login | -| `history.csv` | orphan `vally-history` branch, force-pushed by nightly only | indefinite | repo read | - -Picked because it's zero infra (no Azure Storage, no SAS, no public -endpoint), the URL is one click from the Actions UI, and the -`vally-history` branch is plain git history a dashboard can poll later. -PR-gate uploads artifacts but skips the CSV append + history push — -history is a nightly concern. Promote to Azure Storage static site only -if the artifact-hop UX becomes a blocker. - -#### (d) Future: shared dashboard - -CSV in the `vally-history` branch can feed a Power BI / Kusto dashboard -that lives outside this repo. Out of scope for v1. - -### 5.3 Pipeline - -``` -vally eval ──> results.jsonl ─┬─> Render-VallyTrajectory.ps1 ──> *.html ──> artifact - │ - └─> Export-VallyResultsCsv.ps1 ──> history.csv ──> vally-history branch ──> (future) dashboard -``` - -Both scripts read `results.jsonl` only — no Vally-side changes required. -If/when upstream Vally adds native CSV / HTML output, drop the scripts. - ---- - -## 6. Performance & cost controls - -### 6.1 Principle - -The framework must make expensive evals **fail loudly**, not silently bleed -CI minutes and tokens. An author writing a new scenario should not have to -know in advance how much it costs; the runner tells them, and refuses to -keep running it if it crosses policy. Polishing individual scenarios is -not a substitute for this — it doesn't scale to the next ten authors. - -The release-planner e2e run (17 min wall / 1.78M tokens / 41 turns) is the -existence proof: nothing in the framework today would have stopped it -landing as a "passing" scenario that quietly costs a full hour of CI per -nightly trigger. - -### 6.2 Budgets and enforcement - -Every scenario carries a budget. The runner measures actual cost and -enforces the budget in three bands: - -| Band | Trigger | Effect | -|---|---|---| -| Soft (warn) | actual ≥ 50% of budget | Logged + surfaced in `eval-results.md` | -| Hard (fail) | actual > 100% of budget | Scenario marked **failed**, CI job fails | -| Kill (abort) | actual > 200% of budget | Run aborted mid-flight, partial trajectory saved | - -Budgeted dimensions, in declining order of importance: - -1. **`maxTurns`** — single best proxy for cost; bounds the agent loop. -2. **`maxWallSec`** — protects CI minutes regardless of where time goes. -3. **`maxBillableTokens`** — input (uncached) + output. Cache hits don't - count, so the number tracks real $. -4. **`maxToolCalls`** — catches exploration spirals. - -Defaults are set globally in `.vally.yaml`, overridable per scenario: - -```yaml -# .vally.yaml -defaults: - limits: - maxTurns: 20 - maxWallSec: 120 - maxBillableTokens: 100_000 - maxToolCalls: 30 -``` - -A scenario that *needs* more must opt in explicitly with a comment -explaining why. The opt-in itself is reviewable in code: - -```yaml -# evals/scenarios/release-planner.eval.yaml -limits: - maxTurns: 60 # multi-step chain; see DESIGN §6.4 - maxWallSec: 600 # waits on real ADO pipeline status - maxBillableTokens: 250_000 -``` - -If the opt-in budget gets reviewed and rejected, the author's recourse -is to **switch to mock**, not to widen the budget. This is the lever -that pushes cost-blind scenarios off the live path. - -### 6.3 Tiered policy: PR vs nightly - -Budgets differ by tier. The PR gate is the strict one because it runs on -every push; nightly can be looser because it runs once. - -| Tier | maxTurns | maxWallSec | maxBillableTokens | Opt-out | -|---|---|---|---|---| -| PR gate (unit + mock) | 20 | 120 | 100k | not allowed | -| Nightly mock | 30 | 300 | 200k | reviewable | -| Nightly live | 60 | 600 | 500k | reviewable, requires justification comment | - -A scenario that wants to exceed the PR-gate ceiling **must** drop down -to nightly. The runner refuses to load over-budget scenarios into the -PR-gate suite. No way to silently land a slow scenario. - -### 6.4 General guardrails (framework-level, not per-scenario) - -These apply to every scenario the runner executes. None require the -scenario author to know about them. Scoped to what we own in this repo — -deeper runner-level controls (hard caps, virtual clock, tool-result -truncation, cost-split reporting) need upstream Vally support and are -out of scope here. - -| # | Guardrail | Layer | What it prevents | -|---|---|---|---| -| G1 | **Polling tools default to terminal state under `AZSDKTOOLS_AGENT_TESTING=true`** (`*_get_*_status` returns `Succeeded` on first poll) | mock MCP policy | Wall-time waste on polling loops; any future polling tool inherits the fix | -| G2 | **Cheaper judge model** — LLM-judge graders default to a smaller model than the agent | runner config (`.vally.yaml`) | Judge tokens dominating output cost | -| G3 | **CI concurrency cancel** — superseded PR runs killed immediately | CI workflow | Wasted compute on rapid pushes | - -### 6.5 Where each guardrail lives - -| Guardrail | Owner | -|---|---| -| G1 | `Azure.Sdk.Tools.Mock` in this repo | -| G2 | `.vally.yaml` runner config in this repo | -| G3 | `.github/workflows/skill-eval.yml` in this repo | - -All three can land immediately — no upstream Vally dependency. Per-scenario -budgets (§6.2) are enforced today via a thin post-run check that reads -`results.jsonl` and fails the CI step if any scenario exceeded its -declared limits. - -### 6.6 Author-facing rule of thumb - -> **If the agent's loop talks to a real backend that takes more than a -> few seconds to respond, mock it.** The runner will let you know when -> you've crossed the line — you don't have to guess. - -The budget machinery exists so authors don't need to read this document -to write a cheap eval. They write the scenario; the runner fails it if -it costs too much; the CI message points them at the mock path. - diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md deleted file mode 100644 index 109cb5cd8ae..00000000000 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/REQUIREMENTS.md +++ /dev/null @@ -1,185 +0,0 @@ -# Vally Tool-Scenario Evaluation — Requirements - - ---- - -## 1. Context - -PR [#15811](https://github.com/Azure/azure-sdk-tools/pull/15811) ported the -deleted `Azure.Sdk.Tools.Cli.Benchmarks` tool-scenarios into -[`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/`](./evals/) as -`@microsoft/vally-cli` evals (11 scenarios, 10 fully-graded + 1 stub). They run -locally but are not yet wired into CI, have no shared environments, and cannot -yet assert *skill choice + tool-call shape + ordering* in a single scenario. - - ---- - -## 2. Goals - -1. A single eval can express **what skill was picked, what tools were called, in - what order, with what arguments, and whether the final answer is correct.** -2. Evals are **reproducible**: same SHA, same inputs ⇒ same trajectory. -3. Evals are **safe by default**: nothing destructive runs against live ADO / - GitHub on a nightly schedule unless the author opted in. -4. Evals are **portable**: a new contributor can clone the repo and run any - scenario without hand-editing paths. -5. Evals are **observable**: results are exportable (CSV / JUnit / markdown) - and consumable by non-engineers. - ---- - -## 3. Non-goals (for this round) - -- Authoring new eval scenarios beyond the 11 already ported (tracked separately). -- Schema-parity tests between `Azure.Sdk.Tools.Cli` and `Azure.Sdk.Tools.Mock` - responses — a separate concern, file against the mock project if needed. -- Replacing Vally as the eval runner. -- Building a UI on top of CSV exports. - ---- - -## 4. Functional requirements - -### 4.1 Unified scenario file (skill + tool + e2e in one place) - -A single `.eval.yaml` must be able to declare: - -- one or more `skill-invocation` graders (which `.github/skills/*` were picked), -- one or more `tool-calls` graders (which MCP tools fired, with arg matching), -- an optional `prompt` / `output-contains` / `output-matches` grader for the - final answer, -- arbitrary tags (`tier`, `scenario`, `skills`, `tools`, `owner`). - -Today these flows are split across two pipelines because Vally's per-scenario -grader set is limited. Removing that limitation is the #1 ask from the meeting. - -**Upstream dependencies**: -- [microsoft/vally#453](https://github.com/microsoft/vally/issues/453) — `tool-calls` grader: support strict call ordering (`sequence:`). -- [microsoft/vally#454](https://github.com/microsoft/vally/issues/454) — `tool-calls` grader: open `ToolMatch` for generic argument matching. - -### 4.2 Three levels of evaluation - -Aligned with the 2026-06 design review. Three named levels; -the folder a scenario lives in (and an opt-in tag for level 2) determines -which one runs when. - -| Level | Name | Agent | MCP | Trigger | Failure semantics | -|---|---|---|---|---|---| -| 0 | Routing evals (per-skill, prompt-to-skill matching) | live | none | per PR | required | -| 1 | **Workflow scenarios** (mock MCP — default) | live | mock | per PR | required | -| 2 | **Live scenarios** (live MCP — narrow opt-in) | live | live | nightly | advisory → required | - -Plus a hermetic tool-shape layer that isn't agent-driven: - -| Layer | Name | Agent | MCP | Trigger | Failure semantics | -|---|---|---|---|---|---| -| — | Unit evals (tool-shape + cross-skill triggers) | none | mock | per PR | required | - -**Mock is the default; live is the exception.** Both modes drive the -same live agent, so **both incur LLM token cost** — the mock MCP server -itself is a deterministic stub with no LLM in it. The cost delta is -backend latency + the larger / chattier responses live tools produce, -which expand per-turn input and provoke more polling/retry turns. -Level 2 is therefore reserved for scenarios the mock can't deterministically -cover (e.g. TypeSpec ordering, real codegen output, real DevOps state). -Most multi-step work — including release plan and SDK generation — -stays at level 1. - -Level 0 lives next to its skill and is out of scope for this project's -folder layout; this project owns the runner config it references. - -### 4.3 Mock vs. live MCP — opt-in per eval - -Tracked in [#15831](https://github.com/Azure/azure-sdk-tools/issues/15831). - -- Both a mock MCP server and the real MCP server must be selectable as the - scenario's MCP environment. -- **Default is mock.** Running against the live MCP server is per-scenario - opt-in and must be justified — live MCP carries real token + wall-time - cost (see [DESIGN.md §6](./DESIGN.md)), so it is reserved for behavior - the mock can't faithfully reproduce. -- Scenarios touching **production** systems (e.g. shipping packages) must - remain mock-only and must not be opt-in-able. - -### 4.4 Workspace setup hooks (repo cloning for live scenarios) - -Tracked in [#15831](https://github.com/Azure/azure-sdk-tools/issues/15831). - -- The PR gate (unit + mock scenarios) must be fully hermetic: no clones, - no outbound network. -- Live-tier scenarios that need external repos (e.g. `azure-rest-api-specs`) - must declare those dependencies inside the scenario file. Adding a new - repo dependency must be a YAML-only change. -- Repo provisioning runs once per CI job and is shared across scenarios. -- Pinning a repo to a specific ref is supported but optional in v1. - -### 4.5 Configuration via environment variables, not hard-coded paths - -- Scenario YAMLs must not hard-code absolute paths. -- Repo locations must be resolved through configuration that works the - same way locally and in CI. - -### 4.6 Skill + tool-call grading must be enforced together - -For each prompt, the grader must verify both: - -1. The agent picked the **right skill** (`skill-invocation` grader). -2. The agent fired the **right MCP tool calls**, in the right order, with the - right arguments (`tool-calls` grader with the upstream extensions in §4.1). - -A scenario that asserts only the final answer text is incomplete. - -### 4.7 End-to-end multi-step scenarios - -Vally must be able to grade chains such as: - -- *validate TypeSpec project* → *create release plan* → *generate SDK*. - -This requires: - -- Ordering (vally#453). -- Argument matching (vally#454). -- Tier-appropriate environment (mock for destructive steps, live elsewhere). - -Initial e2e targets: `release-planner-e2e`, `create-release-plan`, -`generate-sdk`. Each must include the full tool-call chain in its graders. - -### 4.8 Result export - -- Native: `results.jsonl`, `eval-results.md`, JUnit XML (already supported). -- **New**: CSV export for the cross-run projection / dashboard use case. - Either a Vally feature request or a thin post-processor script that - consumes `results.jsonl`. - -### 4.9 Mock MCP tool coverage - -- Inventory the tools `Azure.Sdk.Tools.Mock` currently implements. -- For every tool referenced by an eval that runs on `azsdk-mcp-mock`, the mock - must have a handler (returning realistic shape, not necessarily real data). -- Track gaps in a checklist in the mock project's README. - ---- - -## 5. CI / pipeline requirements - -Tracked in [#15829](https://github.com/Azure/azure-sdk-tools/issues/15829). - -- A PR-gate job runs unit + mock scenarios on every PR. Hermetic; required - from day one. -- A nightly job additionally runs the live-tier scenarios against the real - MCP server. Starts advisory (does not block); flipped to required once - the baseline is stable. -- A manual trigger is available for ad-hoc runs. -- Results are published as build artifacts (markdown summary + JUnit XML - at minimum). -- Model and ADO credentials must not leak into logs. - ---- - -## 6. Authoring requirements (so new contributors can extend the suite) - -- The authoring pattern (graders, tiers, mock-vs-live decision) is documented - outside this file and linked from the Vally project's README. -- A new contributor can add a scenario without editing CI configuration or - shared scripts. diff --git a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md new file mode 100644 index 00000000000..0776917c2b6 --- /dev/null +++ b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md @@ -0,0 +1,367 @@ +# Spec: 8 Operations — Agent Evaluation Strategy + +## Table of Contents + +- [Definitions](#definitions) +- [Background / Problem Statement](#background--problem-statement) +- [Goals and Exceptions/Limitations](#goals-and-exceptionslimitations) +- [Design Proposal](#design-proposal) +- [Agent Prompts](#agent-prompts) +- [Success Criteria](#success-criteria) +- [Implementation Plan](#implementation-plan) + +--- + +## Definitions + +- **Agent**: a live LLM conversation driving Azure SDK MCP tools through skills. +- **Skill**: a markdown contract under `.github/skills//` telling the + agent *when* to engage and *which* tools/workflow to use. +- **MCP tool**: a discrete capability exposed by the Azure SDK MCP server. +- **Workflow scenario**: a user prompt that crosses multiple tools / skills + end-to-end (e.g. *create release plan → generate SDK → link the SDK PR*). +- **Stimulus**: one prompt + its expected behavior — the unit of an eval. +- **Three graders per stimulus**: `skill-invocation` (right skill picked), + `tool-calls` (right tools / order / args), and `prompt` (right final answer). +- **Mock MCP**: an in-memory fake of the Azure SDK MCP server — no network, + no side effects. **Live MCP**: the real server hitting real DevOps / GitHub. + + +--- + +## Background / Problem Statement + +We're shipping agent-driven replacements for manual SDK workflows — starting +with the release planner. When someone +asks *"does the agent actually do what we said it does?"*, today the only +honest answer is "I tried a few prompts on my laptop." That is not good +enough to hand to partner teams or to keep regressions out as more workflows +land. + +We need a small, shared set of prompts we promise to support, run regularly, +with a clear pass/fail per prompt — so we can point at the report instead +of re-demoing. + +--- + +## Goals and Exceptions/Limitations + +### Goals + +- [ ] **One file per workflow, three graders per prompt** — skill picked, + tools called, final answer. +- [ ] **Mock MCP by default, live MCP only on opt-in** — no accidental writes + to DevOps / GitHub; release / publish tools stay mock-only. +- [ ] **Mock covers every tool the scenarios call**, with realistic responses. +- [ ] **Anyone can clone and run** — env vars, no hard-coded paths; live + scenarios declare what repos they need. +- [ ] **The run produces a status table** of pass/fail per prompt plus a + trajectory per prompt — readable by non-engineers. +- [ ] **Reports come out in the formats people actually use** — markdown + for humans, JUnit for CI, CSV for spreadsheets and dashboards. +- [ ] **Adding a partner-reported prompt is one new stimulus**, no runner + or CI changes. +- [ ] **Multi-step chains work** (e.g. *validate TypeSpec → create release + plan → generate SDK → link the SDK PR*). + +### Exceptions and Limitations + +- **Some prompts can only be checked against live MCP** — the mock can't + prove a release plan was really created. Those run opt-in only. +- **The agent is not deterministic.** Same prompt, different wording or + turn count each run. We grade shape, not exact strings, and accept some + flake. + +--- + + +## Design Proposal + +### The three eval kinds + +We organize evals around what's actually being tested. No tier numbers — +use the names. The first three columns are the same axis (what does this +prove); the last two say where each lives and what backend it needs. + +| Kind | What it proves | Agent | MCP | Lives in | +|---|---|---|---|---| +| **Skills** | A user prompt routes to the right skill. | live | none | `.github/skills//evals/` | +| **Workflows — Mock** | Agent picks the right skills, calls the right tools in the right order with the right args, returns the right answer. | live | **mock** | `evals/scenarios/` (default) | +| **Workflows — Live** | Same as above, but against the real backend — catches drift the mock can't see (TypeSpec ordering, real codegen output, real DevOps state). | live | **live** | `evals/scenarios/` with `tags: { live-safe: "true" }` | + +Plus a hermetic tool-shape layer that isn't agent-driven: + +| Kind | What it proves | Lives in | +|---|---|---| +| **Tools** | Tool X exists and returns the right shape for these inputs. Cross-skill trigger tables. | `evals/tools/` | + + +### Folder layout + +``` +evals/ +├── tools/ tool-shape + cross-skill triggers (hermetic) +├── scenarios/ +│ ├── mock/ workflow scenarios run against the mock MCP +│ └── live/ workflow scenarios run against the live MCP +└── setup/ shared fixture scripts (repo clone, etc.) +``` + +A scenario lives under `mock/` or `live/` based on which backend the +graders are written against, not based on the prompt. A prompt can +have a `mock/` and a `live/` variant (release-planner does). + +**Scenarios are environment-agnostic.** A scenario file declares the +prompt, expected skills, expected tool sequence, and graders — nothing +about whether MCP is mock or live. Same file, same graders; the MCP +backend is picked at run time. + +| Run mode | MCP | Repos? | When | Coverage | +|---|---|---|---|---| +| Workflows — Mock | mock (stub, no LLM) | none | nightly + on demand | every scenario | +| Workflows — Live | live (real backends) | shallow + sparse | weekly | scenarios tagged `live-safe` (curated subset) | + +When live and mock results disagree, the mock is wrong — the divergence +points straight at the missing or stale handler. Every scenario that +runs on mock therefore drives the mock to grow handlers for the tools +it exercises. + +### Where each eval lives — split by ownership + + +| What it tests | Lives in | Owned by | +|---|---|---| +| **One skill** (does this skill route, call its tools, return a sensible answer) | `.github/skills//evals/` | Skill author | +| **Cross-skill / cross-tool** (multi-step chains, e2e flows, mock-server integration, anything that doesn't belong to one skill) | `tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/` | workflow owner team | + +Skill evals stay next to `SKILL.md` — that's the convention skill +authors expect, and it keeps everything about a skill in one folder. +Existing skill eval files do not move. + +#### Skill eval suite — current state and direction + +The per-skill suite predates this project. Today roughly a dozen skills +have eval files; some are missing thresholds and pass without asserting +anything, and most capability stimuli are graded only by a single +substring check — they pass whether the agent called the right tool, +the wrong tool, or just echoed the prompt. + +*Direction.* Raise the bar on what counts as a per-skill eval: adopt +the four-layer pattern — skill-invocation + tool-calls + structural +output match + optional LLM-judge — as the required shape for every +capability stimulus. A `skill-eval-authoring` skill packages the +pattern, grader catalog, and anti-patterns so other Azure SDK teams +adopt without re-learning the gotchas. + +### Decision tree — where does my new eval go? + +``` +Do you only care that the agent picks the right skill +(you don't care which tools it then calls)? +└── yes → .github/skills//evals/ (not this project) + +Do you want to check that one MCP tool returns the right shape +for a given input — no agent in the loop? +└── yes → evals/tools/ + +Is it a multi-step / multi-tool agent flow? +└── yes → Workflow scenario + ├── Default → evals/scenarios/mock/ + │ Runs against the mock MCP. Use this unless the mock can't + │ faithfully cover the behavior. + └── Also need live coverage → add an evals/scenarios/live/ + Reserve for cases where the real backend's behavior matters (TypeSpec ordering, real codegen output, real DevOps state). +``` + +### CI + +The suite runs on a schedule, not on every pull request. Agent runs +talk to an LLM — they cost money and they flake in ways that have +nothing to do with the code under review. Both are bad properties for a +required check that blocks merges. We'd rather have a reliable +green/red trend you can look at than a noisy gate everyone learns to +ignore. + +| When | What runs | Backend | +|---|---|---| +| Nightly | All workflow scenarios + the hermetic tool layer | mock | +| Weekly | Workflow scenarios marked safe to run live | live (with safe-mode flag on writes) | +| On demand | Any suite, any backend | author's choice | + +#### Pre-run setup for live scenarios + +**The problem.** A real workflow crosses repos. The release planner +reads a TypeSpec project from `azure-rest-api-specs`, generates code +into a language SDK repo, and links a PR back. The tools the agent +calls expect those files on disk. If a repo is missing, the agent +fails for the wrong reason and we learn nothing. + +**The setup step.** Each live scenario declares the repos (and +optionally the commit) it needs. One setup step reads all live +scenarios, takes the union, and makes sure each repo is present at the +requested commit before any eval runs. + +**Locally.** A single script. Run it once; it clones into a cache +folder under your home directory and reuses the clone on subsequent +runs. Same script CI uses. + +**In CI.** The weekly live job runs the same script. The cache folder +is a build-cache artifact keyed on the set of repos the scenarios +declare; it's invalidated only when that set changes. + +**Pinning.** A scenario can pin a commit when reproducibility matters. +Otherwise the setup step takes the default branch and records the +commit it used in the run output. + +The nightly mock job runs no setup — mock evals touch no external repos. + + +### Mock MCP server status + +#### How it works + +`Azure.Sdk.Tools.Mock` reflects over the real CLI's tool list at boot and +registers a mock proxy for **every** tool the real `Azure.Sdk.Tools.Cli` +advertises, preserving each tool's name, description, and input schema. +At call time the proxy looks up a handler by tool name: + +- **Custom handler exists** → scripted, type-correct response. +- **No custom handler** → fallback `{ Message = "Success" }`. + + + +### Results + +The goal: anyone — partner team, manager, the engineer who broke +something — should be able to open a run and understand what passed, +what failed, and why, without help. + +Each run writes three files into the output directory: + +| File | What it is | Who reads it | +|---|---|---| +| `eval-results.md` | Human status table: one row per prompt, pass/fail per grader. | Reviewers, partner teams, anyone scanning a run. | +| `results.jsonl` | The full agent trajectory — every tool call, args, return values, timings. One JSON object per line. | Engineers debugging a failure with tooling. | +| `junit.xml` | Standard test-results format the CI test-results widget already understands. | CI dashboards. | + +The JSONL is rich but hard to read raw. We add two post-processors +on top of it: + +- **Trajectory HTML** — one self-contained web page per prompt, opens + straight from `file://`. Shows the same trajectory as `results.jsonl` + but readable by someone who has never seen JSONL. +- **CSV history** — one row per prompt, appended across runs. Lets us + ask *"how often did release-planner pass in the last 30 nightlies?"* + and feed a dashboard later. + +In CI: trajectories + JSONL are uploaded as build artifacts you can +download from the run page; the CSV gets appended to a long-lived +history branch. + +### Performance and cost controls + +Why this section exists: agent evals are *slow* and *expensive*. Every +run talks to a real LLM — every tool call is a round trip, every turn +is tokens billed against our subscription. Without limits, a single +badly-written scenario can sit in a loop for an hour and burn through +the budget while still reporting *"passed"*. + +Concrete example: one real release-planner end-to-end run took **17 +minutes wall time, 1.78M tokens, 41 turns**. + +The framework therefore enforces three things: + +**1. Per-scenario budgets.** Every scenario file declares an upper +bound on: + +- **Turns** — how many times the agent loops. +- **Wall time** — how long the whole run can take. +- **Billable tokens** — input + output tokens we actually pay for. +- **Tool calls** — catches an agent stuck calling the same tool forever. + +The runner warns at 50% of any limit, fails the scenario at 100%, and +kills the whole run at 200% so a runaway can't bleed indefinitely. + +**2. Tiered defaults.** Mock runs nightly against an in-memory fake — +cheap and fast, so the limits are tight. Live runs weekly against real +backends — slower by nature, so the limits are looser. + +| Tier | Turns | Wall (s) | Billable tokens | +|---|---|---|---| +| Nightly mock | 30 | 300 | 200k | +| Weekly live | 60 | 600 | 500k | + +A scenario that needs more must opt in with a justification comment in +the scenario file. If reviewers reject the opt-in, the scenario has to +be rewritten to fit, or moved to mock — budgets don't widen. + +**3. Background guardrails** — things the scenario author never has +to think about, baked into the framework: + +- Polling tools (`*_get_*_status`) return a terminal state on the first poll under safe mode — no agent stuck waiting for *"in progress"* to flip. +- LLM-judge graders default to a cheaper model than the agent itself. +- CI cancels superseded runs when a branch gets a new push. + + +--- + +## Agent Prompts + +The list of prompts the agent is promised to support. Each lives as a +stimulus in `evals/scenarios/mock/.eval.yaml` (plus a `live/` +counterpart where applicable). Adding a new prompt is one new entry in +the matching file. + +### Release-planner workflow + +Derived from the release-planner replacement test plan +([#15835](https://github.com/Azure/azure-sdk-tools/issues/15835)). All +five route to the `azsdk-common-prepare-release-plan` skill. + +| Prompt | What the agent must do | Required tool calls | +|---|---|---| +| Create a public-preview release plan for a TypeSpec spec, target month June 2026 | Pick the prepare-release-plan skill; check for an existing plan; create one. | `azsdk_get_release_plan`, `azsdk_create_release_plan` | +| Create a release plan **and** generate SDK for a TypeSpec spec, release type beta | End-to-end chain: create, then generate, then back-fill SDK details. | `azsdk_get_release_plan`, `azsdk_create_release_plan`, `azsdk_run_generate_sdk`, `azsdk_update_sdk_details_in_release_plan` | +| Generate SDK for all languages for an existing release plan id | Look up the plan, run generation against the languages it lists. | `azsdk_get_release_plan`, `azsdk_run_generate_sdk` | +| Link a different spec PR (`https://github.com/Azure/azure-rest-api-specs/pull/...`) to an existing release plan | Look up the plan, swap the spec-PR field. | `azsdk_get_release_plan`, `azsdk_update_api_spec_pull_request_in_release_plan` | +| Update SDK details (package names) on an existing release plan from `tspconfig.yaml` | Look up the plan, update the SDK details from emitter config. | `azsdk_get_release_plan`, `azsdk_update_sdk_details_in_release_plan` | + +All five forbid `azsdk_verify_setup` (the setup gate runs once at the +top of the workflow, not per prompt) and forbid the irrelevant +`azsdk_create_release_plan` in the four "existing plan" prompts so we +catch the agent creating a duplicate. + +### Other workflows in the first round + +| Workflow | File | Coverage | +|---|---|---| +| Check spec is in public repo then validate TypeSpec | `check-public-repo-then-validate.eval.yaml` | TypeSpec authoring routing + validation tool call. | +| TypeSpec generation — step 2 of the authoring flow | `typespec-generation-step02.eval.yaml` | TypeSpec authoring skill + generate tool. | +| Rename a client property in a generated SDK | `rename-client-property.eval.yaml` | Customization skill + customize-code tool. | + +The live counterpart of release-planner lives at +`evals/scenarios/live/release-planner.eval.yaml` and adds a prompt-grader +that checks the real DevOps response. + +--- + +## Success Criteria + +- A single command runs the full mock suite locally and produces + `eval-results.md`, `results.jsonl`, JUnit XML, the per-prompt + trajectory HTML, and a `history.csv` row. +- Every release-planner prompt above is green in the mock suite. +- Every MCP tool a green scenario calls has a custom mock handler + returning a chainable, type-correct response. +- A new contributor can clone the repo, set the documented env vars, + and reproduce the same `eval-results.md` verdict table on their + machine. +- A partner team reporting *"I tried this prompt and the agent didn't + do anything"* can be answered by pasting their prompt as a new + stimulus and re-running the workflow file — no runner or CI changes. +- The status table is what we hand to reviewers (Renhe, Laurent, + partner teams) to answer *"what does the agent currently support?"* + +--- + +- From aa714ab94fe874cd534ebef27ed1afa4e4cf8552 Mon Sep 17 00:00:00 2001 From: helen229 Date: Thu, 4 Jun 2026 10:47:10 -0700 Subject: [PATCH 18/24] update doc --- .../add-arm-resource.eval.yaml | 0 .../check-public-repo.eval.yaml | 0 .../check-sdk-generation-status.eval.yaml | 0 .../create-release-plan.eval.yaml | 0 .../get-modified-typespec-projects.eval.yaml | 0 .../get-pr-link-current-branch.eval.yaml | 0 .../link-namespace-approval-issue.eval.yaml | 0 .../triggers-apiview.eval.yaml | 0 .../{unit => tools}/triggers-config.eval.yaml | 0 .../{unit => tools}/triggers-engsys.eval.yaml | 0 .../{unit => tools}/triggers-github.eval.yaml | 0 .../triggers-package.eval.yaml | 0 .../triggers-pipeline.eval.yaml | 0 .../triggers-releaseplan.eval.yaml | 0 .../triggers-typespec.eval.yaml | 0 .../{unit => tools}/triggers-verify.eval.yaml | 0 .../validate-typespec.eval.yaml | 0 .../live/release-planner.eval.yaml | 0 .../check-public-repo-then-validate.eval.yaml | 0 .../mock/release-planner-workflows.eval.yaml | 0 .../mock/rename-client-property.eval.yaml | 0 .../mock/typespec-generation-step02.eval.yaml | 0 .../8-operations-agent-eval-strategy.spec.md | 78 +++++++++++++++---- 23 files changed, 63 insertions(+), 15 deletions(-) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/add-arm-resource.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/check-public-repo.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/check-sdk-generation-status.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/create-release-plan.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/get-modified-typespec-projects.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/get-pr-link-current-branch.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/link-namespace-approval-issue.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-apiview.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-config.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-engsys.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-github.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-package.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-pipeline.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-releaseplan.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-typespec.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/triggers-verify.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{unit => tools}/validate-typespec.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => workflow-scenarios}/live/release-planner.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => workflow-scenarios}/mock/check-public-repo-then-validate.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => workflow-scenarios}/mock/release-planner-workflows.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => workflow-scenarios}/mock/rename-client-property.eval.yaml (100%) rename tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/{scenarios => workflow-scenarios}/mock/typespec-generation-step02.eval.yaml (100%) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/add-arm-resource.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/add-arm-resource.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/add-arm-resource.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-public-repo.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-public-repo.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-public-repo.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-sdk-generation-status.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/check-sdk-generation-status.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-sdk-generation-status.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/create-release-plan.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/create-release-plan.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/create-release-plan.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-modified-typespec-projects.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-modified-typespec-projects.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-modified-typespec-projects.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-pr-link-current-branch.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/get-pr-link-current-branch.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-pr-link-current-branch.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/link-namespace-approval-issue.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/link-namespace-approval-issue.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/link-namespace-approval-issue.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-apiview.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-apiview.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-apiview.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-config.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-config.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-config.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-engsys.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-engsys.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-engsys.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-github.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-github.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-github.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-package.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-package.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-package.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-pipeline.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-pipeline.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-pipeline.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-releaseplan.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-releaseplan.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-releaseplan.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-typespec.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-typespec.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-typespec.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-verify.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/triggers-verify.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-verify.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/validate-typespec.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/unit/validate-typespec.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/validate-typespec.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/live/release-planner.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/live/release-planner.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/check-public-repo-then-validate.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/release-planner-workflows.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/release-planner-workflows.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/rename-client-property.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/rename-client-property.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/rename-client-property.eval.yaml diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/typespec-generation-step02.eval.yaml similarity index 100% rename from tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/scenarios/mock/typespec-generation-step02.eval.yaml rename to tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/typespec-generation-step02.eval.yaml diff --git a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md index 0776917c2b6..6dbe3ea5b0e 100644 --- a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md +++ b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md @@ -8,6 +8,7 @@ - [Design Proposal](#design-proposal) - [Agent Prompts](#agent-prompts) - [Success Criteria](#success-criteria) +- [Open Questions](#open-questions) - [Implementation Plan](#implementation-plan) --- @@ -86,8 +87,8 @@ prove); the last two say where each lives and what backend it needs. | Kind | What it proves | Agent | MCP | Lives in | |---|---|---|---|---| | **Skills** | A user prompt routes to the right skill. | live | none | `.github/skills//evals/` | -| **Workflows — Mock** | Agent picks the right skills, calls the right tools in the right order with the right args, returns the right answer. | live | **mock** | `evals/scenarios/` (default) | -| **Workflows — Live** | Same as above, but against the real backend — catches drift the mock can't see (TypeSpec ordering, real codegen output, real DevOps state). | live | **live** | `evals/scenarios/` with `tags: { live-safe: "true" }` | +| **Workflows — Mock** | Agent picks the right skills, calls the right tools in the right order with the right args, returns the right answer. | live | **mock** | `evals/workflow-scenarios/mock/` | +| **Workflows — Live** | Same as above, but against the real backend — catches drift the mock can't see (TypeSpec ordering, real codegen output, real DevOps state). | live | **live** | `evals/workflow-scenarios/live/` | Plus a hermetic tool-shape layer that isn't agent-driven: @@ -100,11 +101,11 @@ Plus a hermetic tool-shape layer that isn't agent-driven: ``` evals/ -├── tools/ tool-shape + cross-skill triggers (hermetic) -├── scenarios/ -│ ├── mock/ workflow scenarios run against the mock MCP -│ └── live/ workflow scenarios run against the live MCP -└── setup/ shared fixture scripts (repo clone, etc.) +├── tools/ tool-shape + cross-skill triggers (hermetic) +├── workflow-scenarios/ +│ ├── mock/ workflow scenarios run against the mock MCP +│ └── live/ workflow scenarios run against the live MCP +└── setup/ shared fixture scripts (repo clone, etc.) ``` A scenario lives under `mock/` or `live/` based on which backend the @@ -166,11 +167,13 @@ for a given input — no agent in the loop? Is it a multi-step / multi-tool agent flow? └── yes → Workflow scenario - ├── Default → evals/scenarios/mock/ + ├── Default → evals/workflow-scenarios/mock/ │ Runs against the mock MCP. Use this unless the mock can't │ faithfully cover the behavior. - └── Also need live coverage → add an evals/scenarios/live/ - Reserve for cases where the real backend's behavior matters (TypeSpec ordering, real codegen output, real DevOps state). + └── Also need live coverage → add an evals/workflow-scenarios/live/ + variant. Reserve for cases where the real backend's behavior + matters (TypeSpec ordering, real codegen output, real DevOps + state). ``` ### CI @@ -188,6 +191,23 @@ ignore. | Weekly | Workflow scenarios marked safe to run live | live (with safe-mode flag on writes) | | On demand | Any suite, any backend | author's choice | +#### PR gate for essential workflows (open) + +A case for *narrow* PR gating: a small curated set of mock scenarios +covering the workflows we have already promised to partner teams +(release-planner today; more as they ship) could run on PRs that touch +the agent, skills, or MCP tools — so we catch a regression in the +workflows users actually rely on before merge, instead of the morning +after. + +Unresolved trade-offs: which scenarios count as "essential"; how to +keep the gate from flaking on LLM non-determinism (retries? loose +thresholds? quorum across N runs?); whether the cost of the gated +subset is acceptable for every PR; and which paths actually trigger it +(agent-only? skills? MCP server? all of the above?). + +See [Open Questions](#open-questions). + #### Pre-run setup for live scenarios **The problem.** A real workflow crosses repos. The release planner @@ -308,9 +328,9 @@ to think about, baked into the framework: ## Agent Prompts The list of prompts the agent is promised to support. Each lives as a -stimulus in `evals/scenarios/mock/.eval.yaml` (plus a `live/` -counterpart where applicable). Adding a new prompt is one new entry in -the matching file. +stimulus in `evals/workflow-scenarios/mock/.eval.yaml` (plus a +`live/` counterpart where applicable). Adding a new prompt is one new +entry in the matching file. ### Release-planner workflow @@ -340,8 +360,8 @@ catch the agent creating a duplicate. | Rename a client property in a generated SDK | `rename-client-property.eval.yaml` | Customization skill + customize-code tool. | The live counterpart of release-planner lives at -`evals/scenarios/live/release-planner.eval.yaml` and adds a prompt-grader -that checks the real DevOps response. +`evals/workflow-scenarios/live/release-planner.eval.yaml` and adds a +prompt-grader that checks the real DevOps response. --- @@ -364,4 +384,32 @@ that checks the real DevOps response. --- +## Open Questions + +### CI cadence and PR gating + +**Cadence.** Current proposal: nightly mock + weekly live + on-demand. +Open: is nightly the right frequency for mock, or do we want it on +every push to `main`? Is weekly enough for live, given live is the +only thing that catches real-backend drift? + +**PR gate for essential workflows.** Should a curated subset of mock +scenarios block merge on PRs that touch the agent, skills, or MCP +tools? Specifically to answer: + +- *Which workflows are "essential"* — just release-planner today, or + a broader set? Who decides when a new workflow joins or leaves the + gated set? +- *Which paths trigger the gate* — agent code, skill markdown, MCP + tool code, mock handlers, all of the above? Anything else? +- *How do we tame flake* — retries on failure, quorum across N runs, + loose thresholds, or just accept some red and require a human + override? Hard requirement: a green PR must mean *the gated + scenarios passed*, not *we got lucky this run*. +- *What's the cost ceiling* — the gated subset runs on every PR push + to a touched path; what's the per-PR token / wall-time budget we're + willing to spend before we move it back off the PR? + +We need owners' input on all four before turning the gate on. + - From fda9ef962890c5ff6f2bd2754786a3b745c5eb7a Mon Sep 17 00:00:00 2001 From: helen229 Date: Thu, 4 Jun 2026 11:05:25 -0700 Subject: [PATCH 19/24] update names --- tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml | 10 +++++----- .../specs/8-operations-agent-eval-strategy.spec.md | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index 10b71f0a969..ca761951b1c 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -55,27 +55,27 @@ suites: description: | Hermetic single-tool / trigger evals. No external I/O. Fast; the foundation of the PR gate. - evals: ["evals/unit/*.eval.yaml"] + evals: ["evals/tools/*.eval.yaml"] scenarios-mock: description: | Multi-tool scenarios against the mock MCP environment. Hermetic; safe for PR gate. - evals: ["evals/scenarios/mock/*.eval.yaml"] + evals: ["evals/workflow-scenarios/mock/*.eval.yaml"] scenarios-live: description: | Scenarios against live MCP — real DevOps / GitHub / pipelines. Slow; nightly only. Prime any required clones first via `evals/setup/ensure-specs-clone.ps1`. - evals: ["evals/scenarios/live/*.eval.yaml"] + evals: ["evals/workflow-scenarios/live/*.eval.yaml"] # ---- composite suites ---- pr-gate: description: Hermetic tiers only (unit + scenarios-mock). Target for CI PR check. evals: - - "evals/unit/*.eval.yaml" - - "evals/scenarios/mock/*.eval.yaml" + - "evals/tools/*.eval.yaml" + - "evals/workflow-scenarios/mock/*.eval.yaml" nightly: description: All tiers including live scenarios. evals: ["evals/**/*.eval.yaml"] diff --git a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md index 6dbe3ea5b0e..55d986a392f 100644 --- a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md +++ b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md @@ -119,8 +119,8 @@ backend is picked at run time. | Run mode | MCP | Repos? | When | Coverage | |---|---|---|---|---| -| Workflows — Mock | mock (stub, no LLM) | none | nightly + on demand | every scenario | -| Workflows — Live | live (real backends) | shallow + sparse | weekly | scenarios tagged `live-safe` (curated subset) | +| Workflows — Mock | mock (stub, no LLM) | azure-sdk-tools only | nightly + on demand | every scenario | +| Workflows — Live | live (real backends) | azure-sdk-tools + shallow/sparse clones of the spec & language SDK repos each scenario declares | weekly | scenarios tagged `live-safe` (curated subset) | When live and mock results disagree, the mock is wrong — the divergence points straight at the missing or stale handler. Every scenario that From 5b4fb6e6f85c21587fe97ffc67690244ca831883 Mon Sep 17 00:00:00 2001 From: helen229 Date: Thu, 4 Jun 2026 14:15:11 -0700 Subject: [PATCH 20/24] Vally: align release-planner mock stimuli with live e2e pattern All 5 release-planner mock stimuli now use environment.git worktree pointing at the per-user azure-rest-api-specs cache (matching the live e2e fixture), plus a structured e2e-style prompt that supplies the Contoso fixture IDs the mock handlers expect (TypeSpec project, service/product tree IDs, work-item ID 29262). Also document the --skill-dir requirement and worker-cap caveat in README, and fix one stale path in .vally.yaml comment. --- .../Azure.Sdk.Tools.Vally/.vally.yaml | 4 +- .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 43 ++++++---- .../mock/release-planner-workflows.eval.yaml | 85 ++++++++++++++++--- 3 files changed, 102 insertions(+), 30 deletions(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index ca761951b1c..a0fd3e06e8f 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -30,8 +30,8 @@ environments: # Live MCP — real Azure.Sdk.Tools.Cli against real DevOps (test area path), # real GitHub, real pipelines. AZSDKTOOLS_AGENT_TESTING=true keeps the # handful of write tools (e.g. create_release_plan) inside the test area. - # Bound only by scenarios under evals/scenarios/live/ and selected by the - # `scenarios-live` / `nightly` suites. + # Bound only by scenarios under evals/workflow-scenarios/live/ and selected + # by the `scenarios-live` / `nightly` suites. azsdk-mcp-live: mcpServers: azure-sdk-mcp: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index 0af1fa8cfd3..561acfab421 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -13,10 +13,10 @@ different folders. A full end-to-end gate runs *both*. |---|---|---| | **Question** | Given a user prompt, does the agent invoke the right MCP tool(s) with the right shape? | Given a user prompt, does the agent route to the right skill and follow its instructions? | | **Catches** | Tool name / description / parameter regressions; multi-tool ordering; tool-catalog conflicts | Skill frontmatter / `description` / instruction regressions; skill-routing collisions | -| **Path** | [`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/`](evals/) (`scenarios/` + `triggers/`) | [`.github/skills//evals/*.eval.yaml`](../../../.github/skills/) (and `evaluate/evals/` for capability suites) | +| **Path** | [`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/`](evals/) (`tools/` + `workflow-scenarios/`) | [`.github/skills//evals/*.eval.yaml`](../../../.github/skills/) (and `evaluate/evals/` for capability suites) | | **Loaded subject** | Production MCP server (`Azure.Sdk.Tools.Cli`) over stdio — real tools, real network calls | Skill's `SKILL.md` + frontmatter; the agent picks tools itself | | **Primary grader** | `tool-calls` — checks the recorded trajectory for required tool names | Trigger / routing graders + per-skill rubric | -| **Run command** | `vally eval --eval-spec evals/unit/.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/` *from repo root* | +| **Run command** | `vally eval --eval-spec evals/tools/.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/` *from repo root* | | **CI status** | Not wired yet (see follow-ups) | `vally lint` runs in [.github/workflows/skill-eval.yml](../../../.github/workflows/skill-eval.yml); full `eval` job pending | | **Cost profile** | Higher — each run spins up the MCP server, real LLM turns (~5–15), real tool calls | Variable — trigger evals are cheap; capability evals (e.g. `azure-typespec-author`) are expensive | @@ -111,8 +111,8 @@ tracks the migration in Azure.Sdk.Tools.Vally/ ├── .vally.yaml # Vally config (environments + suites) ├── evals/ -│ ├── unit/ # tool-shape + per-skill trigger evals, hermetic -│ ├── scenarios/ +│ ├── tools/ # tool-shape + per-skill trigger evals, hermetic +│ ├── workflow-scenarios/ │ │ ├── mock/ # multi-tool scenarios, hermetic (PR gate) │ │ └── live/ # multi-tool scenarios, live MCP (nightly) │ ├── setup/ # helper scripts (e.g. ensure-specs-clone.ps1) @@ -150,23 +150,34 @@ Run a suite (recommended): ```powershell cd tools/azsdk-cli/Azure.Sdk.Tools.Vally $vally = '../../../eng/skill-eval/node_modules/.bin/vally.cmd' +$skills = '../../../.github/skills' # Fast tiers only — PR-gate candidate -& $vally eval --suite pr-gate +& $vally eval --suite pr-gate --skill-dir $skills # A single tier -& $vally eval --suite unit -& $vally eval --suite scenarios-mock +& $vally eval --suite unit --skill-dir $skills +& $vally eval --suite scenarios-mock --skill-dir $skills # By feature area (cross-cuts tiers via tag filter) -& $vally eval --suite release-plan -& $vally eval --suite typespec +& $vally eval --suite release-plan --skill-dir $skills +& $vally eval --suite typespec --skill-dir $skills ``` +> `--skill-dir` is **required** for workflow-scenario evals — without it, +> the agent never loads the project skills and the `skill-invocation` +> grader fails even when the tool calls are correct. +> +> Each agent boots its own MCP child process, so parallel workers compete +> for stdio startup. Keep `--workers` at 1–2 for `scenarios-mock` / live +> runs until we share a single Mock MCP server across workers — higher +> concurrency triggers `MCP server 'azure-sdk-mcp' failed to load: +> Connection closed` on most stimuli. + Run a single eval: ```powershell -& $vally eval --eval-spec evals/unit/check-public-repo.eval.yaml +& $vally eval --eval-spec evals/tools/check-public-repo.eval.yaml --skill-dir $skills ``` Run the live scenarios tier (first, prime a per-user clone of @@ -174,17 +185,17 @@ Run the live scenarios tier (first, prime a per-user clone of ```powershell ./evals/setup/ensure-specs-clone.ps1 -& $vally eval --suite scenarios-live +& $vally eval --suite scenarios-live --skill-dir $skills --workers 1 ``` ## Adding a new scenario 1. **Pick a tier** — the folder you drop the YAML into: - - `evals/unit/` — one prompt, one MCP tool, no environment hooks. - - `evals/scenarios/mock/` — multi-tool flow against `azsdk-mcp-mock`. - Hermetic; runs on PR gate. - - `evals/scenarios/live/` — needs real DevOps / GitHub / pipelines; - bind `environment: azsdk-mcp-live`. Nightly only. + - `evals/tools/` — one prompt, one MCP tool, no environment hooks. + - `evals/workflow-scenarios/mock/` — multi-tool flow against + `azsdk-mcp-mock`. Hermetic; runs on PR gate. + - `evals/workflow-scenarios/live/` — needs real DevOps / GitHub / + pipelines; bind `environment: azsdk-mcp-live`. Nightly only. 2. Pick a short, kebab-case name (e.g. `create-release-plan`). 3. Create `evals//.eval.yaml`. Start from a sibling in the same tier as a template. diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml index 32e0b61ea5c..24a81e496fd 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml @@ -34,10 +34,26 @@ config: stimuli: # --- Scenario 1: Create release plan --------------------------------- - name: create-public-preview-release-plan + environment: + # Per-user cache populated by evals/setup/ensure-specs-clone.ps1 + # (idempotent shallow+sparse clone, auto-refresh every 24h). Same + # source the live e2e uses — keeps the relative TypeSpec path + # resolvable on disk even though the MCP responses are mocked. + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main prompt: | - Create a public preview release plan for - specification. - Target release month: June 2026. + I'm in a checkout of azure-rest-api-specs. Create a public preview + release plan for the Contoso Widget Manager. Here is all the context + you need: + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f" + - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e" + - target release timeline: "June 2026" + - API version: "2022-11-01-preview" + - SDK release type: "beta" + - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" My setup has already been verified, do not run azsdk_verify_setup. constraints: max_turns: 6 @@ -57,10 +73,26 @@ stimuli: # --- End-to-end demo prompt: create + generate ----------------------- - name: create-release-plan-and-generate-sdk + environment: + # Same per-user azure-rest-api-specs worktree as the create stimulus + # above so the agent sees a real on-disk spec repo. + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main prompt: | - Create a release plan and generate SDK for the TypeSpec project - specification - Target release month: June 2026, SDK release type beta. + I'm in a checkout of azure-rest-api-specs. Walk me through creating + a release plan and then generating SDK for the Contoso Widget Manager: + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - API release type: "Public Preview" + - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f" + - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e" + - target release timeline: "June 2026" + - API version: "2022-11-01-preview" + - SDK release type: "beta" + - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387" + After the release plan is created, generate SDK for all languages + using the work-item ID from the created release plan. My setup has already been verified, do not run azsdk_verify_setup. constraints: max_turns: 12 @@ -84,8 +116,20 @@ stimuli: # --- Scenario 2: Generate SDK for an existing release plan ----------- - name: generate-sdk-for-existing-release-plan + environment: + # Same per-user azure-rest-api-specs worktree as the create stimuli + # above so the agent can locate the TypeSpec project on disk while + # driving the release-planner flow. + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main prompt: | - Generate SDK for all languages for release plan 29262. + I'm in a checkout of azure-rest-api-specs. Using the release-planner + flow, generate SDK for all languages for the Contoso Widget Manager + release plan. Here is the context you need: + - release plan work item ID: "29262" + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" My setup has already been verified, do not run azsdk_verify_setup. constraints: max_turns: 8 @@ -106,9 +150,17 @@ stimuli: # --- Scenario 3: Link a different spec PR to an existing release plan - name: link-different-spec-pr-to-release-plan + environment: + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main prompt: | - Update the API spec PR in release plan 29262 to - https://github.com/Azure/azure-rest-api-specs/pull/38500. + I'm in a checkout of azure-rest-api-specs. Using the release-planner + flow, update the API spec pull request on an existing Contoso Widget + Manager release plan. Here is the context you need: + - release plan work item ID: "29262" + - new spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38500" My setup has already been verified, do not run azsdk_verify_setup. constraints: max_turns: 6 @@ -129,10 +181,19 @@ stimuli: # --- Scenario 4: Update SDK details (package names) ------------------ - name: update-sdk-details-in-release-plan + environment: + git: + type: worktree + source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs + ref: main prompt: | - Update SDK details / package names in release plan 29262 based on the - TypeSpec emitter configuration in tspconfig.yaml for - specification + I'm in a checkout of azure-rest-api-specs. Using the release-planner + flow, refresh the SDK package-name details on an existing Contoso + Widget Manager release plan from the on-disk TypeSpec emitter + configuration. Here is the context you need: + - release plan work item ID: "29262" + - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" + - tspconfig path: "specification/contosowidgetmanager/Contoso.WidgetManager/tspconfig.yaml" My setup has already been verified, do not run azsdk_verify_setup. constraints: max_turns: 8 From af3db0c7817baee7b42f91d3c9375731b83046c2 Mon Sep 17 00:00:00 2001 From: helen229 Date: Thu, 4 Jun 2026 15:12:56 -0700 Subject: [PATCH 21/24] update doc --- .../8-operations-agent-eval-strategy.spec.md | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md index 55d986a392f..629f42bfdca 100644 --- a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md +++ b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md @@ -96,6 +96,21 @@ Plus a hermetic tool-shape layer that isn't agent-driven: |---|---|---| | **Tools** | Tool X exists and returns the right shape for these inputs. Cross-skill trigger tables. | `evals/tools/` | +#### Required graders by kind + +Mock and live workflow scenarios share the same scenario format but +differ in which graders are *required* vs *optional*: + +| Kind | `tool-calls` | `skill-invocation` | response grader (`prompt` / LLM-judge) | +|---|---|---|---| +| **Workflows — Mock** | required | optional | not applicable — mock responses are stubbed, so a response grader has nothing meaningful to assert | +| **Workflows — Live** | required | required | required — only live runs produce a real assistant answer worth grading | + +Rationale: the mock backend deterministically replays canned data, so +"the agent said the right thing" reduces to "the agent called the right +tools." Live runs are the only place a free-form response can drift, so +that's where the response grader earns its cost. + ### Folder layout @@ -127,13 +142,12 @@ points straight at the missing or stale handler. Every scenario that runs on mock therefore drives the mock to grow handlers for the tools it exercises. -### Where each eval lives — split by ownership - +### Where each eval lives -| What it tests | Lives in | Owned by | -|---|---|---| -| **One skill** (does this skill route, call its tools, return a sensible answer) | `.github/skills//evals/` | Skill author | -| **Cross-skill / cross-tool** (multi-step chains, e2e flows, mock-server integration, anything that doesn't belong to one skill) | `tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/` | workflow owner team | +| What it tests | Lives in | +|---|---| +| **One skill** (does this skill route, call its tools, return a sensible answer) | `.github/skills//evals/` | +| **Cross-skill / cross-tool** (multi-step chains, e2e flows, mock-server integration, anything that doesn't belong to one skill) | `tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/` | Skill evals stay next to `SKILL.md` — that's the convention skill authors expect, and it keeps everything about a skill in one folder. @@ -180,10 +194,7 @@ Is it a multi-step / multi-tool agent flow? The suite runs on a schedule, not on every pull request. Agent runs talk to an LLM — they cost money and they flake in ways that have -nothing to do with the code under review. Both are bad properties for a -required check that blocks merges. We'd rather have a reliable -green/red trend you can look at than a noisy gate everyone learns to -ignore. +nothing to do with the code under review. | When | What runs | Backend | |---|---|---| From 36c58bacae782e0ad802b75382ae5bf68fa7f1dc Mon Sep 17 00:00:00 2001 From: helen229 Date: Fri, 5 Jun 2026 13:26:32 -0700 Subject: [PATCH 22/24] Vally: fix MCP boot race + drop misconfigured grader (#15948) - Launch pre-built DLLs via 'dotnet ' in both .vally.yaml files instead of 'dotnet run', so N parallel workers no longer race on Roslyn's exclusive write lock for the output DLL. - Add 'Build MCP servers' step to eng/pipelines/skill-eval.yml so the CI runner has the DLLs ready before vally starts. - Drop the skill-invocation grader from generate-sdk-for-existing-release-plan (no preflight reasoning step required; tools-only). - Strip 'I'm in a checkout of azure-rest-api-specs.' preamble from prompts; the worktree already provides that context. - Remove stray '// tools skills response' artifact in live release-planner.eval.yaml. - README: document 'dotnet build' as a prereq; rewrite workers warning. Validated: scenarios-mock at --workers 6 -> 5/5 stimuli pass, 0 race hits, ~4 min. --- .github/skills/.vally.yaml | 7 ++++-- eng/pipelines/skill-eval.yml | 8 ++++++ .../Azure.Sdk.Tools.Vally/.vally.yaml | 25 +++++++------------ .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 24 ++++++++++++++---- .../live/release-planner.eval.yaml | 2 +- .../mock/release-planner-workflows.eval.yaml | 14 ++++------- 6 files changed, 47 insertions(+), 33 deletions(-) diff --git a/.github/skills/.vally.yaml b/.github/skills/.vally.yaml index 4ee187c6e8e..eaea06f0450 100644 --- a/.github/skills/.vally.yaml +++ b/.github/skills/.vally.yaml @@ -12,12 +12,15 @@ paths: evalFilenames: ["eval.yaml", "*.eval.yaml"] environments: + # Launch the pre-built DLLs via `dotnet `, NOT `dotnet run` — avoids the + # MSBuild boot race under parallel workers. See issue #15948. + # CI builds the DLLs in the 'Build MCP servers' step of skill-eval.yml. azsdk-mcp: mcpServers: azure-sdk-mcp: type: stdio command: dotnet - args: ["run", "--project", "../../tools/azsdk-cli/Azure.Sdk.Tools.Cli", "--", "start"] + args: ["../../artifacts/bin/Azure.Sdk.Tools.Cli/Debug/net8.0/azsdk.dll", "start"] timeout: "60s" env: AZSDKTOOLS_AGENT_TESTING: "true" @@ -27,5 +30,5 @@ environments: azure-sdk-mcp: type: stdio command: dotnet - args: ["run", "--project", "../../tools/azsdk-cli/Azure.Sdk.Tools.Mock"] + args: ["../../artifacts/bin/Azure.Sdk.Tools.Mock/Debug/net8.0/azsdk-mock.dll"] timeout: "60s" diff --git a/eng/pipelines/skill-eval.yml b/eng/pipelines/skill-eval.yml index f0ad05c9895..78fead7090e 100644 --- a/eng/pipelines/skill-eval.yml +++ b/eng/pipelines/skill-eval.yml @@ -41,6 +41,14 @@ jobs: - script: npm install -g @github/copilot-sdk displayName: 'Install Copilot SDK' + # Pre-build the MCP servers so vally launches `dotnet ` instead of + # `dotnet run` — avoids the MSBuild boot race under parallel workers. + # See issue #15948. + - script: | + dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli -c Debug --nologo + dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug --nologo + displayName: 'Build MCP servers' + - script: | input_areas=$(echo "${{ parameters.areas }}" | xargs) if [ -n "$input_areas" ]; then diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml index a0fd3e06e8f..20254a644c8 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml @@ -11,33 +11,26 @@ paths: results: results/ environments: - # Default for unit + mock scenarios. Runs the dedicated Azure.Sdk.Tools.Mock - # MCP server — a separate process whose tool surface mirrors the real CLI - # but with deterministic in-memory responses. - # - # Relative `--project` paths are resolved by `dotnet` against the cwd of - # the vally invocation. Always run vally from this directory: - # cd tools/azsdk-cli/Azure.Sdk.Tools.Vally && vally eval ... - # Same convention as .github/skills/.vally.yaml. + # Launch the pre-built DLL via `dotnet `, NOT `dotnet run` — avoids the + # MSBuild boot race under parallel workers. See issue #15948. + # Run `dotnet build ../Azure.Sdk.Tools.Mock -c Debug` once before vally. azsdk-mcp-mock: mcpServers: azure-sdk-mcp: type: stdio command: dotnet - args: ["run", "--project", "../Azure.Sdk.Tools.Mock"] - timeout: "60s" + args: ["../../../artifacts/bin/Azure.Sdk.Tools.Mock/Debug/net8.0/azsdk-mock.dll"] + timeout: "30s" - # Live MCP — real Azure.Sdk.Tools.Cli against real DevOps (test area path), - # real GitHub, real pipelines. AZSDKTOOLS_AGENT_TESTING=true keeps the - # handful of write tools (e.g. create_release_plan) inside the test area. - # Bound only by scenarios under evals/workflow-scenarios/live/ and selected - # by the `scenarios-live` / `nightly` suites. + # Live MCP. AZSDKTOOLS_AGENT_TESTING=true keeps write tools inside the test + # area. Pre-built DLL pattern — see issue #15948. + # Run `dotnet build ../Azure.Sdk.Tools.Cli -c Debug` once before vally. azsdk-mcp-live: mcpServers: azure-sdk-mcp: type: stdio command: dotnet - args: ["run", "--project", "../Azure.Sdk.Tools.Cli", "--", "start"] + args: ["../../../artifacts/bin/Azure.Sdk.Tools.Cli/Debug/net8.0/azsdk.dll", "start"] timeout: "5m" env: AZSDKTOOLS_AGENT_TESTING: "true" diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index 561acfab421..161474b1ed4 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -144,6 +144,19 @@ Prereqs: cd eng/skill-eval npm ci ``` +- **Build the MCP servers once** before running vally. `.vally.yaml` + launches the pre-built DLLs via `dotnet ` to avoid the build-time + race that crashes parallel workers with `MCP error -32000: Connection + closed`. See [`primer-vally-mcp-race.html`](https://github.com/Azure/azure-sdk-tools/blob/main/doc/) (notebook) for the full write-up. + + ```powershell + # From repo root — builds both Azure.Sdk.Tools.Mock and (transitively) Cli + dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli -c Debug + dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug + ``` + + Rebuild after editing any tool source. Vally itself does **not** rebuild + the MCP server — it just spawns the existing DLL. Run a suite (recommended): @@ -168,11 +181,12 @@ $skills = '../../../.github/skills' > the agent never loads the project skills and the `skill-invocation` > grader fails even when the tool calls are correct. > -> Each agent boots its own MCP child process, so parallel workers compete -> for stdio startup. Keep `--workers` at 1–2 for `scenarios-mock` / live -> runs until we share a single Mock MCP server across workers — higher -> concurrency triggers `MCP server 'azure-sdk-mcp' failed to load: -> Connection closed` on most stimuli. +> Each agent still boots its own MCP child process, but `.vally.yaml` +> launches the **pre-built** `azsdk-mock.dll` / `azsdk.dll` via +> `dotnet ` (read-only memory-map, no MSBuild on the hot path), so +> `--workers 6+` is safe for `scenarios-mock`. The old MSBuild boot race +> is gone; the only remaining concurrency limit is rate limits on the +> Copilot CLI subprocesses. Run a single eval: diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml index 1c709aa5bbb..c3ffb9356ea 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml @@ -63,7 +63,7 @@ stimuli: source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs ref: main prompt: | - I'm in a checkout of azure-rest-api-specs. Walk me through the full + Walk me through the full release-plan + SDK-generation flow for the Contoso Widget Manager end-to-end. Do every step below, in order, and use real tools (no dry-run, no simulation): diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml index 24a81e496fd..d30fc339338 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml @@ -44,7 +44,7 @@ stimuli: source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs ref: main prompt: | - I'm in a checkout of azure-rest-api-specs. Create a public preview + Create a public preview release plan for the Contoso Widget Manager. Here is all the context you need: - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" @@ -81,7 +81,7 @@ stimuli: source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs ref: main prompt: | - I'm in a checkout of azure-rest-api-specs. Walk me through creating + Walk me through creating a release plan and then generating SDK for the Contoso Widget Manager: - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager" - API release type: "Public Preview" @@ -125,7 +125,7 @@ stimuli: source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs ref: main prompt: | - I'm in a checkout of azure-rest-api-specs. Using the release-planner + Using the release-planner flow, generate SDK for all languages for the Contoso Widget Manager release plan. Here is the context you need: - release plan work item ID: "29262" @@ -135,10 +135,6 @@ stimuli: max_turns: 8 max_tokens: 10000 graders: - - type: skill-invocation - config: - required: - - azsdk-common-prepare-release-plan - type: tool-calls config: required: @@ -156,7 +152,7 @@ stimuli: source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs ref: main prompt: | - I'm in a checkout of azure-rest-api-specs. Using the release-planner + Using the release-planner flow, update the API spec pull request on an existing Contoso Widget Manager release plan. Here is the context you need: - release plan work item ID: "29262" @@ -187,7 +183,7 @@ stimuli: source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs ref: main prompt: | - I'm in a checkout of azure-rest-api-specs. Using the release-planner + Using the release-planner flow, refresh the SDK package-name details on an existing Contoso Widget Manager release plan from the on-disk TypeSpec emitter configuration. Here is the context you need: From 2ce5e7b0ba144e772bad86e2956edd8df2374f06 Mon Sep 17 00:00:00 2001 From: helen229 Date: Fri, 5 Jun 2026 13:35:29 -0700 Subject: [PATCH 23/24] update readme for runing steps --- .../azsdk-cli/Azure.Sdk.Tools.Vally/README.md | 103 +++++++++++++----- 1 file changed, 78 insertions(+), 25 deletions(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md index 161474b1ed4..8359c4dc9ba 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md @@ -132,31 +132,84 @@ lives as a `tags:` entry inside each YAML so cross-cuts (e.g. "all release-plan evals") select via [`.vally.yaml`](.vally.yaml) suite filters or `vally eval --tag`. -## Running locally - -Prereqs: - -- Node 22+ -- .NET SDK matching the rest of the repo (see `global.json`) -- `@microsoft/vally-cli` installed via the repo's pinned lockfile: - - ```powershell - cd eng/skill-eval - npm ci - ``` -- **Build the MCP servers once** before running vally. `.vally.yaml` - launches the pre-built DLLs via `dotnet ` to avoid the build-time - race that crashes parallel workers with `MCP error -32000: Connection - closed`. See [`primer-vally-mcp-race.html`](https://github.com/Azure/azure-sdk-tools/blob/main/doc/) (notebook) for the full write-up. - - ```powershell - # From repo root — builds both Azure.Sdk.Tools.Mock and (transitively) Cli - dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli -c Debug - dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug - ``` - - Rebuild after editing any tool source. Vally itself does **not** rebuild - the MCP server — it just spawns the existing DLL. +## Quickstart — run one scenario + +The fastest path from a fresh clone to a green eval. Swap the path after +`-e` for any other `.eval.yaml` to try a different scenario. + +### 1. One-time setup + +```powershell +# From repo root +cd eng/skill-eval; npm ci; cd ../.. +dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli -c Debug +dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug +``` + +Rebuild the MCP servers any time you edit tool source. Vally does **not** +rebuild them — it just spawns the existing DLL. + +### 2. Move into this project and stash the paths + +All commands below run from here: + +```powershell +cd tools/azsdk-cli/Azure.Sdk.Tools.Vally +$vally = '../../../eng/skill-eval/node_modules/.bin/vally.cmd' +$skills = '../../../.github/skills' +``` + +### 3. Run a scenario + +**One trigger eval** (~30 s, hermetic): + +```powershell +& $vally eval -e evals/tools/create-release-plan.eval.yaml --skill-dir $skills +``` + +**The release-planner mock workflow** (~4 min, 5 stimuli, hermetic): + +```powershell +& $vally eval -e evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml --skill-dir $skills +``` + +**The release-planner live workflow** (~15 min, real DevOps writes to the +test area; prime the spec clone once): + +```powershell +./evals/setup/ensure-specs-clone.ps1 +& $vally eval -e evals/workflow-scenarios/live/release-planner.eval.yaml --skill-dir $skills --workers 1 +``` + +### 4. Pick a different scenario + +```powershell +# List everything you can pass to -e +Get-ChildItem evals -Recurse -Filter *.eval.yaml | ForEach-Object FullName +``` + +Common swaps: + +| What you want | Replace `-e` value with | +|---|---| +| A different release-plan trigger | `evals/tools/link-namespace-approval-issue.eval.yaml` | +| A TypeSpec workflow | `evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml` | +| All triggers for one feature | drop `-e` and use `--suite typespec` (or `release-plan`, `github`, …) | +| Everything hermetic | drop `-e` and use `--suite pr-gate` | + +### 5. Read the results + +- Live PASS/FAIL table prints to the terminal. +- Full trajectories land in `results//`. The most useful file + is `results.jsonl` — one line per stimulus run, with the prompt, every + tool call, and the final agent message. +- Add `--output-dir vally-results/` if you want a stable path + to re-open later. + +## Running locally (advanced) + +Prereqs are the same as the [Quickstart](#quickstart--run-one-scenario) +plus Node 22+ and a .NET SDK matching `global.json`. Run a suite (recommended): From 84379cfe24ceae7da3fbfcb71c3d30b73f444283 Mon Sep 17 00:00:00 2001 From: helen229 Date: Fri, 5 Jun 2026 22:42:42 -0700 Subject: [PATCH 24/24] Vally: align mock release-planner grader with live + deterministic 'not found' lookup The create-release-plan-and-generate-sdk mock stimulus required the agent to call azsdk_update_sdk_details_in_release_plan, but neither the prompt nor the azsdk-common-prepare-release-plan skill's create flow asks for it. The agent correctly skipped the tool, and the grader flapped. The dedicated update-sdk-details-in-release-plan stimulus already covers that tool with an explicit prompt. Drop it from the create+generate grader so mock matches the live release-planner-e2e contract (create / get / generate / link). Also patch GetReleasePlanForSpecPrHandler to return a deterministic 'not found' response (ReleasePlanDetails = null). The mock previously returned a 'plan exists' result for any spec PR, pushing the agent down the update path instead of the create path that the stimulus exercises. Stimuli that target an existing plan pass the work-item ID directly and call azsdk_get_release_plan, so this is safe. --- .../Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs | 7 +++++-- .../mock/release-planner-workflows.eval.yaml | 3 +-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs b/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs index b092008b4db..24acb887424 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs @@ -64,12 +64,15 @@ public class UpdateReleasePlanHandler : IMockToolHandler public class GetReleasePlanForSpecPrHandler : IMockToolHandler { public string ToolName => "azsdk_get_release_plan_for_spec_pr"; + // Deterministic "not found" — keeps the create-release-plan flow honest in + // eval scenarios. Stimuli that target an existing plan pass the work-item + // ID directly and call azsdk_get_release_plan instead. See #15948. public CommandResponse Handle(Dictionary? arguments) => new ReleasePlanResponse { TypeSpecProject = "specification/contosowidgetmanager/Contoso.WidgetManager", PackageType = SdkType.Dataplane, - Message = "Release plan found for spec PR (mock)", - ReleasePlanDetails = ReleasePlanMockResponses.ContosoWorkItem() + Message = "No release plan found for the given spec PR (mock)", + ReleasePlanDetails = null }; } diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml index d30fc339338..6a7d1c77230 100644 --- a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml +++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml @@ -97,7 +97,7 @@ stimuli: constraints: max_turns: 12 max_tokens: 16000 - # TODO: assert ordering create -> get -> generate -> update-details + # TODO: assert ordering create -> get -> generate # — blocked on Vally tool-calls grader sequence: support. graders: - type: skill-invocation @@ -110,7 +110,6 @@ stimuli: - azsdk_get_release_plan - azsdk_create_release_plan - azsdk_run_generate_sdk - - azsdk_update_sdk_details_in_release_plan disallowed: - azsdk_verify_setup