diff --git a/.github/skills/.vally.yaml b/.github/skills/.vally.yaml
index 4ee187c6e8e..eaea06f0450 100644
--- a/.github/skills/.vally.yaml
+++ b/.github/skills/.vally.yaml
@@ -12,12 +12,15 @@ paths:
   evalFilenames: ["eval.yaml", "*.eval.yaml"]
 
 environments:
+  # Launch the pre-built DLLs via `dotnet <dll>`, NOT `dotnet run` — avoids the
+  # MSBuild boot race under parallel workers. See issue #15948.
+  # CI builds the DLLs in the 'Build MCP servers' step of skill-eval.yml.
   azsdk-mcp:
     mcpServers:
       azure-sdk-mcp:
         type: stdio
         command: dotnet
-        args: ["run", "--project", "../../tools/azsdk-cli/Azure.Sdk.Tools.Cli", "--", "start"]
+        args: ["../../artifacts/bin/Azure.Sdk.Tools.Cli/Debug/net8.0/azsdk.dll", "start"]
         timeout: "60s"
         env:
           AZSDKTOOLS_AGENT_TESTING: "true"
@@ -27,5 +30,5 @@ environments:
       azure-sdk-mcp:
         type: stdio
         command: dotnet
-        args: ["run", "--project", "../../tools/azsdk-cli/Azure.Sdk.Tools.Mock"]
+        args: ["../../artifacts/bin/Azure.Sdk.Tools.Mock/Debug/net8.0/azsdk-mock.dll"]
         timeout: "60s"
diff --git a/eng/pipelines/skill-eval.yml b/eng/pipelines/skill-eval.yml
index c8d5d2e3ebd..24e930d9e0e 100644
--- a/eng/pipelines/skill-eval.yml
+++ b/eng/pipelines/skill-eval.yml
@@ -42,6 +42,14 @@ jobs:
       - script: npm install -g @github/copilot-sdk
         displayName: 'Install Copilot SDK'
 
+      # Pre-build the MCP servers so vally launches `dotnet <dll>` instead of
+      # `dotnet run` — avoids the MSBuild boot race under parallel workers.
+      # See issue #15948.
+      - script: |
+          dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli  -c Debug --nologo
+          dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug --nologo
+        displayName: 'Build MCP servers'
+
       - script: |
           input_areas=$(echo "${{ parameters.areas }}" | xargs)
           if [ -n "$input_areas" ]; then
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs b/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs
index b092008b4db..24acb887424 100644
--- a/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Mock/Handlers/ReleasePlan/ReleasePlanRemainingHandlers.cs
@@ -64,12 +64,15 @@ public class UpdateReleasePlanHandler : IMockToolHandler
 public class GetReleasePlanForSpecPrHandler : IMockToolHandler
 {
     public string ToolName => "azsdk_get_release_plan_for_spec_pr";
+    // Deterministic "not found" — keeps the create-release-plan flow honest in
+    // eval scenarios. Stimuli that target an existing plan pass the work-item
+    // ID directly and call azsdk_get_release_plan instead. See #15948.
     public CommandResponse Handle(Dictionary<string, object?>? arguments) => new ReleasePlanResponse
     {
         TypeSpecProject = "specification/contosowidgetmanager/Contoso.WidgetManager",
         PackageType = SdkType.Dataplane,
-        Message = "Release plan found for spec PR (mock)",
-        ReleasePlanDetails = ReleasePlanMockResponses.ContosoWorkItem()
+        Message = "No release plan found for the given spec PR (mock)",
+        ReleasePlanDetails = null
     };
 }
 
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore
new file mode 100644
index 00000000000..80a68f12750
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore
@@ -0,0 +1,2 @@
+vally-results/
+results/
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml
new file mode 100644
index 00000000000..20254a644c8
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml
@@ -0,0 +1,92 @@
+# Vally configuration for Azure SDK Tools MCP tool / scenario evaluations.
+# See: https://vally.dev/reference/vally-config
+#
+# These are scenario evals (does the agent invoke the right MCP tool(s) for a
+# given prompt?) and are intentionally separate from the per-skill evals under
+# .github/skills/. See README.md for context.
+
+paths:
+  evals: [evals/]
+  evalFilenames: ["*.eval.yaml"]
+  results: results/
+
+environments:
+  # Launch the pre-built DLL via `dotnet <dll>`, NOT `dotnet run` — avoids the
+  # MSBuild boot race under parallel workers. See issue #15948.
+  # Run `dotnet build ../Azure.Sdk.Tools.Mock -c Debug` once before vally.
+  azsdk-mcp-mock:
+    mcpServers:
+      azure-sdk-mcp:
+        type: stdio
+        command: dotnet
+        args: ["../../../artifacts/bin/Azure.Sdk.Tools.Mock/Debug/net8.0/azsdk-mock.dll"]
+        timeout: "30s"
+
+  # Live MCP. AZSDKTOOLS_AGENT_TESTING=true keeps write tools inside the test
+  # area. Pre-built DLL pattern — see issue #15948.
+  # Run `dotnet build ../Azure.Sdk.Tools.Cli -c Debug` once before vally.
+  azsdk-mcp-live:
+    mcpServers:
+      azure-sdk-mcp:
+        type: stdio
+        command: dotnet
+        args: ["../../../artifacts/bin/Azure.Sdk.Tools.Cli/Debug/net8.0/azsdk.dll", "start"]
+        timeout: "5m"
+        env:
+          AZSDKTOOLS_AGENT_TESTING: "true"
+          AZSDKTOOLS_COLLECT_TELEMETRY: "false"
+
+# Suites group evals for selective execution.
+#
+# Layout maps directly to suites — no tag-based mock/live filtering. Vally's
+# suite filter is positive-match only (AND across keys, OR within values),
+# so subfolders are the cleanest way to split mock vs live. See
+# https://github.com/microsoft/vally suite-filter source.
+suites:
+  # ---- by tier ----
+  unit:
+    description: |
+      Hermetic single-tool / trigger evals. No external I/O. Fast; the
+      foundation of the PR gate.
+    evals: ["evals/tools/*.eval.yaml"]
+
+  scenarios-mock:
+    description: |
+      Multi-tool scenarios against the mock MCP environment. Hermetic; safe
+      for PR gate.
+    evals: ["evals/workflow-scenarios/mock/*.eval.yaml"]
+
+  scenarios-live:
+    description: |
+      Scenarios against live MCP — real DevOps / GitHub / pipelines. Slow;
+      nightly only. Prime any required clones first via
+      `evals/setup/ensure-specs-clone.ps1`.
+    evals: ["evals/workflow-scenarios/live/*.eval.yaml"]
+
+  # ---- composite suites ----
+  pr-gate:
+    description: Hermetic tiers only (unit + scenarios-mock). Target for CI PR check.
+    evals:
+      - "evals/tools/*.eval.yaml"
+      - "evals/workflow-scenarios/mock/*.eval.yaml"
+  nightly:
+    description: All tiers including live scenarios.
+    evals: ["evals/**/*.eval.yaml"]
+
+  # ---- by feature area (tag-filtered) ----
+  release-plan:
+    description: All evals tagged area=release-plan.
+    filter: { area: release-plan }
+    evals: ["evals/**/*.eval.yaml"]
+  typespec:
+    description: All evals tagged area=typespec.
+    filter: { area: typespec }
+    evals: ["evals/**/*.eval.yaml"]
+  pipeline:
+    description: All evals tagged area=pipeline.
+    filter: { area: pipeline }
+    evals: ["evals/**/*.eval.yaml"]
+  github:
+    description: All evals tagged area=github.
+    filter: { area: github }
+    evals: ["evals/**/*.eval.yaml"]
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md
new file mode 100644
index 00000000000..8359c4dc9ba
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/README.md
@@ -0,0 +1,341 @@
+# Azure.Sdk.Tools.Vally
+
+MCP-tool / end-to-end scenario evaluations for the `azsdk` MCP server, run via
+[`@microsoft/vally-cli`](https://www.npmjs.com/package/@microsoft/vally-cli).
+
+## Tool-scenario evals vs. skill evals
+
+The repo runs **two complementary eval surfaces**, both via the same
+`@microsoft/vally-cli` binary. They answer different questions and live in
+different folders. A full end-to-end gate runs *both*.
+
+| | **Tool-scenario evals** (this project) | **Skill evals** |
+|---|---|---|
+| **Question** | Given a user prompt, does the agent invoke the right MCP tool(s) with the right shape? | Given a user prompt, does the agent route to the right skill and follow its instructions? |
+| **Catches** | Tool name / description / parameter regressions; multi-tool ordering; tool-catalog conflicts | Skill frontmatter / `description` / instruction regressions; skill-routing collisions |
+| **Path** | [`tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/`](evals/) (`tools/` + `workflow-scenarios/`) | [`.github/skills/<skill-name>/evals/*.eval.yaml`](../../../.github/skills/) (and `evaluate/evals/` for capability suites) |
+| **Loaded subject** | Production MCP server (`Azure.Sdk.Tools.Cli`) over stdio — real tools, real network calls | Skill's `SKILL.md` + frontmatter; the agent picks tools itself |
+| **Primary grader** | `tool-calls` — checks the recorded trajectory for required tool names | Trigger / routing graders + per-skill rubric |
+| **Run command** | `vally eval --eval-spec evals/tools/<name>.eval.yaml` *from this directory* | `vally eval --skill-dir .github/skills/<skill-name>` *from repo root* |
+| **CI status** | Not wired yet (see follow-ups) | `vally lint` runs in [.github/workflows/skill-eval.yml](../../../.github/workflows/skill-eval.yml); full `eval` job pending |
+| **Cost profile** | Higher — each run spins up the MCP server, real LLM turns (~5–15), real tool calls | Variable — trigger evals are cheap; capability evals (e.g. `azure-typespec-author`) are expensive |
+
+### Why both?
+
+A skill *uses* tools, but a tool can be invoked **without** any skill
+(Copilot picks it directly from the catalog when the user prompt doesn't
+trigger a skill — which is most prompts in practice). Concretely:
+
+- Drop tool-scenario evals → you stop catching regressions when someone
+  renames a tool, edits its description, or adds an overlapping tool that
+  the model now prefers.
+- Drop skill evals → you stop catching regressions when someone edits a
+  skill's `description`, frontmatter, or instruction body and the router
+  stops invoking it for the right prompts.
+
+For workflows where a skill is a thin wrapper around one tool, the two
+evals have meaningful overlap and you may keep just one. For workflows
+where the skill does real orchestration (multi-tool sequencing,
+conditional branches, recovery), both matter independently.
+
+### Scenarios checked in today
+
+**Tool-scenario evals (this project)** — organised by the standard test pyramid under [`evals/`](evals/). The folder is the **cost tier** (and CI cadence); the feature **area** is a tag inside each YAML so cross-cuts work via `.vally.yaml` suite filters.
+
+#### `evals/unit/` — hermetic single-tool evals (18)
+
+One prompt → one expected MCP tool. No `environment.git`, no fixtures. Fast; safe to run on every PR. Includes the per-tool **trigger** coverage ported from [#15183](https://github.com/Azure/azure-sdk-tools/pull/15183) (`triggers-*.eval.yaml`).
+
+| Scenario | Area | Shape |
+|---|---|---|
+| [`check-public-repo`](evals/unit/check-public-repo.eval.yaml) | typespec | Is a TypeSpec project published in `azure-rest-api-specs`? |
+| [`validate-typespec`](evals/unit/validate-typespec.eval.yaml) | typespec | Run `tsp` linter/validation |
+| [`get-modified-typespec-projects`](evals/unit/get-modified-typespec-projects.eval.yaml) | typespec | Git-aware tool against current branch |
+| [`add-arm-resource`](evals/unit/add-arm-resource.eval.yaml) | typespec | Calls `azsdk_typespec_generate_authoring_plan` for an ARM resource |
+| [`create-release-plan`](evals/unit/create-release-plan.eval.yaml) | release-plan | Create a release-plan work item |
+| [`link-namespace-approval-issue`](evals/unit/link-namespace-approval-issue.eval.yaml) | release-plan | Link an existing approval issue to a release plan |
+| [`get-pr-link-current-branch`](evals/unit/get-pr-link-current-branch.eval.yaml) | github | Resolve the PR for the active git branch |
+| [`check-sdk-generation-status`](evals/unit/check-sdk-generation-status.eval.yaml) | pipeline | Pipeline status lookup |
+| [`triggers-apiview`](evals/unit/triggers-apiview.eval.yaml) | apiview | `azsdk_apiview_*` |
+| [`triggers-config`](evals/unit/triggers-config.eval.yaml) | engsys | `azsdk_check_service_label`, `azsdk_create_service_label` |
+| [`triggers-engsys`](evals/unit/triggers-engsys.eval.yaml) | engsys | `azsdk_analyze_log_file`, failed-test tools, codeowner-cache |
+| [`triggers-github`](evals/unit/triggers-github.eval.yaml) | github | `azsdk_create_pull_request`, `azsdk_get_pull_request*`, `azsdk_get_github_user_details` |
+| [`triggers-package`](evals/unit/triggers-package.eval.yaml) | package | `azsdk_package_*`, `azsdk_release_sdk` |
+| [`triggers-pipeline`](evals/unit/triggers-pipeline.eval.yaml) | pipeline | `azsdk_analyze_pipeline`, `azsdk_get_pipeline_*` |
+| [`triggers-releaseplan`](evals/unit/triggers-releaseplan.eval.yaml) | release-plan | `azsdk_*_release_plan*`, `azsdk_run_generate_sdk`, `azsdk_link_*` |
+| [`triggers-typespec`](evals/unit/triggers-typespec.eval.yaml) | typespec | `azsdk_typespec_*`, `azsdk_convert_swagger_to_typespec`, `azsdk_customized_code_update`, `azsdk_run_typespec_validation` |
+| [`triggers-verify`](evals/unit/triggers-verify.eval.yaml) | engsys | `azsdk_verify_setup` |
+
+The companion [`scripts/Validate-EvalTools.ps1`](scripts/Validate-EvalTools.ps1) cross-checks that every tool referenced in `evals/unit/triggers-*.eval.yaml` exists on the running MCP server, and every server tool has at least one trigger.
+
+#### `evals/scenarios/` — multi-tool scenarios (4)
+
+Multi-step prompts that exercise 2+ MCP tools end-to-end. Split into
+`mock/` (hermetic, runs on PR gate) and `live/` (real DevOps / GitHub /
+pipelines, runs nightly).
+
+| Scenario | Area | Mode | Shape |
+|---|---|---|---|
+| [`check-public-repo-then-validate`](evals/scenarios/mock/check-public-repo-then-validate.eval.yaml) | typespec | mock | Validate, then check public-repo presence |
+| [`typespec-generation-step02`](evals/scenarios/mock/typespec-generation-step02.eval.yaml) | typespec | mock | Step in the spec-PR generation flow |
+| [`rename-client-property`](evals/scenarios/mock/rename-client-property.eval.yaml) | typespec | mock | Stub — needs `expected-diff` grader + sparse clone |
+| [`release-planner`](evals/scenarios/live/release-planner.eval.yaml) | release-plan | **live** | Create + re-fetch a release plan, kick off SDK gen, link PR back — real DevOps test-area writes |
+
+Live scenarios need a primed `azure-rest-api-specs` clone — run
+[`evals/setup/ensure-specs-clone.ps1`](evals/setup/ensure-specs-clone.ps1)
+(auto-refreshes every 24h) before invoking the `scenarios-live` / `nightly` suite.
+
+**Skill evals (already in repo, *not* part of this PR)** — for reference:
+
+- **Trigger evals** (one per skill, verify routing): see e.g.
+  [`.github/skills/azsdk-common-prepare-release-plan/evals/trigger.eval.yaml`](../../../.github/skills/azsdk-common-prepare-release-plan/evals/trigger.eval.yaml),
+  plus `azsdk-common-sdk-release`, `azsdk-common-pipeline-troubleshooting`,
+  `azsdk-common-apiview-feedback-resolution`, `sensei`,
+  `skill-authoring`, `markdown-token-optimizer`.
+- **Capability suite** for [`azure-typespec-author`](../../../.github/skills/azure-typespec-author/) —
+  29 numbered cases under
+  [`.github/skills/azure-typespec-author/evaluate/evals/`](../../../.github/skills/azure-typespec-author/evaluate/evals/)
+  (`001001.eval.yaml` … `005001.eval.yaml`). These are the data-driven
+  TypeSpec authoring scenarios that *would* have been our follow-up #1
+  here — they're already covered as skill evals, so this project doesn't
+  re-port them.
+
+This project supersedes the deleted `Azure.Sdk.Tools.Cli.Benchmarks` project
+(removed in [#15697](https://github.com/Azure/azure-sdk-tools/pull/15697)) and
+tracks the migration in
+[#15124](https://github.com/Azure/azure-sdk-tools/issues/15124).
+
+## Layout
+
+```
+Azure.Sdk.Tools.Vally/
+├── .vally.yaml                # Vally config (environments + suites)
+├── evals/
+│   ├── tools/                  # tool-shape + per-skill trigger evals, hermetic
+│   ├── workflow-scenarios/
+│   │   ├── mock/              # multi-tool scenarios, hermetic (PR gate)
+│   │   └── live/              # multi-tool scenarios, live MCP (nightly)
+│   ├── setup/                 # helper scripts (e.g. ensure-specs-clone.ps1)
+│   └── fixtures/              # (future) pinned SHAs + per-eval mocks
+├── fixtures/                  # Per-scenario static input files (env.files)
+│   └── <scenario-name>/...
+├── scripts/                   # Repo-side helpers (Validate-EvalTools.ps1, …)
+└── Graders/                   # (future) Custom .NET graders
+    └── Azure.Sdk.Tools.Vally.csproj  # added when first custom grader lands
+```
+
+Folder = tier (cost / CI cadence): `unit/` is hermetic + fast,
+`scenarios/mock/` is multi-tool hermetic, `scenarios/live/` is multi-tool
+against real services. Vally's suite filter is positive-match only, so the
+mock-vs-live split lives on disk rather than in tags. Feature **area** still
+lives as a `tags:` entry inside each YAML so cross-cuts (e.g. "all
+release-plan evals") select via [`.vally.yaml`](.vally.yaml) suite filters
+or `vally eval --tag`.
+
+## Quickstart — run one scenario
+
+The fastest path from a fresh clone to a green eval. Swap the path after
+`-e` for any other `.eval.yaml` to try a different scenario.
+
+### 1. One-time setup
+
+```powershell
+# From repo root
+cd eng/skill-eval; npm ci; cd ../..
+dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli  -c Debug
+dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug
+```
+
+Rebuild the MCP servers any time you edit tool source. Vally does **not**
+rebuild them — it just spawns the existing DLL.
+
+### 2. Move into this project and stash the paths
+
+All commands below run from here:
+
+```powershell
+cd tools/azsdk-cli/Azure.Sdk.Tools.Vally
+$vally  = '../../../eng/skill-eval/node_modules/.bin/vally.cmd'
+$skills = '../../../.github/skills'
+```
+
+### 3. Run a scenario
+
+**One trigger eval** (~30 s, hermetic):
+
+```powershell
+& $vally eval -e evals/tools/create-release-plan.eval.yaml --skill-dir $skills
+```
+
+**The release-planner mock workflow** (~4 min, 5 stimuli, hermetic):
+
+```powershell
+& $vally eval -e evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml --skill-dir $skills
+```
+
+**The release-planner live workflow** (~15 min, real DevOps writes to the
+test area; prime the spec clone once):
+
+```powershell
+./evals/setup/ensure-specs-clone.ps1
+& $vally eval -e evals/workflow-scenarios/live/release-planner.eval.yaml --skill-dir $skills --workers 1
+```
+
+### 4. Pick a different scenario
+
+```powershell
+# List everything you can pass to -e
+Get-ChildItem evals -Recurse -Filter *.eval.yaml | ForEach-Object FullName
+```
+
+Common swaps:
+
+| What you want | Replace `-e` value with |
+|---|---|
+| A different release-plan trigger | `evals/tools/link-namespace-approval-issue.eval.yaml` |
+| A TypeSpec workflow | `evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml` |
+| All triggers for one feature | drop `-e` and use `--suite typespec` (or `release-plan`, `github`, …) |
+| Everything hermetic | drop `-e` and use `--suite pr-gate` |
+
+### 5. Read the results
+
+- Live PASS/FAIL table prints to the terminal.
+- Full trajectories land in `results/<timestamp>/`. The most useful file
+  is `results.jsonl` — one line per stimulus run, with the prompt, every
+  tool call, and the final agent message.
+- Add `--output-dir vally-results/<your-tag>` if you want a stable path
+  to re-open later.
+
+## Running locally (advanced)
+
+Prereqs are the same as the [Quickstart](#quickstart--run-one-scenario)
+plus Node 22+ and a .NET SDK matching `global.json`.
+
+Run a suite (recommended):
+
+```powershell
+cd tools/azsdk-cli/Azure.Sdk.Tools.Vally
+$vally = '../../../eng/skill-eval/node_modules/.bin/vally.cmd'
+$skills = '../../../.github/skills'
+
+# Fast tiers only — PR-gate candidate
+& $vally eval --suite pr-gate --skill-dir $skills
+
+# A single tier
+& $vally eval --suite unit --skill-dir $skills
+& $vally eval --suite scenarios-mock --skill-dir $skills
+
+# By feature area (cross-cuts tiers via tag filter)
+& $vally eval --suite release-plan --skill-dir $skills
+& $vally eval --suite typespec --skill-dir $skills
+```
+
+> `--skill-dir` is **required** for workflow-scenario evals — without it,
+> the agent never loads the project skills and the `skill-invocation`
+> grader fails even when the tool calls are correct.
+>
+> Each agent still boots its own MCP child process, but `.vally.yaml`
+> launches the **pre-built** `azsdk-mock.dll` / `azsdk.dll` via
+> `dotnet <dll>` (read-only memory-map, no MSBuild on the hot path), so
+> `--workers 6+` is safe for `scenarios-mock`. The old MSBuild boot race
+> is gone; the only remaining concurrency limit is rate limits on the
+> Copilot CLI subprocesses.
+
+Run a single eval:
+
+```powershell
+& $vally eval --eval-spec evals/tools/check-public-repo.eval.yaml --skill-dir $skills
+```
+
+Run the live scenarios tier (first, prime a per-user clone of
+`azure-rest-api-specs`; the helper refreshes it every 24h):
+
+```powershell
+./evals/setup/ensure-specs-clone.ps1
+& $vally eval --suite scenarios-live --skill-dir $skills --workers 1
+```
+
+## Adding a new scenario
+
+1. **Pick a tier** — the folder you drop the YAML into:
+   - `evals/tools/` — one prompt, one MCP tool, no environment hooks.
+   - `evals/workflow-scenarios/mock/` — multi-tool flow against
+     `azsdk-mcp-mock`. Hermetic; runs on PR gate.
+   - `evals/workflow-scenarios/live/` — needs real DevOps / GitHub /
+     pipelines; bind `environment: azsdk-mcp-live`. Nightly only.
+2. Pick a short, kebab-case name (e.g. `create-release-plan`).
+3. Create `evals/<tier>/<name>.eval.yaml`. Start from a sibling in the same
+   tier as a template.
+4. **Tag it** so suite filters pick it up:
+   ```yaml
+   tags:
+     area: release-plan   # or typespec / pipeline / github / engsys / apiview / package
+   ```
+5. If the scenario needs input files, add them under
+   `fixtures/<name>/...` and reference them via `environment.files` in the
+   eval (relative paths from the eval file).
+6. Pick graders — they’re a **list**, stack as many as you need:
+   - `tool-calls` — verify the agent invoked the expected MCP tool(s).
+   - `skill-invocation` — verify the right skill routed (e2e only).
+   - `tool-call-count` / `token-budget` / `turn-count` — chattiness / budget guards.
+   - `output-matches` / `output-contains` — assert final-message shape.
+   - `file-matches` / `file-exists` — verify produced/modified files.
+   - `prompt` — LLM-as-judge for free-form quality checks.
+   - Custom (`Graders/`) — add a .NET grader when no built-in fits.
+7. The suite picks it up automatically (folders are globbed). Add a new
+   tag-filtered suite to [`.vally.yaml`](.vally.yaml) only if you’re
+   introducing a brand-new feature area.
+8. Run locally to confirm it passes, then open a PR.
+
+## Recovery checklist (from deleted benchmark)
+
+Tracked in [#15124](https://github.com/Azure/azure-sdk-tools/issues/15124).
+All 9 deleted scenarios have been ported as Vally `tool-calls` evals (presence
+checks). Items marked with **(stub)** have known gaps documented inline in the
+eval file:
+
+- [x] `check-public-repo`
+- [x] `check-public-repo-then-validate`
+- [x] `validate-typespec`
+- [x] `typespec-generation-step02`
+- [x] `get-modified-typespec-projects` **(stub — needs git-repo fixture / setup hook)**
+- [x] `add-arm-resource` **(stub — needs fixtures + `npx tsp compile` post-check)**
+- [x] `create-release-plan`
+- [x] `link-namespace-approval-issue`
+- [x] `get-pr-link-current-branch`
+- [x] `check-sdk-generation-status`
+- [x] `rename-client-property` **(stub — needs `expected-diff` grader + sparse-clone of `azure-rest-api-specs`)**
+
+### Known gaps vs. the original benchmark
+
+The current `tool-calls` grader only checks tool *names*. The deleted
+benchmark's `ToolCallValidator` additionally asserted:
+
+1. **Argument values** (e.g. `serviceTreeId`, `buildId`, `typeSpecProjectPath`).
+2. **Forbidden tools** (e.g. "must NOT call `azsdk_verify_setup`").
+3. **Call order** (e.g. validate before check-public-repo).
+4. **Optional tools** (calls that are allowed but not required).
+
+Recovering 1–4 requires either upstream grader support in
+`@microsoft/vally-cli` or a custom .NET grader under `Graders/`. Until then
+those constraints are captured in prompt text and inline `TODO:` comments.
+
+### Follow-ups
+
+- [ ] Port `Evaluate_PromptToToolMatch` + `Evaluate_ToolDescriptionSimilarity`
+      from `Azure.Sdk.Tools.Cli.Evaluations` (still uses Copilot-SDK evaluator).
+- [ ] File upstream issue against `@microsoft/vally-cli` to add `forbidden`,
+      `optional`, argument-matching, and ordering to the built-in `tool-calls`
+      grader (or accept that those gaps need custom graders).
+- [ ] Wire a `vally eval` CI job for this project (current
+      [`.github/workflows/skill-eval.yml`](../../../.github/workflows/skill-eval.yml)
+      runs `vally lint` only and is scoped to skills). See
+      [#15126](https://github.com/Azure/azure-sdk-tools/issues/15126) and
+      [#15127](https://github.com/Azure/azure-sdk-tools/issues/15127).
+- [ ] Decide on `AuthoringScenario` parity: the 29 TypeSpec authoring cases
+      are already covered as **skill evals** under
+      [`.github/skills/azure-typespec-author/evaluate/evals/`](../../../.github/skills/azure-typespec-author/evaluate/evals/).
+      Tracked as [#15767](https://github.com/Azure/azure-sdk-tools/issues/15767) —
+      likely close as duplicate unless we also want tool-level coverage of the
+      same prompts (catches catalog regressions even when the skill isn't
+      triggered).
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1
new file mode 100644
index 00000000000..918d544edf8
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/setup/ensure-specs-clone.ps1
@@ -0,0 +1,70 @@
+<#
+.SYNOPSIS
+  Ensures a per-user shallow+sparse cache clone of Azure/azure-rest-api-specs
+  exists and is reasonably fresh.
+
+.DESCRIPTION
+  Run this before invoking the e2e suite (vally eval --suite e2e).
+  Maintains a cache clone that Vally's `environment.git.source` points at,
+  so individual eval YAMLs don't need a pre-existing checkout.
+
+  - First run: shallow + blobless + cone-sparse clone (only
+    specification/contosowidgetmanager/ to keep size minimal).
+  - Subsequent runs within -MaxAgeHours: noop.
+  - Subsequent runs past -MaxAgeHours: `git fetch --depth 1 origin main` and
+    fast-forward `main`.
+
+  Cache lives at:
+    Windows: $env:USERPROFILE\.vally-cache\azure-rest-api-specs
+    *nix:    $HOME/.vally-cache/azure-rest-api-specs
+
+.PARAMETER MaxAgeHours
+  Skip the `git fetch` if the cache was last refreshed within this many
+  hours. Default: 24.
+
+.PARAMETER SparseCheckoutPaths
+  Cone-sparse paths to include. Default: specification/contosowidgetmanager.
+  Pass @() to disable sparse-checkout (full tree).
+#>
+[CmdletBinding()]
+param(
+    [int]      $MaxAgeHours        = 24,
+    [string[]] $SparseCheckoutPaths = @('specification/contosowidgetmanager')
+)
+
+$ErrorActionPreference = 'Stop'
+Set-StrictMode -Version 4
+
+$cacheRoot = if ($env:USERPROFILE) { Join-Path $env:USERPROFILE '.vally-cache' } else { Join-Path $HOME '.vally-cache' }
+$cache     = Join-Path $cacheRoot 'azure-rest-api-specs'
+$stamp     = Join-Path $cache '.vally-last-fetch'
+
+if (-not (Test-Path (Join-Path $cache '.git'))) {
+    Write-Host "[ensure-specs-clone] Cloning azure-rest-api-specs into cache: $cache"
+    New-Item -ItemType Directory -Force -Path $cacheRoot | Out-Null
+    git clone --depth 1 --filter=blob:none --no-checkout `
+        https://github.com/Azure/azure-rest-api-specs.git $cache | Out-Null
+    if ($SparseCheckoutPaths.Count -gt 0) {
+        git -C $cache sparse-checkout init --cone | Out-Null
+        git -C $cache sparse-checkout set @SparseCheckoutPaths | Out-Null
+    }
+    git -C $cache checkout main | Out-Null
+    Set-Content -Path $stamp -Value (Get-Date -Format o)
+} else {
+    $isStale = $true
+    if (Test-Path $stamp) {
+        $age = (Get-Date) - (Get-Item $stamp).LastWriteTime
+        $isStale = $age.TotalHours -gt $MaxAgeHours
+    }
+    if ($isStale) {
+        Write-Host "[ensure-specs-clone] Refreshing cache (>$MaxAgeHours h old): $cache"
+        git -C $cache fetch --depth 1 origin main | Out-Null
+        git -C $cache reset --hard origin/main | Out-Null
+        Set-Content -Path $stamp -Value (Get-Date -Format o)
+    } else {
+        Write-Host "[ensure-specs-clone] Cache is fresh (<$MaxAgeHours h): $cache"
+    }
+}
+
+# Echo the cache path so the wrapper can capture it.
+Write-Output $cache
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/add-arm-resource.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/add-arm-resource.eval.yaml
new file mode 100644
index 00000000000..fc96cc2be73
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/add-arm-resource.eval.yaml
@@ -0,0 +1,44 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Add-arm-resource: end-to-end scenario for authoring a new ARM resource
+  via TypeSpec. This is a complex, file-producing scenario (not a single
+  tool-call check) that needs a real fixture + tsp compile verification.
+version: "1.0"
+type: capability
+
+
+tags:
+  tier: unit
+  area: typespec
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: add-arm-resource
+    prompt: |
+      In the specification/widget/resource-manager/Microsoft.Widget/Widget project,
+      add an ARM resource named 'Asset' with CRUD operations.
+    constraints:
+      max_turns: 20
+      max_tokens: 50000
+    # TODO: seed a fixture (environment.files or git) for the Microsoft.Widget
+    # project, add `file-exists` + `file-contains` graders on the produced
+    # asset.tsp, and a `run-command` grader to verify `npx tsp compile`.
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - edit
+            - azsdk_typespec_generate_authoring_plan
+
+scoring:
+  weights:
+    tool-calls: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-public-repo.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-public-repo.eval.yaml
new file mode 100644
index 00000000000..51e2ec3d129
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-public-repo.eval.yaml
@@ -0,0 +1,43 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Tool-scenario evaluation suite for the azsdk MCP server. Verifies the
+  agent invokes the right MCP tools for given prompts, independent of any
+  specific skill.
+version: "1.0"
+type: capability
+
+
+tags:
+  tier: unit
+  area: typespec
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: check-public-repo
+    prompt: |
+      Check if my TypeSpec project is in the public repo.
+      My setup has already been verified, do not run azsdk_verify_setup.
+      Project root: specification/contosowidgetmanager/Contoso.WidgetManager.
+    constraints:
+      max_turns: 5
+      max_tokens: 5000
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_typespec_check_project_in_public_repo
+          disallowed:
+            - azsdk_verify_setup
+
+scoring:
+  weights:
+    tool-calls: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-sdk-generation-status.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-sdk-generation-status.eval.yaml
new file mode 100644
index 00000000000..714d31cd732
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/check-sdk-generation-status.eval.yaml
@@ -0,0 +1,42 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Check-sdk-generation-status: the agent should call azsdk_get_pipeline_status
+  to check the SDK generation pipeline status.
+version: "1.0"
+type: capability
+
+
+tags:
+  tier: unit
+  area: pipeline
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: check-sdk-generation-status
+    prompt: |
+      Check the SDK generation pipeline status for build ID 5513110.
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 5
+      max_tokens: 5000
+    # TODO: assert buildId=5513110 — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher).
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_get_pipeline_status
+          disallowed:
+            - azsdk_verify_setup
+
+scoring:
+  weights:
+    tool-calls: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/create-release-plan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/create-release-plan.eval.yaml
new file mode 100644
index 00000000000..7b4a25f0725
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/create-release-plan.eval.yaml
@@ -0,0 +1,112 @@
+# =============================================================================
+# Scenario: create-release-plan
+# -----------------------------------------------------------------------------
+# Purpose:
+#   Tier-1 "tool-call" eval. Verify that, given a fully-specified prompt with
+#   all required context, the agent invokes `azsdk_create_release_plan` exactly
+#   once and does NOT redundantly call `azsdk_verify_setup` (the prompt already
+#   states setup is verified).
+#
+# What this eval is NOT:
+#   - Not an end-to-end flow (see release-planner-e2e.eval.yaml for that).
+#   - Does not validate argument values yet — see TODO below + #15833.
+#   - Does not need azure-rest-api-specs cloned; runs against the live MCP
+#     server in agent-testing mode (AZSDKTOOLS_AGENT_TESTING=true, set in
+#     ../../.vally.yaml).
+#
+# How to run locally:
+#   cd tools/azsdk-cli/Azure.Sdk.Tools.Vally
+#   ../../../eng/skill-eval/node_modules/.bin/vally.cmd eval \
+#     --eval-spec evals/unit/create-release-plan.eval.yaml --verbose
+# =============================================================================
+
+name: azsdk-mcp-tool-scenarios
+description: |
+  Create-release-plan: the agent should call azsdk_create_release_plan with
+  the supplied service-tree / product-tree / spec PR context.
+version: "1.0"
+type: capability
+
+
+tags:
+  tier: unit
+  area: release-plan
+
+# `environment: azsdk-mcp` refers to the named environment defined in
+# ../../.vally.yaml (configures the azsdk-cli MCP server + env vars).
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1            # bump for flakiness sampling (e.g. runs: 5)
+  timeout: 30m       # total wall-clock budget for ALL stimuli in this file
+  model: gpt-5.4     # model alias — see .vally.yaml `models:` map
+  executor: copilot-sdk
+
+stimuli:
+  - name: create-release-plan
+    prompt: |
+      Create a release plan for the Contoso Widget Manager, no need to get it afterwards only create.
+      My setup has already been verified, do not run azsdk_verify_setup. Here is all the context you need:
+      TypeSpec project located at "specification/contosowidgetmanager/Contoso.WidgetManager".
+      Use service tree ID "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f",
+      product tree ID "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e",
+      target release timeline "December 2025",
+      API version "2022-11-01-preview",
+      SDK release type "beta",
+      and link it to the spec pull request "https://github.com/Azure/azure-rest-api-specs/pull/38387".
+
+    # Per-stimulus guardrails. Anything beyond these fails the run.
+    constraints:
+      max_turns: 8       # agent loop iterations
+      max_tokens: 8000   # cumulative token spend
+
+    # TODO: assert serviceTreeId / productTreeId / specApiVersion / specPullRequestUrl / sdkReleaseType / typeSpecProjectPath — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher).
+    #
+    # `graders:` is a LIST — stack as many as you want. Each grader produces a
+    # score in [0,1]; the `scoring.weights` block below combines them into the
+    # final scenario score. Available grader `type:` values include:
+    #
+    #   static (deterministic, free):
+    #     tool-calls, skill-invocation, has-output, no-errors, turn-completed,
+    #     token-budget, tool-call-count, turn-count, error-count, wall-time,
+    #     program, run-command, stdout-contains, stdout-matches,
+    #     stderr-contains, exit-code, file-exists, file-contains,
+    #     file-matches, output-contains, output-matches
+    #   llm (model-judged, costs tokens):
+    #     prompt, pairwise
+    #
+    # Example of stacking multiple graders (uncomment to use):
+    #
+    #   graders:
+    #     - type: tool-calls
+    #       config:
+    #         required: [azsdk_create_release_plan]
+    #         disallowed: [azsdk_verify_setup]
+    #     - type: skill-invocation           # was a specific skill invoked?
+    #       config:
+    #         required: [release-planner]
+    #     - type: tool-call-count            # cap chattiness
+    #       config:
+    #         max: 5
+    #     - type: prompt                     # llm-judged correctness
+    #       config:
+    #         model: gpt-5.4
+    #         rubric: |
+    #           Did the final assistant message confirm the release plan was
+    #           created and surface its ID? Answer "pass" or "fail".
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_create_release_plan
+          disallowed:
+            - azsdk_verify_setup
+
+# Combine grader scores into the final scenario score.
+# Keys must match the grader `type:` (or its `name:` if you set one).
+# `threshold` is the minimum weighted score for the scenario to PASS.
+scoring:
+  weights:
+    tool-calls: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-modified-typespec-projects.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-modified-typespec-projects.eval.yaml
new file mode 100644
index 00000000000..b62e3536c8b
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-modified-typespec-projects.eval.yaml
@@ -0,0 +1,45 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Get-modified-typespec-projects: the agent should call
+  azsdk_get_modified_typespec_projects to list TypeSpec projects modified
+  in the current branch.
+version: "1.0"
+type: capability
+
+
+tags:
+  tier: unit
+  area: typespec
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: get-modified-typespec-projects
+    prompt: |
+      List the TypeSpec projects modified in my current branch compared to main.
+      My setup has already been verified, do not run azsdk_verify_setup.
+      The repository root is the relative path ./azure-rest-api-specs.
+    constraints:
+      max_turns: 5
+      max_tokens: 5000
+    # TODO: seed a git worktree fixture (environment.git) with a modified
+    # tspconfig.yaml so the tool actually has a diff to report.
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_get_modified_typespec_projects
+          disallowed:
+            - azsdk_verify_setup
+
+scoring:
+  weights:
+    tool-calls: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-pr-link-current-branch.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-pr-link-current-branch.eval.yaml
new file mode 100644
index 00000000000..bb61cd1f5c3
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/get-pr-link-current-branch.eval.yaml
@@ -0,0 +1,43 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Get-pr-link-current-branch: the agent should call
+  azsdk_get_pull_request_link_for_current_branch when asked about the
+  status of the spec PR on the current branch.
+version: "1.0"
+type: capability
+
+
+tags:
+  tier: unit
+  area: github
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: get-pr-link-current-branch
+    prompt: |
+      What's the status of the spec PR in my current branch? Only check the status once.
+      My setup has already been verified, do not run azsdk_verify_setup.
+      The repository root is the relative path ./azure-rest-api-specs.
+    constraints:
+      max_turns: 5
+      max_tokens: 5000
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_get_pull_request_link_for_current_branch
+          disallowed:
+            - azsdk_verify_setup
+
+scoring:
+  weights:
+    tool-calls: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/link-namespace-approval-issue.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/link-namespace-approval-issue.eval.yaml
new file mode 100644
index 00000000000..aeddc254dd0
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/link-namespace-approval-issue.eval.yaml
@@ -0,0 +1,42 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Link-namespace-approval-issue: the agent should call
+  azsdk_link_namespace_approval_issue to link an issue to a release plan.
+version: "1.0"
+type: capability
+
+
+tags:
+  tier: unit
+  area: release-plan
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: link-namespace-approval-issue
+    prompt: |
+      Link namespace approval issue https://github.com/Azure/azure-sdk/issues/1234 to release plan 12345.
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 5
+      max_tokens: 5000
+    # TODO: assert releasePlanWorkItemId=12345 and namespaceApprovalIssue URL — blocked on https://github.com/Azure/azure-sdk-tools/issues/15833 (Vally tool-calls grader needs generic args matcher).
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_link_namespace_approval_issue
+          disallowed:
+            - azsdk_verify_setup
+
+scoring:
+  weights:
+    tool-calls: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-apiview.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-apiview.eval.yaml
new file mode 100644
index 00000000000..a1de2a6541f
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-apiview.eval.yaml
@@ -0,0 +1,115 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+tags:
+  tier: unit
+  area: apiview
+  priority: p0
+
+stimuli:
+
+  # ==== azsdk_apiview_get_comments triggers ====
+  - name: invoke-azsdk-apiview-get-comments-1
+    prompt: "Get all the APIView comments for my package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_comments"
+  - name: invoke-azsdk-apiview-get-comments-2
+    prompt: "Show me the API review feedback for this package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_comments"
+  - name: invoke-azsdk-apiview-get-comments-3
+    prompt: "What comments did the API reviewers leave?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_comments"
+
+  # ==== azsdk_apiview_get_copilot_review triggers ====
+  - name: invoke-azsdk-apiview-get-copilot-review-1
+    prompt: "Check if my Copilot review is done"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_copilot_review"
+  - name: invoke-azsdk-apiview-get-copilot-review-2
+    prompt: "Get the results of my automated API review job"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_copilot_review"
+  - name: invoke-azsdk-apiview-get-copilot-review-3
+    prompt: "What comments did the Copilot generate for my API review?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_copilot_review"
+
+  # ==== azsdk_apiview_get_review_url triggers ====
+  - name: invoke-azsdk-apiview-get-review-url-1
+    prompt: "Get the APIView review link for the Azure.Storage.Blobs C# package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_review_url"
+  - name: invoke-azsdk-apiview-get-review-url-2
+    prompt: "What is the APIView URL for the azure-core Python package?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_review_url"
+  - name: invoke-azsdk-apiview-get-review-url-3
+    prompt: >
+      Give me the link to the API review page for the Java storage blob package version 12.32.0
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_get_review_url"
+
+  # ==== azsdk_apiview_request_copilot_review triggers ====
+  - name: invoke-azsdk-apiview-request-copilot-review-1
+    prompt: "Request a Copilot review for this API"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_request_copilot_review"
+  - name: invoke-azsdk-apiview-request-copilot-review-2
+    prompt: "Run an automated review on my package's API surface"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_request_copilot_review"
+  - name: invoke-azsdk-apiview-request-copilot-review-3
+    prompt: "Submit this APIView URL for an automated Copilot review"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_apiview_request_copilot_review"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-config.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-config.eval.yaml
new file mode 100644
index 00000000000..d03ba74ad0a
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-config.eval.yaml
@@ -0,0 +1,54 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+tags:
+  tier: unit
+  area: engsys
+  priority: p0
+
+stimuli:
+
+  # ==== azsdk_check_service_label triggers ====
+  - name: invoke-azsdk-check-service-label-1
+    prompt: "Check if a service label exists for my service"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_check_service_label"
+  - name: invoke-azsdk-check-service-label-2
+    prompt: "Does the service label for Contoso already exist?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_check_service_label"
+
+  # ==== azsdk_create_service_label triggers ====
+  - name: invoke-azsdk-create-service-label-1
+    prompt: "Create a new service label for my service"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_create_service_label"
+  - name: invoke-azsdk-create-service-label-2
+    prompt: "Add a service label for Contoso Widget Manager"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_create_service_label"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-engsys.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-engsys.eval.yaml
new file mode 100644
index 00000000000..5ee6b9c584b
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-engsys.eval.yaml
@@ -0,0 +1,102 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+tags:
+  tier: unit
+  area: engsys
+  priority: p0
+
+stimuli:
+
+  # ==== azsdk_analyze_log_file triggers ====
+  - name: invoke-azsdk-analyze-log-file-1
+    prompt: "Analyze this log file for errors"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_analyze_log_file"
+  - name: invoke-azsdk-analyze-log-file-2
+    prompt: "What errors are in this build log?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_analyze_log_file"
+
+  # ==== azsdk_cleanup_ai_agents triggers ====
+  - name: invoke-azsdk-cleanup-ai-agents-1
+    prompt: "Clean up AI agents in my project"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_cleanup_ai_agents"
+
+  # ==== azsdk_get_failed_test_case_data triggers ====
+  - name: invoke-azsdk-get-failed-test-case-data-1
+    prompt: "Get detailed information about a specific failed test"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_failed_test_case_data"
+  - name: invoke-azsdk-get-failed-test-case-data-2
+    prompt: "Show me the error message and stack trace for the failed test TestAuthentication"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_failed_test_case_data"
+
+  # ==== azsdk_get_failed_test_cases triggers ====
+  - name: invoke-azsdk-get-failed-test-cases-1
+    prompt: "Get the list of failed test cases from my test run"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_failed_test_cases"
+  - name: invoke-azsdk-get-failed-test-cases-2
+    prompt: "What tests failed in this TRX file?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_failed_test_cases"
+  - name: invoke-azsdk-get-failed-test-cases-3
+    prompt: "Show me which tests failed"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_failed_test_cases"
+
+  # ==== azsdk_get_failed_test_run_data triggers ====
+  - name: invoke-azsdk-get-failed-test-run-data-1
+    prompt: "Get complete details for all failed tests"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_failed_test_run_data"
+  - name: invoke-azsdk-get-failed-test-run-data-2
+    prompt: "Show me full information about all test failures including stack traces"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_failed_test_run_data"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-github.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-github.eval.yaml
new file mode 100644
index 00000000000..50ed5954c62
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-github.eval.yaml
@@ -0,0 +1,79 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+tags:
+  tier: unit
+  area: github
+  priority: p0
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+stimuli:
+
+  # ==== azsdk_create_pull_request triggers ====
+  - name: invoke-azsdk-create-pull-request-1
+    prompt: "Create a pull request for my changes"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_create_pull_request"
+
+  # ==== azsdk_get_github_user_details triggers ====
+  - name: invoke-azsdk-get-github-user-details-1
+    prompt: "Get details for GitHub user octocat"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_github_user_details"
+  - name: invoke-azsdk-get-github-user-details-2
+    prompt: "Who is the GitHub user johndoe?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_github_user_details"
+
+  # ==== azsdk_get_pull_request triggers ====
+  - name: invoke-azsdk-get-pull-request-1
+    prompt: "Get the details of my pull request"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pull_request"
+  - name: invoke-azsdk-get-pull-request-2
+    prompt: "Show me the status and comments on PR #1234"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pull_request"
+
+  # ==== azsdk_get_pull_request_link_for_current_branch triggers ====
+  - name: invoke-azsdk-get-pull-request-link-for-current-branch-1
+    prompt: "Get the PR link for my current branch"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pull_request_link_for_current_branch"
+  - name: invoke-azsdk-get-pull-request-link-for-current-branch-2
+    prompt: "What's the pull request URL for this branch?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pull_request_link_for_current_branch"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-package.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-package.eval.yaml
new file mode 100644
index 00000000000..e2d3217175a
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-package.eval.yaml
@@ -0,0 +1,233 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+tags:
+  tier: unit
+  area: package
+  priority: p0
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+stimuli:
+
+  # ==== azsdk_package_build_code triggers ====
+  - name: invoke-azsdk-package-build-code-1
+    prompt: "Build my SDK package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_build_code"
+  - name: invoke-azsdk-package-build-code-2
+    prompt: "Compile the code for my package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_build_code"
+
+  # ==== azsdk_package_generate_code triggers ====
+  - name: invoke-azsdk-package-generate-code-1
+    prompt: "Generate SDK code from my TypeSpec"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_generate_code"
+  - name: invoke-azsdk-package-generate-code-2
+    prompt: "Run code generation for my package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_generate_code"
+
+  # ==== azsdk_package_generate_samples triggers ====
+  - name: invoke-azsdk-package-generate-samples-1
+    prompt: "Generate sample code for my SDK package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_generate_samples"
+  - name: invoke-azsdk-package-generate-samples-2
+    prompt: "Create sample code for my package based on these scenarios"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_generate_samples"
+  - name: invoke-azsdk-package-generate-samples-3
+    prompt: "Generate samples for my package using this prompt"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_generate_samples"
+
+  # ==== azsdk_package_pack triggers ====
+  - name: invoke-azsdk-package-pack-1
+    prompt: "Pack my SDK package into a distributable artifact"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_pack"
+  - name: invoke-azsdk-package-pack-2
+    prompt: "Create distributable package artifacts for my SDK"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_pack"
+  - name: invoke-azsdk-package-pack-3
+    prompt: "Generate package artifacts for my SDK"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_pack"
+
+  # ==== azsdk_package_run_check triggers ====
+  - name: invoke-azsdk-package-run-check-1
+    prompt: "Run the azsdk package check command to validate my SDK"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_run_check"
+  - name: invoke-azsdk-package-run-check-2
+    prompt: "Run validation checks on my SDK package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_run_check"
+  - name: invoke-azsdk-package-run-check-3
+    prompt: "Validate the changelog and dependencies for my package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_run_check"
+
+  # ==== azsdk_package_run_tests triggers ====
+  - name: invoke-azsdk-package-run-tests-1
+    prompt: "Run tests for my SDK package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_run_tests"
+  - name: invoke-azsdk-package-run-tests-2
+    prompt: "Run the tests for my specified SDK package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_run_tests"
+
+  # ==== azsdk_package_translate_samples triggers ====
+  - name: invoke-azsdk-package-translate-samples-1
+    prompt: "Translate the sample code from the Python package to the Java package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_translate_samples"
+  - name: invoke-azsdk-package-translate-samples-2
+    prompt: "Convert samples from the source package to the target language package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_translate_samples"
+  - name: invoke-azsdk-package-translate-samples-3
+    prompt: "Translate SDK samples from one language to another"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_translate_samples"
+
+  # ==== azsdk_package_update_changelog_content triggers ====
+  - name: invoke-azsdk-package-update-changelog-content-1
+    prompt: "Update the changelog for my package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_update_changelog_content"
+  - name: invoke-azsdk-package-update-changelog-content-2
+    prompt: "Update the changelog content for my package with new release notes"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_update_changelog_content"
+
+  # ==== azsdk_package_update_metadata triggers ====
+  - name: invoke-azsdk-package-update-metadata-1
+    prompt: "Update the package metadata"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_update_metadata"
+  - name: invoke-azsdk-package-update-metadata-2
+    prompt: "Update the package metadata for my SDK"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_update_metadata"
+
+  # ==== azsdk_package_update_version triggers ====
+  - name: invoke-azsdk-package-update-version-1
+    prompt: "Update my package version"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_update_version"
+  - name: invoke-azsdk-package-update-version-2
+    prompt: "Bump the version to 1.2.0"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_package_update_version"
+
+  # ==== azsdk_release_sdk triggers ====
+  - name: invoke-azsdk-release-sdk-1
+    prompt: "Release my SDK package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_release_sdk"
+  - name: invoke-azsdk-release-sdk-2
+    prompt: "Trigger the release pipeline for my package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_release_sdk"
+  - name: invoke-azsdk-release-sdk-3
+    prompt: "Start the SDK release process for my package"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_release_sdk"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-pipeline.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-pipeline.eval.yaml
new file mode 100644
index 00000000000..5e283686437
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-pipeline.eval.yaml
@@ -0,0 +1,77 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+tags:
+  tier: unit
+  area: pipeline
+  priority: p0
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+stimuli:
+
+  # ==== azsdk_analyze_pipeline triggers ====
+  - name: invoke-azsdk-analyze-pipeline-1
+    prompt: "Analyze my pipeline run"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_analyze_pipeline"
+  - name: invoke-azsdk-analyze-pipeline-2
+    prompt: "What happened in this pipeline build?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_analyze_pipeline"
+
+  # ==== azsdk_get_pipeline_llm_artifacts triggers ====
+  - name: invoke-azsdk-get-pipeline-llm-artifacts-1
+    prompt: "Get the LLM artifacts from my pipeline"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pipeline_llm_artifacts"
+  - name: invoke-azsdk-get-pipeline-llm-artifacts-2
+    prompt: "Download the analysis artifacts from the pipeline run"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pipeline_llm_artifacts"
+
+  # ==== azsdk_get_pipeline_status triggers ====
+  - name: invoke-azsdk-get-pipeline-status-1
+    prompt: "Check the status of my Azure pipeline build 12345678"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pipeline_status"
+  - name: invoke-azsdk-get-pipeline-status-2
+    prompt: "Get the pipeline build status for run 9876543"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pipeline_status"
+  - name: invoke-azsdk-get-pipeline-status-3
+    prompt: "Get the pipeline build status for my CI run"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_pipeline_status"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-releaseplan.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-releaseplan.eval.yaml
new file mode 100644
index 00000000000..1c8f9277ede
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-releaseplan.eval.yaml
@@ -0,0 +1,317 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+tags:
+  tier: unit
+  area: release-plan
+  priority: p0
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+stimuli:
+
+  # ==== azsdk_abandon_release_plan triggers ====
+  - name: invoke-azsdk-abandon-release-plan-1
+    prompt: "Abandon the release plan for work item 12345"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_abandon_release_plan"
+  - name: invoke-azsdk-abandon-release-plan-2
+    prompt: "Cancel and abandon my release plan for work item 12345"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_abandon_release_plan"
+  - name: invoke-azsdk-abandon-release-plan-3
+    prompt: "Mark the release plan for work item 12345 as abandoned"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_abandon_release_plan"
+  - name: invoke-azsdk-abandon-release-plan-4
+    prompt: "Abandon my release plan for work item 12345"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_abandon_release_plan"
+  - name: invoke-azsdk-abandon-release-plan-5
+    prompt: "Cancel the release plan for my service for work item 12345"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_abandon_release_plan"
+
+  # ==== azsdk_check_api_spec_ready_for_sdk triggers ====
+  - name: invoke-azsdk-check-api-spec-ready-for-sdk-1
+    prompt: "Check if my API spec is ready to generate SDK"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_check_api_spec_ready_for_sdk"
+  - name: invoke-azsdk-check-api-spec-ready-for-sdk-2
+    prompt: "Is my TypeSpec ready for SDK generation?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_check_api_spec_ready_for_sdk"
+
+  # ==== azsdk_create_release_plan triggers ====
+  - name: invoke-azsdk-create-release-plan-1
+    prompt: "Create a release plan for my service"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_create_release_plan"
+  - name: invoke-azsdk-create-release-plan-2
+    prompt: "Create a release plan for Contoso Widget Manager service"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_create_release_plan"
+
+  # ==== azsdk_get_release_plan triggers ====
+  - name: invoke-azsdk-get-release-plan-1
+    prompt: "Get the release plan for work item 12345"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_release_plan"
+  - name: invoke-azsdk-get-release-plan-2
+    prompt: "Show me the release plan details"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_release_plan"
+
+  # ==== azsdk_get_release_plan_for_spec_pr triggers ====
+  - name: invoke-azsdk-get-release-plan-for-spec-pr-1
+    prompt: "Get the release plan for my spec PR"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_release_plan_for_spec_pr"
+  - name: invoke-azsdk-get-release-plan-for-spec-pr-2
+    prompt: "What release plan is associated with this spec pull request?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_release_plan_for_spec_pr"
+
+  # ==== azsdk_get_sdk_pull_request_link triggers ====
+  - name: invoke-azsdk-get-sdk-pull-request-link-1
+    prompt: "Get the SDK pull request link from the generation pipeline"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_sdk_pull_request_link"
+  - name: invoke-azsdk-get-sdk-pull-request-link-2
+    prompt: "Where is the PR created by SDK generation?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_sdk_pull_request_link"
+
+  # ==== azsdk_get_service_details_by_typespec_path triggers ====
+  - name: invoke-azsdk-get-service-details-by-typespec-path-1
+    prompt: "Get the service tree details for my TypeSpec project path"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_service_details_by_typespec_path"
+  - name: invoke-azsdk-get-service-details-by-typespec-path-2
+    prompt: "Look up the service and product details using the TypeSpec project path"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_service_details_by_typespec_path"
+  - name: invoke-azsdk-get-service-details-by-typespec-path-3
+    prompt: "What service tree ID and product info is associated with this TypeSpec path?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_service_details_by_typespec_path"
+  - name: invoke-azsdk-get-service-details-by-typespec-path-4
+    prompt: "Find product details for my typespec project"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_service_details_by_typespec_path"
+  - name: invoke-azsdk-get-service-details-by-typespec-path-5
+    prompt: "What service does this TypeSpec project belong to?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_service_details_by_typespec_path"
+  - name: invoke-azsdk-get-service-details-by-typespec-path-6
+    prompt: >
+      Get service and service tree product details for a product using TypeSpec project path
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_service_details_by_typespec_path"
+
+  # ==== azsdk_link_namespace_approval_issue triggers ====
+  - name: invoke-azsdk-link-namespace-approval-issue-1
+    prompt: "Link namespace approval issue to release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_link_namespace_approval_issue"
+  - name: invoke-azsdk-link-namespace-approval-issue-2
+    prompt: "Associate the namespace approval with my release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_link_namespace_approval_issue"
+
+  # ==== azsdk_link_sdk_pull_request_to_release_plan triggers ====
+  - name: invoke-azsdk-link-sdk-pull-request-to-release-plan-1
+    prompt: "Link my SDK pull request to the release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_link_sdk_pull_request_to_release_plan"
+  - name: invoke-azsdk-link-sdk-pull-request-to-release-plan-2
+    prompt: "Link SDK pull request #5678 to release plan 12345"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_link_sdk_pull_request_to_release_plan"
+
+  # ==== azsdk_run_generate_sdk triggers ====
+  - name: invoke-azsdk-run-generate-sdk-1
+    prompt: "Generate SDK from my TypeSpec project using the pipeline"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_run_generate_sdk"
+  - name: invoke-azsdk-run-generate-sdk-2
+    prompt: "Generate SDK for my TypeSpec project using the pipeline"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_run_generate_sdk"
+
+  # ==== azsdk_update_api_spec_pull_request_in_release_plan triggers ====
+  - name: invoke-azsdk-update-api-spec-pull-request-in-release-plan-1
+    prompt: "Update the TypeSpec PR URL in the release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_api_spec_pull_request_in_release_plan"
+  - name: invoke-azsdk-update-api-spec-pull-request-in-release-plan-2
+    prompt: "Update the TypeSpec pull request URL in my release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_api_spec_pull_request_in_release_plan"
+
+  # ==== azsdk_update_language_exclusion_justification triggers ====
+  - name: invoke-azsdk-update-language-exclusion-justification-1
+    prompt: "Update the language exclusion justification"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_language_exclusion_justification"
+  - name: invoke-azsdk-update-language-exclusion-justification-2
+    prompt: "Explain why Python is excluded from this release"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_language_exclusion_justification"
+
+  # ==== azsdk_update_release_plan triggers ====
+  - name: invoke-azsdk-update-release-plan-1
+    prompt: "Update the release plan with my TypeSpec project path and API version"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_release_plan"
+  - name: invoke-azsdk-update-release-plan-2
+    prompt: "Update the spec PR URL and SDK release type in my release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_release_plan"
+  - name: invoke-azsdk-update-release-plan-3
+    prompt: "Update the existing release plan for my service"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_release_plan"
+  - name: invoke-azsdk-update-release-plan-4
+    prompt: "Update my release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_release_plan"
+  - name: invoke-azsdk-update-release-plan-5
+    prompt: "Update TypeSpec project in release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_release_plan"
+
+  # ==== azsdk_update_sdk_details_in_release_plan triggers ====
+  - name: invoke-azsdk-update-sdk-details-in-release-plan-1
+    prompt: "Update SDK details in the release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_sdk_details_in_release_plan"
+  - name: invoke-azsdk-update-sdk-details-in-release-plan-2
+    prompt: "Change the SDK package name in the release plan"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_update_sdk_details_in_release_plan"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-typespec.eval.yaml
new file mode 100644
index 00000000000..63e4b9182f3
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-typespec.eval.yaml
@@ -0,0 +1,190 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+tags:
+  tier: unit
+  area: typespec
+  priority: p0
+
+stimuli:
+
+  # ==== azsdk_convert_swagger_to_typespec triggers ====
+  - name: invoke-azsdk-convert-swagger-to-typespec-1
+    prompt: "Convert my swagger to TypeSpec"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_convert_swagger_to_typespec"
+  - name: invoke-azsdk-convert-swagger-to-typespec-2
+    prompt: "Migrate my API from swagger to TypeSpec"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_convert_swagger_to_typespec"
+
+  # ==== azsdk_customized_code_update triggers ====
+  - name: invoke-azsdk-customized-code-update-1
+    prompt: "Update customized code with patches to fix build errors"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_customized_code_update"
+  - name: invoke-azsdk-customized-code-update-2
+    prompt: "Apply customized code patches and rebuild to fix errors"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_customized_code_update"
+
+  # ==== azsdk_get_modified_typespec_projects triggers ====
+  - name: invoke-azsdk-get-modified-typespec-projects-1
+    prompt: "What TypeSpec projects were modified in my branch?"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_modified_typespec_projects"
+  - name: invoke-azsdk-get-modified-typespec-projects-2
+    prompt: "List the changed TypeSpec projects"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_get_modified_typespec_projects"
+
+  # ==== azsdk_run_typespec_validation triggers ====
+  - name: invoke-azsdk-run-typespec-validation-1
+    prompt: "Run TypeSpec validation on my project"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_run_typespec_validation"
+  - name: invoke-azsdk-run-typespec-validation-2
+    prompt: "Run TypeSpec configuration validation for my project root path"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_run_typespec_validation"
+
+  # ==== azsdk_typespec_check_project_in_public_repo triggers ====
+  - name: invoke-azsdk-typespec-check-project-in-public-repo-1
+    prompt: "Check if my TypeSpec project is in the public repo"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_check_project_in_public_repo"
+  - name: invoke-azsdk-typespec-check-project-in-public-repo-2
+    prompt: "Check if my TypeSpec project is in the public spec repo"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_check_project_in_public_repo"
+
+  # ==== azsdk_typespec_delegate_apiview_feedback triggers ====
+  - name: invoke-azsdk-typespec-delegate-apiview-feedback-1
+    prompt: "Delegate the APIView feedback to Copilot for resolution"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_delegate_apiview_feedback"
+  - name: invoke-azsdk-typespec-delegate-apiview-feedback-2
+    prompt: "Address the APIView comments by creating a GitHub issue and assigning Copilot"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_delegate_apiview_feedback"
+  - name: invoke-azsdk-typespec-delegate-apiview-feedback-3
+    prompt: "Resolve the APIView reviewer feedback from this URL"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_delegate_apiview_feedback"
+  - name: invoke-azsdk-typespec-delegate-apiview-feedback-4
+    prompt: >
+      Help me fix these comments: https://spa.apiview.dev/review/c375391d5ab9419f83e3bdsfas9asdfadf2e?activeApiRevisionId=fc2a4adfasdfasdagae3w3hhtd
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_delegate_apiview_feedback"
+  - name: invoke-azsdk-typespec-delegate-apiview-feedback-5
+    prompt: >
+      Fix this feedback: https://spa.apiview.dev/review/c375391d5ab9419f83e3bdsfas9asdfadf2e?activeApiRevisionId=fc2a4adfasdfasdagae3w3hhtd
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_delegate_apiview_feedback"
+  - name: invoke-azsdk-typespec-delegate-apiview-feedback-6
+    prompt: >
+      Create an issue and assign to copilot to fix this: https://spa.apiview.dev/review/adfaset5391d5ab9419f83e3bds9asdfadf2e?activeApiRevisionId=adf34adfasadastasdagae3w3hhtd
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_delegate_apiview_feedback"
+
+  # ==== azsdk_typespec_generate_authoring_plan triggers ====
+  - name: invoke-azsdk-typespec-generate-authoring-plan-1
+    prompt: >
+      Generate a solution to add a new resource 'asset' for service widget with TypeSpec.
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_generate_authoring_plan"
+  - name: invoke-azsdk-typespec-generate-authoring-plan-2
+    prompt: "Generate a solution to add a new api version for service widget with TypeSpec."
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_generate_authoring_plan"
+  - name: invoke-azsdk-typespec-generate-authoring-plan-3
+    prompt: >
+      Generate a solution to set a default value `21` for property `age` in model EmployeeProperties from a api version say 2025-11-01 with TypeSpec.
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_generate_authoring_plan"
+
+  # ==== azsdk_typespec_init_project triggers ====
+  - name: invoke-azsdk-typespec-init-project-1
+    prompt: "Initialize a new TypeSpec project"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_init_project"
+  - name: invoke-azsdk-typespec-init-project-2
+    prompt: "Initialize a new TypeSpec project for my service"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_typespec_init_project"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-verify.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-verify.eval.yaml
new file mode 100644
index 00000000000..24e43a38451
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/triggers-verify.eval.yaml
@@ -0,0 +1,45 @@
+name: azsdk-mcp-tool-invocation-eval
+description: |
+  Verify that prompts correctly invoke the expected Azure SDK MCP tools.
+  Each stimulus is a single user prompt that should trigger a specific tool.
+version: "1.0"
+type: capability
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 5
+  timeout: "120s"
+  executor: copilot-sdk
+  model: claude-opus-4.6
+
+tags:
+  tier: unit
+  area: engsys
+  priority: p0
+
+stimuli:
+
+  # ==== azsdk_verify_setup triggers ====
+  - name: invoke-azsdk-verify-setup-1
+    prompt: "Verify my environment setup"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_verify_setup"
+  - name: invoke-azsdk-verify-setup-2
+    prompt: "Verify my developer environment setup for MCP tools"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_verify_setup"
+  - name: invoke-azsdk-verify-setup-3
+    prompt: "Verify my MCP release tool setup"
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - name: "azsdk_verify_setup"
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/validate-typespec.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/validate-typespec.eval.yaml
new file mode 100644
index 00000000000..75993329a60
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/tools/validate-typespec.eval.yaml
@@ -0,0 +1,39 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Validate-typespec: the agent should run TypeSpec validation when asked to
+  validate a TypeSpec project.
+version: "1.0"
+type: capability
+
+
+tags:
+  tier: unit
+  area: typespec
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: validate-typespec
+    prompt: |
+      Validate my typespec project. It is already confirmed we are in a public repository.
+      The path to my typespec is specification/contosowidgetmanager/Contoso.WidgetManager/main.tsp.
+    constraints:
+      max_turns: 8
+      max_tokens: 8000
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_run_typespec_validation
+
+scoring:
+  weights:
+    tool-calls: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml
new file mode 100644
index 00000000000..c3ffb9356ea
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/live/release-planner.eval.yaml
@@ -0,0 +1,151 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Live end-to-end demo for the full release-planner -> generate-SDK flow.
+
+  Drives the *real* azsdk-cli MCP server against real DevOps APIs, inside a
+  real git worktree of azure-rest-api-specs. The MCP server runs with
+  AZSDKTOOLS_AGENT_TESTING=true (set globally in .vally.yaml), so work items
+  route to the DevOps test area path and are safe to leave around / re-run.
+
+  This scenario walks the agent through a multi-step chain that exercises
+  multiple skills back-to-back in a single conversation:
+
+    1. Release-plan skill   -> azsdk_create_release_plan, azsdk_get_release_plan
+    2. Generate-SDK skill   -> azsdk_run_generate_sdk
+    3. Release-plan skill   -> azsdk_link_sdk_pull_request_to_release_plan
+
+  The goal is to verify Vally end-to-end (live agent + live MCP + live DevOps)
+  can:
+    - route each turn to the correct skill,
+    - call the correct tool on that skill,
+    - and do so in the expected order across multiple steps.
+
+  Demonstrates Vally's environment.git fixture hook + live MCP + Copilot SDK
+  executor + real DevOps in one shot.
+
+  Prerequisite: a clone of Azure/azure-rest-api-specs at the path referenced
+  by environment.git.source below. Locally, run
+  evals/setup/ensure-specs-clone.ps1 to prime a per-user cache
+  (auto-refresh every 24h) at the path this source points at. CI should
+  clone the repo as a pipeline checkout step instead.
+
+version: "1.0"
+type: capability
+
+tags:
+  area: release-plan
+metadata:
+  repos:
+    - name: Azure/azure-rest-api-specs
+
+# Bound to the live env because the scenario asserts real DevOps writes
+# (test-area path) + real generation pipeline. Mock can't satisfy those
+# graders. Picked up by the `scenarios-live` / `nightly` suite via folder.
+environment: azsdk-mcp-live
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: release-planner-e2e
+    environment:
+      # Source is the per-user cache populated by evals/setup/ensure-specs-clone.ps1
+      # (idempotent shallow+sparse clone, auto-refresh every 24h).
+      # NOTE: hardcoded absolute path — Vally does not currently expand
+      # ${USERPROFILE} / env vars in env.git.source. Adjust per machine
+      # or replace with a CI-provided path. See upstream issue:
+      # https://github.com/microsoft/vally/issues (TODO: file env-var expansion)
+      git:
+        type: worktree
+        source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs
+        ref: main
+    prompt: |
+      Walk me through the full
+      release-plan + SDK-generation flow for the Contoso Widget Manager
+      end-to-end. Do every step below, in order, and use real tools (no
+      dry-run, no simulation):
+
+        1. Create a release plan using:
+             - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager"
+             - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f"
+             - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e"
+             - target release timeline: "December 2026"
+             - API version: "2022-11-01-preview"
+             - SDK release type: "beta"
+             - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387"
+
+        2. Fetch the release plan you just created back from DevOps to confirm
+           it was saved, and tell me its work-item ID.
+
+        3. Kick off SDK generation for that same TypeSpec project via the
+           generation pipeline (Python SDK is fine). Use the work-item ID
+           from step 2.
+
+        4. Once the generation pipeline reports a pull request URL, link
+           that SDK pull request back to the release plan from step 2.
+
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 20
+      max_tokens: 30000
+    # TODO: assert strict ordering create -> get -> generate -> link
+    #   — blocked on https://github.com/microsoft/vally/issues/453 (tool-calls grader sequence:).
+    # TODO: assert args (serviceTreeId / productTreeId / typeSpecProjectPath / workItemId)
+    #   — blocked on https://github.com/microsoft/vally/issues/454 (tool-calls grader generic args:).
+    # TODO: add `azsdk-common-generate-sdk-locally` (or the equivalent pipeline-
+    #   driven skill) to skill-invocation `required` once a skill that owns
+    #   `azsdk_run_generate_sdk` is registered. Today the only skill that
+    #   declares any of the tools in this scenario is azsdk-common-prepare-release-plan.
+    graders:
+      # 1. Skill-routing check (FIRST — fast, deterministic, free): did the
+      #    agent dispatch to the right skill at all? If this fails, the
+      #    tool-calls grader below is meaningless.
+      - type: skill-invocation
+        config:
+          required:
+            - azsdk-common-prepare-release-plan
+      # 2. Tool-call check: given the right skill was loaded, did it call
+      #    the right MCP tools? Each tool here is owned by the skill above
+      #    except azsdk_run_generate_sdk (see TODO).
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_create_release_plan
+            - azsdk_get_release_plan
+            - azsdk_run_generate_sdk
+            - azsdk_link_sdk_pull_request_to_release_plan
+          disallowed:
+            - azsdk_verify_setup
+      # 3. Final-answer correctness (LLM-judged): the deterministic graders
+      #    above only verify the agent *did* the right things, not that it
+      #    *reported* them back to the user correctly. Tools can fire
+      #    successfully while the final message hallucinates IDs / URLs.
+      #    This grader uses gpt-5.4 as judge against a free-form rubric so
+      #    minor wording variants (`WI 29262`, `work-item #29262`) all pass.
+      - type: prompt
+        config:
+          model: gpt-5.4
+          rubric: |
+            Did the final assistant message clearly state BOTH of the
+            following, consistent with the tools that were actually called?
+
+              1. A numeric DevOps work-item ID for the release plan that was
+                 created (or confirmed). Any unambiguous format is fine
+                 (e.g. "work item 29262", "WI #29262", "/_workitems/edit/29262").
+
+              2. A GitHub pull request URL on
+                 github.com/Azure/azure-sdk-for-* that was linked back to
+                 that release plan.
+
+            Answer "pass" only if BOTH are present. Otherwise answer "fail"
+            and briefly say which one is missing.
+
+scoring:
+  weights:
+    skill-invocation: 1
+    tool-calls: 1
+    prompt: 1
+  threshold: 1.0
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml
new file mode 100644
index 00000000000..95c2ea0f5f2
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/check-public-repo-then-validate.eval.yaml
@@ -0,0 +1,48 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Validate-then-check-public-repo: the agent should run TypeSpec validation,
+  then check if the project is in the public repo.
+version: "1.0"
+type: capability
+
+
+tags:
+  area: typespec
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: validate-then-check-public-repo
+    prompt: |
+      Run TypeSpec validation, then check if the project is in the public repo.
+      Project path: specification/contosowidgetmanager/Contoso.WidgetManager.
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 8
+      max_tokens: 8000
+    # TODO: assert ordering (validate before check) — blocked on https://github.com/Azure/azure-sdk-tools/issues/15832 (Vally tool-calls grader needs sequence:).
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_run_typespec_validation
+            - azsdk_typespec_check_project_in_public_repo
+          disallowed:
+            - azsdk_verify_setup
+      - type: skill-invocation
+        config:
+          required:
+            - azure-typespec-author
+
+scoring:
+  weights:
+    tool-calls: 1
+    skill-invocation: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml
new file mode 100644
index 00000000000..6a7d1c77230
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/release-planner-workflows.eval.yaml
@@ -0,0 +1,214 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Mock-environment workflow scenarios derived from the release-planner
+  replacement test plan (#15835). Each stimulus mirrors one of the four
+  high-level scenarios that release-planner-dashboard must hand off to the
+  agent:
+
+    1. Create a release plan (private preview / public preview / GA)
+    2. Generate SDK for all languages in an existing release plan
+    3. Link a different spec PR to an existing release plan
+    4. Update SDK details (package names) in a release plan
+
+  Plus an end-to-end "create + generate" flow used as the headline demo
+  prompt.
+
+  Bound to the mock MCP — these graders only inspect skill routing and tool
+  selection, not real DevOps writes. The full live e2e flow lives in
+  evals/scenarios/live/release-planner.eval.yaml.
+
+version: "1.0"
+type: capability
+
+tags:
+  area: release-plan
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  # --- Scenario 1: Create release plan ---------------------------------
+  - name: create-public-preview-release-plan
+    environment:
+      # Per-user cache populated by evals/setup/ensure-specs-clone.ps1
+      # (idempotent shallow+sparse clone, auto-refresh every 24h). Same
+      # source the live e2e uses — keeps the relative TypeSpec path
+      # resolvable on disk even though the MCP responses are mocked.
+      git:
+        type: worktree
+        source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs
+        ref: main
+    prompt: |
+      Create a public preview
+      release plan for the Contoso Widget Manager. Here is all the context
+      you need:
+        - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager"
+        - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f"
+        - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e"
+        - target release timeline: "June 2026"
+        - API version: "2022-11-01-preview"
+        - SDK release type: "beta"
+        - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387"
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 6
+      max_tokens: 8000
+    graders:
+      - type: skill-invocation
+        config:
+          required:
+            - azsdk-common-prepare-release-plan
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_get_release_plan
+            - azsdk_create_release_plan
+          disallowed:
+            - azsdk_verify_setup
+
+  # --- End-to-end demo prompt: create + generate -----------------------
+  - name: create-release-plan-and-generate-sdk
+    environment:
+      # Same per-user azure-rest-api-specs worktree as the create stimulus
+      # above so the agent sees a real on-disk spec repo.
+      git:
+        type: worktree
+        source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs
+        ref: main
+    prompt: |
+      Walk me through creating
+      a release plan and then generating SDK for the Contoso Widget Manager:
+        - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager"
+        - API release type: "Public Preview"
+        - service tree ID: "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f"
+        - product tree ID: "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e"
+        - target release timeline: "June 2026"
+        - API version: "2022-11-01-preview"
+        - SDK release type: "beta"
+        - spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38387"
+      After the release plan is created, generate SDK for all languages
+      using the work-item ID from the created release plan.
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 12
+      max_tokens: 16000
+    # TODO: assert ordering create -> get -> generate
+    #   — blocked on Vally tool-calls grader sequence: support.
+    graders:
+      - type: skill-invocation
+        config:
+          required:
+            - azsdk-common-prepare-release-plan
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_get_release_plan
+            - azsdk_create_release_plan
+            - azsdk_run_generate_sdk
+          disallowed:
+            - azsdk_verify_setup
+
+  # --- Scenario 2: Generate SDK for an existing release plan -----------
+  - name: generate-sdk-for-existing-release-plan
+    environment:
+      # Same per-user azure-rest-api-specs worktree as the create stimuli
+      # above so the agent can locate the TypeSpec project on disk while
+      # driving the release-planner flow.
+      git:
+        type: worktree
+        source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs
+        ref: main
+    prompt: |
+      Using the release-planner
+      flow, generate SDK for all languages for the Contoso Widget Manager
+      release plan. Here is the context you need:
+        - release plan work item ID: "29262"
+        - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager"
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 8
+      max_tokens: 10000
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_get_release_plan
+            - azsdk_run_generate_sdk
+          disallowed:
+            - azsdk_verify_setup
+            - azsdk_create_release_plan
+
+  # --- Scenario 3: Link a different spec PR to an existing release plan
+  - name: link-different-spec-pr-to-release-plan
+    environment:
+      git:
+        type: worktree
+        source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs
+        ref: main
+    prompt: |
+      Using the release-planner
+      flow, update the API spec pull request on an existing Contoso Widget
+      Manager release plan. Here is the context you need:
+        - release plan work item ID: "29262"
+        - new spec pull request: "https://github.com/Azure/azure-rest-api-specs/pull/38500"
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 6
+      max_tokens: 8000
+    graders:
+      - type: skill-invocation
+        config:
+          required:
+            - azsdk-common-prepare-release-plan
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_get_release_plan
+            - azsdk_update_api_spec_pull_request_in_release_plan
+          disallowed:
+            - azsdk_verify_setup
+            - azsdk_create_release_plan
+
+  # --- Scenario 4: Update SDK details (package names) ------------------
+  - name: update-sdk-details-in-release-plan
+    environment:
+      git:
+        type: worktree
+        source: C:/Users/gaoh/.vally-cache/azure-rest-api-specs
+        ref: main
+    prompt: |
+      Using the release-planner
+      flow, refresh the SDK package-name details on an existing Contoso
+      Widget Manager release plan from the on-disk TypeSpec emitter
+      configuration. Here is the context you need:
+        - release plan work item ID: "29262"
+        - TypeSpec project: "specification/contosowidgetmanager/Contoso.WidgetManager"
+        - tspconfig path: "specification/contosowidgetmanager/Contoso.WidgetManager/tspconfig.yaml"
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 8
+      max_tokens: 10000
+    graders:
+      - type: skill-invocation
+        config:
+          required:
+            - azsdk-common-prepare-release-plan
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_get_release_plan
+            - azsdk_update_sdk_details_in_release_plan
+          disallowed:
+            - azsdk_verify_setup
+            - azsdk_create_release_plan
+
+scoring:
+  weights:
+    skill-invocation: 1
+    tool-calls: 1
+  threshold: 1.0
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/rename-client-property.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/rename-client-property.eval.yaml
new file mode 100644
index 00000000000..0892601a25e
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/rename-client-property.eval.yaml
@@ -0,0 +1,48 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  Rename-client-property: the agent should rename @clientName("uri", "csharp")
+  to @clientName("imageUri", "csharp") on the AddFaceFromUrlRequest.url
+  property in specification/ai/Face/models.common.tsp.
+version: "1.0"
+type: capability
+
+
+tags:
+  area: typespec
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: rename-client-property
+    prompt: |
+      In the specification/ai/Face project, find the AddFaceFromUrlRequest model.
+      It has a property called 'url' that's been renamed to "uri" in c#.
+      Change that to imageUri for c#.
+    constraints:
+      max_turns: 5
+      max_tokens: 5000
+    # TODO: seed a git worktree (environment.git) at specification/ai/Face and
+    # add a `file-matches` grader on models.common.tsp to verify the
+    # @clientName("uri", "csharp") → @clientName("imageUri", "csharp") rename.
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - edit
+      - type: skill-invocation
+        config:
+          required:
+            - azure-typespec-author
+
+scoring:
+  weights:
+    tool-calls: 1
+    skill-invocation: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/typespec-generation-step02.eval.yaml b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/typespec-generation-step02.eval.yaml
new file mode 100644
index 00000000000..954188d4b5b
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/workflow-scenarios/mock/typespec-generation-step02.eval.yaml
@@ -0,0 +1,47 @@
+name: azsdk-mcp-tool-scenarios
+description: |
+  TypeSpec generation workflow step 2: the agent should check whether the
+  project is in the public repo as part of the validation step.
+version: "1.0"
+type: capability
+
+
+tags:
+  area: typespec
+
+environment: azsdk-mcp-mock
+
+config:
+  runs: 1
+  timeout: 30m
+  model: gpt-5.4
+  executor: copilot-sdk
+
+stimuli:
+  - name: typespec-generation-step02-validation
+    prompt: |
+      I'm working on the TypeSpec generation workflow. I need to validate my TypeSpec project
+      as part of step 2. Please check if my TypeSpec project is in the public repo.
+      The project is at specification/contosowidgetmanager/Contoso.WidgetManager.
+      My setup has already been verified, do not run azsdk_verify_setup.
+    constraints:
+      max_turns: 5
+      max_tokens: 5000
+    graders:
+      - type: tool-calls
+        config:
+          required:
+            - azsdk_typespec_check_project_in_public_repo
+          disallowed:
+            - azsdk_verify_setup
+      - type: skill-invocation
+        config:
+          required:
+            - azure-typespec-author
+
+scoring:
+  weights:
+    tool-calls: 1
+    skill-invocation: 1
+  threshold: 1.0
+
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep
new file mode 100644
index 00000000000..6f799cb330a
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/fixtures/.gitkeep
@@ -0,0 +1,8 @@
+# Scenario fixtures live here, one folder per scenario name
+# (matching the `name:` field in the corresponding evals/*.eval.yaml).
+#
+# Reference them from the eval via:
+#   environment:
+#     files:
+#       - src: ../fixtures/<scenario-name>/<file>
+#         dest: <path inside the workspace>
diff --git a/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1 b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1
new file mode 100644
index 00000000000..fcd257e4cc8
--- /dev/null
+++ b/tools/azsdk-cli/Azure.Sdk.Tools.Vally/scripts/Validate-EvalTools.ps1
@@ -0,0 +1,160 @@
+<#
+.SYNOPSIS
+    Validates that all tool names referenced in tool-trigger eval files exist in the MCP server.
+
+.DESCRIPTION
+    This script:
+    1. Runs `azsdk list` to get all registered MCP tool names from the server.
+    2. Parses all `triggers-*.eval.yaml` files under the unit/ directory.
+    3. Reports any eval tool references that don't exist on the server,
+       and any server tools that are missing eval coverage.
+
+.PARAMETER ProjectPath
+    Path to the Azure.Sdk.Tools.Cli project. Defaults to ../Azure.Sdk.Tools.Cli relative to this script.
+
+.PARAMETER EvalPath
+    Path to the directory containing `triggers-*.eval.yaml` files.
+    Defaults to ../evals/unit relative to this script.
+
+.PARAMETER SkipBuild
+    If set, passes --no-build to dotnet run (requires a prior build).
+#>
+[CmdletBinding()]
+param(
+    [string]$ProjectPath,
+    [string]$EvalPath,
+    [switch]$SkipBuild
+)
+
+Set-StrictMode -Version 4
+$ErrorActionPreference = 'Stop'
+
+$scriptDir = $PSScriptRoot
+$vallyRoot = (Resolve-Path (Join-Path $scriptDir "..")).Path
+$cliParent = (Resolve-Path (Join-Path $vallyRoot "..")).Path
+
+if (-not $ProjectPath) {
+    $ProjectPath = Join-Path $cliParent "Azure.Sdk.Tools.Cli"
+}
+if (-not $EvalPath) {
+    $EvalPath = Join-Path $vallyRoot "evals/unit"
+}
+
+if (-not (Test-Path $ProjectPath)) {
+    Write-Error "CLI project not found at: $ProjectPath"
+    return 1
+}
+if (-not (Test-Path $EvalPath)) {
+    Write-Error "Evaluations directory not found at: $EvalPath"
+    return 1
+}
+
+# Step 1: Get tool names from the MCP server via `azsdk list`
+Write-Host "Running 'azsdk list' to get registered MCP tools..." -ForegroundColor Cyan
+
+$dotnetArgs = @("run", "--project", $ProjectPath)
+if ($SkipBuild) {
+    $dotnetArgs += "--no-build"
+}
+$dotnetArgs += @("--", "list", "--output", "json")
+
+$listOutput = & dotnet @dotnetArgs 2>&1
+$jsonLines = $listOutput | Where-Object { $_ -is [string] -and $_ -notmatch "^Using launch settings" }
+$jsonText = $jsonLines -join "`n"
+
+try {
+    $parsed = $jsonText | ConvertFrom-Json
+    [string[]]$serverTools = @($parsed.Tools | ForEach-Object { $_.McpToolName } | Where-Object { $_ } | Sort-Object -Unique)
+} catch {
+    Write-Error "Failed to parse 'azsdk list --output json'. Error: $_"
+    return 1
+}
+
+# Filter out tools that are excluded from eval coverage (example, test, and utility tools)
+$excludedTools = @(
+    "azsdk_hello_world",
+    "azsdk_hello_world_fail",
+    "azsdk_example_process_execution",
+    "azsdk_example_powershell_execution",
+    "azsdk_example_azure_service",
+    "azsdk_example_ai_service",
+    "azsdk_example_error_handling",
+    "azsdk_example_agent_fibonacci",
+    "azsdk_example_github_service",
+    "azsdk_example_devops_service",
+    "azsdk_upgrade",
+    "azsdk_engsys_codeowner_view",
+    "azsdk_engsys_codeowner_add_label_owner",
+    "azsdk_engsys_codeowner_remove_label_owner",
+    "azsdk_engsys_codeowner_add_package_owner",
+    "azsdk_engsys_codeowner_remove_package_owner",
+    "azsdk_engsys_codeowner_add_package_label",
+    "azsdk_engsys_codeowner_remove_package_label"
+)
+
+[string[]]$serverTools = @($serverTools | Where-Object { $_ -notin $excludedTools })
+
+if ($serverTools.Count -eq 0) {
+    Write-Error "No tools found from 'azsdk list'. Check that the CLI project builds and runs correctly."
+    return 1
+}
+
+Write-Host "Found $($serverTools.Count) tools registered on the MCP server ($($excludedTools.Count) excluded).`n" -ForegroundColor Green
+
+# Step 2: Parse all triggers-*.eval.yaml files in the unit directory for tool name references
+$evalFiles = Get-ChildItem -Path $EvalPath -Filter "triggers-*.eval.yaml"
+
+if ($evalFiles.Count -eq 0) {
+    Write-Error "No triggers-*.eval.yaml files found in: $EvalPath"
+    return 1
+}
+
+$evalToolsByFile = @{}
+$allEvalTools = [System.Collections.Generic.HashSet[string]]::new()
+
+foreach ($file in $evalFiles) {
+    $key = $file.BaseName
+    $matchResults = Select-String -Path $file.FullName -Pattern 'name:\s*"(azsdk_[^"]+)"'
+    [string[]]$tools = @($matchResults | ForEach-Object { $_.Matches[0].Groups[1].Value } | Sort-Object -Unique)
+    $evalToolsByFile[$key] = $tools
+    foreach ($t in $tools) {
+        [void]$allEvalTools.Add($t)
+    }
+}
+
+Write-Host "Found $($allEvalTools.Count) unique tools across $($evalFiles.Count) eval files.`n" -ForegroundColor Green
+
+# Step 3: Compare
+[string[]]$missingFromServer = @($allEvalTools | Where-Object { $_ -notin $serverTools } | Sort-Object)
+[string[]]$missingFromEvals = @($serverTools | Where-Object { $_ -notin $allEvalTools } | Sort-Object)
+
+$hasErrors = $false
+
+if ($missingFromServer.Count -gt 0) {
+    $hasErrors = $true
+    Write-Host "ERROR: Eval references tools NOT found on the MCP server:" -ForegroundColor Red
+    foreach ($tool in $missingFromServer) {
+        # Find which eval file references it
+        $sources = $evalToolsByFile.GetEnumerator() | Where-Object { $_.Value -contains $tool } | ForEach-Object { $_.Key }
+        Write-Host "  - $tool (referenced in: $($sources -join ', '))" -ForegroundColor Red
+    }
+    Write-Host ""
+}
+
+if ($missingFromEvals.Count -gt 0) {
+    $hasErrors = $true
+    Write-Host "ERROR: Server tools with no eval coverage:" -ForegroundColor Red
+    foreach ($tool in $missingFromEvals) {
+        Write-Host "  - $tool" -ForegroundColor Red
+    }
+    Write-Host ""
+}
+
+Write-Host ""
+if ($hasErrors) {
+    Write-Host "RESULT: FAIL - Eval tools and MCP server tools are out of sync." -ForegroundColor Red
+    exit 1
+} else {
+    Write-Host "RESULT: PASS - All eval tools exist on the MCP server." -ForegroundColor Green
+    exit 0
+}
diff --git a/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md
new file mode 100644
index 00000000000..629f42bfdca
--- /dev/null
+++ b/tools/azsdk-cli/docs/specs/8-operations-agent-eval-strategy.spec.md
@@ -0,0 +1,426 @@
+# Spec: 8 Operations — Agent Evaluation Strategy
+
+## Table of Contents
+
+- [Definitions](#definitions)
+- [Background / Problem Statement](#background--problem-statement)
+- [Goals and Exceptions/Limitations](#goals-and-exceptionslimitations)
+- [Design Proposal](#design-proposal)
+- [Agent Prompts](#agent-prompts)
+- [Success Criteria](#success-criteria)
+- [Open Questions](#open-questions)
+- [Implementation Plan](#implementation-plan)
+
+---
+
+## Definitions
+
+- **Agent**: a live LLM conversation driving Azure SDK MCP tools through skills.
+- **Skill**: a markdown contract under `.github/skills/<name>/` telling the
+  agent *when* to engage and *which* tools/workflow to use.
+- **MCP tool**: a discrete capability exposed by the Azure SDK MCP server.
+- **Workflow scenario**: a user prompt that crosses multiple tools / skills
+  end-to-end (e.g. *create release plan → generate SDK → link the SDK PR*).
+- **Stimulus**: one prompt + its expected behavior — the unit of an eval.
+- **Three graders per stimulus**: `skill-invocation` (right skill picked),
+  `tool-calls` (right tools / order / args), and `prompt` (right final answer).
+- **Mock MCP**: an in-memory fake of the Azure SDK MCP server — no network,
+  no side effects. **Live MCP**: the real server hitting real DevOps / GitHub.
+
+
+---
+
+## Background / Problem Statement
+
+We're shipping agent-driven replacements for manual SDK workflows — starting
+with the release planner. When someone
+asks *"does the agent actually do what we said it does?"*, today the only
+honest answer is "I tried a few prompts on my laptop." That is not good
+enough to hand to partner teams or to keep regressions out as more workflows
+land.
+
+We need a small, shared set of prompts we promise to support, run regularly,
+with a clear pass/fail per prompt — so we can point at the report instead
+of re-demoing.
+
+---
+
+## Goals and Exceptions/Limitations
+
+### Goals
+
+- [ ] **One file per workflow, three graders per prompt** — skill picked,
+      tools called, final answer.
+- [ ] **Mock MCP by default, live MCP only on opt-in** — no accidental writes
+      to DevOps / GitHub; release / publish tools stay mock-only.
+- [ ] **Mock covers every tool the scenarios call**, with realistic responses.
+- [ ] **Anyone can clone and run** — env vars, no hard-coded paths; live
+      scenarios declare what repos they need.
+- [ ] **The run produces a status table** of pass/fail per prompt plus a
+      trajectory per prompt — readable by non-engineers.
+- [ ] **Reports come out in the formats people actually use** — markdown
+      for humans, JUnit for CI, CSV for spreadsheets and dashboards.
+- [ ] **Adding a partner-reported prompt is one new stimulus**, no runner
+      or CI changes.
+- [ ] **Multi-step chains work** (e.g. *validate TypeSpec → create release
+      plan → generate SDK → link the SDK PR*).
+
+### Exceptions and Limitations
+
+- **Some prompts can only be checked against live MCP** — the mock can't
+  prove a release plan was really created. Those run opt-in only.
+- **The agent is not deterministic.** Same prompt, different wording or
+  turn count each run. We grade shape, not exact strings, and accept some
+  flake.
+
+---
+
+
+## Design Proposal
+
+### The three eval kinds
+
+We organize evals around what's actually being tested. No tier numbers —
+use the names. The first three columns are the same axis (what does this
+prove); the last two say where each lives and what backend it needs.
+
+| Kind | What it proves | Agent | MCP | Lives in |
+|---|---|---|---|---|
+| **Skills** | A user prompt routes to the right skill. | live | none | `.github/skills/<skill>/evals/` |
+| **Workflows — Mock** | Agent picks the right skills, calls the right tools in the right order with the right args, returns the right answer. | live | **mock** | `evals/workflow-scenarios/mock/` |
+| **Workflows — Live** | Same as above, but against the real backend — catches drift the mock can't see (TypeSpec ordering, real codegen output, real DevOps state). | live | **live** | `evals/workflow-scenarios/live/` |
+
+Plus a hermetic tool-shape layer that isn't agent-driven:
+
+| Kind | What it proves | Lives in |
+|---|---|---|
+| **Tools** | Tool X exists and returns the right shape for these inputs. Cross-skill trigger tables. | `evals/tools/` |
+
+#### Required graders by kind
+
+Mock and live workflow scenarios share the same scenario format but
+differ in which graders are *required* vs *optional*:
+
+| Kind | `tool-calls` | `skill-invocation` | response grader (`prompt` / LLM-judge) |
+|---|---|---|---|
+| **Workflows — Mock** | required | optional | not applicable — mock responses are stubbed, so a response grader has nothing meaningful to assert |
+| **Workflows — Live** | required | required | required — only live runs produce a real assistant answer worth grading |
+
+Rationale: the mock backend deterministically replays canned data, so
+"the agent said the right thing" reduces to "the agent called the right
+tools." Live runs are the only place a free-form response can drift, so
+that's where the response grader earns its cost.
+
+
+### Folder layout
+
+```
+evals/
+├── tools/                  tool-shape + cross-skill triggers (hermetic)
+├── workflow-scenarios/
+│   ├── mock/               workflow scenarios run against the mock MCP
+│   └── live/               workflow scenarios run against the live MCP
+└── setup/                  shared fixture scripts (repo clone, etc.)
+```
+
+A scenario lives under `mock/` or `live/` based on which backend the
+graders are written against, not based on the prompt. A prompt can
+have a `mock/` and a `live/` variant (release-planner does).
+
+**Scenarios are environment-agnostic.** A scenario file declares the
+prompt, expected skills, expected tool sequence, and graders — nothing
+about whether MCP is mock or live. Same file, same graders; the MCP
+backend is picked at run time.
+
+| Run mode | MCP | Repos? | When | Coverage |
+|---|---|---|---|---|
+| Workflows — Mock | mock (stub, no LLM) | azure-sdk-tools only | nightly + on demand | every scenario |
+| Workflows — Live | live (real backends) | azure-sdk-tools + shallow/sparse clones of the spec & language SDK repos each scenario declares | weekly | scenarios tagged `live-safe` (curated subset) |
+
+When live and mock results disagree, the mock is wrong — the divergence
+points straight at the missing or stale handler. Every scenario that
+runs on mock therefore drives the mock to grow handlers for the tools
+it exercises.
+
+### Where each eval lives
+
+| What it tests | Lives in |
+|---|---|
+| **One skill** (does this skill route, call its tools, return a sensible answer) | `.github/skills/<skill>/evals/` |
+| **Cross-skill / cross-tool** (multi-step chains, e2e flows, mock-server integration, anything that doesn't belong to one skill) | `tools/azsdk-cli/Azure.Sdk.Tools.Vally/evals/` |
+
+Skill evals stay next to `SKILL.md` — that's the convention skill
+authors expect, and it keeps everything about a skill in one folder.
+Existing skill eval files do not move.
+
+#### Skill eval suite — current state and direction
+
+The per-skill suite predates this project. Today roughly a dozen skills
+have eval files; some are missing thresholds and pass without asserting
+anything, and most capability stimuli are graded only by a single
+substring check — they pass whether the agent called the right tool,
+the wrong tool, or just echoed the prompt.
+
+*Direction.* Raise the bar on what counts as a per-skill eval: adopt
+the four-layer pattern — skill-invocation + tool-calls + structural
+output match + optional LLM-judge — as the required shape for every
+capability stimulus. A `skill-eval-authoring` skill packages the
+pattern, grader catalog, and anti-patterns so other Azure SDK teams
+adopt without re-learning the gotchas.
+
+### Decision tree — where does my new eval go?
+
+```
+Do you only care that the agent picks the right skill
+(you don't care which tools it then calls)?
+└── yes → .github/skills/<skill-name>/evals/   (not this project)
+
+Do you want to check that one MCP tool returns the right shape
+for a given input — no agent in the loop?
+└── yes → evals/tools/
+
+Is it a multi-step / multi-tool agent flow?
+└── yes → Workflow scenario
+        ├── Default → evals/workflow-scenarios/mock/
+        │   Runs against the mock MCP. Use this unless the mock can't
+        │   faithfully cover the behavior.
+        └── Also need live coverage → add an evals/workflow-scenarios/live/
+            variant. Reserve for cases where the real backend's behavior
+            matters (TypeSpec ordering, real codegen output, real DevOps
+            state).
+```
+
+### CI
+
+The suite runs on a schedule, not on every pull request. Agent runs
+talk to an LLM — they cost money and they flake in ways that have
+nothing to do with the code under review.
+
+| When | What runs | Backend |
+|---|---|---|
+| Nightly | All workflow scenarios + the hermetic tool layer | mock |
+| Weekly | Workflow scenarios marked safe to run live | live (with safe-mode flag on writes) |
+| On demand | Any suite, any backend | author's choice |
+
+#### PR gate for essential workflows (open)
+
+A case for *narrow* PR gating: a small curated set of mock scenarios
+covering the workflows we have already promised to partner teams
+(release-planner today; more as they ship) could run on PRs that touch
+the agent, skills, or MCP tools — so we catch a regression in the
+workflows users actually rely on before merge, instead of the morning
+after.
+
+Unresolved trade-offs: which scenarios count as "essential"; how to
+keep the gate from flaking on LLM non-determinism (retries? loose
+thresholds? quorum across N runs?); whether the cost of the gated
+subset is acceptable for every PR; and which paths actually trigger it
+(agent-only? skills? MCP server? all of the above?).
+
+See [Open Questions](#open-questions).
+
+#### Pre-run setup for live scenarios
+
+**The problem.** A real workflow crosses repos. The release planner
+reads a TypeSpec project from `azure-rest-api-specs`, generates code
+into a language SDK repo, and links a PR back. The tools the agent
+calls expect those files on disk. If a repo is missing, the agent
+fails for the wrong reason and we learn nothing.
+
+**The setup step.** Each live scenario declares the repos (and
+optionally the commit) it needs. One setup step reads all live
+scenarios, takes the union, and makes sure each repo is present at the
+requested commit before any eval runs.
+
+**Locally.** A single script. Run it once; it clones into a cache
+folder under your home directory and reuses the clone on subsequent
+runs. Same script CI uses.
+
+**In CI.** The weekly live job runs the same script. The cache folder
+is a build-cache artifact keyed on the set of repos the scenarios
+declare; it's invalidated only when that set changes.
+
+**Pinning.** A scenario can pin a commit when reproducibility matters.
+Otherwise the setup step takes the default branch and records the
+commit it used in the run output.
+
+The nightly mock job runs no setup — mock evals touch no external repos.
+
+
+### Mock MCP server status
+
+#### How it works
+
+`Azure.Sdk.Tools.Mock` reflects over the real CLI's tool list at boot and
+registers a mock proxy for **every** tool the real `Azure.Sdk.Tools.Cli`
+advertises, preserving each tool's name, description, and input schema.
+At call time the proxy looks up a handler by tool name:
+
+- **Custom handler exists** → scripted, type-correct response.
+- **No custom handler** → fallback `{ Message = "Success" }`.
+
+
+
+### Results
+
+The goal: anyone — partner team, manager, the engineer who broke
+something — should be able to open a run and understand what passed,
+what failed, and why, without help.
+
+Each run writes three files into the output directory:
+
+| File | What it is | Who reads it |
+|---|---|---|
+| `eval-results.md` | Human status table: one row per prompt, pass/fail per grader. | Reviewers, partner teams, anyone scanning a run. |
+| `results.jsonl` | The full agent trajectory — every tool call, args, return values, timings. One JSON object per line. | Engineers debugging a failure with tooling. |
+| `junit.xml` | Standard test-results format the CI test-results widget already understands. | CI dashboards. |
+
+The JSONL is rich but hard to read raw. We add two post-processors
+on top of it:
+
+- **Trajectory HTML** — one self-contained web page per prompt, opens
+  straight from `file://`. Shows the same trajectory as `results.jsonl`
+  but readable by someone who has never seen JSONL.
+- **CSV history** — one row per prompt, appended across runs. Lets us
+  ask *"how often did release-planner pass in the last 30 nightlies?"*
+  and feed a dashboard later.
+
+In CI: trajectories + JSONL are uploaded as build artifacts you can
+download from the run page; the CSV gets appended to a long-lived
+history branch.
+
+### Performance and cost controls
+
+Why this section exists: agent evals are *slow* and *expensive*. Every
+run talks to a real LLM — every tool call is a round trip, every turn
+is tokens billed against our subscription. Without limits, a single
+badly-written scenario can sit in a loop for an hour and burn through
+the budget while still reporting *"passed"*.
+
+Concrete example: one real release-planner end-to-end run took **17
+minutes wall time, 1.78M tokens, 41 turns**.
+
+The framework therefore enforces three things:
+
+**1. Per-scenario budgets.** Every scenario file declares an upper
+bound on:
+
+- **Turns** — how many times the agent loops.
+- **Wall time** — how long the whole run can take.
+- **Billable tokens** — input + output tokens we actually pay for.
+- **Tool calls** — catches an agent stuck calling the same tool forever.
+
+The runner warns at 50% of any limit, fails the scenario at 100%, and
+kills the whole run at 200% so a runaway can't bleed indefinitely.
+
+**2. Tiered defaults.** Mock runs nightly against an in-memory fake —
+cheap and fast, so the limits are tight. Live runs weekly against real
+backends — slower by nature, so the limits are looser.
+
+| Tier | Turns | Wall (s) | Billable tokens |
+|---|---|---|---|
+| Nightly mock | 30 | 300 | 200k |
+| Weekly live | 60 | 600 | 500k |
+
+A scenario that needs more must opt in with a justification comment in
+the scenario file. If reviewers reject the opt-in, the scenario has to
+be rewritten to fit, or moved to mock — budgets don't widen.
+
+**3. Background guardrails** — things the scenario author never has
+to think about, baked into the framework:
+
+- Polling tools (`*_get_*_status`) return a terminal state on the first poll under safe mode — no agent stuck waiting for *"in progress"* to flip.
+- LLM-judge graders default to a cheaper model than the agent itself.
+- CI cancels superseded runs when a branch gets a new push.
+
+
+---
+
+## Agent Prompts
+
+The list of prompts the agent is promised to support. Each lives as a
+stimulus in `evals/workflow-scenarios/mock/<workflow>.eval.yaml` (plus a
+`live/` counterpart where applicable). Adding a new prompt is one new
+entry in the matching file.
+
+### Release-planner workflow
+
+Derived from the release-planner replacement test plan
+([#15835](https://github.com/Azure/azure-sdk-tools/issues/15835)). All
+five route to the `azsdk-common-prepare-release-plan` skill.
+
+| Prompt | What the agent must do | Required tool calls |
+|---|---|---|
+| Create a public-preview release plan for a TypeSpec spec, target month June 2026 | Pick the prepare-release-plan skill; check for an existing plan; create one. | `azsdk_get_release_plan`, `azsdk_create_release_plan` |
+| Create a release plan **and** generate SDK for a TypeSpec spec, release type beta | End-to-end chain: create, then generate, then back-fill SDK details. | `azsdk_get_release_plan`, `azsdk_create_release_plan`, `azsdk_run_generate_sdk`, `azsdk_update_sdk_details_in_release_plan` |
+| Generate SDK for all languages for an existing release plan id | Look up the plan, run generation against the languages it lists. | `azsdk_get_release_plan`, `azsdk_run_generate_sdk` |
+| Link a different spec PR (`https://github.com/Azure/azure-rest-api-specs/pull/...`) to an existing release plan | Look up the plan, swap the spec-PR field. | `azsdk_get_release_plan`, `azsdk_update_api_spec_pull_request_in_release_plan` |
+| Update SDK details (package names) on an existing release plan from `tspconfig.yaml` | Look up the plan, update the SDK details from emitter config. | `azsdk_get_release_plan`, `azsdk_update_sdk_details_in_release_plan` |
+
+All five forbid `azsdk_verify_setup` (the setup gate runs once at the
+top of the workflow, not per prompt) and forbid the irrelevant
+`azsdk_create_release_plan` in the four "existing plan" prompts so we
+catch the agent creating a duplicate.
+
+### Other workflows in the first round
+
+| Workflow | File | Coverage |
+|---|---|---|
+| Check spec is in public repo then validate TypeSpec | `check-public-repo-then-validate.eval.yaml` | TypeSpec authoring routing + validation tool call. |
+| TypeSpec generation — step 2 of the authoring flow | `typespec-generation-step02.eval.yaml` | TypeSpec authoring skill + generate tool. |
+| Rename a client property in a generated SDK | `rename-client-property.eval.yaml` | Customization skill + customize-code tool. |
+
+The live counterpart of release-planner lives at
+`evals/workflow-scenarios/live/release-planner.eval.yaml` and adds a
+prompt-grader that checks the real DevOps response.
+
+---
+
+## Success Criteria
+
+- A single command runs the full mock suite locally and produces
+  `eval-results.md`, `results.jsonl`, JUnit XML, the per-prompt
+  trajectory HTML, and a `history.csv` row.
+- Every release-planner prompt above is green in the mock suite.
+- Every MCP tool a green scenario calls has a custom mock handler
+  returning a chainable, type-correct response.
+- A new contributor can clone the repo, set the documented env vars,
+  and reproduce the same `eval-results.md` verdict table on their
+  machine.
+- A partner team reporting *"I tried this prompt and the agent didn't
+  do anything"* can be answered by pasting their prompt as a new
+  stimulus and re-running the workflow file — no runner or CI changes.
+- The status table is what we hand to reviewers (Renhe, Laurent,
+  partner teams) to answer *"what does the agent currently support?"*
+
+---
+
+## Open Questions
+
+### CI cadence and PR gating
+
+**Cadence.** Current proposal: nightly mock + weekly live + on-demand.
+Open: is nightly the right frequency for mock, or do we want it on
+every push to `main`? Is weekly enough for live, given live is the
+only thing that catches real-backend drift?
+
+**PR gate for essential workflows.** Should a curated subset of mock
+scenarios block merge on PRs that touch the agent, skills, or MCP
+tools? Specifically to answer:
+
+- *Which workflows are "essential"* — just release-planner today, or
+  a broader set? Who decides when a new workflow joins or leaves the
+  gated set?
+- *Which paths trigger the gate* — agent code, skill markdown, MCP
+  tool code, mock handlers, all of the above? Anything else?
+- *How do we tame flake* — retries on failure, quorum across N runs,
+  loose thresholds, or just accept some red and require a human
+  override? Hard requirement: a green PR must mean *the gated
+  scenarios passed*, not *we got lucky this run*.
+- *What's the cost ceiling* — the gated subset runs on every PR push
+  to a touched path; what's the per-PR token / wall-time budget we're
+  willing to spend before we move it back off the PR?
+
+We need owners' input on all four before turning the gate on.
+
+-