Skip to content
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a0f9233
Scaffold Azure.Sdk.Tools.Vally tool-scenario eval suite (#15124)
helen229 May 27, 2026
701b7f8
Port remaining 9 benchmark scenarios to Vally (#15124)
helen229 May 27, 2026
26cc6ef
Add rename-client-property stub eval to Vally suite (#15124)
helen229 Jun 1, 2026
8e4f524
Fix tool name prefix in graders, timeout format, expand README
helen229 Jun 2, 2026
d9ea3e4
Reorganize evals into scenarios/ and triggers/; port trigger evals fr…
helen229 Jun 2, 2026
c10063b
Merge branch 'main' into feat/vally-tool-scenarios-15124
helen229 Jun 2, 2026
02aee34
update the config and use gpt-5.4 model
helen229 Jun 2, 2026
d1f212f
add disallowed
helen229 Jun 2, 2026
fd4eaf8
Vally: restructure evals into unit/integration/e2e test pyramid
helen229 Jun 2, 2026
66216b0
Merge branch 'feat/vally-tool-scenarios-15124' of https://github.com/…
helen229 Jun 2, 2026
a88ae11
Vally: remove Run-LiveEvals.ps1 (local-only test wrapper)
helen229 Jun 2, 2026
bb47139
some docs and test e2e one
helen229 Jun 3, 2026
4d89bac
update docs
helen229 Jun 3, 2026
f6f5c80
udpate design
helen229 Jun 3, 2026
3a8d609
update with skill evals
helen229 Jun 3, 2026
b7005b2
reorg based on the design
helen229 Jun 3, 2026
6db7c5f
remove the duplicates
helen229 Jun 3, 2026
b77dccb
add new scenarios
helen229 Jun 4, 2026
1264e9a
update the doc
helen229 Jun 4, 2026
aa714ab
update doc
helen229 Jun 4, 2026
f26cf1f
Merge remote-tracking branch 'origin/main' into feat/vally-tool-scena…
helen229 Jun 4, 2026
fda9ef9
update names
helen229 Jun 4, 2026
5b4fb6e
Vally: align release-planner mock stimuli with live e2e pattern
helen229 Jun 4, 2026
af3db0c
update doc
helen229 Jun 4, 2026
36c58ba
Vally: fix MCP boot race + drop misconfigured grader (#15948)
helen229 Jun 5, 2026
12714ae
Merge branch 'main' into feat/vally-tool-scenarios-15124
helen229 Jun 5, 2026
2ce5e7b
update readme for runing steps
helen229 Jun 5, 2026
84379cf
Vally: align mock release-planner grader with live + deterministic 'n…
helen229 Jun 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/skills/.vally.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@ paths:
evalFilenames: ["eval.yaml", "*.eval.yaml"]

environments:
# Launch the pre-built DLLs via `dotnet <dll>`, NOT `dotnet run` — avoids the
# MSBuild boot race under parallel workers. See issue #15948.
# CI builds the DLLs in the 'Build MCP servers' step of skill-eval.yml.
azsdk-mcp:
mcpServers:
azure-sdk-mcp:
type: stdio
command: dotnet
args: ["run", "--project", "../../tools/azsdk-cli/Azure.Sdk.Tools.Cli", "--", "start"]
args: ["../../artifacts/bin/Azure.Sdk.Tools.Cli/Debug/net8.0/azsdk.dll", "start"]
timeout: "60s"
env:
AZSDKTOOLS_AGENT_TESTING: "true"
Expand All @@ -27,5 +30,5 @@ environments:
azure-sdk-mcp:
type: stdio
command: dotnet
args: ["run", "--project", "../../tools/azsdk-cli/Azure.Sdk.Tools.Mock"]
args: ["../../artifacts/bin/Azure.Sdk.Tools.Mock/Debug/net8.0/azsdk-mock.dll"]
timeout: "60s"
8 changes: 8 additions & 0 deletions eng/pipelines/skill-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ jobs:
- script: npm install -g @github/copilot-sdk
displayName: 'Install Copilot SDK'

# Pre-build the MCP servers so vally launches `dotnet <dll>` instead of
# `dotnet run` — avoids the MSBuild boot race under parallel workers.
# See issue #15948.
- script: |
dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Cli -c Debug --nologo
dotnet build tools/azsdk-cli/Azure.Sdk.Tools.Mock -c Debug --nologo
displayName: 'Build MCP servers'

- script: |
input_areas=$(echo "${{ parameters.areas }}" | xargs)
if [ -n "$input_areas" ]; then
Expand Down
2 changes: 2 additions & 0 deletions tools/azsdk-cli/Azure.Sdk.Tools.Vally/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
vally-results/
results/
92 changes: 92 additions & 0 deletions tools/azsdk-cli/Azure.Sdk.Tools.Vally/.vally.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Vally configuration for Azure SDK Tools MCP tool / scenario evaluations.
# See: https://vally.dev/reference/vally-config
#
# These are scenario evals (does the agent invoke the right MCP tool(s) for a
# given prompt?) and are intentionally separate from the per-skill evals under
# .github/skills/. See README.md for context.

paths:
evals: [evals/]
evalFilenames: ["*.eval.yaml"]
results: results/

environments:
# Launch the pre-built DLL via `dotnet <dll>`, NOT `dotnet run` — avoids the
# MSBuild boot race under parallel workers. See issue #15948.
# Run `dotnet build ../Azure.Sdk.Tools.Mock -c Debug` once before vally.
azsdk-mcp-mock:
mcpServers:
azure-sdk-mcp:
type: stdio
command: dotnet
args: ["../../../artifacts/bin/Azure.Sdk.Tools.Mock/Debug/net8.0/azsdk-mock.dll"]
timeout: "30s"

# Live MCP. AZSDKTOOLS_AGENT_TESTING=true keeps write tools inside the test
# area. Pre-built DLL pattern — see issue #15948.
# Run `dotnet build ../Azure.Sdk.Tools.Cli -c Debug` once before vally.
azsdk-mcp-live:
mcpServers:
azure-sdk-mcp:
type: stdio
command: dotnet
args: ["../../../artifacts/bin/Azure.Sdk.Tools.Cli/Debug/net8.0/azsdk.dll", "start"]
timeout: "5m"
env:
AZSDKTOOLS_AGENT_TESTING: "true"
AZSDKTOOLS_COLLECT_TELEMETRY: "false"

# Suites group evals for selective execution.
#
# Layout maps directly to suites — no tag-based mock/live filtering. Vally's
# suite filter is positive-match only (AND across keys, OR within values),
# so subfolders are the cleanest way to split mock vs live. See
# https://github.com/microsoft/vally suite-filter source.
suites:
# ---- by tier ----
unit:
description: |
Hermetic single-tool / trigger evals. No external I/O. Fast; the
foundation of the PR gate.
evals: ["evals/tools/*.eval.yaml"]

scenarios-mock:
description: |
Multi-tool scenarios against the mock MCP environment. Hermetic; safe
for PR gate.
evals: ["evals/workflow-scenarios/mock/*.eval.yaml"]

scenarios-live:
description: |
Scenarios against live MCP — real DevOps / GitHub / pipelines. Slow;
nightly only. Prime any required clones first via
`evals/setup/ensure-specs-clone.ps1`.
evals: ["evals/workflow-scenarios/live/*.eval.yaml"]

# ---- composite suites ----
pr-gate:
description: Hermetic tiers only (unit + scenarios-mock). Target for CI PR check.
evals:
- "evals/tools/*.eval.yaml"
- "evals/workflow-scenarios/mock/*.eval.yaml"
nightly:
description: All tiers including live scenarios.
evals: ["evals/**/*.eval.yaml"]

# ---- by feature area (tag-filtered) ----
release-plan:
description: All evals tagged area=release-plan.
filter: { area: release-plan }
evals: ["evals/**/*.eval.yaml"]
typespec:
description: All evals tagged area=typespec.
filter: { area: typespec }
evals: ["evals/**/*.eval.yaml"]
pipeline:
description: All evals tagged area=pipeline.
filter: { area: pipeline }
evals: ["evals/**/*.eval.yaml"]
github:
description: All evals tagged area=github.
filter: { area: github }
evals: ["evals/**/*.eval.yaml"]
Loading
Loading