-
Notifications
You must be signed in to change notification settings - Fork 235
Benchmark Eval Migration #14507
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Benchmark Eval Migration #14507
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
58d8311
first iteration of migration
jeo02 2ddb427
simplify
jeo02 08f9a12
structure based off tools
jeo02 60cf5ba
remove evaluations + remove unused
jeo02 105bf40
path fix + copilot fix
jeo02 a34ba42
organize tags + copilot comment
jeo02 b338595
Merge branch 'main' into benchmark-eval-migration
jeo02 9d2e7b6
test mode
jeo02 e256964
copilot nit
jeo02 65bbfed
copilot nit
jeo02 de7e67c
remove param
jeo02 3bed443
Merge branch 'main' into benchmark-eval-migration
jeo02 e76ad42
dictionary
jeo02 6a3f360
method for setup env
jeo02 62473d2
nit
jeo02 38ff717
Merge branch 'benchmark-eval-migration' of https://github.com/jeo02/a…
jeo02 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
tools/azsdk-cli/Azure.Sdk.Tools.Cli.Benchmarks/Models/ExpectedToolCall.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| namespace Azure.Sdk.Tools.Cli.Benchmarks.Models; | ||
|
|
||
| /// <summary> | ||
| /// Describes an expected tool call with optional input validation. | ||
| /// </summary> | ||
| public class ExpectedToolCall | ||
| { | ||
| /// <summary>Gets the expected tool name (short name without MCP prefix).</summary> | ||
| public string ToolName { get; } | ||
|
|
||
| /// <summary> | ||
| /// Gets the expected input key-value pairs to validate, or null to skip input validation. | ||
| /// Keys are parameter names; values are the expected values. | ||
| /// String values use case-insensitive substring matching (to handle variable path prefixes). | ||
| /// Numeric and boolean values use exact matching. | ||
| /// </summary> | ||
| public IReadOnlyDictionary<string, object?> ExpectedInputs { get; } | ||
|
|
||
| /// <summary> | ||
| /// Creates an expected tool call that only validates the tool was called (no input checks). | ||
| /// </summary> | ||
| public ExpectedToolCall(string toolName) | ||
| { | ||
| ToolName = toolName; | ||
| ExpectedInputs = new Dictionary<string, object?>(); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Creates an expected tool call that validates both the tool name and its inputs. | ||
| /// </summary> | ||
| public ExpectedToolCall(string toolName, Dictionary<string, object?> expectedInputs) | ||
| { | ||
| ToolName = toolName; | ||
| ExpectedInputs = expectedInputs; | ||
| } | ||
|
|
||
| public override string ToString() => ToolName; | ||
| } |
54 changes: 54 additions & 0 deletions
54
tools/azsdk-cli/Azure.Sdk.Tools.Cli.Benchmarks/Models/ToolCallRecord.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| using System.Text.Json; | ||
|
|
||
| namespace Azure.Sdk.Tools.Cli.Benchmarks.Models; | ||
|
|
||
| /// <summary> | ||
| /// Represents a tool call captured during benchmark execution. | ||
| /// </summary> | ||
| public class ToolCallRecord | ||
| { | ||
| /// <summary>Gets the tool name (may include MCP prefix).</summary> | ||
| public required string ToolName { get; init; } | ||
|
|
||
| /// <summary> | ||
| /// Gets the arguments passed to the tool, or null if unavailable. | ||
| /// The raw object from the SDK hook (typically a JsonElement). | ||
| /// </summary> | ||
| public object? ToolArgs { get; init; } | ||
|
|
||
| /// <summary>Gets the result returned by the tool, or null if unavailable.</summary> | ||
| public object? ToolResult { get; init; } | ||
|
|
||
| /// <summary>Gets the tool call duration in milliseconds, or null if unavailable.</summary> | ||
| public double? DurationMs { get; init; } | ||
|
|
||
| /// <summary>Gets the MCP server name extracted from the tool name prefix, or null.</summary> | ||
| public string? McpServerName { get; init; } | ||
|
|
||
| /// <summary>Gets the timestamp when the tool call was recorded.</summary> | ||
| public DateTime Timestamp { get; init; } = DateTime.UtcNow; | ||
|
|
||
| /// <summary> | ||
| /// Gets the tool arguments as a string-keyed dictionary of JsonElements. | ||
| /// Returns an empty dictionary if args are null or not a JSON object. | ||
| /// </summary> | ||
| public IReadOnlyDictionary<string, JsonElement> GetArgsAsDictionary() | ||
| { | ||
| if (ToolArgs is not JsonElement { ValueKind: JsonValueKind.Object } jsonElement) | ||
| { | ||
| return new Dictionary<string, JsonElement>(); | ||
| } | ||
|
|
||
| var dict = new Dictionary<string, JsonElement>(StringComparer.OrdinalIgnoreCase); | ||
| foreach (var prop in jsonElement.EnumerateObject()) | ||
| { | ||
| dict[prop.Name] = prop.Value; | ||
| } | ||
| return dict; | ||
| } | ||
|
|
||
| public override string ToString() => ToolName; | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
54 changes: 54 additions & 0 deletions
54
...sdk-cli/Azure.Sdk.Tools.Cli.Benchmarks/Scenarios/GitHub/GetPrLinkCurrentBranchScenario.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| using Azure.Sdk.Tools.Cli.Benchmarks.Models; | ||
| using Azure.Sdk.Tools.Cli.Benchmarks.Validation; | ||
| using Azure.Sdk.Tools.Cli.Benchmarks.Validation.Validators; | ||
|
|
||
| namespace Azure.Sdk.Tools.Cli.Benchmarks.Scenarios; | ||
|
|
||
| /// <summary> | ||
| /// Validates that the agent invokes azsdk_get_pull_request_link_for_current_branch | ||
| /// when asked about the status of a spec PR on the current branch. | ||
| /// Migrated from evaluation scenario: Evaluate_GetPullRequestLinkForCurrentBranch. | ||
| /// </summary> | ||
| public class GetPrLinkCurrentBranchScenario : BenchmarkScenario | ||
| { | ||
| /// <inheritdoc /> | ||
| public override string Name => "get-pr-link-current-branch"; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string Description => | ||
| "Verify the agent calls azsdk_get_pull_request_link_for_current_branch for PR status."; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string[] Tags => ["github"]; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override RepoConfig Repo => new() | ||
| { | ||
| Owner = "Azure", | ||
| Name = "azure-rest-api-specs", | ||
| Ref = "main", | ||
| SparseCheckoutPaths = ["specification/contosowidgetmanager"] | ||
| }; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string Prompt => """ | ||
| What's the status of the spec PR in my current branch? Only check the status once. | ||
| My setup has already been verified, do not run azsdk_verify_setup. | ||
| The repository root is the relative path ./azure-rest-api-specs. | ||
| """; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override IEnumerable<IValidator> Validators => | ||
| [ | ||
| new ToolCallValidator( | ||
| "Expected tool: azsdk_get_pull_request_link_for_current_branch", | ||
| expectedToolCalls: | ||
| [ | ||
| new ExpectedToolCall("azsdk_get_pull_request_link_for_current_branch") | ||
| ], | ||
| forbiddenToolNames: ["azsdk_verify_setup"]) | ||
| ]; | ||
| } |
59 changes: 59 additions & 0 deletions
59
...cli/Azure.Sdk.Tools.Cli.Benchmarks/Scenarios/Pipeline/CheckSdkGenerationStatusScenario.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| using Azure.Sdk.Tools.Cli.Benchmarks.Models; | ||
| using Azure.Sdk.Tools.Cli.Benchmarks.Validation; | ||
| using Azure.Sdk.Tools.Cli.Benchmarks.Validation.Validators; | ||
|
|
||
| namespace Azure.Sdk.Tools.Cli.Benchmarks.Scenarios; | ||
|
|
||
| /// <summary> | ||
| /// Validates that the agent invokes azsdk_get_pipeline_status when asked | ||
| /// to check SDK generation pipeline status. | ||
| /// Adapted from evaluation scenario: Evaluate_CheckSDKGenerationStatus. | ||
| /// The original scenario loaded a mid-conversation JSON trace; this benchmark uses a | ||
| /// standalone prompt capturing the same intent. | ||
| /// </summary> | ||
| public class CheckSdkGenerationStatusScenario : BenchmarkScenario | ||
| { | ||
| /// <inheritdoc /> | ||
| public override string Name => "check-sdk-generation-status"; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string Description => | ||
| "Verify the agent calls azsdk_get_pipeline_status to check SDK generation."; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string[] Tags => ["pipeline"]; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override RepoConfig Repo => new() | ||
| { | ||
| Owner = "Azure", | ||
| Name = "azure-rest-api-specs", | ||
| Ref = "main", | ||
| SparseCheckoutPaths = ["specification/contosowidgetmanager"] | ||
| }; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string Prompt => """ | ||
| Check the SDK generation pipeline status for build ID 5513110. | ||
| My setup has already been verified, do not run azsdk_verify_setup. | ||
| """; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override IEnumerable<IValidator> Validators => | ||
| [ | ||
| new ToolCallValidator( | ||
| "Expected tool: azsdk_get_pipeline_status", | ||
| expectedToolCalls: | ||
| [ | ||
| new ExpectedToolCall("azsdk_get_pipeline_status", | ||
| new Dictionary<string, object?> | ||
| { | ||
| ["buildId"] = 5513110 | ||
| }) | ||
| ], | ||
| forbiddenToolNames: ["azsdk_verify_setup"]) | ||
| ]; | ||
| } |
68 changes: 68 additions & 0 deletions
68
...sdk-cli/Azure.Sdk.Tools.Cli.Benchmarks/Scenarios/ReleasePlan/CreateReleasePlanScenario.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| using Azure.Sdk.Tools.Cli.Benchmarks.Models; | ||
| using Azure.Sdk.Tools.Cli.Benchmarks.Validation; | ||
| using Azure.Sdk.Tools.Cli.Benchmarks.Validation.Validators; | ||
|
|
||
| namespace Azure.Sdk.Tools.Cli.Benchmarks.Scenarios; | ||
|
|
||
| /// <summary> | ||
| /// Validates that the agent invokes the azsdk_create_release_plan tool when asked to create a release plan. | ||
| /// Migrated from evaluation scenario: Evaluate_CreateReleasePlan. | ||
| /// </summary> | ||
| public class CreateReleasePlanScenario : BenchmarkScenario | ||
| { | ||
| /// <inheritdoc /> | ||
| public override string Name => "create-release-plan"; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string Description => | ||
| "Verify the agent calls azsdk_create_release_plan with appropriate context."; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string[] Tags => ["release-plan"]; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override RepoConfig Repo => new() | ||
| { | ||
| Owner = "Azure", | ||
| Name = "azure-rest-api-specs", | ||
| Ref = "main", | ||
| SparseCheckoutPaths = ["specification/contosowidgetmanager"] | ||
| }; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override string Prompt => """ | ||
| Create a release plan for the Contoso Widget Manager, no need to get it afterwards only create. | ||
| My setup has already been verified, do not run azsdk_verify_setup. Here is all the context you need: | ||
| TypeSpec project located at "specification/contosowidgetmanager/Contoso.WidgetManager". | ||
| Use service tree ID "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f", | ||
| product tree ID "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e", | ||
| target release timeline "December 2025", | ||
| API version "2022-11-01-preview", | ||
| SDK release type "beta", | ||
| and link it to the spec pull request "https://github.com/Azure/azure-rest-api-specs/pull/38387". | ||
| """; | ||
|
|
||
| /// <inheritdoc /> | ||
| public override IEnumerable<IValidator> Validators => | ||
| [ | ||
| new ToolCallValidator( | ||
| "Expected tool: azsdk_create_release_plan", | ||
| expectedToolCalls: | ||
| [ | ||
| new ExpectedToolCall("azsdk_create_release_plan", | ||
| new Dictionary<string, object?> | ||
| { | ||
| ["typeSpecProjectPath"] = "specification/contosowidgetmanager/Contoso.WidgetManager", | ||
| ["serviceTreeId"] = "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f", | ||
| ["productTreeId"] = "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e", | ||
| ["specApiVersion"] = "2022-11-01-preview", | ||
| ["specPullRequestUrl"] = "https://github.com/Azure/azure-rest-api-specs/pull/38387", | ||
| ["sdkReleaseType"] = "beta" | ||
| }) | ||
| ], | ||
| forbiddenToolNames: ["azsdk_verify_setup"]) | ||
jeo02 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ]; | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.