Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ public class SessionExecutor : IDisposable
public async Task<ExecutionResult> ExecuteAsync(ExecutionConfig config)
{
var stopwatch = Stopwatch.StartNew();
var toolCalls = new List<string>();
var toolCalls = new List<ToolCallRecord>();
var pendingTimestamps = new Dictionary<string, double>();

try
{
Expand Down Expand Up @@ -56,10 +57,28 @@ public async Task<ExecutionResult> ExecuteAsync(ExecutionConfig config)
{
Console.WriteLine($"Model is calling tool: {input.ToolName}");
config.OnActivity?.Invoke($"Calling tool: {input.ToolName}");
pendingTimestamps[input.ToolName] = input.Timestamp;
return Task.FromResult<PreToolUseHookOutput?>(null);
},
OnPostToolUse = (input, invocation) =>
{
double? durationMs = pendingTimestamps.TryGetValue(input.ToolName, out var startTs)
? input.Timestamp - startTs
: null;

var mcpServerName = input.ToolName.Contains("__")
? input.ToolName.Split("__", 2)[0]
: null;

toolCalls.Add(new ToolCallRecord
{
ToolName = input.ToolName,
ToolArgs = input.ToolArgs,
ToolResult = input.ToolResult,
DurationMs = durationMs,
McpServerName = mcpServerName,
Timestamp = startTs,
});
if (input.ToolName == "skill")
{
toolCalls.Add($"{input.ToolName} {input.ToolArgs?.ToString()}");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ public async Task<string> ReadFileAsync(string relativePath)
public async Task WriteExecutionLogAsync(
string scenarioName,
IReadOnlyList<object> messages,
IReadOnlyList<string> toolCalls,
IReadOnlyList<ToolCallRecord> toolCalls,
string? gitDiff,
TimeSpan duration,
bool passed,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ public async Task<Workspace> PrepareAsync(RepoConfig repo, string scenarioId)
// Create worktree using the resolved SHA (safe for concurrent use)
await CreateWorktreeAsync(barePath, worktreePath, commitSha, repo.SparseCheckoutPaths);

SetupWorkspaceEnvironment();

return new Workspace(workspaceRoot, repo.Name);
}

Expand Down Expand Up @@ -279,4 +281,10 @@ private static string EscapeArgument(string arg)

return arg;
}

private void SetupWorkspaceEnvironment()
{
// Force test mode for the tools.
Environment.SetEnvironmentVariable("AZSDKTOOLS_AGENT_TESTING", "true");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ public class BenchmarkResult
public string? GitDiff { get; init; }

/// <summary>
/// Gets the list of tool calls made during execution.
/// Gets the tool calls made during execution.
/// </summary>
public IReadOnlyList<string> ToolCalls { get; init; } = [];
public IReadOnlyList<ToolCallRecord> ToolCalls { get; init; } = [];

/// <summary>
/// Gets the path to the workspace where the benchmark was executed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ public class ExecutionResult
/// <summary>The conversation messages from the session.</summary>
public IReadOnlyList<object> Messages { get; init; } = [];

/// <summary>Tool calls made during execution (for debugging).</summary>
public IReadOnlyList<string> ToolCalls { get; init; } = [];
/// <summary>Tool calls made during execution.</summary>
public IReadOnlyList<ToolCallRecord> ToolCalls { get; init; } = [];
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

namespace Azure.Sdk.Tools.Cli.Benchmarks.Models;

/// <summary>
/// Describes an expected tool call with optional input validation.
/// </summary>
public class ExpectedToolCall
{
/// <summary>Gets the expected tool name (short name without MCP prefix).</summary>
public string ToolName { get; }

/// <summary>
/// Gets the expected input key-value pairs to validate, or null to skip input validation.
/// Keys are parameter names; values are the expected values.
/// String values use case-insensitive substring matching (to handle variable path prefixes).
/// Numeric and boolean values use exact matching.
/// </summary>
public IReadOnlyDictionary<string, object?> ExpectedInputs { get; }

/// <summary>
/// Creates an expected tool call that only validates the tool was called (no input checks).
/// </summary>
public ExpectedToolCall(string toolName)
{
ToolName = toolName;
ExpectedInputs = new Dictionary<string, object?>();
}

/// <summary>
/// Creates an expected tool call that validates both the tool name and its inputs.
/// </summary>
public ExpectedToolCall(string toolName, Dictionary<string, object?> expectedInputs)
{
ToolName = toolName;
ExpectedInputs = expectedInputs;
}

public override string ToString() => ToolName;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using System.Text.Json;

namespace Azure.Sdk.Tools.Cli.Benchmarks.Models;

/// <summary>
/// Represents a tool call captured during benchmark execution.
/// </summary>
public class ToolCallRecord
{
/// <summary>Gets the tool name (may include MCP prefix).</summary>
public required string ToolName { get; init; }

/// <summary>
/// Gets the arguments passed to the tool, or null if unavailable.
/// The raw object from the SDK hook (typically a JsonElement).
/// </summary>
public object? ToolArgs { get; init; }

/// <summary>Gets the result returned by the tool, or null if unavailable.</summary>
public object? ToolResult { get; init; }

/// <summary>Gets the tool call duration in milliseconds, or null if unavailable.</summary>
public double? DurationMs { get; init; }

/// <summary>Gets the MCP server name extracted from the tool name prefix, or null.</summary>
public string? McpServerName { get; init; }

/// <summary>Gets the timestamp when the tool call was recorded.</summary>
public DateTime Timestamp { get; init; } = DateTime.UtcNow;

/// <summary>
/// Gets the tool arguments as a string-keyed dictionary of JsonElements.
/// Returns an empty dictionary if args are null or not a JSON object.
/// </summary>
public IReadOnlyDictionary<string, JsonElement> GetArgsAsDictionary()
{
if (ToolArgs is not JsonElement { ValueKind: JsonValueKind.Object } jsonElement)
{
return new Dictionary<string, JsonElement>();
}

var dict = new Dictionary<string, JsonElement>(StringComparer.OrdinalIgnoreCase);
foreach (var prop in jsonElement.EnumerateObject())
{
dict[prop.Name] = prop.Value;
}
return dict;
}

public override string ToString() => ToolName;
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public class ValidationContext
/// <summary>
/// Gets the tool calls made during execution.
/// </summary>
public IReadOnlyList<string> ToolCalls { get; init; } = [];
public IReadOnlyList<ToolCallRecord> ToolCalls { get; init; } = [];

/// <summary>
/// Gets the conversation messages from the agent session.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using Azure.Sdk.Tools.Cli.Benchmarks.Models;
using Azure.Sdk.Tools.Cli.Benchmarks.Validation;
using Azure.Sdk.Tools.Cli.Benchmarks.Validation.Validators;

namespace Azure.Sdk.Tools.Cli.Benchmarks.Scenarios;

/// <summary>
/// Validates that the agent invokes azsdk_get_pull_request_link_for_current_branch
/// when asked about the status of a spec PR on the current branch.
/// Migrated from evaluation scenario: Evaluate_GetPullRequestLinkForCurrentBranch.
/// </summary>
public class GetPrLinkCurrentBranchScenario : BenchmarkScenario
{
/// <inheritdoc />
public override string Name => "get-pr-link-current-branch";

/// <inheritdoc />
public override string Description =>
"Verify the agent calls azsdk_get_pull_request_link_for_current_branch for PR status.";

/// <inheritdoc />
public override string[] Tags => ["github"];

/// <inheritdoc />
public override RepoConfig Repo => new()
{
Owner = "Azure",
Name = "azure-rest-api-specs",
Ref = "main",
SparseCheckoutPaths = ["specification/contosowidgetmanager"]
};

/// <inheritdoc />
public override string Prompt => """
What's the status of the spec PR in my current branch? Only check the status once.
My setup has already been verified, do not run azsdk_verify_setup.
The repository root is the relative path ./azure-rest-api-specs.
""";

/// <inheritdoc />
public override IEnumerable<IValidator> Validators =>
[
new ToolCallValidator(
"Expected tool: azsdk_get_pull_request_link_for_current_branch",
expectedToolCalls:
[
new ExpectedToolCall("azsdk_get_pull_request_link_for_current_branch")
],
forbiddenToolNames: ["azsdk_verify_setup"])
];
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using Azure.Sdk.Tools.Cli.Benchmarks.Models;
using Azure.Sdk.Tools.Cli.Benchmarks.Validation;
using Azure.Sdk.Tools.Cli.Benchmarks.Validation.Validators;

namespace Azure.Sdk.Tools.Cli.Benchmarks.Scenarios;

/// <summary>
/// Validates that the agent invokes azsdk_get_pipeline_status when asked
/// to check SDK generation pipeline status.
/// Adapted from evaluation scenario: Evaluate_CheckSDKGenerationStatus.
/// The original scenario loaded a mid-conversation JSON trace; this benchmark uses a
/// standalone prompt capturing the same intent.
/// </summary>
public class CheckSdkGenerationStatusScenario : BenchmarkScenario
{
/// <inheritdoc />
public override string Name => "check-sdk-generation-status";

/// <inheritdoc />
public override string Description =>
"Verify the agent calls azsdk_get_pipeline_status to check SDK generation.";

/// <inheritdoc />
public override string[] Tags => ["pipeline"];

/// <inheritdoc />
public override RepoConfig Repo => new()
{
Owner = "Azure",
Name = "azure-rest-api-specs",
Ref = "main",
SparseCheckoutPaths = ["specification/contosowidgetmanager"]
};

/// <inheritdoc />
public override string Prompt => """
Check the SDK generation pipeline status for build ID 5513110.
My setup has already been verified, do not run azsdk_verify_setup.
""";

/// <inheritdoc />
public override IEnumerable<IValidator> Validators =>
[
new ToolCallValidator(
"Expected tool: azsdk_get_pipeline_status",
expectedToolCalls:
[
new ExpectedToolCall("azsdk_get_pipeline_status",
new Dictionary<string, object?>
{
["buildId"] = 5513110
})
],
forbiddenToolNames: ["azsdk_verify_setup"])
];
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using Azure.Sdk.Tools.Cli.Benchmarks.Models;
using Azure.Sdk.Tools.Cli.Benchmarks.Validation;
using Azure.Sdk.Tools.Cli.Benchmarks.Validation.Validators;

namespace Azure.Sdk.Tools.Cli.Benchmarks.Scenarios;

/// <summary>
/// Validates that the agent invokes the azsdk_create_release_plan tool when asked to create a release plan.
/// Migrated from evaluation scenario: Evaluate_CreateReleasePlan.
/// </summary>
public class CreateReleasePlanScenario : BenchmarkScenario
{
/// <inheritdoc />
public override string Name => "create-release-plan";

/// <inheritdoc />
public override string Description =>
"Verify the agent calls azsdk_create_release_plan with appropriate context.";

/// <inheritdoc />
public override string[] Tags => ["release-plan"];

/// <inheritdoc />
public override RepoConfig Repo => new()
{
Owner = "Azure",
Name = "azure-rest-api-specs",
Ref = "main",
SparseCheckoutPaths = ["specification/contosowidgetmanager"]
};

/// <inheritdoc />
public override string Prompt => """
Create a release plan for the Contoso Widget Manager, no need to get it afterwards only create.
My setup has already been verified, do not run azsdk_verify_setup. Here is all the context you need:
TypeSpec project located at "specification/contosowidgetmanager/Contoso.WidgetManager".
Use service tree ID "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f",
product tree ID "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e",
target release timeline "December 2025",
API version "2022-11-01-preview",
SDK release type "beta",
and link it to the spec pull request "https://github.com/Azure/azure-rest-api-specs/pull/38387".
""";

/// <inheritdoc />
public override IEnumerable<IValidator> Validators =>
[
new ToolCallValidator(
"Expected tool: azsdk_create_release_plan",
expectedToolCalls:
[
new ExpectedToolCall("azsdk_create_release_plan",
new Dictionary<string, object?>
{
["typeSpecProjectPath"] = "specification/contosowidgetmanager/Contoso.WidgetManager",
["serviceTreeId"] = "a7f2b8e4-9c1d-4a3e-b6f9-2d8e5a7c3b1f",
["productTreeId"] = "f1a8c5d2-6e4b-4f7a-9c2d-8b5e1f3a6c9e",
["specApiVersion"] = "2022-11-01-preview",
["specPullRequestUrl"] = "https://github.com/Azure/azure-rest-api-specs/pull/38387",
["sdkReleaseType"] = "beta"
})
],
forbiddenToolNames: ["azsdk_verify_setup"])
];
}
Loading