Skip to content

Commit 796ce43

Browse files
authored
Generate HTML view for evals (#12408)
* Fix async methods * Reporting Html
1 parent ff9cf24 commit 796ce43

3 files changed

Lines changed: 65 additions & 17 deletions

File tree

tools/ai-evals/azsdk-mcp/Evaluators/ExpectedToolInputEvaluator.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ public ValueTask<EvaluationResult> EvaluateAsync(
3030
}
3131

3232
// Get tool calls to compare them
33-
var expectedToolCalls = await GetToolContent(context.ChatMessages, context.ToolNames, true);
34-
var actualToolCalls = await GetToolContent(modelResponse.Messages, context.ToolNames, false);
33+
var expectedToolCalls = GetToolContent(context.ChatMessages, context.ToolNames, true);
34+
var actualToolCalls = GetToolContent(modelResponse.Messages, context.ToolNames, false);
3535

3636
// Make sure we have tool calls to compare
3737
if (!expectedToolCalls.Any())
@@ -144,7 +144,7 @@ private static void Interpret(BooleanMetric metric)
144144
}
145145
}
146146

147-
private async Task<IEnumerable<FunctionCallContent>> GetToolContent(IEnumerable<ChatMessage> messages, IEnumerable<string> toolNames, bool simplify)
147+
private IEnumerable<FunctionCallContent> GetToolContent(IEnumerable<ChatMessage> messages, IEnumerable<string> toolNames, bool simplify)
148148
{
149149
var result = messages
150150
.Where(message => message.Role == ChatRole.Assistant)

tools/ai-evals/azsdk-mcp/Scenarios/AzsdkTypeSpecGeneration_Step02_TypespecValidation.cs

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
using Azure.Sdk.Tools.McpEvals.Models;
55
using Microsoft.Extensions.AI.Evaluation;
66
using Microsoft.Extensions.AI.Evaluation.Reporting;
7+
using Microsoft.Extensions.AI.Evaluation.Reporting.Formats.Html;
8+
using Microsoft.Extensions.AI.Evaluation.Reporting.Storage;
9+
using ModelContextProtocol.Protocol;
710
using NUnit.Framework;
811

912
namespace Azure.Sdk.Tools.McpEvals.Scenarios
@@ -19,15 +22,23 @@ public async Task AzsdkTypeSpecGeneration_Step02_TypespecValidation()
1922
var fullChat = json.ChatHistory.Append(json.NextMessage);
2023

2124
// 2. Get chat response
22-
var expectedToolCalls = SerializationHelper.NumberOfToolCalls(json.ExpectedOutcome, ToolNames);
23-
var response = await ChatCompletion!.GetChatResponseAsync(fullChat, expectedToolCalls);
25+
var expectedToolCalls = SerializationHelper.NumberOfToolCalls(json.ExpectedOutcome, s_toolNames);
26+
var response = await s_chatCompletion!.GetChatResponseAsync(fullChat, expectedToolCalls);
2427

2528
// 3. Custom Evaluator to check tool inputs
26-
var expectedToolInputEvaluator = new ExpectedToolInputEvaluator();
29+
// Layers the reporting configuration on top of it for a nice html report.
30+
// Could not make this static because each test will have to define what evaluators it wants to use.
31+
var reportingConfiguration = DiskBasedReportingConfiguration.Create(
32+
executionName: s_executionName, // Having a static execution name allows us to see all results in one report
33+
storageRootPath: ReportingPath,
34+
evaluators: [new ExpectedToolInputEvaluator()], // In this test we only want to run the ExpectedToolInputEvaluator
35+
chatConfiguration: s_chatConfig,
36+
enableResponseCaching: true);
37+
await using ScenarioRun scenarioRun = await reportingConfiguration.CreateScenarioRunAsync(this.ScenarioName);
2738

28-
// Pass the expected outcome through the additional context.
29-
var additionalContext = new ExpectedToolInputEvaluatorContext(json.ExpectedOutcome, ToolNames);
30-
var result = await expectedToolInputEvaluator.EvaluateAsync(fullChat, response, additionalContext: [additionalContext]);
39+
// Pass the expected outcome through the additional context, then run the evaluation.
40+
var additionalContext = new ExpectedToolInputEvaluatorContext(json.ExpectedOutcome, s_toolNames);
41+
var result = await scenarioRun.EvaluateAsync(fullChat, response, additionalContext: [additionalContext]);
3142

3243
// 4. Assert the results
3344
EvaluationRating[] expectedRatings = [EvaluationRating.Good, EvaluationRating.Exceptional];
Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
1+
using AwesomeAssertions.Specialized;
2+
using Azure.Sdk.Tools.McpEvals.Evaluators;
13
using Azure.Sdk.Tools.McpEvals.Helpers;
24
using Microsoft.Extensions.AI;
5+
using Microsoft.Extensions.AI.Evaluation;
6+
using Microsoft.Extensions.AI.Evaluation.Quality;
7+
using Microsoft.Extensions.AI.Evaluation.Reporting;
8+
using Microsoft.Extensions.AI.Evaluation.Reporting.Formats.Html;
9+
using Microsoft.Extensions.AI.Evaluation.Reporting.Storage;
310
using ModelContextProtocol.Client;
411
using NUnit.Framework;
512

@@ -9,18 +16,48 @@ namespace Azure.Sdk.Tools.McpEvals.Scenarios
916
public partial class Scenario
1017
{
1118
// Static services shared across all tests
12-
protected static IChatClient? ChatClient;
13-
protected static IMcpClient? McpClient;
14-
protected static ChatCompletion? ChatCompletion;
15-
protected static IEnumerable<string> ToolNames;
19+
protected static IChatClient? s_chatClient;
20+
protected static IMcpClient? s_mcpClient;
21+
protected static ChatCompletion? s_chatCompletion;
22+
protected static IEnumerable<string> s_toolNames;
23+
protected static ReportingConfiguration s_reportingConfiguration;
24+
protected static ChatConfiguration s_chatConfig;
25+
private static string s_executionName;
26+
private string ScenarioName => $"{TestContext.CurrentContext.Test.ClassName}.{TestContext.CurrentContext.Test.Name}";
27+
private string ReportingPath => Path.Combine(TestContext.CurrentContext.TestDirectory, "reports");
28+
1629

1730
[OneTimeSetUp]
1831
public async Task GlobalSetup()
1932
{
20-
ChatClient = TestSetup.GetChatClient();
21-
McpClient = await TestSetup.GetMcpClientAsync();
22-
ChatCompletion = TestSetup.GetChatCompletion(ChatClient, McpClient);
23-
ToolNames = (await McpClient.ListToolsAsync()).Select(tool => tool.Name)!;
33+
s_chatClient = TestSetup.GetChatClient();
34+
s_mcpClient = await TestSetup.GetMcpClientAsync();
35+
s_chatConfig = new ChatConfiguration(s_chatClient);
36+
s_chatCompletion = TestSetup.GetChatCompletion(s_chatClient, s_mcpClient);
37+
s_toolNames = (await s_mcpClient.ListToolsAsync()).Select(tool => tool.Name)!;
38+
s_executionName = $"{DateTime.Now:yyyyMMddTHHmmss}";
39+
}
40+
41+
42+
[OneTimeTearDown]
43+
public async Task GlobalTearDown()
44+
{
45+
// Generate a HTML report for all the evaluations run
46+
IEvaluationResultStore resultStore = new DiskBasedResultStore(ReportingPath);
47+
var allResults = new List<ScenarioRunResult>();
48+
49+
await foreach (string executionName in resultStore.GetLatestExecutionNamesAsync(count: 1))
50+
{
51+
await foreach (ScenarioRunResult scenarioResult in resultStore.ReadResultsAsync(executionName))
52+
{
53+
allResults.Add(scenarioResult);
54+
}
55+
}
56+
57+
var timestamp = DateTime.Now.ToString("yyyyMMdd_HHmmss");
58+
string reportFilePath = Path.Combine(ReportingPath, $"report-{timestamp}.html");
59+
IEvaluationReportWriter reportWriter = new HtmlReportWriter(reportFilePath);
60+
await reportWriter.WriteReportAsync(allResults);
2461
}
2562
}
2663
}

0 commit comments

Comments
 (0)