Generate HTML view for evals (#12408)

jeo02 · web-flow · commit 796ce43276a4 · 2025-10-08T10:33:25.000-07:00
* Fix async methods

* Reporting Html
diff --git a/tools/ai-evals/azsdk-mcp/Evaluators/ExpectedToolInputEvaluator.cs b/tools/ai-evals/azsdk-mcp/Evaluators/ExpectedToolInputEvaluator.cs
@@ -30,8 +30,8 @@ public ValueTask<EvaluationResult> EvaluateAsync(
             }
 
             // Get tool calls to compare them
-            var expectedToolCalls = await GetToolContent(context.ChatMessages, context.ToolNames, true);
-            var actualToolCalls = await GetToolContent(modelResponse.Messages, context.ToolNames, false);
+            var expectedToolCalls = GetToolContent(context.ChatMessages, context.ToolNames, true);
+            var actualToolCalls = GetToolContent(modelResponse.Messages, context.ToolNames, false);
 
             // Make sure we have tool calls to compare
             if (!expectedToolCalls.Any())
@@ -144,7 +144,7 @@ private static void Interpret(BooleanMetric metric)
             }
         }
 
-        private async Task<IEnumerable<FunctionCallContent>> GetToolContent(IEnumerable<ChatMessage> messages, IEnumerable<string> toolNames, bool simplify)
+        private IEnumerable<FunctionCallContent> GetToolContent(IEnumerable<ChatMessage> messages, IEnumerable<string> toolNames, bool simplify)
         {
             var result = messages
                 .Where(message => message.Role == ChatRole.Assistant)
diff --git a/tools/ai-evals/azsdk-mcp/Scenarios/AzsdkTypeSpecGeneration_Step02_TypespecValidation.cs b/tools/ai-evals/azsdk-mcp/Scenarios/AzsdkTypeSpecGeneration_Step02_TypespecValidation.cs
@@ -4,6 +4,9 @@
 using Azure.Sdk.Tools.McpEvals.Models;
 using Microsoft.Extensions.AI.Evaluation;
 using Microsoft.Extensions.AI.Evaluation.Reporting;
+using Microsoft.Extensions.AI.Evaluation.Reporting.Formats.Html;
+using Microsoft.Extensions.AI.Evaluation.Reporting.Storage;
+using ModelContextProtocol.Protocol;
 using NUnit.Framework;
 
 namespace Azure.Sdk.Tools.McpEvals.Scenarios
@@ -19,15 +22,23 @@ public async Task AzsdkTypeSpecGeneration_Step02_TypespecValidation()
             var fullChat = json.ChatHistory.Append(json.NextMessage);
 
             // 2. Get chat response
-            var expectedToolCalls = SerializationHelper.NumberOfToolCalls(json.ExpectedOutcome, ToolNames);
-            var response = await ChatCompletion!.GetChatResponseAsync(fullChat, expectedToolCalls);
+            var expectedToolCalls = SerializationHelper.NumberOfToolCalls(json.ExpectedOutcome, s_toolNames);
+            var response = await s_chatCompletion!.GetChatResponseAsync(fullChat, expectedToolCalls);
 
             // 3. Custom Evaluator to check tool inputs
-            var expectedToolInputEvaluator = new ExpectedToolInputEvaluator();
+            // Layers the reporting configuration on top of it for a nice html report. 
+            // Could not make this static because each test will have to define what evaluators it wants to use.
+            var reportingConfiguration = DiskBasedReportingConfiguration.Create(
+                executionName: s_executionName,                     // Having a static execution name allows us to see all results in one report
+                storageRootPath: ReportingPath,
+                evaluators: [new ExpectedToolInputEvaluator()],     // In this test we only want to run the ExpectedToolInputEvaluator
+                chatConfiguration: s_chatConfig,
+                enableResponseCaching: true);
+            await using ScenarioRun scenarioRun = await reportingConfiguration.CreateScenarioRunAsync(this.ScenarioName);
 
-            // Pass the expected outcome through the additional context. 
-            var additionalContext = new ExpectedToolInputEvaluatorContext(json.ExpectedOutcome, ToolNames);
-            var result = await expectedToolInputEvaluator.EvaluateAsync(fullChat, response, additionalContext: [additionalContext]);
+            // Pass the expected outcome through the additional context, then run the evaluation.
+            var additionalContext = new ExpectedToolInputEvaluatorContext(json.ExpectedOutcome, s_toolNames);
+            var result = await scenarioRun.EvaluateAsync(fullChat, response, additionalContext: [additionalContext]);
 
             // 4. Assert the results
             EvaluationRating[] expectedRatings = [EvaluationRating.Good, EvaluationRating.Exceptional];
diff --git a/tools/ai-evals/azsdk-mcp/Scenarios/Scenario.cs b/tools/ai-evals/azsdk-mcp/Scenarios/Scenario.cs
@@ -1,5 +1,12 @@
+using AwesomeAssertions.Specialized;
+using Azure.Sdk.Tools.McpEvals.Evaluators;
 using Azure.Sdk.Tools.McpEvals.Helpers;
 using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+using Microsoft.Extensions.AI.Evaluation.Quality;
+using Microsoft.Extensions.AI.Evaluation.Reporting;
+using Microsoft.Extensions.AI.Evaluation.Reporting.Formats.Html;
+using Microsoft.Extensions.AI.Evaluation.Reporting.Storage;
 using ModelContextProtocol.Client;
 using NUnit.Framework;
 
@@ -9,18 +16,48 @@ namespace Azure.Sdk.Tools.McpEvals.Scenarios
     public partial class Scenario
     {
         // Static services shared across all tests
-        protected static IChatClient? ChatClient;
-        protected static IMcpClient? McpClient;
-        protected static ChatCompletion? ChatCompletion;
-        protected static IEnumerable<string> ToolNames;
+        protected static IChatClient? s_chatClient;
+        protected static IMcpClient? s_mcpClient;
+        protected static ChatCompletion? s_chatCompletion;
+        protected static IEnumerable<string> s_toolNames;
+        protected static ReportingConfiguration s_reportingConfiguration;
+        protected static ChatConfiguration s_chatConfig;
+        private static string s_executionName;
+        private string ScenarioName => $"{TestContext.CurrentContext.Test.ClassName}.{TestContext.CurrentContext.Test.Name}";
+        private string ReportingPath => Path.Combine(TestContext.CurrentContext.TestDirectory, "reports");
+
 
         [OneTimeSetUp]
         public async Task GlobalSetup()
         {
-            ChatClient = TestSetup.GetChatClient();
-            McpClient = await TestSetup.GetMcpClientAsync();
-            ChatCompletion = TestSetup.GetChatCompletion(ChatClient, McpClient);
-            ToolNames = (await McpClient.ListToolsAsync()).Select(tool => tool.Name)!;
+            s_chatClient = TestSetup.GetChatClient();
+            s_mcpClient = await TestSetup.GetMcpClientAsync();
+            s_chatConfig = new ChatConfiguration(s_chatClient);
+            s_chatCompletion = TestSetup.GetChatCompletion(s_chatClient, s_mcpClient);
+            s_toolNames = (await s_mcpClient.ListToolsAsync()).Select(tool => tool.Name)!;
+            s_executionName = $"{DateTime.Now:yyyyMMddTHHmmss}";
+        }
+
+
+        [OneTimeTearDown]
+        public async Task GlobalTearDown()
+        {
+            // Generate a HTML report for all the evaluations run
+            IEvaluationResultStore resultStore = new DiskBasedResultStore(ReportingPath);
+            var allResults = new List<ScenarioRunResult>();
+
+            await foreach (string executionName in resultStore.GetLatestExecutionNamesAsync(count: 1))
+            {
+                await foreach (ScenarioRunResult scenarioResult in resultStore.ReadResultsAsync(executionName))
+                {
+                    allResults.Add(scenarioResult);
+                }
+            }
+
+            var timestamp = DateTime.Now.ToString("yyyyMMdd_HHmmss");
+            string reportFilePath = Path.Combine(ReportingPath, $"report-{timestamp}.html");
+            IEvaluationReportWriter reportWriter = new HtmlReportWriter(reportFilePath);
+            await reportWriter.WriteReportAsync(allResults);
         }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -30,8 +30,8 @@ public ValueTask<EvaluationResult> EvaluateAsync(`
`30`	`30`	`}`
`31`	`31`
`32`	`32`	`// Get tool calls to compare them`
`33`		`- var expectedToolCalls = await GetToolContent(context.ChatMessages, context.ToolNames, true);`
`34`		`- var actualToolCalls = await GetToolContent(modelResponse.Messages, context.ToolNames, false);`
	`33`	`+ var expectedToolCalls = GetToolContent(context.ChatMessages, context.ToolNames, true);`
	`34`	`+ var actualToolCalls = GetToolContent(modelResponse.Messages, context.ToolNames, false);`
`35`	`35`
`36`	`36`	`// Make sure we have tool calls to compare`
`37`	`37`	`if (!expectedToolCalls.Any())`
`@@ -144,7 +144,7 @@ private static void Interpret(BooleanMetric metric)`
`144`	`144`	`}`
`145`	`145`	`}`
`146`	`146`
`147`		`- private async Task<IEnumerable<FunctionCallContent>> GetToolContent(IEnumerable<ChatMessage> messages, IEnumerable<string> toolNames, bool simplify)`
	`147`	`+ private IEnumerable<FunctionCallContent> GetToolContent(IEnumerable<ChatMessage> messages, IEnumerable<string> toolNames, bool simplify)`
`148`	`148`	`{`
`149`	`149`	`var result = messages`
`150`	`150`	`.Where(message => message.Role == ChatRole.Assistant)`