fix: display eval status per metric type instead of overall status

stefanoamorelli · stefanoamorelli · commit 4a8d456c1d5b · 2025-11-29T20:22:07.000+02:00
Previously, when viewing eval results, all messages in an invocation showed the same pass/fail status. If response_match_score failed, tool calls would incorrectly show ❌ even when tool_trajectory_avg_score passed. Now, tool-related events (functionCall, functionResponse) display the tool_trajectory_avg_score result, while text responses display the response_match_score result. This gives accurate per-metric feedback in the eval UI. Fixes #187
diff --git a/src/app/components/eval-tab/eval-tab.component.spec.ts b/src/app/components/eval-tab/eval-tab.component.spec.ts
@@ -93,4 +93,127 @@ describe('EvalTabComponent', () => {
   it('should create', () => {
     expect(component).toBeTruthy();
   });
+
+  describe('addEvalCaseResultToEvents', () => {
+    it('should show pass for tool events when tool_trajectory passes but response_match fails', () => {
+      const res = {
+        events: [
+          {author: 'user', content: {parts: [{text: 'hello'}]}},
+          {
+            author: 'bot',
+            content: {parts: [{functionCall: {name: 'test_tool', args: {}}}]},
+          },
+          {
+            author: 'bot',
+            content:
+                {parts: [{functionResponse: {name: 'test_tool', response: {}}}]},
+          },
+          {author: 'bot', content: {parts: [{text: 'final response'}]}},
+        ],
+      };
+
+      const evalCaseResult = {
+        setId: 'test-set',
+        evalId: 'test-eval',
+        finalEvalStatus: 2,
+        evalMetricResults: [],
+        overallEvalMetricResults: [],
+        evalMetricResultPerInvocation: [
+          {
+            evalMetricResults: [
+              {
+                metricName: 'tool_trajectory_avg_score',
+                evalStatus: 1,
+                score: 1,
+                threshold: 1,
+              },
+              {
+                metricName: 'response_match_score',
+                evalStatus: 2,
+                score: 0,
+                threshold: 0.7,
+              },
+            ],
+            actualInvocation: {
+              intermediateData: {toolUses: []},
+              finalResponse: {parts: [{text: 'actual'}]},
+            },
+            expectedInvocation: {
+              intermediateData: {toolUses: []},
+              finalResponse: {parts: [{text: 'expected'}]},
+            },
+          },
+        ],
+        sessionId: 'test-session',
+        sessionDetails: {},
+      };
+
+      const result =
+          (component as any).addEvalCaseResultToEvents(res, evalCaseResult);
+
+      expect(result.events[1].evalStatus)
+          .toBe(1, 'functionCall event should pass');
+      expect(result.events[2].evalStatus)
+          .toBe(1, 'functionResponse event should pass');
+      expect(result.events[3].evalStatus)
+          .toBe(2, 'final text response should fail');
+    });
+
+    it('should show fail for tool events when tool_trajectory fails but response_match passes', () => {
+      const res = {
+        events: [
+          {author: 'user', content: {parts: [{text: 'hello'}]}},
+          {
+            author: 'bot',
+            content: {parts: [{functionCall: {name: 'test_tool', args: {}}}]},
+          },
+          {author: 'bot', content: {parts: [{text: 'final response'}]}},
+        ],
+      };
+
+      const evalCaseResult = {
+        setId: 'test-set',
+        evalId: 'test-eval',
+        finalEvalStatus: 2,
+        evalMetricResults: [],
+        overallEvalMetricResults: [],
+        evalMetricResultPerInvocation: [
+          {
+            evalMetricResults: [
+              {
+                metricName: 'tool_trajectory_avg_score',
+                evalStatus: 2,
+                score: 0.5,
+                threshold: 1,
+              },
+              {
+                metricName: 'response_match_score',
+                evalStatus: 1,
+                score: 0.9,
+                threshold: 0.7,
+              },
+            ],
+            actualInvocation: {
+              intermediateData: {toolUses: [{name: 'wrong_tool', args: {}}]},
+              finalResponse: {parts: [{text: 'actual'}]},
+            },
+            expectedInvocation: {
+              intermediateData: {toolUses: [{name: 'test_tool', args: {}}]},
+              finalResponse: {parts: [{text: 'expected'}]},
+            },
+          },
+        ],
+        sessionId: 'test-session',
+        sessionDetails: {},
+      };
+
+      const result =
+          (component as any).addEvalCaseResultToEvents(res, evalCaseResult);
+
+      expect(result.events[1].evalStatus)
+          .toBe(2, 'functionCall event should fail');
+      expect(result.events[2].evalStatus)
+          .toBe(1, 'final text response should pass');
+    });
+  });
 });
diff --git a/src/app/components/eval-tab/eval-tab.component.ts b/src/app/components/eval-tab/eval-tab.component.ts
@@ -324,16 +324,24 @@ export class EvalTabComponent implements OnInit, OnChanges {
           currentInvocationIndex++;
         } else {
           const invocationResult = invocationResults[currentInvocationIndex];
+          const isToolEvent = this.isToolRelatedEvent(event);
+          const relevantMetric = isToolEvent ?
+              'tool_trajectory_avg_score' :
+              'response_match_score';
+
           let evalStatus = 1;
           let failedMetric = '';
           let score = 1;
           let threshold = 1;
+
           for (const evalMetricResult of invocationResult.evalMetricResults) {
-            if (evalMetricResult.evalStatus === 2) {
-              evalStatus = 2;
-              failedMetric = evalMetricResult.metricName;
-              score = evalMetricResult.score;
-              threshold = evalMetricResult.threshold;
+            if (evalMetricResult.metricName === relevantMetric) {
+              evalStatus = evalMetricResult.evalStatus;
+              if (evalMetricResult.evalStatus === 2) {
+                failedMetric = evalMetricResult.metricName;
+                score = evalMetricResult.score;
+                threshold = evalMetricResult.threshold;
+              }
               break;
             }
           }
@@ -350,6 +358,14 @@ export class EvalTabComponent implements OnInit, OnChanges {
     return res;
   }
 
+  private isToolRelatedEvent(event: any): boolean {
+    if (!event.content || !event.content.parts) {
+      return false;
+    }
+    return event.content.parts.some(
+        (part: any) => part.functionCall || part.functionResponse);
+  }
+
   private addEvalFieldsToBotEvent(
       event: any, invocationResult: any, failedMetric: string, score: number,
       threshold: number) {