Skip to content

Commit 4a8d456

Browse files
fix: display eval status per metric type instead of overall status
Previously, when viewing eval results, all messages in an invocation showed the same pass/fail status. If response_match_score failed, tool calls would incorrectly show ❌ even when tool_trajectory_avg_score passed. Now, tool-related events (functionCall, functionResponse) display the tool_trajectory_avg_score result, while text responses display the response_match_score result. This gives accurate per-metric feedback in the eval UI. Fixes #187
1 parent 56bf9fd commit 4a8d456

File tree

2 files changed

+144
-5
lines changed

2 files changed

+144
-5
lines changed

src/app/components/eval-tab/eval-tab.component.spec.ts

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,127 @@ describe('EvalTabComponent', () => {
9393
it('should create', () => {
9494
expect(component).toBeTruthy();
9595
});
96+
97+
describe('addEvalCaseResultToEvents', () => {
98+
it('should show pass for tool events when tool_trajectory passes but response_match fails', () => {
99+
const res = {
100+
events: [
101+
{author: 'user', content: {parts: [{text: 'hello'}]}},
102+
{
103+
author: 'bot',
104+
content: {parts: [{functionCall: {name: 'test_tool', args: {}}}]},
105+
},
106+
{
107+
author: 'bot',
108+
content:
109+
{parts: [{functionResponse: {name: 'test_tool', response: {}}}]},
110+
},
111+
{author: 'bot', content: {parts: [{text: 'final response'}]}},
112+
],
113+
};
114+
115+
const evalCaseResult = {
116+
setId: 'test-set',
117+
evalId: 'test-eval',
118+
finalEvalStatus: 2,
119+
evalMetricResults: [],
120+
overallEvalMetricResults: [],
121+
evalMetricResultPerInvocation: [
122+
{
123+
evalMetricResults: [
124+
{
125+
metricName: 'tool_trajectory_avg_score',
126+
evalStatus: 1,
127+
score: 1,
128+
threshold: 1,
129+
},
130+
{
131+
metricName: 'response_match_score',
132+
evalStatus: 2,
133+
score: 0,
134+
threshold: 0.7,
135+
},
136+
],
137+
actualInvocation: {
138+
intermediateData: {toolUses: []},
139+
finalResponse: {parts: [{text: 'actual'}]},
140+
},
141+
expectedInvocation: {
142+
intermediateData: {toolUses: []},
143+
finalResponse: {parts: [{text: 'expected'}]},
144+
},
145+
},
146+
],
147+
sessionId: 'test-session',
148+
sessionDetails: {},
149+
};
150+
151+
const result =
152+
(component as any).addEvalCaseResultToEvents(res, evalCaseResult);
153+
154+
expect(result.events[1].evalStatus)
155+
.toBe(1, 'functionCall event should pass');
156+
expect(result.events[2].evalStatus)
157+
.toBe(1, 'functionResponse event should pass');
158+
expect(result.events[3].evalStatus)
159+
.toBe(2, 'final text response should fail');
160+
});
161+
162+
it('should show fail for tool events when tool_trajectory fails but response_match passes', () => {
163+
const res = {
164+
events: [
165+
{author: 'user', content: {parts: [{text: 'hello'}]}},
166+
{
167+
author: 'bot',
168+
content: {parts: [{functionCall: {name: 'test_tool', args: {}}}]},
169+
},
170+
{author: 'bot', content: {parts: [{text: 'final response'}]}},
171+
],
172+
};
173+
174+
const evalCaseResult = {
175+
setId: 'test-set',
176+
evalId: 'test-eval',
177+
finalEvalStatus: 2,
178+
evalMetricResults: [],
179+
overallEvalMetricResults: [],
180+
evalMetricResultPerInvocation: [
181+
{
182+
evalMetricResults: [
183+
{
184+
metricName: 'tool_trajectory_avg_score',
185+
evalStatus: 2,
186+
score: 0.5,
187+
threshold: 1,
188+
},
189+
{
190+
metricName: 'response_match_score',
191+
evalStatus: 1,
192+
score: 0.9,
193+
threshold: 0.7,
194+
},
195+
],
196+
actualInvocation: {
197+
intermediateData: {toolUses: [{name: 'wrong_tool', args: {}}]},
198+
finalResponse: {parts: [{text: 'actual'}]},
199+
},
200+
expectedInvocation: {
201+
intermediateData: {toolUses: [{name: 'test_tool', args: {}}]},
202+
finalResponse: {parts: [{text: 'expected'}]},
203+
},
204+
},
205+
],
206+
sessionId: 'test-session',
207+
sessionDetails: {},
208+
};
209+
210+
const result =
211+
(component as any).addEvalCaseResultToEvents(res, evalCaseResult);
212+
213+
expect(result.events[1].evalStatus)
214+
.toBe(2, 'functionCall event should fail');
215+
expect(result.events[2].evalStatus)
216+
.toBe(1, 'final text response should pass');
217+
});
218+
});
96219
});

src/app/components/eval-tab/eval-tab.component.ts

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -324,16 +324,24 @@ export class EvalTabComponent implements OnInit, OnChanges {
324324
currentInvocationIndex++;
325325
} else {
326326
const invocationResult = invocationResults[currentInvocationIndex];
327+
const isToolEvent = this.isToolRelatedEvent(event);
328+
const relevantMetric = isToolEvent ?
329+
'tool_trajectory_avg_score' :
330+
'response_match_score';
331+
327332
let evalStatus = 1;
328333
let failedMetric = '';
329334
let score = 1;
330335
let threshold = 1;
336+
331337
for (const evalMetricResult of invocationResult.evalMetricResults) {
332-
if (evalMetricResult.evalStatus === 2) {
333-
evalStatus = 2;
334-
failedMetric = evalMetricResult.metricName;
335-
score = evalMetricResult.score;
336-
threshold = evalMetricResult.threshold;
338+
if (evalMetricResult.metricName === relevantMetric) {
339+
evalStatus = evalMetricResult.evalStatus;
340+
if (evalMetricResult.evalStatus === 2) {
341+
failedMetric = evalMetricResult.metricName;
342+
score = evalMetricResult.score;
343+
threshold = evalMetricResult.threshold;
344+
}
337345
break;
338346
}
339347
}
@@ -350,6 +358,14 @@ export class EvalTabComponent implements OnInit, OnChanges {
350358
return res;
351359
}
352360

361+
private isToolRelatedEvent(event: any): boolean {
362+
if (!event.content || !event.content.parts) {
363+
return false;
364+
}
365+
return event.content.parts.some(
366+
(part: any) => part.functionCall || part.functionResponse);
367+
}
368+
353369
private addEvalFieldsToBotEvent(
354370
event: any, invocationResult: any, failedMetric: string, score: number,
355371
threshold: number) {

0 commit comments

Comments
 (0)