diff --git a/src/app/components/eval-tab/eval-tab.component.spec.ts b/src/app/components/eval-tab/eval-tab.component.spec.ts index b5372c12..055a93b1 100644 --- a/src/app/components/eval-tab/eval-tab.component.spec.ts +++ b/src/app/components/eval-tab/eval-tab.component.spec.ts @@ -93,4 +93,127 @@ describe('EvalTabComponent', () => { it('should create', () => { expect(component).toBeTruthy(); }); + + describe('addEvalCaseResultToEvents', () => { + it('should show pass for tool events when tool_trajectory passes but response_match fails', () => { + const res = { + events: [ + {author: 'user', content: {parts: [{text: 'hello'}]}}, + { + author: 'bot', + content: {parts: [{functionCall: {name: 'test_tool', args: {}}}]}, + }, + { + author: 'bot', + content: + {parts: [{functionResponse: {name: 'test_tool', response: {}}}]}, + }, + {author: 'bot', content: {parts: [{text: 'final response'}]}}, + ], + }; + + const evalCaseResult = { + setId: 'test-set', + evalId: 'test-eval', + finalEvalStatus: 2, + evalMetricResults: [], + overallEvalMetricResults: [], + evalMetricResultPerInvocation: [ + { + evalMetricResults: [ + { + metricName: 'tool_trajectory_avg_score', + evalStatus: 1, + score: 1, + threshold: 1, + }, + { + metricName: 'response_match_score', + evalStatus: 2, + score: 0, + threshold: 0.7, + }, + ], + actualInvocation: { + intermediateData: {toolUses: []}, + finalResponse: {parts: [{text: 'actual'}]}, + }, + expectedInvocation: { + intermediateData: {toolUses: []}, + finalResponse: {parts: [{text: 'expected'}]}, + }, + }, + ], + sessionId: 'test-session', + sessionDetails: {}, + }; + + const result = + (component as any).addEvalCaseResultToEvents(res, evalCaseResult); + + expect(result.events[1].evalStatus) + .toBe(1, 'functionCall event should pass'); + expect(result.events[2].evalStatus) + .toBe(1, 'functionResponse event should pass'); + expect(result.events[3].evalStatus) + .toBe(2, 'final text response should fail'); + }); + + it('should show fail for tool events when tool_trajectory fails but response_match passes', () => { + const res = { + events: [ + {author: 'user', content: {parts: [{text: 'hello'}]}}, + { + author: 'bot', + content: {parts: [{functionCall: {name: 'test_tool', args: {}}}]}, + }, + {author: 'bot', content: {parts: [{text: 'final response'}]}}, + ], + }; + + const evalCaseResult = { + setId: 'test-set', + evalId: 'test-eval', + finalEvalStatus: 2, + evalMetricResults: [], + overallEvalMetricResults: [], + evalMetricResultPerInvocation: [ + { + evalMetricResults: [ + { + metricName: 'tool_trajectory_avg_score', + evalStatus: 2, + score: 0.5, + threshold: 1, + }, + { + metricName: 'response_match_score', + evalStatus: 1, + score: 0.9, + threshold: 0.7, + }, + ], + actualInvocation: { + intermediateData: {toolUses: [{name: 'wrong_tool', args: {}}]}, + finalResponse: {parts: [{text: 'actual'}]}, + }, + expectedInvocation: { + intermediateData: {toolUses: [{name: 'test_tool', args: {}}]}, + finalResponse: {parts: [{text: 'expected'}]}, + }, + }, + ], + sessionId: 'test-session', + sessionDetails: {}, + }; + + const result = + (component as any).addEvalCaseResultToEvents(res, evalCaseResult); + + expect(result.events[1].evalStatus) + .toBe(2, 'functionCall event should fail'); + expect(result.events[2].evalStatus) + .toBe(1, 'final text response should pass'); + }); + }); }); diff --git a/src/app/components/eval-tab/eval-tab.component.ts b/src/app/components/eval-tab/eval-tab.component.ts index 520d1261..7c88f853 100644 --- a/src/app/components/eval-tab/eval-tab.component.ts +++ b/src/app/components/eval-tab/eval-tab.component.ts @@ -324,16 +324,24 @@ export class EvalTabComponent implements OnInit, OnChanges { currentInvocationIndex++; } else { const invocationResult = invocationResults[currentInvocationIndex]; + const isToolEvent = this.isToolRelatedEvent(event); + const relevantMetric = isToolEvent ? + 'tool_trajectory_avg_score' : + 'response_match_score'; + let evalStatus = 1; let failedMetric = ''; let score = 1; let threshold = 1; + for (const evalMetricResult of invocationResult.evalMetricResults) { - if (evalMetricResult.evalStatus === 2) { - evalStatus = 2; - failedMetric = evalMetricResult.metricName; - score = evalMetricResult.score; - threshold = evalMetricResult.threshold; + if (evalMetricResult.metricName === relevantMetric) { + evalStatus = evalMetricResult.evalStatus; + if (evalMetricResult.evalStatus === 2) { + failedMetric = evalMetricResult.metricName; + score = evalMetricResult.score; + threshold = evalMetricResult.threshold; + } break; } } @@ -350,6 +358,14 @@ export class EvalTabComponent implements OnInit, OnChanges { return res; } + private isToolRelatedEvent(event: any): boolean { + if (!event.content || !event.content.parts) { + return false; + } + return event.content.parts.some( + (part: any) => part.functionCall || part.functionResponse); + } + private addEvalFieldsToBotEvent( event: any, invocationResult: any, failedMetric: string, score: number, threshold: number) {