From d049868851a22bc4a78a388334768d63141de34e Mon Sep 17 00:00:00 2001 From: Zain Dana Harper Date: Tue, 30 Jun 2026 07:41:53 -0700 Subject: [PATCH] Fix red main CI: format TypeScript sources with Prettier The TypeScript Lint workflow runs `prettier --check "src/**/*.ts" "test/**/*.ts"` and currently fails on main with "Code style issues found in 66 files", which cancels the rest of the TypeScript CI matrix. This applies `prettier --write` (pinned prettier 3.6.2 from typescript/package.json) over those files and contains no other changes, so the lint job passes again. The two pre-existing eslint unused-variable *warnings* are left untouched as they do not fail the build. Co-Authored-By: Claude Opus 4.8 --- typescript/src/evaluate/compare.ts | 12 +- typescript/src/evaluate/confident.ts | 107 ++++++++++-------- typescript/src/evaluate/console-report.ts | 38 +++++-- typescript/src/evaluate/evaluate.ts | 5 +- typescript/src/evaluate/trace-eval.ts | 7 +- .../langchain/callback-handler.ts | 20 +++- .../answer-relevancy/answer-relevancy.ts | 39 +++++-- .../src/metrics/answer-relevancy/schema.ts | 4 +- .../src/metrics/arena-g-eval/arena-g-eval.ts | 19 ++-- typescript/src/metrics/arena-g-eval/utils.ts | 5 +- .../argument-correctness.ts | 33 ++++-- typescript/src/metrics/bias/bias.ts | 35 ++++-- .../contextual-precision.ts | 34 ++++-- .../contextual-recall/contextual-recall.ts | 40 ++++--- .../contextual-relevancy.ts | 40 ++++--- .../conversation-completeness.ts | 55 +++++---- .../conversational-g-eval.ts | 50 ++++---- .../src/metrics/faithfulness/faithfulness.ts | 57 +++++++--- typescript/src/metrics/g-eval/g-eval.ts | 51 ++++++--- typescript/src/metrics/g-eval/utils.ts | 8 +- .../metrics/goal-accuracy/goal-accuracy.ts | 29 +++-- .../metrics/hallucination/hallucination.ts | 30 +++-- .../json-correctness/json-correctness.ts | 19 +++- .../knowledge-retention.ts | 55 +++++---- .../metrics/mcp-use-metric/mcp-use-metric.ts | 35 ++++-- .../src/metrics/mcp/mcp-task-completion.ts | 13 ++- .../src/metrics/mcp/multi-turn-mcp-use.ts | 40 ++++--- typescript/src/metrics/mcp/schema.ts | 15 ++- typescript/src/metrics/misuse/misuse.ts | 39 +++++-- .../image-coherence/index.ts | 5 +- .../multimodal-metrics/image-editing/index.ts | 5 +- .../image-helpfulness/index.ts | 5 +- .../image-reference/index.ts | 5 +- .../multimodal-metrics/text-to-image/index.ts | 5 +- .../src/metrics/non-advice/non-advice.ts | 39 +++++-- .../src/metrics/pii-leakage/pii-leakage.ts | 24 ++-- .../src/metrics/plan-adherence/index.ts | 5 +- .../metrics/plan-adherence/plan-adherence.ts | 17 ++- typescript/src/metrics/plan-quality/index.ts | 5 +- .../src/metrics/plan-quality/plan-quality.ts | 22 +++- .../prompt-alignment/prompt-alignment.ts | 37 ++++-- .../metrics/role-adherence/role-adherence.ts | 28 ++--- .../metrics/role-violation/role-violation.ts | 37 ++++-- .../src/metrics/step-efficiency/index.ts | 5 +- .../metrics/summarization/summarization.ts | 60 ++++++---- .../src/metrics/task-completion/index.ts | 5 +- .../task-completion/task-completion.ts | 17 ++- .../tool-correctness/tool-correctness.ts | 21 ++-- typescript/src/metrics/tool-use/tool-use.ts | 23 ++-- .../topic-adherence/topic-adherence.ts | 51 +++++---- typescript/src/metrics/toxicity/toxicity.ts | 35 ++++-- .../turn-contextual-precision.ts | 68 +++++++---- .../turn-contextual-recall.ts | 65 ++++++----- .../turn-contextual-relevancy.ts | 71 +++++++----- .../turn-faithfulness/turn-faithfulness.ts | 73 +++++++----- .../metrics/turn-relevancy/turn-relevancy.ts | 37 +++--- .../src/models/gateways/openrouter-model.ts | 4 +- .../src/models/gateways/portkey-model.ts | 3 +- .../src/models/providers/deepseek-model.ts | 4 +- typescript/src/models/providers/kimi-model.ts | 6 +- .../src/models/providers/openai-model.ts | 3 +- .../src/test-case/conversational-test-case.ts | 18 ++- typescript/src/test-case/llm-test-case.ts | 15 ++- typescript/src/test-case/mcp.ts | 4 +- typescript/src/test-case/mllm-image.ts | 2 +- typescript/src/tracing/tracing.ts | 5 +- 66 files changed, 1176 insertions(+), 592 deletions(-) diff --git a/typescript/src/evaluate/compare.ts b/typescript/src/evaluate/compare.ts index 6678c88815..dd9bace257 100644 --- a/typescript/src/evaluate/compare.ts +++ b/typescript/src/evaluate/compare.ts @@ -146,7 +146,12 @@ export async function compare( for (const w of winners) counts[w] = (counts[w] ?? 0) + 1; if (display.printResults) { - printArenaCompleted(counts, runDuration, winners.length, hasCost ? totalCost : 0); + printArenaCompleted( + counts, + runDuration, + winners.length, + hasCost ? totalCost : 0, + ); } // Post to Confident AI as an experiment (no-op unless logged in). @@ -171,7 +176,10 @@ function printArenaCompleted( const sorted = Object.entries(counts).sort((a, b) => b[1] - a[1]); const breakdown = sorted.length ? sorted - .map(([name, wins]) => ` » ${GREEN}${BOLD}${name}${RESET}: ${wins} wins`) + .map( + ([name, wins]) => + ` » ${GREEN}${BOLD}${name}${RESET}: ${wins} wins`, + ) .join("\n") : "No winners"; const cost = tokenCost ? `${tokenCost} USD` : "None"; diff --git a/typescript/src/evaluate/confident.ts b/typescript/src/evaluate/confident.ts index d74f0ba0cb..7bd0984d69 100644 --- a/typescript/src/evaluate/confident.ts +++ b/typescript/src/evaluate/confident.ts @@ -48,7 +48,12 @@ function buildMetricsScores(cases: EvaluatedCase[]) { for (const { metricsData } of cases) { for (const m of metricsData) { if (m.skipped) continue; - const e = map.get(m.name) ?? { scores: [], passes: 0, fails: 0, errors: 0 }; + const e = map.get(m.name) ?? { + scores: [], + passes: 0, + fails: 0, + errors: 0, + }; if (m.error) { e.errors += 1; } else { @@ -81,53 +86,55 @@ export async function postTestRun( let totalCost = 0; let hasCost = false; - cases.forEach(({ testCase, metricsData, runDuration: caseDuration, trace }, order) => { - const success = metricsData.every((m) => m.skipped || m.success); - if (success) testPassed += 1; - else testFailed += 1; + cases.forEach( + ({ testCase, metricsData, runDuration: caseDuration, trace }, order) => { + const success = metricsData.every((m) => m.skipped || m.success); + if (success) testPassed += 1; + else testFailed += 1; - const evaluationCost = caseCost(metricsData); - if (evaluationCost != null) { - totalCost += evaluationCost; - hasCost = true; - } - const metricsDataApi = metricsData.map(convertMetricData); + const evaluationCost = caseCost(metricsData); + if (evaluationCost != null) { + totalCost += evaluationCost; + hasCost = true; + } + const metricsDataApi = metricsData.map(convertMetricData); - if (testCase instanceof ConversationalTestCase) { - conversationalTestCases.push({ - name: testCase.name ?? `test_case_${order}`, - success, - metricsData: metricsDataApi, - runDuration: caseDuration, - evaluationCost, - order, - turns: testCase.turns.map((t, i) => convertTurn(t, i)), - scenario: testCase.scenario, - expectedOutcome: testCase.expectedOutcome, - userDescription: testCase.userDescription, - chatbotRole: testCase.chatbotRole, - imagesMapping: testCase.getImagesMapping(), - }); - } else { - testCases.push({ - name: testCase.name ?? `test_case_${order}`, - input: testCase.input, - actualOutput: testCase.actualOutput, - expectedOutput: testCase.expectedOutput, - context: testCase.context, - retrievalContext: resolveRetrievalContext(testCase.retrievalContext), - toolsCalled: testCase.toolsCalled?.map(convertTool), - expectedTools: testCase.expectedTools?.map(convertTool), - success, - metricsData: metricsDataApi, - runDuration: caseDuration, - evaluationCost, - order, - imagesMapping: testCase.getImagesMapping(), - trace, - }); - } - }); + if (testCase instanceof ConversationalTestCase) { + conversationalTestCases.push({ + name: testCase.name ?? `test_case_${order}`, + success, + metricsData: metricsDataApi, + runDuration: caseDuration, + evaluationCost, + order, + turns: testCase.turns.map((t, i) => convertTurn(t, i)), + scenario: testCase.scenario, + expectedOutcome: testCase.expectedOutcome, + userDescription: testCase.userDescription, + chatbotRole: testCase.chatbotRole, + imagesMapping: testCase.getImagesMapping(), + }); + } else { + testCases.push({ + name: testCase.name ?? `test_case_${order}`, + input: testCase.input, + actualOutput: testCase.actualOutput, + expectedOutput: testCase.expectedOutput, + context: testCase.context, + retrievalContext: resolveRetrievalContext(testCase.retrievalContext), + toolsCalled: testCase.toolsCalled?.map(convertTool), + expectedTools: testCase.expectedTools?.map(convertTool), + success, + metricsData: metricsDataApi, + runDuration: caseDuration, + evaluationCost, + order, + imagesMapping: testCase.getImagesMapping(), + trace, + }); + } + }, + ); const payload = { testCases, @@ -261,7 +268,13 @@ export async function postExperiment( testCases: e.testCases, conversationalTestCases: [], metricsScores: [ - { metric: metricName, scores: e.scores, passes: e.passes, fails: e.fails, errors: e.errors }, + { + metric: metricName, + scores: e.scores, + passes: e.passes, + fails: e.fails, + errors: e.errors, + }, ], identifier: e.identifier, testPassed: e.testPassed, diff --git a/typescript/src/evaluate/console-report.ts b/typescript/src/evaluate/console-report.ts index f39cbfe916..d46ea8032d 100644 --- a/typescript/src/evaluate/console-report.ts +++ b/typescript/src/evaluate/console-report.ts @@ -143,7 +143,11 @@ function wrapCell(c: string, width: number): string[] { } /** A labeled panel line (`Label: value`) wrapped to `inner`, continuations indented. */ -function wrapLabeledLine(prefix: string, value: string, inner: number): string[] { +function wrapLabeledLine( + prefix: string, + value: string, + inner: number, +): string[] { const indent = visLen(prefix); const chunks = wrapText(value, Math.max(10, inner - indent)); return chunks.map((chunk, i) => @@ -222,7 +226,9 @@ function tableLines( function metricStatusCell(m: MetricData): string { if (m.skipped) return `${YELLOW}${BOLD}SKIP${RESET}`; if (m.error) return `${RED}${BOLD}ERROR${RESET}`; - return m.success ? `${GREEN}${BOLD}PASS${RESET}` : `${RED}${BOLD}FAIL${RESET}`; + return m.success + ? `${GREEN}${BOLD}PASS${RESET}` + : `${RED}${BOLD}FAIL${RESET}`; } /** @@ -265,16 +271,32 @@ export function printResultsTable( lines.push(`${CYAN}${BOLD}Conversation Turns${RESET}`); for (const turn of tc.turns ?? []) { const role = turn.role.charAt(0).toUpperCase() + turn.role.slice(1); - lines.push(...wrapLabeledLine(` ${BOLD}${role}:${RESET} `, turn.content, inner)); + lines.push( + ...wrapLabeledLine(` ${BOLD}${role}:${RESET} `, turn.content, inner), + ); } } else { - lines.push(...wrapLabeledLine(`${CYAN}${BOLD}Input:${RESET} `, String(tc.input), inner)); lines.push( - ...wrapLabeledLine(`${CYAN}${BOLD}Actual Output:${RESET} `, String(tc.actualOutput), inner), + ...wrapLabeledLine( + `${CYAN}${BOLD}Input:${RESET} `, + String(tc.input), + inner, + ), + ); + lines.push( + ...wrapLabeledLine( + `${CYAN}${BOLD}Actual Output:${RESET} `, + String(tc.actualOutput), + inner, + ), ); if (tc.expectedOutput && tc.expectedOutput !== "N/A") { lines.push( - ...wrapLabeledLine(`${CYAN}${BOLD}Expected Output:${RESET} `, tc.expectedOutput, inner), + ...wrapLabeledLine( + `${CYAN}${BOLD}Expected Output:${RESET} `, + tc.expectedOutput, + inner, + ), ); } } @@ -408,7 +430,9 @@ export function exportToMarkdown( const ts = `${d.getFullYear()}${pad2(d.getMonth() + 1)}${pad2(d.getDate())}` + `_${pad2(d.getHours())}${pad2(d.getMinutes())}${pad2(d.getSeconds())}`; - const safe = (evaluationName || "evaluation").replace(/\s+/g, "_").toLowerCase(); + const safe = (evaluationName || "evaluation") + .replace(/\s+/g, "_") + .toLowerCase(); const filepath = path.join(outputDir, `${safe}_${ts}.${fileType}`); const sorted = [...testResults].sort( diff --git a/typescript/src/evaluate/evaluate.ts b/typescript/src/evaluate/evaluate.ts index e4e10c359c..5c793c2778 100644 --- a/typescript/src/evaluate/evaluate.ts +++ b/typescript/src/evaluate/evaluate.ts @@ -226,7 +226,10 @@ export async function runMetric( // Dispatched in `evaluate`, so the metric matches the test case type. await (metric.measure as (tc: AnyTestCase) => Promise)(testCase); } catch (e) { - if (e instanceof MissingTestCaseParamsError && errorCfg.skipOnMissingParams) { + if ( + e instanceof MissingTestCaseParamsError && + errorCfg.skipOnMissingParams + ) { metric.skipped = true; } else if (errorCfg.ignoreErrors) { metric.error = (e as Error).message; diff --git a/typescript/src/evaluate/trace-eval.ts b/typescript/src/evaluate/trace-eval.ts index 2ed3415094..8107b49b6a 100644 --- a/typescript/src/evaluate/trace-eval.ts +++ b/typescript/src/evaluate/trace-eval.ts @@ -99,7 +99,12 @@ export async function evaluateTrace( const metricsData: MetricData[] = []; for (const metric of metrics) { metricsData.push( - await runMetric(metric, testCase, errorCfg, options.onMetric ?? (() => {})), + await runMetric( + metric, + testCase, + errorCfg, + options.onMetric ?? (() => {}), + ), ); } scope.metricsData = metricsData; // also attach to the span/trace diff --git a/typescript/src/integrations/langchain/callback-handler.ts b/typescript/src/integrations/langchain/callback-handler.ts index 4c7f282eed..806ee5eed0 100644 --- a/typescript/src/integrations/langchain/callback-handler.ts +++ b/typescript/src/integrations/langchain/callback-handler.ts @@ -203,9 +203,9 @@ export class DeepEvalCallbackHandler // trace by ancestry so it is not left dangling. const traceUuid = this.hierarchy.getTraceUuid(uuidStr); if (traceUuid && traceManager.getTraceByUuid(traceUuid)) { - const others = Array.from(traceManager.getActiveSpans().values()).filter( - (s) => s.traceUuid === traceUuid, - ); + const others = Array.from( + traceManager.getActiveSpans().values(), + ).filter((s) => s.traceUuid === traceUuid); if (others.length === 0) { traceManager.setTraceStatus(traceUuid, TraceSpanStatus.ERRORED); traceManager.endTrace(traceUuid); @@ -408,7 +408,12 @@ export class DeepEvalCallbackHandler } } - async handleToolEnd(output: any, runId: string, _parentRunId?: string, _tags?: string[]) { + async handleToolEnd( + output: any, + runId: string, + _parentRunId?: string, + _tags?: string[], + ) { const uuidStr = String(runId); const toolSpan: any = traceManager.getSpanByUuid(uuidStr); @@ -432,7 +437,12 @@ export class DeepEvalCallbackHandler this.hierarchy.cleanupRun(uuidStr); } - async handleToolError(err: any, runId: string, _parentRunId?: string, _tags?: string[]) { + async handleToolError( + err: any, + runId: string, + _parentRunId?: string, + _tags?: string[], + ) { const uuidStr = String(runId); const toolSpan: any = traceManager.getSpanByUuid(uuidStr); diff --git a/typescript/src/metrics/answer-relevancy/answer-relevancy.ts b/typescript/src/metrics/answer-relevancy/answer-relevancy.ts index bbddfc1cb7..32334e44e2 100644 --- a/typescript/src/metrics/answer-relevancy/answer-relevancy.ts +++ b/typescript/src/metrics/answer-relevancy/answer-relevancy.ts @@ -75,9 +75,14 @@ export class AnswerRelevancyMetric extends BaseMetric { } private async generateStatements(actualOutput: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_statements", { - actual_output: actualOutput, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_statements", + { + actual_output: actualOutput, + }, + ); const { statements } = await generateWithSchema( this, prompt, @@ -90,10 +95,15 @@ export class AnswerRelevancyMetric extends BaseMetric { input: string, ): Promise { if (this.statements.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - input, - statements: this.statements, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + input, + statements: this.statements, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -103,11 +113,16 @@ export class AnswerRelevancyMetric extends BaseMetric { const irrelevantStatements = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "no") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - irrelevant_statements: irrelevantStatements, - input, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + irrelevant_statements: irrelevantStatements, + input, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/answer-relevancy/schema.ts b/typescript/src/metrics/answer-relevancy/schema.ts index f4ae0fce2f..2067b589ab 100644 --- a/typescript/src/metrics/answer-relevancy/schema.ts +++ b/typescript/src/metrics/answer-relevancy/schema.ts @@ -17,4 +17,6 @@ export const AnswerRelevancyScoreReasonSchema = z.object({ reason: z.string(), }); -export type AnswerRelevancyVerdict = z.infer; +export type AnswerRelevancyVerdict = z.infer< + typeof AnswerRelevancyVerdictSchema +>; diff --git a/typescript/src/metrics/arena-g-eval/arena-g-eval.ts b/typescript/src/metrics/arena-g-eval/arena-g-eval.ts index d3540e3e4d..b87d957a06 100644 --- a/typescript/src/metrics/arena-g-eval/arena-g-eval.ts +++ b/typescript/src/metrics/arena-g-eval/arena-g-eval.ts @@ -156,13 +156,18 @@ export class ArenaGEval extends BaseArenaMetric { reason: string, dummyToReal: Record, ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "rewrite_reason", { - reason, - // Pass as JSON text: Nunjucks renders a bare object as "[object Object]" - // (the resolver only gives arrays a Python-repr toString). The template - // renders {{ dummy_to_real_names }} directly and its example is JSON. - dummy_to_real_names: JSON.stringify(dummyToReal), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "rewrite_reason", + { + reason, + // Pass as JSON text: Nunjucks renders a bare object as "[object Object]" + // (the resolver only gives arrays a Python-repr toString). The template + // renders {{ dummy_to_real_names }} directly and its example is JSON. + dummy_to_real_names: JSON.stringify(dummyToReal), + }, + ); const { rewritten_reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/arena-g-eval/utils.ts b/typescript/src/metrics/arena-g-eval/utils.ts index 13f222c77b..4b506ee281 100644 --- a/typescript/src/metrics/arena-g-eval/utils.ts +++ b/typescript/src/metrics/arena-g-eval/utils.ts @@ -36,7 +36,10 @@ function formattedLLMTestCase( tc: LLMTestCase, ): string { const data: Record = {}; - if (params.includes(SingleTurnParams.ACTUAL_OUTPUT) && tc.actualOutput != null) + if ( + params.includes(SingleTurnParams.ACTUAL_OUTPUT) && + tc.actualOutput != null + ) data.actual_output = tc.actualOutput; if (params.includes(SingleTurnParams.CONTEXT) && tc.context != null) data.context = tc.context; diff --git a/typescript/src/metrics/argument-correctness/argument-correctness.ts b/typescript/src/metrics/argument-correctness/argument-correctness.ts index d6bd6541b6..921ec25d0c 100644 --- a/typescript/src/metrics/argument-correctness/argument-correctness.ts +++ b/typescript/src/metrics/argument-correctness/argument-correctness.ts @@ -66,7 +66,10 @@ export class ArgumentCorrectnessMetric extends BaseMetric { this.score = 1; this.reason = "No tool calls provided"; } else { - this.verdicts = await this.generateVerdicts(testCase.input, toolsCalled); + this.verdicts = await this.generateVerdicts( + testCase.input, + toolsCalled, + ); this.score = this.calculateScore(); this.reason = await this.generateReason(testCase.input); } @@ -86,10 +89,15 @@ export class ArgumentCorrectnessMetric extends BaseMetric { input: string, toolsCalled: ToolCall[], ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - input, - stringified_tools_called: printToolsCalled(toolsCalled), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + input, + stringified_tools_called: printToolsCalled(toolsCalled), + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -99,11 +107,16 @@ export class ArgumentCorrectnessMetric extends BaseMetric { const incorrectToolCallsReasons = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "no") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - incorrect_tool_calls_reasons: incorrectToolCallsReasons, - input, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + incorrect_tool_calls_reasons: incorrectToolCallsReasons, + input, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/bias/bias.ts b/typescript/src/metrics/bias/bias.ts index cff78f3878..bc77177397 100644 --- a/typescript/src/metrics/bias/bias.ts +++ b/typescript/src/metrics/bias/bias.ts @@ -80,18 +80,28 @@ export class BiasMetric extends BaseMetric { } private async generateOpinions(actualOutput: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_opinions", { - actual_output: actualOutput, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_opinions", + { + actual_output: actualOutput, + }, + ); const { opinions } = await generateWithSchema(this, prompt, OpinionsSchema); return opinions; } private async generateVerdicts(): Promise { if (this.opinions.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - opinions: this.opinions, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + opinions: this.opinions, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -101,10 +111,15 @@ export class BiasMetric extends BaseMetric { const biases = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "yes") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - biases, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + biases, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/contextual-precision/contextual-precision.ts b/typescript/src/metrics/contextual-precision/contextual-precision.ts index 01cb214c1b..c16e85ea0b 100644 --- a/typescript/src/metrics/contextual-precision/contextual-precision.ts +++ b/typescript/src/metrics/contextual-precision/contextual-precision.ts @@ -126,13 +126,18 @@ export class ContextualPrecisionMetric extends BaseMetric { retrievalContext: string[], ): Promise { const n = retrievalContext.length; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - input, - expected_output: expectedOutput, - document_count_str: ` (${n} document${n > 1 ? "s" : ""})`, - context_to_display: retrievalContext, - multimodal_note: "", - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + input, + expected_output: expectedOutput, + document_count_str: ` (${n} document${n > 1 ? "s" : ""})`, + context_to_display: retrievalContext, + multimodal_note: "", + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -143,11 +148,16 @@ export class ContextualPrecisionMetric extends BaseMetric { verdict: v.verdict, reason: v.reason, })); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - input, - verdicts, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + input, + verdicts, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/contextual-recall/contextual-recall.ts b/typescript/src/metrics/contextual-recall/contextual-recall.ts index f71830899b..7be0158fa1 100644 --- a/typescript/src/metrics/contextual-recall/contextual-recall.ts +++ b/typescript/src/metrics/contextual-recall/contextual-recall.ts @@ -82,14 +82,19 @@ export class ContextualRecallMetric extends BaseMetric { expectedOutput: string, retrievalContext: string[], ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - expected_output: expectedOutput, - content_type: "sentence", - content_type_plural: "sentences", - content_or: "sentence", - context_to_display: retrievalContext, - node_instruction: "", - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + expected_output: expectedOutput, + content_type: "sentence", + content_type_plural: "sentences", + content_or: "sentence", + context_to_display: retrievalContext, + node_instruction: "", + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -104,13 +109,18 @@ export class ContextualRecallMetric extends BaseMetric { if (v.verdict.toLowerCase() === "yes") supportiveReasons.push(v.reason); else unsupportiveReasons.push(v.reason); } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - expected_output: expectedOutput, - supportive_reasons: supportiveReasons, - unsupportive_reasons: unsupportiveReasons, - score: (this.score ?? 0).toFixed(2), - content_type: "sentence", - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + expected_output: expectedOutput, + supportive_reasons: supportiveReasons, + unsupportive_reasons: unsupportiveReasons, + score: (this.score ?? 0).toFixed(2), + content_type: "sentence", + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/contextual-relevancy/contextual-relevancy.ts b/typescript/src/metrics/contextual-relevancy/contextual-relevancy.ts index 122d94e354..a45ea790be 100644 --- a/typescript/src/metrics/contextual-relevancy/contextual-relevancy.ts +++ b/typescript/src/metrics/contextual-relevancy/contextual-relevancy.ts @@ -25,7 +25,7 @@ const EXTRACTION_INSTRUCTIONS = "high level information found in the context, before deciding on a " + "verdict and optionally a reason for each statement."; const EMPTY_CONTEXT_INSTRUCTION = - '\nIf provided context contains no actual content or statements then: ' + + "\nIf provided context contains no actual content or statements then: " + 'give "no" as a "verdict",\nput context into "statement", and ' + '"No statements found in provided context." into "reason".'; @@ -97,14 +97,19 @@ export class ContextualRelevancyMetric extends BaseMetric { input: string, context: string, ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - input, - context, - context_type: "context", - statement_or_image: "statement", - extraction_instructions: EXTRACTION_INSTRUCTIONS, - empty_context_instruction: EMPTY_CONTEXT_INSTRUCTION, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + input, + context, + context_type: "context", + statement_or_image: "statement", + extraction_instructions: EXTRACTION_INSTRUCTIONS, + empty_context_instruction: EMPTY_CONTEXT_INSTRUCTION, + }, + ); return generateWithSchema(this, prompt, ContextualRelevancyVerdictsSchema); } @@ -119,12 +124,17 @@ export class ContextualRelevancyMetric extends BaseMetric { else relevantStatements.push(v.statement); } } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - input, - irrelevant_statements: irrelevantStatements, - relevant_statements: relevantStatements, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + input, + irrelevant_statements: irrelevantStatements, + relevant_statements: relevantStatements, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/conversation-completeness/conversation-completeness.ts b/typescript/src/metrics/conversation-completeness/conversation-completeness.ts index 9e2c1d4109..10583e6dc7 100644 --- a/typescript/src/metrics/conversation-completeness/conversation-completeness.ts +++ b/typescript/src/metrics/conversation-completeness/conversation-completeness.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -87,9 +83,14 @@ export class ConversationCompletenessMetric extends BaseConversationalMetric { } private async extractUserIntentions(turns: Turn[]): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "extract_user_intentions", { - turns: turns.map((turn) => convertTurnToDict(turn)), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "extract_user_intentions", + { + turns: turns.map((turn) => convertTurnToDict(turn)), + }, + ); const { intentions } = await generateWithSchema( this, prompt, @@ -102,23 +103,39 @@ export class ConversationCompletenessMetric extends BaseConversationalMetric { turns: Turn[], intention: string, ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - turns: turns.map((turn) => convertTurnToDict(turn)), - intention, - }); - return generateWithSchema(this, prompt, ConversationCompletenessVerdictSchema); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + turns: turns.map((turn) => convertTurnToDict(turn)), + intention, + }, + ); + return generateWithSchema( + this, + prompt, + ConversationCompletenessVerdictSchema, + ); } private async generateReason(): Promise { if (!this.includeReason) return undefined; const incompletenesses = this.verdicts - .filter((v) => v?.verdict != null && v.verdict.trim().toLowerCase() === "no") + .filter( + (v) => v?.verdict != null && v.verdict.trim().toLowerCase() === "no", + ) .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - score: this.score, - incompletenesses, - intentions: this.userIntentions, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + score: this.score, + incompletenesses, + intentions: this.userIntentions, + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/conversational-g-eval/conversational-g-eval.ts b/typescript/src/metrics/conversational-g-eval/conversational-g-eval.ts index e2af41b1be..b80b4e2a43 100644 --- a/typescript/src/metrics/conversational-g-eval/conversational-g-eval.ts +++ b/typescript/src/metrics/conversational-g-eval/conversational-g-eval.ts @@ -127,12 +127,17 @@ export class ConversationalGEval extends BaseConversationalMetric { private async generateEvaluationSteps(): Promise { if (this.evaluationSteps) return this.evaluationSteps; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_evaluation_steps", { - criteria: this.criteria, - parameters: constructConversationalGEvalTurnParamsString( - this.evaluationParams, - ), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_evaluation_steps", + { + criteria: this.criteria, + parameters: constructConversationalGEvalTurnParamsString( + this.evaluationParams, + ), + }, + ); const { steps } = await generateWithSchema(this, prompt, StepsSchema); return steps; } @@ -140,20 +145,25 @@ export class ConversationalGEval extends BaseConversationalMetric { private async evaluate( testCase: ConversationalTestCase, ): Promise<[number, string]> { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_evaluation_results", { - evaluation_steps: numberEvaluationSteps(this.evaluationSteps ?? []), - test_case_content: constructNonTurnsTestCaseString( - this.evaluationParams, - testCase, - ), - turns: testCase.turns.map((t) => - convertTurnToDict(t, this.evaluationParams), - ), - parameters: constructConversationalGEvalTurnParamsString( - this.evaluationParams, - ), - rubric: this.rubric ? formatRubrics(this.rubric) : null, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_evaluation_results", + { + evaluation_steps: numberEvaluationSteps(this.evaluationSteps ?? []), + test_case_content: constructNonTurnsTestCaseString( + this.evaluationParams, + testCase, + ), + turns: testCase.turns.map((t) => + convertTurnToDict(t, this.evaluationParams), + ), + parameters: constructConversationalGEvalTurnParamsString( + this.evaluationParams, + ), + rubric: this.rubric ? formatRubrics(this.rubric) : null, + }, + ); const { score, reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/faithfulness/faithfulness.ts b/typescript/src/metrics/faithfulness/faithfulness.ts index 27d84ba10b..6f65649c11 100644 --- a/typescript/src/metrics/faithfulness/faithfulness.ts +++ b/typescript/src/metrics/faithfulness/faithfulness.ts @@ -22,7 +22,8 @@ const TEMPLATE_CLASS = "FaithfulnessMetric"; function truthsLimitPhrase(limit?: number): string { if (limit == null) return " FACTUAL, undisputed truths"; - if (limit === 1) return " the single most important FACTUAL, undisputed truth"; + if (limit === 1) + return " the single most important FACTUAL, undisputed truth"; return ` the ${limit} most important FACTUAL, undisputed truths per document`; } @@ -103,30 +104,45 @@ export class FaithfulnessMetric extends BaseMetric { } private async generateTruths(retrievalContext: string[]): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_truths", { - retrieval_context: retrievalContext.join("\n\n"), - limit: truthsLimitPhrase(this.truthsExtractionLimit), - multimodal_instruction: "", - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_truths", + { + retrieval_context: retrievalContext.join("\n\n"), + limit: truthsLimitPhrase(this.truthsExtractionLimit), + multimodal_instruction: "", + }, + ); const { truths } = await generateWithSchema(this, prompt, TruthsSchema); return truths; } private async generateClaims(actualOutput: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_claims", { - actual_output: actualOutput, - multimodal_instruction: "", - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_claims", + { + actual_output: actualOutput, + multimodal_instruction: "", + }, + ); const { claims } = await generateWithSchema(this, prompt, ClaimsSchema); return claims; } private async generateVerdicts(): Promise { if (this.claims.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - claims: this.claims, - retrieval_context: this.truths.join("\n\n"), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + claims: this.claims, + retrieval_context: this.truths.join("\n\n"), + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -140,10 +156,15 @@ export class FaithfulnessMetric extends BaseMetric { else if (vd === "idk" && this.penalizeAmbiguousClaims) contradictions.push(`(Ambiguous) ${v.reason}`); } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - contradictions, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + contradictions, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/g-eval/g-eval.ts b/typescript/src/metrics/g-eval/g-eval.ts index f868d0c8c2..721e6a5c5a 100644 --- a/typescript/src/metrics/g-eval/g-eval.ts +++ b/typescript/src/metrics/g-eval/g-eval.ts @@ -114,10 +114,15 @@ export class GEval extends BaseMetric { private async generateEvaluationSteps(): Promise { if (this.evaluationSteps) return this.evaluationSteps; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_evaluation_steps", { - criteria: this.criteria, - parameters: constructGEvalParamsString(this.evaluationParams), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_evaluation_steps", + { + criteria: this.criteria, + parameters: constructGEvalParamsString(this.evaluationParams), + }, + ); const { steps } = await generateWithSchema(this, prompt, StepsSchema); return steps; } @@ -131,20 +136,30 @@ export class GEval extends BaseMetric { const numberedSteps = numberEvaluationSteps(this.evaluationSteps ?? []); const prompt = this.strictMode - ? resolveTemplate("metrics", TEMPLATE_CLASS, "generate_strict_evaluation_results", { - evaluation_steps: numberedSteps, - test_case_content: testCaseContent, - parameters, - _additional_context: null, - }) - : resolveTemplate("metrics", TEMPLATE_CLASS, "generate_evaluation_results", { - evaluation_steps: numberedSteps, - test_case_content: testCaseContent, - parameters, - rubric: this.rubric ? formatRubrics(this.rubric) : null, - score_range: this.scoreRange, - _additional_context: null, - }); + ? resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_strict_evaluation_results", + { + evaluation_steps: numberedSteps, + test_case_content: testCaseContent, + parameters, + _additional_context: null, + }, + ) + : resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_evaluation_results", + { + evaluation_steps: numberedSteps, + test_case_content: testCaseContent, + parameters, + rubric: this.rubric ? formatRubrics(this.rubric) : null, + score_range: this.scoreRange, + _additional_context: null, + }, + ); const { score, reason } = await generateWithSchema( this, diff --git a/typescript/src/metrics/g-eval/utils.ts b/typescript/src/metrics/g-eval/utils.ts index 0c0a816ddf..6863fd52da 100644 --- a/typescript/src/metrics/g-eval/utils.ts +++ b/typescript/src/metrics/g-eval/utils.ts @@ -58,9 +58,7 @@ function formatValue(value: unknown): string { } /** Join param labels: "A", "A and B", or "A, B, and C". */ -export function constructGEvalParamsString( - params: SingleTurnParams[], -): string { +export function constructGEvalParamsString(params: SingleTurnParams[]): string { const labels = params.map((p) => G_EVAL_PARAMS[p] ?? p); if (labels.length === 1) return labels[0]; if (labels.length === 2) return labels.join(" and "); @@ -102,7 +100,9 @@ export function getScoreRange(rubrics?: Rubric[]): [number, number] { } /** Sort rubrics by start and reject overlaps. Returns undefined for none. */ -export function validateAndSortRubrics(rubrics?: Rubric[]): Rubric[] | undefined { +export function validateAndSortRubrics( + rubrics?: Rubric[], +): Rubric[] | undefined { if (!rubrics || rubrics.length === 0) return undefined; const sorted = [...rubrics].sort((a, b) => a.scoreRange[0] - b.scoreRange[0]); for (let i = 0; i < sorted.length; i++) { diff --git a/typescript/src/metrics/goal-accuracy/goal-accuracy.ts b/typescript/src/metrics/goal-accuracy/goal-accuracy.ts index dcd40ae9ea..db36789f48 100644 --- a/typescript/src/metrics/goal-accuracy/goal-accuracy.ts +++ b/typescript/src/metrics/goal-accuracy/goal-accuracy.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -66,9 +62,7 @@ export class GoalAccuracyMetric extends BaseConversationalMetric { checkConversationalTestCaseParams(testCase, this.requiredParams, this); this.evaluationCost = this.usingNativeModel ? 0 : undefined; - const tasks = this.goalAndStepsTaken( - getUnitInteractions(testCase.turns), - ); + const tasks = this.goalAndStepsTaken(getUnitInteractions(testCase.turns)); [this.goalScores, this.planScores] = await Promise.all([ Promise.all( tasks.map((t) => @@ -150,13 +144,18 @@ export class GoalAccuracyMetric extends BaseConversationalMetric { const planEvaluations = this.planScores .map((p) => `Score: ${p.score}, Reason: ${p.reason} \n`) .join(""); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "get_final_reason", { - final_score: this.score, - threshold: this.threshold, - goal_evaluations: goalEvaluations, - // NOTE: matches Python's misspelled template variable. - plan_evalautions: planEvaluations, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_final_reason", + { + final_score: this.score, + threshold: this.threshold, + goal_evaluations: goalEvaluations, + // NOTE: matches Python's misspelled template variable. + plan_evalautions: planEvaluations, + }, + ); // Free-text reason (no schema), mirroring Python's raw `model.generate`. const { output, cost } = await this.model!.generate(prompt); this.accrueCost(cost); diff --git a/typescript/src/metrics/hallucination/hallucination.ts b/typescript/src/metrics/hallucination/hallucination.ts index 2e11f1a1ed..334266a5d2 100644 --- a/typescript/src/metrics/hallucination/hallucination.ts +++ b/typescript/src/metrics/hallucination/hallucination.ts @@ -82,11 +82,16 @@ export class HallucinationMetric extends BaseMetric { actualOutput: string, contexts: string[], ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - actual_output: actualOutput, - contexts, - contexts_count: contexts.length, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + actual_output: actualOutput, + contexts, + contexts_count: contexts.length, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -100,11 +105,16 @@ export class HallucinationMetric extends BaseMetric { factualAlignments.push(v.reason); else contradictions.push(v.reason); } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - factual_alignments: factualAlignments, - contradictions, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + factual_alignments: factualAlignments, + contradictions, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/json-correctness/json-correctness.ts b/typescript/src/metrics/json-correctness/json-correctness.ts index 92f7450c1c..18abd623b0 100644 --- a/typescript/src/metrics/json-correctness/json-correctness.ts +++ b/typescript/src/metrics/json-correctness/json-correctness.ts @@ -92,11 +92,20 @@ export class JsonCorrectnessMetric extends BaseMetric { if (!this.includeReason) return undefined; if (this.score === 1) return DEFAULT_CORRECT_REASON; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - actual_output: actualOutput, - expected_schema: JSON.stringify(toJsonSchema(this.expectedSchema), null, 4), - is_valid_json: false, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + actual_output: actualOutput, + expected_schema: JSON.stringify( + toJsonSchema(this.expectedSchema), + null, + 4, + ), + is_valid_json: false, + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/knowledge-retention/knowledge-retention.ts b/typescript/src/metrics/knowledge-retention/knowledge-retention.ts index 820726c3c6..9cc92f4559 100644 --- a/typescript/src/metrics/knowledge-retention/knowledge-retention.ts +++ b/typescript/src/metrics/knowledge-retention/knowledge-retention.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -85,15 +81,22 @@ export class KnowledgeRetentionMetric extends BaseConversationalMetric { } /** Extract knowledge from each user turn (assistant turns get `null`). */ - private async generateKnowledges(turns: Turn[]): Promise<(Knowledge | null)[]> { + private async generateKnowledges( + turns: Turn[], + ): Promise<(Knowledge | null)[]> { const knowledges: (Knowledge | null)[] = new Array(turns.length).fill(null); const extracted = await Promise.all( turns.map(async (turn, i) => { if (turn.role === "assistant") return null; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "extract_data", { - user_message: turn.content, - previous_turns: turns.slice(0, i).map((t) => convertTurnToDict(t)), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "extract_data", + { + user_message: turn.content, + previous_turns: turns.slice(0, i).map((t) => convertTurnToDict(t)), + }, + ); return generateWithSchema(this, prompt, KnowledgeSchema); }), ); @@ -115,11 +118,20 @@ export class KnowledgeRetentionMetric extends BaseConversationalMetric { .filter((k): k is Knowledge => k != null && k.data != null) .map((k) => k.data); if (accumulatedKnowledge.length === 0) return null; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdict", { - llm_message: turn.content, - accumulated_knowledge: accumulatedKnowledge, - }); - return generateWithSchema(this, prompt, KnowledgeRetentionVerdictSchema); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdict", + { + llm_message: turn.content, + accumulated_knowledge: accumulatedKnowledge, + }, + ); + return generateWithSchema( + this, + prompt, + KnowledgeRetentionVerdictSchema, + ); }), ); return results.filter((v): v is KnowledgeRetentionVerdict => v != null); @@ -130,10 +142,15 @@ export class KnowledgeRetentionMetric extends BaseConversationalMetric { const attritions = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "yes") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - attritions, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + attritions, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/mcp-use-metric/mcp-use-metric.ts b/typescript/src/metrics/mcp-use-metric/mcp-use-metric.ts index bf8777530e..fca37d132c 100644 --- a/typescript/src/metrics/mcp-use-metric/mcp-use-metric.ts +++ b/typescript/src/metrics/mcp-use-metric/mcp-use-metric.ts @@ -84,20 +84,30 @@ export class MCPUseMetric extends BaseMetric { const primScore = await generateWithSchema( this, - resolveTemplate("metrics", TEMPLATE_CLASS, "get_primitive_correctness_prompt", { - test_case: testCaseVars, - available_primitives: availablePrimitives, - primitives_used: primitivesUsed, - }), + resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_primitive_correctness_prompt", + { + test_case: testCaseVars, + available_primitives: availablePrimitives, + primitives_used: primitivesUsed, + }, + ), MCPPrimitivesScoreSchema, ); const argScore = await generateWithSchema( this, - resolveTemplate("metrics", TEMPLATE_CLASS, "get_mcp_argument_correctness_prompt", { - test_case: testCaseVars, - available_primitives: availablePrimitives, - primitives_used: primitivesUsed, - }), + resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_mcp_argument_correctness_prompt", + { + test_case: testCaseVars, + available_primitives: availablePrimitives, + primitives_used: primitivesUsed, + }, + ), MCPArgsScoreSchema, ); @@ -129,7 +139,10 @@ export class MCPUseMetric extends BaseMetric { let availablePrimitives = "MCP Primitives Available: \n"; for (const server of mcpServers) { availablePrimitives += `MCP Server ${server.serverName}\n`; - availablePrimitives += block("Available Tools", server.availableTools ?? []); + availablePrimitives += block( + "Available Tools", + server.availableTools ?? [], + ); availablePrimitives += block( "Available Resources", server.availableResources ?? [], diff --git a/typescript/src/metrics/mcp/mcp-task-completion.ts b/typescript/src/metrics/mcp/mcp-task-completion.ts index abe68f744e..fc2d86fbb9 100644 --- a/typescript/src/metrics/mcp/mcp-task-completion.ts +++ b/typescript/src/metrics/mcp/mcp-task-completion.ts @@ -65,10 +65,15 @@ export class MCPTaskCompletionMetric extends BaseConversationalMetric { tasks.map((task) => generateWithSchema( this, - resolveTemplate("metrics", TEMPLATE_CLASS, "get_task_completion_score", { - task, - steps_taken: taskStepsTakenText(task), - }), + resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_task_completion_score", + { + task, + steps_taken: taskStepsTakenText(task), + }, + ), TaskScoreSchema, ), ), diff --git a/typescript/src/metrics/mcp/multi-turn-mcp-use.ts b/typescript/src/metrics/mcp/multi-turn-mcp-use.ts index dfb4b369a6..a485782115 100644 --- a/typescript/src/metrics/mcp/multi-turn-mcp-use.ts +++ b/typescript/src/metrics/mcp/multi-turn-mcp-use.ts @@ -12,7 +12,11 @@ import { checkConversationalTestCaseParams, getUnitInteractions, } from "../conversational-utils"; -import { getTasks, taskStepsTakenText, availableMcpServersBlock } from "./utils"; +import { + getTasks, + taskStepsTakenText, + availableMcpServersBlock, +} from "./utils"; import { ToolScoreSchema, ArgsScoreSchema, @@ -75,11 +79,16 @@ export class MultiTurnMCPUseMetric extends BaseConversationalMetric { tasks.map((task) => generateWithSchema( this, - resolveTemplate("metrics", TEMPLATE_CLASS, "get_tool_correctness_score", { - task, - available_tools: availableTools, - steps_taken: taskStepsTakenText(task), - }), + resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_tool_correctness_score", + { + task, + available_tools: availableTools, + steps_taken: taskStepsTakenText(task), + }, + ), ToolScoreSchema, ), ), @@ -88,13 +97,18 @@ export class MultiTurnMCPUseMetric extends BaseConversationalMetric { tasks.map((task) => generateWithSchema( this, - resolveTemplate("metrics", TEMPLATE_CLASS, "get_args_correctness_score", { - task, - available_tools: availableTools, - available_resources: availableResources, - available_prompts: availablePrompts, - steps_taken: taskStepsTakenText(task), - }), + resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_args_correctness_score", + { + task, + available_tools: availableTools, + available_resources: availableResources, + available_prompts: availablePrompts, + steps_taken: taskStepsTakenText(task), + }, + ), ArgsScoreSchema, ), ), diff --git a/typescript/src/metrics/mcp/schema.ts b/typescript/src/metrics/mcp/schema.ts index b29eeed0ed..8760979110 100644 --- a/typescript/src/metrics/mcp/schema.ts +++ b/typescript/src/metrics/mcp/schema.ts @@ -2,9 +2,18 @@ import { z } from "zod"; // Mirrors deepeval/metrics/mcp/schema.py. -export const TaskScoreSchema = z.object({ score: z.number(), reason: z.string() }); -export const ToolScoreSchema = z.object({ score: z.number(), reason: z.string() }); -export const ArgsScoreSchema = z.object({ score: z.number(), reason: z.string() }); +export const TaskScoreSchema = z.object({ + score: z.number(), + reason: z.string(), +}); +export const ToolScoreSchema = z.object({ + score: z.number(), + reason: z.string(), +}); +export const ArgsScoreSchema = z.object({ + score: z.number(), + reason: z.string(), +}); export const ReasonSchema = z.object({ reason: z.string() }); export type TaskScore = z.infer; diff --git a/typescript/src/metrics/misuse/misuse.ts b/typescript/src/metrics/misuse/misuse.ts index 46030600b3..6040717219 100644 --- a/typescript/src/metrics/misuse/misuse.ts +++ b/typescript/src/metrics/misuse/misuse.ts @@ -83,20 +83,30 @@ export class MisuseMetric extends BaseMetric { } private async generateMisuses(actualOutput: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_misuses", { - actual_output: actualOutput, - domain: this.domain, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_misuses", + { + actual_output: actualOutput, + domain: this.domain, + }, + ); const { misuses } = await generateWithSchema(this, prompt, MisusesSchema); return misuses; } private async generateVerdicts(): Promise { if (this.misuses.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - misuses: this.misuses, - domain: this.domain, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + misuses: this.misuses, + domain: this.domain, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -106,10 +116,15 @@ export class MisuseMetric extends BaseMetric { const misuseViolations = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "yes") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - misuse_violations: misuseViolations, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + misuse_violations: misuseViolations, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/multimodal-metrics/image-coherence/index.ts b/typescript/src/metrics/multimodal-metrics/image-coherence/index.ts index 6884cc571a..ceb1ebfd15 100644 --- a/typescript/src/metrics/multimodal-metrics/image-coherence/index.ts +++ b/typescript/src/metrics/multimodal-metrics/image-coherence/index.ts @@ -1 +1,4 @@ -export { ImageCoherenceMetric, type ImageCoherenceMetricOptions } from "./image-coherence"; +export { + ImageCoherenceMetric, + type ImageCoherenceMetricOptions, +} from "./image-coherence"; diff --git a/typescript/src/metrics/multimodal-metrics/image-editing/index.ts b/typescript/src/metrics/multimodal-metrics/image-editing/index.ts index e02ff3f926..54f7aedff1 100644 --- a/typescript/src/metrics/multimodal-metrics/image-editing/index.ts +++ b/typescript/src/metrics/multimodal-metrics/image-editing/index.ts @@ -1 +1,4 @@ -export { ImageEditingMetric, type ImageEditingMetricOptions } from "./image-editing"; +export { + ImageEditingMetric, + type ImageEditingMetricOptions, +} from "./image-editing"; diff --git a/typescript/src/metrics/multimodal-metrics/image-helpfulness/index.ts b/typescript/src/metrics/multimodal-metrics/image-helpfulness/index.ts index 8432366a98..78fae9a9ce 100644 --- a/typescript/src/metrics/multimodal-metrics/image-helpfulness/index.ts +++ b/typescript/src/metrics/multimodal-metrics/image-helpfulness/index.ts @@ -1 +1,4 @@ -export { ImageHelpfulnessMetric, type ImageHelpfulnessMetricOptions } from "./image-helpfulness"; +export { + ImageHelpfulnessMetric, + type ImageHelpfulnessMetricOptions, +} from "./image-helpfulness"; diff --git a/typescript/src/metrics/multimodal-metrics/image-reference/index.ts b/typescript/src/metrics/multimodal-metrics/image-reference/index.ts index 9c657bbe7d..d91b92fcde 100644 --- a/typescript/src/metrics/multimodal-metrics/image-reference/index.ts +++ b/typescript/src/metrics/multimodal-metrics/image-reference/index.ts @@ -1 +1,4 @@ -export { ImageReferenceMetric, type ImageReferenceMetricOptions } from "./image-reference"; +export { + ImageReferenceMetric, + type ImageReferenceMetricOptions, +} from "./image-reference"; diff --git a/typescript/src/metrics/multimodal-metrics/text-to-image/index.ts b/typescript/src/metrics/multimodal-metrics/text-to-image/index.ts index 85643cc751..1c37064d3e 100644 --- a/typescript/src/metrics/multimodal-metrics/text-to-image/index.ts +++ b/typescript/src/metrics/multimodal-metrics/text-to-image/index.ts @@ -1 +1,4 @@ -export { TextToImageMetric, type TextToImageMetricOptions } from "./text-to-image"; +export { + TextToImageMetric, + type TextToImageMetricOptions, +} from "./text-to-image"; diff --git a/typescript/src/metrics/non-advice/non-advice.ts b/typescript/src/metrics/non-advice/non-advice.ts index ebbee53e13..9147f94110 100644 --- a/typescript/src/metrics/non-advice/non-advice.ts +++ b/typescript/src/metrics/non-advice/non-advice.ts @@ -83,20 +83,30 @@ export class NonAdviceMetric extends BaseMetric { } private async generateAdvices(actualOutput: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_advices", { - actual_output: actualOutput, - advice_types: this.adviceTypes, - advice_types_str: this.adviceTypes.join(", "), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_advices", + { + actual_output: actualOutput, + advice_types: this.adviceTypes, + advice_types_str: this.adviceTypes.join(", "), + }, + ); const { advices } = await generateWithSchema(this, prompt, AdvicesSchema); return advices; } private async generateVerdicts(): Promise { if (this.advices.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - advices: this.advices, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + advices: this.advices, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -106,10 +116,15 @@ export class NonAdviceMetric extends BaseMetric { const nonAdviceViolations = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "yes") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - non_advice_violations: nonAdviceViolations, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + non_advice_violations: nonAdviceViolations, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/pii-leakage/pii-leakage.ts b/typescript/src/metrics/pii-leakage/pii-leakage.ts index 44ff87b5cf..8d725b7b96 100644 --- a/typescript/src/metrics/pii-leakage/pii-leakage.ts +++ b/typescript/src/metrics/pii-leakage/pii-leakage.ts @@ -92,9 +92,14 @@ export class PIILeakageMetric extends BaseMetric { private async generateVerdicts(): Promise { if (this.extractedPii.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - extracted_pii: this.extractedPii, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + extracted_pii: this.extractedPii, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -104,10 +109,15 @@ export class PIILeakageMetric extends BaseMetric { const privacyViolations = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "yes") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - privacy_violations: privacyViolations, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + privacy_violations: privacyViolations, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/plan-adherence/index.ts b/typescript/src/metrics/plan-adherence/index.ts index 388df9486c..ed9eef447e 100644 --- a/typescript/src/metrics/plan-adherence/index.ts +++ b/typescript/src/metrics/plan-adherence/index.ts @@ -1,2 +1,5 @@ -export { PlanAdherenceMetric, type PlanAdherenceMetricOptions } from "./plan-adherence"; +export { + PlanAdherenceMetric, + type PlanAdherenceMetricOptions, +} from "./plan-adherence"; export * from "./schema"; diff --git a/typescript/src/metrics/plan-adherence/plan-adherence.ts b/typescript/src/metrics/plan-adherence/plan-adherence.ts index bd598f8e36..7108c83bd3 100644 --- a/typescript/src/metrics/plan-adherence/plan-adherence.ts +++ b/typescript/src/metrics/plan-adherence/plan-adherence.ts @@ -8,7 +8,11 @@ import { checkSingleTurnParams, constructVerboseLogs, } from "../utils"; -import { TaskSchema, AgentPlanSchema, PlanAdherenceScoreSchema } from "./schema"; +import { + TaskSchema, + AgentPlanSchema, + PlanAdherenceScoreSchema, +} from "./schema"; // `extract_task_from_trace` lives under StepEfficiencyMetric (shared, mirrors Python). const TASK_TEMPLATE_CLASS = "StepEfficiencyMetric"; @@ -68,9 +72,14 @@ export class PlanAdherenceMetric extends BaseMetric { const { task } = await generateWithSchema( this, - resolveTemplate("metrics", TASK_TEMPLATE_CLASS, "extract_task_from_trace", { - trace_json: json, - }), + resolveTemplate( + "metrics", + TASK_TEMPLATE_CLASS, + "extract_task_from_trace", + { + trace_json: json, + }, + ), TaskSchema, ); const { plan } = await generateWithSchema( diff --git a/typescript/src/metrics/plan-quality/index.ts b/typescript/src/metrics/plan-quality/index.ts index 441ed74389..7bce058b50 100644 --- a/typescript/src/metrics/plan-quality/index.ts +++ b/typescript/src/metrics/plan-quality/index.ts @@ -1,2 +1,5 @@ -export { PlanQualityMetric, type PlanQualityMetricOptions } from "./plan-quality"; +export { + PlanQualityMetric, + type PlanQualityMetricOptions, +} from "./plan-quality"; export * from "./schema"; diff --git a/typescript/src/metrics/plan-quality/plan-quality.ts b/typescript/src/metrics/plan-quality/plan-quality.ts index 8c9c75c169..18aaf89603 100644 --- a/typescript/src/metrics/plan-quality/plan-quality.ts +++ b/typescript/src/metrics/plan-quality/plan-quality.ts @@ -70,16 +70,26 @@ export class PlanQualityMetric extends BaseMetric { const { task } = await generateWithSchema( this, - resolveTemplate("metrics", TASK_TEMPLATE_CLASS, "extract_task_from_trace", { - trace_json: json, - }), + resolveTemplate( + "metrics", + TASK_TEMPLATE_CLASS, + "extract_task_from_trace", + { + trace_json: json, + }, + ), TaskSchema, ); const { plan } = await generateWithSchema( this, - resolveTemplate("metrics", PLAN_TEMPLATE_CLASS, "extract_plan_from_trace", { - trace_json_str: json, - }), + resolveTemplate( + "metrics", + PLAN_TEMPLATE_CLASS, + "extract_plan_from_trace", + { + trace_json_str: json, + }, + ), AgentPlanSchema, ); diff --git a/typescript/src/metrics/prompt-alignment/prompt-alignment.ts b/typescript/src/metrics/prompt-alignment/prompt-alignment.ts index 57c4e5d8ad..c161cdb087 100644 --- a/typescript/src/metrics/prompt-alignment/prompt-alignment.ts +++ b/typescript/src/metrics/prompt-alignment/prompt-alignment.ts @@ -38,7 +38,10 @@ export class PromptAlignmentMetric extends BaseMetric { verdicts: PromptAlignmentVerdict[] = []; constructor(options: PromptAlignmentMetricOptions) { - if (!options.promptInstructions || options.promptInstructions.length === 0) { + if ( + !options.promptInstructions || + options.promptInstructions.length === 0 + ) { throw new Error("'promptInstructions' must not be empty."); } const strictMode = options.strictMode ?? false; @@ -92,11 +95,16 @@ export class PromptAlignmentMetric extends BaseMetric { input: string, actualOutput: string, ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - prompt_instructions: this.promptInstructions, - input, - actual_output: actualOutput, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + prompt_instructions: this.promptInstructions, + input, + actual_output: actualOutput, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -109,12 +117,17 @@ export class PromptAlignmentMetric extends BaseMetric { const unalignmentReasons = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "no") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - unalignment_reasons: unalignmentReasons, - input, - actual_output: actualOutput, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + unalignment_reasons: unalignmentReasons, + input, + actual_output: actualOutput, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/role-adherence/role-adherence.ts b/typescript/src/metrics/role-adherence/role-adherence.ts index 4e798fd7da..178bdde2ab 100644 --- a/typescript/src/metrics/role-adherence/role-adherence.ts +++ b/typescript/src/metrics/role-adherence/role-adherence.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -89,7 +85,8 @@ export class RoleAdherenceMetric extends BaseConversationalMetric { turns: Turn[], role: string, ): Promise { - const prompt = resolveTemplate("metrics", + const prompt = resolveTemplate( + "metrics", TEMPLATE_CLASS, "extract_out_of_character_response_verdicts", { turns: turns.map((turn) => convertTurnToDict(turn)), role }, @@ -109,13 +106,18 @@ export class RoleAdherenceMetric extends BaseConversationalMetric { private async generateReason(role: string): Promise { if (!this.includeReason) return undefined; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - score: this.score, - role, - out_of_character_responses: this.outOfCharacterVerdicts.map( - (v) => v.ai_message, - ), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + score: this.score, + role, + out_of_character_responses: this.outOfCharacterVerdicts.map( + (v) => v.ai_message, + ), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/role-violation/role-violation.ts b/typescript/src/metrics/role-violation/role-violation.ts index 4165afa36f..6746d384a8 100644 --- a/typescript/src/metrics/role-violation/role-violation.ts +++ b/typescript/src/metrics/role-violation/role-violation.ts @@ -85,10 +85,15 @@ export class RoleViolationMetric extends BaseMetric { } private async detectRoleViolations(actualOutput: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "detect_role_violations", { - actual_output: actualOutput, - expected_role: this.role, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "detect_role_violations", + { + actual_output: actualOutput, + expected_role: this.role, + }, + ); const { role_violations } = await generateWithSchema( this, prompt, @@ -99,9 +104,14 @@ export class RoleViolationMetric extends BaseMetric { private async generateVerdicts(): Promise { if (this.roleViolations.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - role_violations: this.roleViolations, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + role_violations: this.roleViolations, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -111,10 +121,15 @@ export class RoleViolationMetric extends BaseMetric { const violationReasons = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "yes") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - role_violations: violationReasons, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + role_violations: violationReasons, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/step-efficiency/index.ts b/typescript/src/metrics/step-efficiency/index.ts index 12faa85723..3cacd461c0 100644 --- a/typescript/src/metrics/step-efficiency/index.ts +++ b/typescript/src/metrics/step-efficiency/index.ts @@ -1,2 +1,5 @@ -export { StepEfficiencyMetric, type StepEfficiencyMetricOptions } from "./step-efficiency"; +export { + StepEfficiencyMetric, + type StepEfficiencyMetricOptions, +} from "./step-efficiency"; export * from "./schema"; diff --git a/typescript/src/metrics/summarization/summarization.ts b/typescript/src/metrics/summarization/summarization.ts index d4b2a4a2ce..f5552239ee 100644 --- a/typescript/src/metrics/summarization/summarization.ts +++ b/typescript/src/metrics/summarization/summarization.ts @@ -24,7 +24,8 @@ const TEMPLATE_CLASS = "SummarizationMetric"; // Borrowed from faithfulness (Python imports `_faithfulness_truths_limit_phrase`). function truthsLimitPhrase(limit?: number): string { if (limit == null) return " FACTUAL, undisputed truths"; - if (limit === 1) return " the single most important FACTUAL, undisputed truth"; + if (limit === 1) + return " the single most important FACTUAL, undisputed truth"; return ` the ${limit} most important FACTUAL, undisputed truths per document`; } @@ -138,20 +139,30 @@ export class SummarizationMetric extends BaseMetric { // --- truths/claims (borrow Faithfulness templates) --- private async generateTruths(text: string): Promise { - const prompt = resolveTemplate("metrics", "FaithfulnessMetric", "generate_truths", { - retrieval_context: text, - limit: truthsLimitPhrase(this.truthsExtractionLimit), - multimodal_instruction: "", - }); + const prompt = resolveTemplate( + "metrics", + "FaithfulnessMetric", + "generate_truths", + { + retrieval_context: text, + limit: truthsLimitPhrase(this.truthsExtractionLimit), + multimodal_instruction: "", + }, + ); const { truths } = await generateWithSchema(this, prompt, TruthsSchema); return truths; } private async generateClaims(text: string): Promise { - const prompt = resolveTemplate("metrics", "FaithfulnessMetric", "generate_claims", { - actual_output: text, - multimodal_instruction: "", - }); + const prompt = resolveTemplate( + "metrics", + "FaithfulnessMetric", + "generate_claims", + { + actual_output: text, + multimodal_instruction: "", + }, + ); const { claims } = await generateWithSchema(this, prompt, ClaimsSchema); return claims; } @@ -162,7 +173,8 @@ export class SummarizationMetric extends BaseMetric { SummarizationAlignmentVerdict[] > { if (this.claims.length === 0) return []; - const prompt = resolveTemplate("metrics", + const prompt = resolveTemplate( + "metrics", TEMPLATE_CLASS, "generate_alignment_verdicts", { @@ -203,10 +215,15 @@ export class SummarizationMetric extends BaseMetric { } private async generateAssessmentQuestions(text: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_questions", { - text, - n: this.n, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_questions", + { + text, + n: this.n, + }, + ); const { questions } = await generateWithSchema( this, prompt, @@ -216,10 +233,15 @@ export class SummarizationMetric extends BaseMetric { } private async generateAnswers(text: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_answers", { - questions: this.assessmentQuestions, - text, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_answers", + { + questions: this.assessmentQuestions, + text, + }, + ); const { answers } = await generateWithSchema(this, prompt, AnswersSchema); return answers; } diff --git a/typescript/src/metrics/task-completion/index.ts b/typescript/src/metrics/task-completion/index.ts index 3d4d265df8..fd33eeb11e 100644 --- a/typescript/src/metrics/task-completion/index.ts +++ b/typescript/src/metrics/task-completion/index.ts @@ -1,2 +1,5 @@ -export { TaskCompletionMetric, type TaskCompletionMetricOptions } from "./task-completion"; +export { + TaskCompletionMetric, + type TaskCompletionMetricOptions, +} from "./task-completion"; export * from "./schema"; diff --git a/typescript/src/metrics/task-completion/task-completion.ts b/typescript/src/metrics/task-completion/task-completion.ts index e40bdcf06e..35177e5c58 100644 --- a/typescript/src/metrics/task-completion/task-completion.ts +++ b/typescript/src/metrics/task-completion/task-completion.ts @@ -97,11 +97,18 @@ export class TaskCompletionMetric extends BaseMetric { "extract_task_and_outcome_from_trace", { trace_json: JSON.stringify(testCase._traceDict) }, ) - : resolveTemplate("metrics", TEMPLATE_CLASS, "extract_goal_and_outcome", { - input: testCase.input, - actual_output: testCase.actualOutput, - tools_called_formatted: printToolsCalled(testCase.toolsCalled ?? []), - }); + : resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "extract_goal_and_outcome", + { + input: testCase.input, + actual_output: testCase.actualOutput, + tools_called_formatted: printToolsCalled( + testCase.toolsCalled ?? [], + ), + }, + ); return generateWithSchema(this, prompt, TaskAndOutcomeSchema); } diff --git a/typescript/src/metrics/tool-correctness/tool-correctness.ts b/typescript/src/metrics/tool-correctness/tool-correctness.ts index be1e5b93f6..3aaaa8642b 100644 --- a/typescript/src/metrics/tool-correctness/tool-correctness.ts +++ b/typescript/src/metrics/tool-correctness/tool-correctness.ts @@ -127,8 +127,7 @@ export class ToolCorrectnessMetric extends BaseMetric { }; const combined = Math.min(toolCallingScore, toolSelectionScore.score); - this.score = - this.strictMode && combined < this.threshold ? 0 : combined; + this.score = this.strictMode && combined < this.threshold ? 0 : combined; this.reason = this.constructFinalReason( this.generateReason(), toolSelectionScore.reason, @@ -152,11 +151,16 @@ export class ToolCorrectnessMetric extends BaseMetric { private async getToolSelectionScore( userInput: string, ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "get_tool_selection_score", { - user_input: userInput, - tools_called: printToolsCalled(this.toolsCalled), - available_tools: printToolsCalled(this.availableTools ?? []), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_tool_selection_score", + { + user_input: userInput, + tools_called: printToolsCalled(this.toolsCalled), + available_tools: printToolsCalled(this.availableTools ?? []), + }, + ); return generateWithSchema(this, prompt, ToolSelectionScoreSchema); } @@ -360,7 +364,8 @@ export class ToolCorrectnessMetric extends BaseMetric { lcs.map((t) => t.name), ); const issues: string[] = []; - if (missing.length) issues.push(`missing tools ${JSON.stringify(missing)}`); + if (missing.length) + issues.push(`missing tools ${JSON.stringify(missing)}`); if (outOfOrder.length) issues.push(`out-of-order tools ${JSON.stringify(outOfOrder)}`); return `Incorrect tool usage: ${issues.join(" and ")}; expected ${JSON.stringify(expectedNames)}, called ${JSON.stringify(calledNames)}. See more details above.`; diff --git a/typescript/src/metrics/tool-use/tool-use.ts b/typescript/src/metrics/tool-use/tool-use.ts index 6269665b13..8010059f07 100644 --- a/typescript/src/metrics/tool-use/tool-use.ts +++ b/typescript/src/metrics/tool-use/tool-use.ts @@ -142,19 +142,25 @@ export class ToolUseMetric extends BaseConversationalMetric { private async getToolSelectionScore( u: UserInputAndTools, ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "get_tool_selection_score", { - user_input: u.user_messages, - assistant_messages: u.assistant_messages, - tools_called: u.tools_called, - available_tools: u.available_tools, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_tool_selection_score", + { + user_input: u.user_messages, + assistant_messages: u.assistant_messages, + tools_called: u.tools_called, + available_tools: u.available_tools, + }, + ); return generateWithSchema(this, prompt, ToolSelectionScoreSchema); } private async getArgumentCorrectnessScore( u: UserInputAndTools, ): Promise { - const prompt = resolveTemplate("metrics", + const prompt = resolveTemplate( + "metrics", TEMPLATE_CLASS, "get_argument_correctness_score", { @@ -184,7 +190,8 @@ export class ToolUseMetric extends BaseConversationalMetric { const allScoresAndReasons = scores .map((s) => `\nScore: ${s.score} \nReason: ${s.reason} \n`) .join(""); - const prompt = resolveTemplate("metrics", + const prompt = resolveTemplate( + "metrics", TEMPLATE_CLASS, "get_tool_selection_final_reason", { diff --git a/typescript/src/metrics/topic-adherence/topic-adherence.ts b/typescript/src/metrics/topic-adherence/topic-adherence.ts index 76dc6aed71..d180ddb513 100644 --- a/typescript/src/metrics/topic-adherence/topic-adherence.ts +++ b/typescript/src/metrics/topic-adherence/topic-adherence.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -77,7 +73,12 @@ export class TopicAdherenceMetric extends BaseConversationalMetric { const verdicts = await Promise.all( qaPairs.map((qa) => this.getQaVerdict(qa)), ); - const tally: Record = { TP: [], TN: [], FP: [], FN: [] }; + const tally: Record = { + TP: [], + TN: [], + FP: [], + FN: [], + }; for (const v of verdicts) tally[v.verdict].push(v.reason); this.score = this.calculateScore(tally); @@ -109,11 +110,16 @@ export class TopicAdherenceMetric extends BaseConversationalMetric { } private async getQaVerdict(qaPair: QAPair): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "get_qa_pair_verdict", { - relevant_topics: this.relevantTopics, - question: qaPair.question, - response: qaPair.response, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "get_qa_pair_verdict", + { + relevant_topics: this.relevantTopics, + question: qaPair.question, + response: qaPair.response, + }, + ); return generateWithSchema(this, prompt, RelevancyVerdictSchema); } @@ -135,15 +141,20 @@ export class TopicAdherenceMetric extends BaseConversationalMetric { } const line = (reasons: string[]) => reasons.length ? prettifyList(reasons) : "(none)"; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - success: this.success, - score: this.score, - threshold: this.threshold, - true_positives_reason_line: line(tally.TP), - true_negatives_reason_line: line(tally.TN), - false_positives_reason_line: line(tally.FP), - false_negatives_reason_line: line(tally.FN), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + success: this.success, + score: this.score, + threshold: this.threshold, + true_positives_reason_line: line(tally.TP), + true_negatives_reason_line: line(tally.TN), + false_positives_reason_line: line(tally.FP), + false_negatives_reason_line: line(tally.FN), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/toxicity/toxicity.ts b/typescript/src/metrics/toxicity/toxicity.ts index c402f6f633..320580edaf 100644 --- a/typescript/src/metrics/toxicity/toxicity.ts +++ b/typescript/src/metrics/toxicity/toxicity.ts @@ -79,18 +79,28 @@ export class ToxicityMetric extends BaseMetric { } private async generateOpinions(actualOutput: string): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_opinions", { - actual_output: actualOutput, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_opinions", + { + actual_output: actualOutput, + }, + ); const { opinions } = await generateWithSchema(this, prompt, OpinionsSchema); return opinions; } private async generateVerdicts(): Promise { if (this.opinions.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - opinions: this.opinions, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + opinions: this.opinions, + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -100,10 +110,15 @@ export class ToxicityMetric extends BaseMetric { const toxics = this.verdicts .filter((v) => v.verdict.trim().toLowerCase() === "yes") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - toxics, - score: (this.score ?? 0).toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + toxics, + score: (this.score ?? 0).toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/turn-contextual-precision/turn-contextual-precision.ts b/typescript/src/metrics/turn-contextual-precision/turn-contextual-precision.ts index b3a845914e..6eae38f889 100644 --- a/typescript/src/metrics/turn-contextual-precision/turn-contextual-precision.ts +++ b/typescript/src/metrics/turn-contextual-precision/turn-contextual-precision.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -107,7 +103,9 @@ export class TurnContextualPrecisionMetric extends BaseConversationalMetric { for (const turn of window) { if (turn.role === "user") userContent += `\n${turn.content} `; else if (turn.retrievalContext != null) - retrievalContext.push(...resolveRetrievalContext(turn.retrievalContext)); + retrievalContext.push( + ...resolveRetrievalContext(turn.retrievalContext), + ); } const verdicts = await this.generateVerdicts( @@ -124,7 +122,11 @@ export class TurnContextualPrecisionMetric extends BaseConversationalMetric { }; } const score = this.calculateInteractionScore(verdicts); - const reason = await this.getInteractionReason(userContent, score, verdicts); + const reason = await this.getInteractionReason( + userContent, + score, + verdicts, + ); return { score: this.strictMode && score < this.threshold ? 0 : score, reason, @@ -139,13 +141,18 @@ export class TurnContextualPrecisionMetric extends BaseConversationalMetric { ): Promise { if (retrievalContext.length === 0) return []; const n = retrievalContext.length; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - input, - expected_outcome: expectedOutcome, - document_count_str: ` (${n} document${n > 1 ? "s" : ""})`, - context_to_display: retrievalContext, - multimodal_note: "", - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + input, + expected_outcome: expectedOutcome, + document_count_str: ` (${n} document${n > 1 ? "s" : ""})`, + context_to_display: retrievalContext, + multimodal_note: "", + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -156,11 +163,19 @@ export class TurnContextualPrecisionMetric extends BaseConversationalMetric { verdicts: ContextualPrecisionVerdict[], ): Promise { if (!this.includeReason) return undefined; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - input, - verdicts: verdicts.map((v) => ({ verdict: v.verdict, reason: v.reason })), - score: score.toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + input, + verdicts: verdicts.map((v) => ({ + verdict: v.verdict, + reason: v.reason, + })), + score: score.toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, @@ -197,11 +212,16 @@ export class TurnContextualPrecisionMetric extends BaseConversationalMetric { if (this.scores.length === 0) { return "There were no interactions with retrieval context to evaluate, hence the score is 1"; } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_final_reason", { - final_score: this.score, - success: this.success, - reasons: this.scores.map((s) => s.reason), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_final_reason", + { + final_score: this.score, + success: this.success, + reasons: this.scores.map((s) => s.reason), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/turn-contextual-recall/turn-contextual-recall.ts b/typescript/src/metrics/turn-contextual-recall/turn-contextual-recall.ts index 9186ac2897..c251b5a9ba 100644 --- a/typescript/src/metrics/turn-contextual-recall/turn-contextual-recall.ts +++ b/typescript/src/metrics/turn-contextual-recall/turn-contextual-recall.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -105,7 +101,9 @@ export class TurnContextualRecallMetric extends BaseConversationalMetric { const retrievalContext: string[] = []; for (const turn of window) { if (turn.role !== "user" && turn.retrievalContext != null) - retrievalContext.push(...resolveRetrievalContext(turn.retrievalContext)); + retrievalContext.push( + ...resolveRetrievalContext(turn.retrievalContext), + ); } const verdicts = await this.generateVerdicts( @@ -138,14 +136,19 @@ export class TurnContextualRecallMetric extends BaseConversationalMetric { retrievalContext: string[], ): Promise { if (retrievalContext.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - expected_outcome: expectedOutcome, - content_type: "sentence", - content_type_plural: "sentences", - content_or: "sentence", - context_to_display: retrievalContext, - node_instruction: "", - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + expected_outcome: expectedOutcome, + content_type: "sentence", + content_type_plural: "sentences", + content_or: "sentence", + context_to_display: retrievalContext, + node_instruction: "", + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -162,13 +165,18 @@ export class TurnContextualRecallMetric extends BaseConversationalMetric { if (v.verdict.toLowerCase() === "yes") supportive.push(v.reason); else unsupportive.push(v.reason); } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - expected_outcome: expectedOutcome, - supportive_reasons: supportive, - unsupportive_reasons: unsupportive, - score: score.toFixed(2), - content_type: "sentence", - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + expected_outcome: expectedOutcome, + supportive_reasons: supportive, + unsupportive_reasons: unsupportive, + score: score.toFixed(2), + content_type: "sentence", + }, + ); const { reason } = await generateWithSchema( this, prompt, @@ -196,11 +204,16 @@ export class TurnContextualRecallMetric extends BaseConversationalMetric { if (this.scores.length === 0) { return "There were no interactions with retrieval context to evaluate, hence the score is 1"; } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_final_reason", { - final_score: this.score, - success: this.success, - reasons: this.scores.map((s) => s.reason), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_final_reason", + { + final_score: this.score, + success: this.success, + reasons: this.scores.map((s) => s.reason), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/turn-contextual-relevancy/turn-contextual-relevancy.ts b/typescript/src/metrics/turn-contextual-relevancy/turn-contextual-relevancy.ts index f782c0c6ac..058e68fa97 100644 --- a/typescript/src/metrics/turn-contextual-relevancy/turn-contextual-relevancy.ts +++ b/typescript/src/metrics/turn-contextual-relevancy/turn-contextual-relevancy.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -32,7 +28,7 @@ const EXTRACTION_INSTRUCTIONS = "high level information found in the context, before deciding on a " + "verdict and optionally a reason for each statement."; const EMPTY_CONTEXT_INSTRUCTION = - '\nIf provided context contains no actual content or statements then: ' + + "\nIf provided context contains no actual content or statements then: " + 'give "no" as a "verdict",\nput context into "statement", and ' + '"No statements found in provided context." into "reason".'; @@ -113,7 +109,9 @@ export class TurnContextualRelevancyMetric extends BaseConversationalMetric { for (const turn of window) { if (turn.role === "user") userContent += `\n${turn.content} `; else if (turn.retrievalContext != null) - retrievalContext.push(...resolveRetrievalContext(turn.retrievalContext)); + retrievalContext.push( + ...resolveRetrievalContext(turn.retrievalContext), + ); } const verdicts = await this.generateVerdicts(userContent, retrievalContext); @@ -126,7 +124,11 @@ export class TurnContextualRelevancyMetric extends BaseConversationalMetric { }; } const score = this.calculateInteractionScore(verdicts); - const reason = await this.getInteractionReason(userContent, score, verdicts); + const reason = await this.getInteractionReason( + userContent, + score, + verdicts, + ); return { score: this.strictMode && score < this.threshold ? 0 : score, reason, @@ -141,14 +143,19 @@ export class TurnContextualRelevancyMetric extends BaseConversationalMetric { if (retrievalContext.length === 0) return []; const perNode = await Promise.all( retrievalContext.map(async (context) => { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - input, - context, - context_type: "context", - statement_or_image: "statement", - extraction_instructions: EXTRACTION_INSTRUCTIONS, - empty_context_instruction: EMPTY_CONTEXT_INSTRUCTION, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + input, + context, + context_type: "context", + statement_or_image: "statement", + extraction_instructions: EXTRACTION_INSTRUCTIONS, + empty_context_instruction: EMPTY_CONTEXT_INSTRUCTION, + }, + ); const { verdicts } = await generateWithSchema( this, prompt, @@ -172,12 +179,17 @@ export class TurnContextualRelevancyMetric extends BaseConversationalMetric { if (v.verdict.toLowerCase() === "no") irrelevant.push(v.reason); else relevant.push(v.statement); } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - input, - irrelevant_statements: irrelevant, - relevant_statements: relevant, - score: score.toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + input, + irrelevant_statements: irrelevant, + relevant_statements: relevant, + score: score.toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, @@ -205,11 +217,16 @@ export class TurnContextualRelevancyMetric extends BaseConversationalMetric { if (this.scores.length === 0) { return "There were no interactions with retrieval context to evaluate, hence the score is 1"; } - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_final_reason", { - final_score: this.score, - success: this.success, - reasons: this.scores.map((s) => s.reason), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_final_reason", + { + final_score: this.score, + success: this.success, + reasons: this.scores.map((s) => s.reason), + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/turn-faithfulness/turn-faithfulness.ts b/typescript/src/metrics/turn-faithfulness/turn-faithfulness.ts index 771d2e7b32..a5cdad2492 100644 --- a/typescript/src/metrics/turn-faithfulness/turn-faithfulness.ts +++ b/typescript/src/metrics/turn-faithfulness/turn-faithfulness.ts @@ -1,9 +1,5 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; import { @@ -144,10 +140,15 @@ export class TurnFaithfulnessMetric extends BaseConversationalMetric { } private async generateTruths(retrievalContext: string[]): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_truths", { - reference_context: retrievalContext.join("\n\n"), - limit_description: limitDescription(this.truthsExtractionLimit), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_truths", + { + reference_context: retrievalContext.join("\n\n"), + limit_description: limitDescription(this.truthsExtractionLimit), + }, + ); const { truths } = await generateWithSchema(this, prompt, TruthsSchema); return truths; } @@ -156,10 +157,15 @@ export class TurnFaithfulnessMetric extends BaseConversationalMetric { userContent: string, assistantContent: string, ): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_claims", { - input: userContent, - assistant_output: assistantContent, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_claims", + { + input: userContent, + assistant_output: assistantContent, + }, + ); const { claims } = await generateWithSchema(this, prompt, ClaimsSchema); return claims; } @@ -169,10 +175,15 @@ export class TurnFaithfulnessMetric extends BaseConversationalMetric { truths: string[], ): Promise { if (claims.length === 0) return []; - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - claims, - reference_context: truths.join("\n\n"), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + claims, + reference_context: truths.join("\n\n"), + }, + ); const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema); return verdicts; } @@ -206,10 +217,15 @@ export class TurnFaithfulnessMetric extends BaseConversationalMetric { const contradictions = verdicts .filter((v) => v.verdict.trim().toLowerCase() === "no") .map((v) => v.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - contradictions, - score: score.toFixed(2), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + contradictions, + score: score.toFixed(2), + }, + ); const { reason } = await generateWithSchema( this, prompt, @@ -230,11 +246,16 @@ export class TurnFaithfulnessMetric extends BaseConversationalMetric { return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"; } const reasons = this.scores.map((s) => s.reason); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_final_reason", { - final_score: this.score, - success: this.success, - reasons, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_final_reason", + { + final_score: this.score, + success: this.success, + reasons, + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/metrics/turn-relevancy/turn-relevancy.ts b/typescript/src/metrics/turn-relevancy/turn-relevancy.ts index 4b3710a193..e1af0cfecf 100644 --- a/typescript/src/metrics/turn-relevancy/turn-relevancy.ts +++ b/typescript/src/metrics/turn-relevancy/turn-relevancy.ts @@ -1,12 +1,13 @@ import { BaseConversationalMetric } from "../base-conversational-metric"; -import { - ConversationalTestCase, - MultiTurnParams, - Turn, -} from "../../test-case"; +import { ConversationalTestCase, MultiTurnParams, Turn } from "../../test-case"; import { DeepEvalBaseLLM } from "../../models"; import { resolveTemplate } from "../../templates"; -import { initializeModel, generateWithSchema, constructVerboseLogs, prettifyList } from "../utils"; +import { + initializeModel, + generateWithSchema, + constructVerboseLogs, + prettifyList, +} from "../utils"; import { checkConversationalTestCaseParams, getUnitInteractions, @@ -89,9 +90,14 @@ export class TurnRelevancyMetric extends BaseConversationalMetric { } private async generateVerdict(window: Turn[]): Promise { - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", { - sliding_window: window.map((turn) => convertTurnToDict(turn)), - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_verdicts", + { + sliding_window: window.map((turn) => convertTurnToDict(turn)), + }, + ); return generateWithSchema(this, prompt, TurnRelevancyVerdictSchema); } @@ -108,10 +114,15 @@ export class TurnRelevancyMetric extends BaseConversationalMetric { "message number": `${index + 1}`, reason: verdict.reason, })); - const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", { - score: this.score, - irrelevancies, - }); + const prompt = resolveTemplate( + "metrics", + TEMPLATE_CLASS, + "generate_reason", + { + score: this.score, + irrelevancies, + }, + ); const { reason } = await generateWithSchema( this, prompt, diff --git a/typescript/src/models/gateways/openrouter-model.ts b/typescript/src/models/gateways/openrouter-model.ts index 99881cac93..89d1d3521b 100644 --- a/typescript/src/models/gateways/openrouter-model.ts +++ b/typescript/src/models/gateways/openrouter-model.ts @@ -26,7 +26,9 @@ export class OpenRouterModel extends DeepEvalOpenAICompatibleModel { DEFAULT_OPENROUTER_MODEL, apiKey: options.apiKey ?? process.env.OPENROUTER_API_KEY, baseURL: - options.baseURL ?? process.env.OPENROUTER_BASE_URL ?? OPENROUTER_BASE_URL, + options.baseURL ?? + process.env.OPENROUTER_BASE_URL ?? + OPENROUTER_BASE_URL, }); } } diff --git a/typescript/src/models/gateways/portkey-model.ts b/typescript/src/models/gateways/portkey-model.ts index bf239fc5b1..7111d41b41 100644 --- a/typescript/src/models/gateways/portkey-model.ts +++ b/typescript/src/models/gateways/portkey-model.ts @@ -27,7 +27,8 @@ export class PortkeyModel extends DeepEvalOpenAICompatibleModel { ...options, model: options.model ?? process.env.PORTKEY_MODEL_NAME, apiKey, - baseURL: options.baseURL ?? process.env.PORTKEY_BASE_URL ?? PORTKEY_BASE_URL, + baseURL: + options.baseURL ?? process.env.PORTKEY_BASE_URL ?? PORTKEY_BASE_URL, defaultHeaders: { ...(options.defaultHeaders ?? {}), ...(apiKey ? { "x-portkey-api-key": apiKey } : {}), diff --git a/typescript/src/models/providers/deepseek-model.ts b/typescript/src/models/providers/deepseek-model.ts index f1c56765d7..7e551da802 100644 --- a/typescript/src/models/providers/deepseek-model.ts +++ b/typescript/src/models/providers/deepseek-model.ts @@ -20,7 +20,9 @@ export class DeepSeekModel extends DeepEvalOpenAICompatibleModel { super({ ...options, model: - options.model ?? process.env.DEEPSEEK_MODEL_NAME ?? DEFAULT_DEEPSEEK_MODEL, + options.model ?? + process.env.DEEPSEEK_MODEL_NAME ?? + DEFAULT_DEEPSEEK_MODEL, apiKey: options.apiKey ?? process.env.DEEPSEEK_API_KEY, baseURL: options.baseURL ?? DEEPSEEK_BASE_URL, }); diff --git a/typescript/src/models/providers/kimi-model.ts b/typescript/src/models/providers/kimi-model.ts index 114787ccb5..57106620d3 100644 --- a/typescript/src/models/providers/kimi-model.ts +++ b/typescript/src/models/providers/kimi-model.ts @@ -20,9 +20,11 @@ export class KimiModel extends DeepEvalOpenAICompatibleModel { constructor(options: KimiModelOptions = {}) { super({ ...options, - model: options.model ?? process.env.MOONSHOT_MODEL_NAME ?? DEFAULT_KIMI_MODEL, + model: + options.model ?? process.env.MOONSHOT_MODEL_NAME ?? DEFAULT_KIMI_MODEL, apiKey: options.apiKey ?? process.env.MOONSHOT_API_KEY, - baseURL: options.baseURL ?? process.env.MOONSHOT_BASE_URL ?? MOONSHOT_BASE_URL, + baseURL: + options.baseURL ?? process.env.MOONSHOT_BASE_URL ?? MOONSHOT_BASE_URL, }); } } diff --git a/typescript/src/models/providers/openai-model.ts b/typescript/src/models/providers/openai-model.ts index d84c43c9f6..f09ea2726a 100644 --- a/typescript/src/models/providers/openai-model.ts +++ b/typescript/src/models/providers/openai-model.ts @@ -19,7 +19,8 @@ export class OpenAIModel extends DeepEvalOpenAICompatibleModel { constructor(options: OpenAIModelOptions = {}) { super({ ...options, - model: options.model ?? process.env.OPENAI_MODEL_NAME ?? DEFAULT_OPENAI_MODEL, + model: + options.model ?? process.env.OPENAI_MODEL_NAME ?? DEFAULT_OPENAI_MODEL, apiKey: options.apiKey ?? process.env.OPENAI_API_KEY, }); } diff --git a/typescript/src/test-case/conversational-test-case.ts b/typescript/src/test-case/conversational-test-case.ts index fb063b22b9..fe63b85e1e 100644 --- a/typescript/src/test-case/conversational-test-case.ts +++ b/typescript/src/test-case/conversational-test-case.ts @@ -1,5 +1,11 @@ import { ToolCall, RetrievedContextData } from "./llm-test-case"; -import { checkIfMultimodal, extractImageIdsFromList, extractImageIdsFromString, MLLM_IMAGE_REGISTRY, MLLMImage } from "./mllm-image"; +import { + checkIfMultimodal, + extractImageIdsFromList, + extractImageIdsFromString, + MLLM_IMAGE_REGISTRY, + MLLMImage, +} from "./mllm-image"; import { MCPServer, MCPToolCall, @@ -152,7 +158,9 @@ export class ConversationalTestCase { this.turns.forEach((turn) => { extractImageIdsFromString(turn.content, ids); extractImageIdsFromList( - turn.retrievalContext?.map((c) => (typeof c === "string" ? c : c.context)), + turn.retrievalContext?.map((c) => + typeof c === "string" ? c : c.context, + ), ids, ); }); @@ -172,7 +180,11 @@ export class ConversationalTestCase { /** Auto-detect multimodality from image slugs in the fields/turns (mirrors Python). */ private detectMultimodal(): boolean { const has = (s?: string) => s != null && checkIfMultimodal(s); - if (has(this.scenario) || has(this.expectedOutcome) || has(this.userDescription)) { + if ( + has(this.scenario) || + has(this.expectedOutcome) || + has(this.userDescription) + ) { return true; } for (const turn of this.turns ?? []) { diff --git a/typescript/src/test-case/llm-test-case.ts b/typescript/src/test-case/llm-test-case.ts index 8dc610a4d0..fbd7887af9 100644 --- a/typescript/src/test-case/llm-test-case.ts +++ b/typescript/src/test-case/llm-test-case.ts @@ -5,7 +5,14 @@ import { MCPPromptCall, validateMcpServers, } from "./mcp"; -import { checkIfMultimodal, extractImageIdsFromList, extractImageIdsFromString, MLLM_IMAGE_REGISTRY, MLLMImage, SLUG_PATTERN } from "./mllm-image"; +import { + checkIfMultimodal, + extractImageIdsFromList, + extractImageIdsFromString, + MLLM_IMAGE_REGISTRY, + MLLMImage, + SLUG_PATTERN, +} from "./mllm-image"; export enum SingleTurnParams { INPUT = "input", @@ -154,9 +161,11 @@ export class LLMTestCase { extractImageIdsFromString(this.actualOutput, ids); extractImageIdsFromString(this.expectedOutput, ids); extractImageIdsFromList(this.context, ids); - + extractImageIdsFromList( - this.retrievalContext?.map((c) => (typeof c === "string" ? c : c.context)), + this.retrievalContext?.map((c) => + typeof c === "string" ? c : c.context, + ), ids, ); diff --git a/typescript/src/test-case/mcp.ts b/typescript/src/test-case/mcp.ts index 534dea8296..5eaf9bcdd5 100644 --- a/typescript/src/test-case/mcp.ts +++ b/typescript/src/test-case/mcp.ts @@ -95,7 +95,9 @@ export function validateMcpServers(mcpServers: MCPServer[]): void { Array.isArray(x) && x.every((i) => typeof i === "object" && i !== null); for (const s of mcpServers) { if (s.availableTools != null && !isObjList(s.availableTools)) { - throw new TypeError("'availableTools' must be a list of MCP Tool objects"); + throw new TypeError( + "'availableTools' must be a list of MCP Tool objects", + ); } if (s.availableResources != null && !isObjList(s.availableResources)) { throw new TypeError( diff --git a/typescript/src/test-case/mllm-image.ts b/typescript/src/test-case/mllm-image.ts index f03987145f..bf3395b712 100644 --- a/typescript/src/test-case/mllm-image.ts +++ b/typescript/src/test-case/mllm-image.ts @@ -201,4 +201,4 @@ export function extractImageIdsFromList( for (const item of lst) { extractImageIdsFromString(item, targetSet); } -} \ No newline at end of file +} diff --git a/typescript/src/tracing/tracing.ts b/typescript/src/tracing/tracing.ts index 2d9263af62..cdcb210737 100644 --- a/typescript/src/tracing/tracing.ts +++ b/typescript/src/tracing/tracing.ts @@ -419,7 +419,10 @@ export class TraceManager { put("expectedTools", span.expectedTools); // Span-type extras (model / tools) when present, like Python's api span. put("model", (span as { model?: unknown }).model); - put("availableTools", (span as { availableTools?: unknown }).availableTools); + put( + "availableTools", + (span as { availableTools?: unknown }).availableTools, + ); dict.children = (span.children ?? []).map((child) => this.createNestedSpansDict(child), );