confident-ai · HarperZ9 · Jun 30, 2026
diff --git a/typescript/src/evaluate/compare.ts b/typescript/src/evaluate/compare.ts
@@ -146,7 +146,12 @@ export async function compare(
   for (const w of winners) counts[w] = (counts[w] ?? 0) + 1;
 
   if (display.printResults) {
-    printArenaCompleted(counts, runDuration, winners.length, hasCost ? totalCost : 0);
+    printArenaCompleted(
+      counts,
+      runDuration,
+      winners.length,
+      hasCost ? totalCost : 0,
+    );
   }
 
   // Post to Confident AI as an experiment (no-op unless logged in).
@@ -171,7 +176,10 @@ function printArenaCompleted(
   const sorted = Object.entries(counts).sort((a, b) => b[1] - a[1]);
   const breakdown = sorted.length
     ? sorted
-        .map(([name, wins]) => `    » ${GREEN}${BOLD}${name}${RESET}: ${wins} wins`)
+        .map(
+          ([name, wins]) =>
+            `    » ${GREEN}${BOLD}${name}${RESET}: ${wins} wins`,
+        )
         .join("\n")
     : "No winners";
   const cost = tokenCost ? `${tokenCost} USD` : "None";

diff --git a/typescript/src/evaluate/confident.ts b/typescript/src/evaluate/confident.ts
@@ -48,7 +48,12 @@ function buildMetricsScores(cases: EvaluatedCase[]) {
   for (const { metricsData } of cases) {
     for (const m of metricsData) {
       if (m.skipped) continue;
-      const e = map.get(m.name) ?? { scores: [], passes: 0, fails: 0, errors: 0 };
+      const e = map.get(m.name) ?? {
+        scores: [],
+        passes: 0,
+        fails: 0,
+        errors: 0,
+      };
       if (m.error) {
         e.errors += 1;
       } else {
@@ -81,53 +86,55 @@ export async function postTestRun(
   let totalCost = 0;
   let hasCost = false;
 
-  cases.forEach(({ testCase, metricsData, runDuration: caseDuration, trace }, order) => {
-    const success = metricsData.every((m) => m.skipped || m.success);
-    if (success) testPassed += 1;
-    else testFailed += 1;
+  cases.forEach(
+    ({ testCase, metricsData, runDuration: caseDuration, trace }, order) => {
+      const success = metricsData.every((m) => m.skipped || m.success);
+      if (success) testPassed += 1;
+      else testFailed += 1;
 
-    const evaluationCost = caseCost(metricsData);
-    if (evaluationCost != null) {
-      totalCost += evaluationCost;
-      hasCost = true;
-    }
-    const metricsDataApi = metricsData.map(convertMetricData);
+      const evaluationCost = caseCost(metricsData);
+      if (evaluationCost != null) {
+        totalCost += evaluationCost;
+        hasCost = true;
+      }
+      const metricsDataApi = metricsData.map(convertMetricData);
 
-    if (testCase instanceof ConversationalTestCase) {
-      conversationalTestCases.push({
-        name: testCase.name ?? `test_case_${order}`,
-        success,
-        metricsData: metricsDataApi,
-        runDuration: caseDuration,
-        evaluationCost,
-        order,
-        turns: testCase.turns.map((t, i) => convertTurn(t, i)),
-        scenario: testCase.scenario,
-        expectedOutcome: testCase.expectedOutcome,
-        userDescription: testCase.userDescription,
-        chatbotRole: testCase.chatbotRole,
-        imagesMapping: testCase.getImagesMapping(),
-      });
-    } else {
-      testCases.push({
-        name: testCase.name ?? `test_case_${order}`,
-        input: testCase.input,
-        actualOutput: testCase.actualOutput,
-        expectedOutput: testCase.expectedOutput,
-        context: testCase.context,
-        retrievalContext: resolveRetrievalContext(testCase.retrievalContext),
-        toolsCalled: testCase.toolsCalled?.map(convertTool),
-        expectedTools: testCase.expectedTools?.map(convertTool),
-        success,
-        metricsData: metricsDataApi,
-        runDuration: caseDuration,
-        evaluationCost,
-        order,
-        imagesMapping: testCase.getImagesMapping(),
-        trace,
-      });
-    }
-  });
+      if (testCase instanceof ConversationalTestCase) {
+        conversationalTestCases.push({
+          name: testCase.name ?? `test_case_${order}`,
+          success,
+          metricsData: metricsDataApi,
+          runDuration: caseDuration,
+          evaluationCost,
+          order,
+          turns: testCase.turns.map((t, i) => convertTurn(t, i)),
+          scenario: testCase.scenario,
+          expectedOutcome: testCase.expectedOutcome,
+          userDescription: testCase.userDescription,
+          chatbotRole: testCase.chatbotRole,
+          imagesMapping: testCase.getImagesMapping(),
+        });
+      } else {
+        testCases.push({
+          name: testCase.name ?? `test_case_${order}`,
+          input: testCase.input,
+          actualOutput: testCase.actualOutput,
+          expectedOutput: testCase.expectedOutput,
+          context: testCase.context,
+          retrievalContext: resolveRetrievalContext(testCase.retrievalContext),
+          toolsCalled: testCase.toolsCalled?.map(convertTool),
+          expectedTools: testCase.expectedTools?.map(convertTool),
+          success,
+          metricsData: metricsDataApi,
+          runDuration: caseDuration,
+          evaluationCost,
+          order,
+          imagesMapping: testCase.getImagesMapping(),
+          trace,
+        });
+      }
+    },
+  );
 
   const payload = {
     testCases,
@@ -261,7 +268,13 @@ export async function postExperiment(
     testCases: e.testCases,
     conversationalTestCases: [],
     metricsScores: [
-      { metric: metricName, scores: e.scores, passes: e.passes, fails: e.fails, errors: e.errors },
+      {
+        metric: metricName,
+        scores: e.scores,
+        passes: e.passes,
+        fails: e.fails,
+        errors: e.errors,
+      },
     ],
     identifier: e.identifier,
     testPassed: e.testPassed,

diff --git a/typescript/src/evaluate/console-report.ts b/typescript/src/evaluate/console-report.ts
@@ -143,7 +143,11 @@ function wrapCell(c: string, width: number): string[] {
 }
 
 /** A labeled panel line (`Label: value`) wrapped to `inner`, continuations indented. */
-function wrapLabeledLine(prefix: string, value: string, inner: number): string[] {
+function wrapLabeledLine(
+  prefix: string,
+  value: string,
+  inner: number,
+): string[] {
   const indent = visLen(prefix);
   const chunks = wrapText(value, Math.max(10, inner - indent));
   return chunks.map((chunk, i) =>
@@ -222,7 +226,9 @@ function tableLines(
 function metricStatusCell(m: MetricData): string {
   if (m.skipped) return `${YELLOW}${BOLD}SKIP${RESET}`;
   if (m.error) return `${RED}${BOLD}ERROR${RESET}`;
-  return m.success ? `${GREEN}${BOLD}PASS${RESET}` : `${RED}${BOLD}FAIL${RESET}`;
+  return m.success
+    ? `${GREEN}${BOLD}PASS${RESET}`
+    : `${RED}${BOLD}FAIL${RESET}`;
 }
 
 /**
@@ -265,16 +271,32 @@ export function printResultsTable(
       lines.push(`${CYAN}${BOLD}Conversation Turns${RESET}`);
       for (const turn of tc.turns ?? []) {
         const role = turn.role.charAt(0).toUpperCase() + turn.role.slice(1);
-        lines.push(...wrapLabeledLine(`  ${BOLD}${role}:${RESET} `, turn.content, inner));
+        lines.push(
+          ...wrapLabeledLine(`  ${BOLD}${role}:${RESET} `, turn.content, inner),
+        );
       }
     } else {
-      lines.push(...wrapLabeledLine(`${CYAN}${BOLD}Input:${RESET} `, String(tc.input), inner));
       lines.push(
-        ...wrapLabeledLine(`${CYAN}${BOLD}Actual Output:${RESET} `, String(tc.actualOutput), inner),
+        ...wrapLabeledLine(
+          `${CYAN}${BOLD}Input:${RESET} `,
+          String(tc.input),
+          inner,
+        ),
+      );
+      lines.push(
+        ...wrapLabeledLine(
+          `${CYAN}${BOLD}Actual Output:${RESET} `,
+          String(tc.actualOutput),
+          inner,
+        ),
       );
       if (tc.expectedOutput && tc.expectedOutput !== "N/A") {
         lines.push(
-          ...wrapLabeledLine(`${CYAN}${BOLD}Expected Output:${RESET} `, tc.expectedOutput, inner),
+          ...wrapLabeledLine(
+            `${CYAN}${BOLD}Expected Output:${RESET} `,
+            tc.expectedOutput,
+            inner,
+          ),
         );
       }
     }
@@ -408,7 +430,9 @@ export function exportToMarkdown(
   const ts =
     `${d.getFullYear()}${pad2(d.getMonth() + 1)}${pad2(d.getDate())}` +
     `_${pad2(d.getHours())}${pad2(d.getMinutes())}${pad2(d.getSeconds())}`;
-  const safe = (evaluationName || "evaluation").replace(/\s+/g, "_").toLowerCase();
+  const safe = (evaluationName || "evaluation")
+    .replace(/\s+/g, "_")
+    .toLowerCase();
   const filepath = path.join(outputDir, `${safe}_${ts}.${fileType}`);
 
   const sorted = [...testResults].sort(

diff --git a/typescript/src/evaluate/evaluate.ts b/typescript/src/evaluate/evaluate.ts
@@ -226,7 +226,10 @@ export async function runMetric(
     // Dispatched in `evaluate`, so the metric matches the test case type.
     await (metric.measure as (tc: AnyTestCase) => Promise<number>)(testCase);
   } catch (e) {
-    if (e instanceof MissingTestCaseParamsError && errorCfg.skipOnMissingParams) {
+    if (
+      e instanceof MissingTestCaseParamsError &&
+      errorCfg.skipOnMissingParams
+    ) {
       metric.skipped = true;
     } else if (errorCfg.ignoreErrors) {
       metric.error = (e as Error).message;

diff --git a/typescript/src/evaluate/trace-eval.ts b/typescript/src/evaluate/trace-eval.ts
@@ -99,7 +99,12 @@ export async function evaluateTrace(
     const metricsData: MetricData[] = [];
     for (const metric of metrics) {
       metricsData.push(
-        await runMetric(metric, testCase, errorCfg, options.onMetric ?? (() => {})),
+        await runMetric(
+          metric,
+          testCase,
+          errorCfg,
+          options.onMetric ?? (() => {}),
+        ),
       );
     }
     scope.metricsData = metricsData; // also attach to the span/trace

diff --git a/typescript/src/integrations/langchain/callback-handler.ts b/typescript/src/integrations/langchain/callback-handler.ts
@@ -203,9 +203,9 @@ export class DeepEvalCallbackHandler
       // trace by ancestry so it is not left dangling.
       const traceUuid = this.hierarchy.getTraceUuid(uuidStr);
       if (traceUuid && traceManager.getTraceByUuid(traceUuid)) {
-        const others = Array.from(traceManager.getActiveSpans().values()).filter(
-          (s) => s.traceUuid === traceUuid,
-        );
+        const others = Array.from(
+          traceManager.getActiveSpans().values(),
+        ).filter((s) => s.traceUuid === traceUuid);
         if (others.length === 0) {
           traceManager.setTraceStatus(traceUuid, TraceSpanStatus.ERRORED);
           traceManager.endTrace(traceUuid);
@@ -408,7 +408,12 @@ export class DeepEvalCallbackHandler
     }
   }
 
-  async handleToolEnd(output: any, runId: string, _parentRunId?: string, _tags?: string[]) {
+  async handleToolEnd(
+    output: any,
+    runId: string,
+    _parentRunId?: string,
+    _tags?: string[],
+  ) {
     const uuidStr = String(runId);
     const toolSpan: any = traceManager.getSpanByUuid(uuidStr);
 
@@ -432,7 +437,12 @@ export class DeepEvalCallbackHandler
     this.hierarchy.cleanupRun(uuidStr);
   }
 
-  async handleToolError(err: any, runId: string, _parentRunId?: string, _tags?: string[]) {
+  async handleToolError(
+    err: any,
+    runId: string,
+    _parentRunId?: string,
+    _tags?: string[],
+  ) {
     const uuidStr = String(runId);
     const toolSpan: any = traceManager.getSpanByUuid(uuidStr);
 

diff --git a/typescript/src/metrics/answer-relevancy/answer-relevancy.ts b/typescript/src/metrics/answer-relevancy/answer-relevancy.ts
@@ -75,9 +75,14 @@ export class AnswerRelevancyMetric extends BaseMetric {
   }
 
   private async generateStatements(actualOutput: string): Promise<string[]> {
-    const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_statements", {
-      actual_output: actualOutput,
-    });
+    const prompt = resolveTemplate(
+      "metrics",
+      TEMPLATE_CLASS,
+      "generate_statements",
+      {
+        actual_output: actualOutput,
+      },
+    );
     const { statements } = await generateWithSchema(
       this,
       prompt,
@@ -90,10 +95,15 @@ export class AnswerRelevancyMetric extends BaseMetric {
     input: string,
   ): Promise<AnswerRelevancyVerdict[]> {
     if (this.statements.length === 0) return [];
-    const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", {
-      input,
-      statements: this.statements,
-    });
+    const prompt = resolveTemplate(
+      "metrics",
+      TEMPLATE_CLASS,
+      "generate_verdicts",
+      {
+        input,
+        statements: this.statements,
+      },
+    );
     const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema);
     return verdicts;
   }
@@ -103,11 +113,16 @@ export class AnswerRelevancyMetric extends BaseMetric {
     const irrelevantStatements = this.verdicts
       .filter((v) => v.verdict.trim().toLowerCase() === "no")
       .map((v) => v.reason);
-    const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", {
-      irrelevant_statements: irrelevantStatements,
-      input,
-      score: (this.score ?? 0).toFixed(2),
-    });
+    const prompt = resolveTemplate(
+      "metrics",
+      TEMPLATE_CLASS,
+      "generate_reason",
+      {
+        irrelevant_statements: irrelevantStatements,
+        input,
+        score: (this.score ?? 0).toFixed(2),
+      },
+    );
     const { reason } = await generateWithSchema(
       this,
       prompt,

diff --git a/typescript/src/metrics/answer-relevancy/schema.ts b/typescript/src/metrics/answer-relevancy/schema.ts
@@ -17,4 +17,6 @@ export const AnswerRelevancyScoreReasonSchema = z.object({
   reason: z.string(),
 });
 
-export type AnswerRelevancyVerdict = z.infer<typeof AnswerRelevancyVerdictSchema>;
+export type AnswerRelevancyVerdict = z.infer<
+  typeof AnswerRelevancyVerdictSchema
+>;
diff --git a/typescript/src/metrics/arena-g-eval/arena-g-eval.ts b/typescript/src/metrics/arena-g-eval/arena-g-eval.ts
@@ -156,13 +156,18 @@ export class ArenaGEval extends BaseArenaMetric {
     reason: string,
     dummyToReal: Record<string, string>,
   ): Promise<string> {
-    const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "rewrite_reason", {
-      reason,
-      // Pass as JSON text: Nunjucks renders a bare object as "[object Object]"
-      // (the resolver only gives arrays a Python-repr toString). The template
-      // renders {{ dummy_to_real_names }} directly and its example is JSON.
-      dummy_to_real_names: JSON.stringify(dummyToReal),
-    });
+    const prompt = resolveTemplate(
+      "metrics",
+      TEMPLATE_CLASS,
+      "rewrite_reason",
+      {
+        reason,
+        // Pass as JSON text: Nunjucks renders a bare object as "[object Object]"
+        // (the resolver only gives arrays a Python-repr toString). The template
+        // renders {{ dummy_to_real_names }} directly and its example is JSON.
+        dummy_to_real_names: JSON.stringify(dummyToReal),
+      },
+    );
     const { rewritten_reason } = await generateWithSchema(
       this,
       prompt,