Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions typescript/src/evaluate/compare.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,12 @@ export async function compare(
for (const w of winners) counts[w] = (counts[w] ?? 0) + 1;

if (display.printResults) {
printArenaCompleted(counts, runDuration, winners.length, hasCost ? totalCost : 0);
printArenaCompleted(
counts,
runDuration,
winners.length,
hasCost ? totalCost : 0,
);
}

// Post to Confident AI as an experiment (no-op unless logged in).
Expand All @@ -171,7 +176,10 @@ function printArenaCompleted(
const sorted = Object.entries(counts).sort((a, b) => b[1] - a[1]);
const breakdown = sorted.length
? sorted
.map(([name, wins]) => ` » ${GREEN}${BOLD}${name}${RESET}: ${wins} wins`)
.map(
([name, wins]) =>
` » ${GREEN}${BOLD}${name}${RESET}: ${wins} wins`,
)
.join("\n")
: "No winners";
const cost = tokenCost ? `${tokenCost} USD` : "None";
Expand Down
107 changes: 60 additions & 47 deletions typescript/src/evaluate/confident.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@ function buildMetricsScores(cases: EvaluatedCase[]) {
for (const { metricsData } of cases) {
for (const m of metricsData) {
if (m.skipped) continue;
const e = map.get(m.name) ?? { scores: [], passes: 0, fails: 0, errors: 0 };
const e = map.get(m.name) ?? {
scores: [],
passes: 0,
fails: 0,
errors: 0,
};
if (m.error) {
e.errors += 1;
} else {
Expand Down Expand Up @@ -81,53 +86,55 @@ export async function postTestRun(
let totalCost = 0;
let hasCost = false;

cases.forEach(({ testCase, metricsData, runDuration: caseDuration, trace }, order) => {
const success = metricsData.every((m) => m.skipped || m.success);
if (success) testPassed += 1;
else testFailed += 1;
cases.forEach(
({ testCase, metricsData, runDuration: caseDuration, trace }, order) => {
const success = metricsData.every((m) => m.skipped || m.success);
if (success) testPassed += 1;
else testFailed += 1;

const evaluationCost = caseCost(metricsData);
if (evaluationCost != null) {
totalCost += evaluationCost;
hasCost = true;
}
const metricsDataApi = metricsData.map(convertMetricData);
const evaluationCost = caseCost(metricsData);
if (evaluationCost != null) {
totalCost += evaluationCost;
hasCost = true;
}
const metricsDataApi = metricsData.map(convertMetricData);

if (testCase instanceof ConversationalTestCase) {
conversationalTestCases.push({
name: testCase.name ?? `test_case_${order}`,
success,
metricsData: metricsDataApi,
runDuration: caseDuration,
evaluationCost,
order,
turns: testCase.turns.map((t, i) => convertTurn(t, i)),
scenario: testCase.scenario,
expectedOutcome: testCase.expectedOutcome,
userDescription: testCase.userDescription,
chatbotRole: testCase.chatbotRole,
imagesMapping: testCase.getImagesMapping(),
});
} else {
testCases.push({
name: testCase.name ?? `test_case_${order}`,
input: testCase.input,
actualOutput: testCase.actualOutput,
expectedOutput: testCase.expectedOutput,
context: testCase.context,
retrievalContext: resolveRetrievalContext(testCase.retrievalContext),
toolsCalled: testCase.toolsCalled?.map(convertTool),
expectedTools: testCase.expectedTools?.map(convertTool),
success,
metricsData: metricsDataApi,
runDuration: caseDuration,
evaluationCost,
order,
imagesMapping: testCase.getImagesMapping(),
trace,
});
}
});
if (testCase instanceof ConversationalTestCase) {
conversationalTestCases.push({
name: testCase.name ?? `test_case_${order}`,
success,
metricsData: metricsDataApi,
runDuration: caseDuration,
evaluationCost,
order,
turns: testCase.turns.map((t, i) => convertTurn(t, i)),
scenario: testCase.scenario,
expectedOutcome: testCase.expectedOutcome,
userDescription: testCase.userDescription,
chatbotRole: testCase.chatbotRole,
imagesMapping: testCase.getImagesMapping(),
});
} else {
testCases.push({
name: testCase.name ?? `test_case_${order}`,
input: testCase.input,
actualOutput: testCase.actualOutput,
expectedOutput: testCase.expectedOutput,
context: testCase.context,
retrievalContext: resolveRetrievalContext(testCase.retrievalContext),
toolsCalled: testCase.toolsCalled?.map(convertTool),
expectedTools: testCase.expectedTools?.map(convertTool),
success,
metricsData: metricsDataApi,
runDuration: caseDuration,
evaluationCost,
order,
imagesMapping: testCase.getImagesMapping(),
trace,
});
}
},
);

const payload = {
testCases,
Expand Down Expand Up @@ -261,7 +268,13 @@ export async function postExperiment(
testCases: e.testCases,
conversationalTestCases: [],
metricsScores: [
{ metric: metricName, scores: e.scores, passes: e.passes, fails: e.fails, errors: e.errors },
{
metric: metricName,
scores: e.scores,
passes: e.passes,
fails: e.fails,
errors: e.errors,
},
],
identifier: e.identifier,
testPassed: e.testPassed,
Expand Down
38 changes: 31 additions & 7 deletions typescript/src/evaluate/console-report.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,11 @@ function wrapCell(c: string, width: number): string[] {
}

/** A labeled panel line (`Label: value`) wrapped to `inner`, continuations indented. */
function wrapLabeledLine(prefix: string, value: string, inner: number): string[] {
function wrapLabeledLine(
prefix: string,
value: string,
inner: number,
): string[] {
const indent = visLen(prefix);
const chunks = wrapText(value, Math.max(10, inner - indent));
return chunks.map((chunk, i) =>
Expand Down Expand Up @@ -222,7 +226,9 @@ function tableLines(
function metricStatusCell(m: MetricData): string {
if (m.skipped) return `${YELLOW}${BOLD}SKIP${RESET}`;
if (m.error) return `${RED}${BOLD}ERROR${RESET}`;
return m.success ? `${GREEN}${BOLD}PASS${RESET}` : `${RED}${BOLD}FAIL${RESET}`;
return m.success
? `${GREEN}${BOLD}PASS${RESET}`
: `${RED}${BOLD}FAIL${RESET}`;
}

/**
Expand Down Expand Up @@ -265,16 +271,32 @@ export function printResultsTable(
lines.push(`${CYAN}${BOLD}Conversation Turns${RESET}`);
for (const turn of tc.turns ?? []) {
const role = turn.role.charAt(0).toUpperCase() + turn.role.slice(1);
lines.push(...wrapLabeledLine(` ${BOLD}${role}:${RESET} `, turn.content, inner));
lines.push(
...wrapLabeledLine(` ${BOLD}${role}:${RESET} `, turn.content, inner),
);
}
} else {
lines.push(...wrapLabeledLine(`${CYAN}${BOLD}Input:${RESET} `, String(tc.input), inner));
lines.push(
...wrapLabeledLine(`${CYAN}${BOLD}Actual Output:${RESET} `, String(tc.actualOutput), inner),
...wrapLabeledLine(
`${CYAN}${BOLD}Input:${RESET} `,
String(tc.input),
inner,
),
);
lines.push(
...wrapLabeledLine(
`${CYAN}${BOLD}Actual Output:${RESET} `,
String(tc.actualOutput),
inner,
),
);
if (tc.expectedOutput && tc.expectedOutput !== "N/A") {
lines.push(
...wrapLabeledLine(`${CYAN}${BOLD}Expected Output:${RESET} `, tc.expectedOutput, inner),
...wrapLabeledLine(
`${CYAN}${BOLD}Expected Output:${RESET} `,
tc.expectedOutput,
inner,
),
);
}
}
Expand Down Expand Up @@ -408,7 +430,9 @@ export function exportToMarkdown(
const ts =
`${d.getFullYear()}${pad2(d.getMonth() + 1)}${pad2(d.getDate())}` +
`_${pad2(d.getHours())}${pad2(d.getMinutes())}${pad2(d.getSeconds())}`;
const safe = (evaluationName || "evaluation").replace(/\s+/g, "_").toLowerCase();
const safe = (evaluationName || "evaluation")
.replace(/\s+/g, "_")
.toLowerCase();
const filepath = path.join(outputDir, `${safe}_${ts}.${fileType}`);

const sorted = [...testResults].sort(
Expand Down
5 changes: 4 additions & 1 deletion typescript/src/evaluate/evaluate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,10 @@ export async function runMetric(
// Dispatched in `evaluate`, so the metric matches the test case type.
await (metric.measure as (tc: AnyTestCase) => Promise<number>)(testCase);
} catch (e) {
if (e instanceof MissingTestCaseParamsError && errorCfg.skipOnMissingParams) {
if (
e instanceof MissingTestCaseParamsError &&
errorCfg.skipOnMissingParams
) {
metric.skipped = true;
} else if (errorCfg.ignoreErrors) {
metric.error = (e as Error).message;
Expand Down
7 changes: 6 additions & 1 deletion typescript/src/evaluate/trace-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,12 @@ export async function evaluateTrace(
const metricsData: MetricData[] = [];
for (const metric of metrics) {
metricsData.push(
await runMetric(metric, testCase, errorCfg, options.onMetric ?? (() => {})),
await runMetric(
metric,
testCase,
errorCfg,
options.onMetric ?? (() => {}),
),
);
}
scope.metricsData = metricsData; // also attach to the span/trace
Expand Down
20 changes: 15 additions & 5 deletions typescript/src/integrations/langchain/callback-handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,9 @@ export class DeepEvalCallbackHandler
// trace by ancestry so it is not left dangling.
const traceUuid = this.hierarchy.getTraceUuid(uuidStr);
if (traceUuid && traceManager.getTraceByUuid(traceUuid)) {
const others = Array.from(traceManager.getActiveSpans().values()).filter(
(s) => s.traceUuid === traceUuid,
);
const others = Array.from(
traceManager.getActiveSpans().values(),
).filter((s) => s.traceUuid === traceUuid);
if (others.length === 0) {
traceManager.setTraceStatus(traceUuid, TraceSpanStatus.ERRORED);
traceManager.endTrace(traceUuid);
Expand Down Expand Up @@ -408,7 +408,12 @@ export class DeepEvalCallbackHandler
}
}

async handleToolEnd(output: any, runId: string, _parentRunId?: string, _tags?: string[]) {
async handleToolEnd(
output: any,
runId: string,
_parentRunId?: string,
_tags?: string[],
) {
const uuidStr = String(runId);
const toolSpan: any = traceManager.getSpanByUuid(uuidStr);

Expand All @@ -432,7 +437,12 @@ export class DeepEvalCallbackHandler
this.hierarchy.cleanupRun(uuidStr);
}

async handleToolError(err: any, runId: string, _parentRunId?: string, _tags?: string[]) {
async handleToolError(
err: any,
runId: string,
_parentRunId?: string,
_tags?: string[],
) {
const uuidStr = String(runId);
const toolSpan: any = traceManager.getSpanByUuid(uuidStr);

Expand Down
39 changes: 27 additions & 12 deletions typescript/src/metrics/answer-relevancy/answer-relevancy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,14 @@ export class AnswerRelevancyMetric extends BaseMetric {
}

private async generateStatements(actualOutput: string): Promise<string[]> {
const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_statements", {
actual_output: actualOutput,
});
const prompt = resolveTemplate(
"metrics",
TEMPLATE_CLASS,
"generate_statements",
{
actual_output: actualOutput,
},
);
const { statements } = await generateWithSchema(
this,
prompt,
Expand All @@ -90,10 +95,15 @@ export class AnswerRelevancyMetric extends BaseMetric {
input: string,
): Promise<AnswerRelevancyVerdict[]> {
if (this.statements.length === 0) return [];
const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_verdicts", {
input,
statements: this.statements,
});
const prompt = resolveTemplate(
"metrics",
TEMPLATE_CLASS,
"generate_verdicts",
{
input,
statements: this.statements,
},
);
const { verdicts } = await generateWithSchema(this, prompt, VerdictsSchema);
return verdicts;
}
Expand All @@ -103,11 +113,16 @@ export class AnswerRelevancyMetric extends BaseMetric {
const irrelevantStatements = this.verdicts
.filter((v) => v.verdict.trim().toLowerCase() === "no")
.map((v) => v.reason);
const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "generate_reason", {
irrelevant_statements: irrelevantStatements,
input,
score: (this.score ?? 0).toFixed(2),
});
const prompt = resolveTemplate(
"metrics",
TEMPLATE_CLASS,
"generate_reason",
{
irrelevant_statements: irrelevantStatements,
input,
score: (this.score ?? 0).toFixed(2),
},
);
const { reason } = await generateWithSchema(
this,
prompt,
Expand Down
4 changes: 3 additions & 1 deletion typescript/src/metrics/answer-relevancy/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,6 @@ export const AnswerRelevancyScoreReasonSchema = z.object({
reason: z.string(),
});

export type AnswerRelevancyVerdict = z.infer<typeof AnswerRelevancyVerdictSchema>;
export type AnswerRelevancyVerdict = z.infer<
typeof AnswerRelevancyVerdictSchema
>;
19 changes: 12 additions & 7 deletions typescript/src/metrics/arena-g-eval/arena-g-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,18 @@ export class ArenaGEval extends BaseArenaMetric {
reason: string,
dummyToReal: Record<string, string>,
): Promise<string> {
const prompt = resolveTemplate("metrics", TEMPLATE_CLASS, "rewrite_reason", {
reason,
// Pass as JSON text: Nunjucks renders a bare object as "[object Object]"
// (the resolver only gives arrays a Python-repr toString). The template
// renders {{ dummy_to_real_names }} directly and its example is JSON.
dummy_to_real_names: JSON.stringify(dummyToReal),
});
const prompt = resolveTemplate(
"metrics",
TEMPLATE_CLASS,
"rewrite_reason",
{
reason,
// Pass as JSON text: Nunjucks renders a bare object as "[object Object]"
// (the resolver only gives arrays a Python-repr toString). The template
// renders {{ dummy_to_real_names }} directly and its example is JSON.
dummy_to_real_names: JSON.stringify(dummyToReal),
},
);
const { rewritten_reason } = await generateWithSchema(
this,
prompt,
Expand Down
Loading
Loading