Skip to content

Commit 44ec0c5

Browse files
committed
added eval framework package
1 parent 8d6b652 commit 44ec0c5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+3890
-2
lines changed

packages/mcp-eval-framework/ONBOARDING.md

Lines changed: 447 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"name": "@salesforce/mcp-eval-framework",
3+
"description": "Generic MCP agent evaluation framework — use with any MCP provider",
4+
"version": "0.0.1",
5+
"private": true,
6+
"type": "module",
7+
"main": "src/index.ts",
8+
"dependencies": {
9+
"chalk": "^5.4.0"
10+
},
11+
"devDependencies": {
12+
"@types/node": "^22.16.5",
13+
"typescript": "^5.9.2"
14+
}
15+
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import type {
2+
TestCase,
3+
ParsedAgentOutput,
4+
ToolEfficiencyConfig,
5+
DomainEndStateResult,
6+
} from "./types.js";
7+
8+
/**
9+
* A rubric function takes the agent's prompt, output, and ground truth,
10+
* and returns a prompt string for the LLM judge to score one dimension.
11+
* The returned string should instruct the judge to output JSON: { "score": 1-5, "justification": "..." }
12+
*/
13+
export type RubricFn = (
14+
prompt: string,
15+
agentOutput: string,
16+
groundTruth: string
17+
) => string;
18+
19+
/**
20+
* EvalAdapter — the single extension point for domain-specific behavior.
21+
*
22+
* Each MCP provider implements this interface once.
23+
* The framework calls these methods at the right times during the eval run.
24+
*/
25+
export interface EvalAdapter {
26+
/**
27+
* The MCP tool name the agent should invoke.
28+
* Used by evaluateEndState() to check tool selection.
29+
* e.g. "scan_apex_class_for_antipatterns"
30+
*/
31+
toolName: string;
32+
33+
/**
34+
* Claude CLI --allowedTools glob that auto-approves MCP tools.
35+
* e.g. "mcp__salesforce__*,mcp__salesforce-dx__*"
36+
*/
37+
allowedToolsPattern: string;
38+
39+
/**
40+
* Returns additional {placeholder} substitutions for the prompt template,
41+
* beyond the built-ins {filePath} and {directory}.
42+
*
43+
* Example: return { apexFilePath: filePath } for backward compat with
44+
* existing YAML test cases that use {apexFilePath}.
45+
*/
46+
resolvePromptPlaceholders(
47+
testCase: TestCase,
48+
filePath: string,
49+
directory: string
50+
): Record<string, string>;
51+
52+
/**
53+
* Deterministic scoring: did the agent get the right answer?
54+
* Called after the agent run completes.
55+
*
56+
* @param output - Parsed agent output (tool calls + response text)
57+
* @param rawOutput - Raw stdout string (for text-matching fallback)
58+
* @param expected - The expectedResults from the test case (your domain shape)
59+
* @param config - toolEfficiency config from the test case scoring section
60+
*/
61+
evaluateEndState(
62+
output: ParsedAgentOutput,
63+
rawOutput: string,
64+
expected: Record<string, unknown>,
65+
config: ToolEfficiencyConfig
66+
): DomainEndStateResult;
67+
68+
/**
69+
* Converts expectedResults to a natural language ground truth string.
70+
* Used by the LLM judge to understand what the correct answer looks like.
71+
*/
72+
formatGroundTruth(expected: Record<string, unknown>): string;
73+
74+
/**
75+
* Returns lines to display per test case in the terminal (domain-specific).
76+
* Called both during the run (inline progress) and in the final report.
77+
*
78+
* Example return value:
79+
* ["FP: none", "GGD: ✔ found 1/1 severity: ok"]
80+
*/
81+
formatEndStateDetails(result: DomainEndStateResult): string[];
82+
83+
/**
84+
* Four rubric functions for LLM-as-judge scoring.
85+
* Each returns a prompt string for the judge to score one dimension.
86+
*/
87+
rubrics: {
88+
factualAccuracy: RubricFn;
89+
completeness: RubricFn;
90+
toolEfficiency: RubricFn;
91+
responseQuality: RubricFn;
92+
};
93+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/**
2+
* @salesforce/mcp-eval-framework
3+
*
4+
* Generic MCP agent evaluation framework.
5+
* Implement EvalAdapter to plug in your domain-specific tool, test cases, and scoring logic.
6+
*
7+
* Quick start:
8+
* 1. Add this package as a devDependency
9+
* 2. Implement EvalAdapter in eval/adapter/index.ts
10+
* 3. Write test cases in eval/test-cases/*.yaml
11+
* 4. Create eval/run-eval.ts using the boilerplate below
12+
*
13+
* @see ONBOARDING.md for the full guide
14+
*/
15+
16+
// Core extension point
17+
export type { EvalAdapter, RubricFn } from "./adapter.js";
18+
19+
// Generic types
20+
export type {
21+
TestCase,
22+
ScoringConfig,
23+
DimensionConfig,
24+
ToolEfficiencyConfig,
25+
CliInvocationResult,
26+
CliInvokerConfig,
27+
DomainEndStateResult,
28+
JudgeScore,
29+
RubricResult,
30+
CaseResult,
31+
EvalReport,
32+
EvalRunConfig,
33+
EvalSummary,
34+
ParsedToolCall,
35+
ParsedAgentOutput,
36+
} from "./types.js";
37+
38+
// Runner
39+
export { EvalRunner } from "./runner/eval-runner.js";
40+
export type { EvalRunnerOptions } from "./runner/eval-runner.js";
41+
export { FixtureManager } from "./runner/fixture-manager.js";
42+
export { ClaudeCliInvoker } from "./runner/claude-cli-invoker.js";
43+
44+
// Scoring
45+
export { JudgeEvaluator } from "./scoring/judge-evaluator.js";
46+
47+
// Reporting
48+
export { ConsoleReporter } from "./reporting/console-reporter.js";
49+
export { JsonReporter } from "./reporting/json-reporter.js";
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import chalk from "chalk";
2+
import type { EvalReport, CaseResult, EvalSummary } from "../types.js";
3+
import type { EvalAdapter } from "../adapter.js";
4+
5+
/**
6+
* Console reporter: prints color-coded eval results to stdout.
7+
* Domain-specific per-case details are delegated to adapter.formatEndStateDetails().
8+
*/
9+
export class ConsoleReporter {
10+
private readonly adapter: EvalAdapter;
11+
12+
constructor(adapter: EvalAdapter) {
13+
this.adapter = adapter;
14+
}
15+
16+
report(evalReport: EvalReport): void {
17+
this.printHeader(evalReport);
18+
this.printCaseResults(evalReport.results);
19+
this.printSummary(evalReport.summary, evalReport.config.judgeEnabled);
20+
}
21+
22+
private printHeader(report: EvalReport): void {
23+
console.log("");
24+
console.log(chalk.bold("MCP Agent Evaluation Report"));
25+
console.log(chalk.gray("─".repeat(60)));
26+
console.log(` Timestamp: ${report.timestamp}`);
27+
console.log(` Model: ${report.config.model}`);
28+
const judgeLabel = report.config.judgeEnabled
29+
? `enabled (${report.config.judgeModel ?? "sonnet"})`
30+
: "disabled (fast mode)";
31+
console.log(` Judge: ${judgeLabel}`);
32+
console.log(` Threshold: ${report.config.passThreshold}/5.0`);
33+
if (report.config.filter) {
34+
console.log(` Filter: ${report.config.filter}`);
35+
}
36+
console.log(chalk.gray("─".repeat(60)));
37+
console.log("");
38+
}
39+
40+
private printCaseResults(results: CaseResult[]): void {
41+
for (const result of results) {
42+
this.printCaseResult(result);
43+
}
44+
}
45+
46+
private printCaseResult(result: CaseResult): void {
47+
const statusIcon = result.pass ? chalk.green("PASS") : chalk.red("FAIL");
48+
const scoreStr = result.finalScore.toFixed(2);
49+
50+
console.log(
51+
`${statusIcon} ${chalk.bold(result.testCaseId)} (${scoreStr}/5.0)`
52+
);
53+
console.log(chalk.gray(` ${result.description}`));
54+
55+
// Generic deterministic checks
56+
const es = result.endStateScore;
57+
const toolCheck = es.toolSelected ? chalk.green("yes") : chalk.red("no");
58+
const callsCheck = es.toolCallCountWithinMax
59+
? chalk.green(`${es.toolCallCount}`)
60+
: chalk.red(`${es.toolCallCount} (exceeded max)`);
61+
62+
console.log(
63+
chalk.gray(` Tool selected: ${toolCheck} | Tool calls: ${callsCheck}`)
64+
);
65+
66+
// Domain-specific details from adapter
67+
const details = this.adapter.formatEndStateDetails(es);
68+
for (const line of details) {
69+
console.log(chalk.gray(` ${line}`));
70+
}
71+
72+
// Judge scores
73+
if (result.judgeScore) {
74+
const js = result.judgeScore;
75+
console.log(
76+
chalk.gray(
77+
` Judge: factual=${js.factualAccuracy.score} completeness=${js.completeness.score} efficiency=${js.toolEfficiency.score} quality=${js.responseQuality.score} => composite=${js.compositeScore.toFixed(2)}`
78+
)
79+
);
80+
}
81+
82+
console.log("");
83+
}
84+
85+
private printSummary(summary: EvalSummary, judgeEnabled: boolean): void {
86+
console.log(chalk.gray("═".repeat(60)));
87+
console.log(chalk.bold("Summary"));
88+
console.log(chalk.gray("─".repeat(60)));
89+
90+
const passRateColor =
91+
summary.passRate >= 0.8
92+
? chalk.green
93+
: summary.passRate >= 0.5
94+
? chalk.yellow
95+
: chalk.red;
96+
97+
console.log(` Total: ${summary.totalCases} cases`);
98+
console.log(
99+
` Passed: ${chalk.green(String(summary.passedCases))} | Failed: ${chalk.red(String(summary.failedCases))}`
100+
);
101+
console.log(
102+
` Pass rate: ${passRateColor((summary.passRate * 100).toFixed(1) + "%")}`
103+
);
104+
console.log(` Avg score: ${summary.averageScore.toFixed(2)}/5.0`);
105+
106+
if (judgeEnabled && summary.averageByDimension) {
107+
const dim = summary.averageByDimension;
108+
console.log("");
109+
console.log(chalk.bold(" Per-Dimension Averages:"));
110+
console.log(` Factual accuracy: ${dim.factualAccuracy.toFixed(2)}`);
111+
console.log(` Completeness: ${dim.completeness.toFixed(2)}`);
112+
console.log(` Tool efficiency: ${dim.toolEfficiency.toFixed(2)}`);
113+
console.log(` Response quality: ${dim.responseQuality.toFixed(2)}`);
114+
}
115+
116+
if (summary.worstCases.length > 0) {
117+
console.log("");
118+
console.log(chalk.bold(" Worst Cases:"));
119+
for (const wc of summary.worstCases) {
120+
console.log(
121+
chalk.red(` ${wc.testCaseId}: ${wc.score.toFixed(2)}/5.0`)
122+
);
123+
}
124+
}
125+
126+
console.log(chalk.gray("═".repeat(60)));
127+
console.log("");
128+
}
129+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import * as fs from "node:fs";
2+
import * as path from "node:path";
3+
import type { EvalReport } from "../types.js";
4+
5+
/**
6+
* JSON reporter: writes the full EvalReport to a timestamped file
7+
* in eval/results/.
8+
*/
9+
export class JsonReporter {
10+
private readonly resultsDir: string;
11+
12+
constructor(resultsDir: string) {
13+
this.resultsDir = resultsDir;
14+
}
15+
16+
/**
17+
* Writes the evaluation report to a JSON file.
18+
* Returns the path to the written file.
19+
*/
20+
report(evalReport: EvalReport): string {
21+
fs.mkdirSync(this.resultsDir, { recursive: true });
22+
23+
const timestamp = evalReport.timestamp.replace(/[:.]/g, "-");
24+
const fileName = `eval-${timestamp}.json`;
25+
const filePath = path.join(this.resultsDir, fileName);
26+
27+
fs.writeFileSync(filePath, JSON.stringify(evalReport, null, 2), "utf-8");
28+
29+
return filePath;
30+
}
31+
}

0 commit comments

Comments
 (0)