salesforcecli
diff --git a/‎packages/mcp-eval-framework/ONBOARDING.md‎
Lines changed: 447 additions & 0 deletions b/‎packages/mcp-eval-framework/ONBOARDING.md‎
Lines changed: 447 additions & 0 deletions
diff --git a/‎packages/mcp-eval-framework/package.json‎
Lines changed: 15 additions & 0 deletions b/‎packages/mcp-eval-framework/package.json‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎packages/mcp-eval-framework/src/adapter.ts‎
Lines changed: 93 additions & 0 deletions b/‎packages/mcp-eval-framework/src/adapter.ts‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎packages/mcp-eval-framework/src/index.ts‎
Lines changed: 49 additions & 0 deletions b/‎packages/mcp-eval-framework/src/index.ts‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎packages/mcp-eval-framework/src/reporting/console-reporter.ts‎
Lines changed: 129 additions & 0 deletions b/‎packages/mcp-eval-framework/src/reporting/console-reporter.ts‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎packages/mcp-eval-framework/src/reporting/json-reporter.ts‎
Lines changed: 31 additions & 0 deletions b/‎packages/mcp-eval-framework/src/reporting/json-reporter.ts‎
Lines changed: 31 additions & 0 deletions
@@ -0,0 +1,15 @@
+{
+  "name": "@salesforce/mcp-eval-framework",
+  "description": "Generic MCP agent evaluation framework — use with any MCP provider",
+  "version": "0.0.1",
+  "private": true,
+  "type": "module",
+  "main": "src/index.ts",
+  "dependencies": {
+    "chalk": "^5.4.0"
+  },
+  "devDependencies": {
+    "@types/node": "^22.16.5",
+    "typescript": "^5.9.2"
+  }
+}
@@ -0,0 +1,93 @@
+import type {
+  TestCase,
+  ParsedAgentOutput,
+  ToolEfficiencyConfig,
+  DomainEndStateResult,
+} from "./types.js";
+
+/**
+ * A rubric function takes the agent's prompt, output, and ground truth,
+ * and returns a prompt string for the LLM judge to score one dimension.
+ * The returned string should instruct the judge to output JSON: { "score": 1-5, "justification": "..." }
+ */
+export type RubricFn = (
+  prompt: string,
+  agentOutput: string,
+  groundTruth: string
+) => string;
+
+/**
+ * EvalAdapter — the single extension point for domain-specific behavior.
+ *
+ * Each MCP provider implements this interface once.
+ * The framework calls these methods at the right times during the eval run.
+ */
+export interface EvalAdapter {
+  /**
+   * The MCP tool name the agent should invoke.
+   * Used by evaluateEndState() to check tool selection.
+   * e.g. "scan_apex_class_for_antipatterns"
+   */
+  toolName: string;
+
+  /**
+   * Claude CLI --allowedTools glob that auto-approves MCP tools.
+   * e.g. "mcp__salesforce__*,mcp__salesforce-dx__*"
+   */
+  allowedToolsPattern: string;
+
+  /**
+   * Returns additional {placeholder} substitutions for the prompt template,
+   * beyond the built-ins {filePath} and {directory}.
+   *
+   * Example: return { apexFilePath: filePath } for backward compat with
+   * existing YAML test cases that use {apexFilePath}.
+   */
+  resolvePromptPlaceholders(
+    testCase: TestCase,
+    filePath: string,
+    directory: string
+  ): Record<string, string>;
+
+  /**
+   * Deterministic scoring: did the agent get the right answer?
+   * Called after the agent run completes.
+   *
+   * @param output - Parsed agent output (tool calls + response text)
+   * @param rawOutput - Raw stdout string (for text-matching fallback)
+   * @param expected - The expectedResults from the test case (your domain shape)
+   * @param config - toolEfficiency config from the test case scoring section
+   */
+  evaluateEndState(
+    output: ParsedAgentOutput,
+    rawOutput: string,
+    expected: Record<string, unknown>,
+    config: ToolEfficiencyConfig
+  ): DomainEndStateResult;
+
+  /**
+   * Converts expectedResults to a natural language ground truth string.
+   * Used by the LLM judge to understand what the correct answer looks like.
+   */
+  formatGroundTruth(expected: Record<string, unknown>): string;
+
+  /**
+   * Returns lines to display per test case in the terminal (domain-specific).
+   * Called both during the run (inline progress) and in the final report.
+   *
+   * Example return value:
+   *   ["FP: none", "GGD: ✔ found 1/1  severity: ok"]
+   */
+  formatEndStateDetails(result: DomainEndStateResult): string[];
+
+  /**
+   * Four rubric functions for LLM-as-judge scoring.
+   * Each returns a prompt string for the judge to score one dimension.
+   */
+  rubrics: {
+    factualAccuracy: RubricFn;
+    completeness: RubricFn;
+    toolEfficiency: RubricFn;
+    responseQuality: RubricFn;
+  };
+}
@@ -0,0 +1,49 @@
+/**
+ * @salesforce/mcp-eval-framework
+ *
+ * Generic MCP agent evaluation framework.
+ * Implement EvalAdapter to plug in your domain-specific tool, test cases, and scoring logic.
+ *
+ * Quick start:
+ *   1. Add this package as a devDependency
+ *   2. Implement EvalAdapter in eval/adapter/index.ts
+ *   3. Write test cases in eval/test-cases/*.yaml
+ *   4. Create eval/run-eval.ts using the boilerplate below
+ *
+ * @see ONBOARDING.md for the full guide
+ */
+
+// Core extension point
+export type { EvalAdapter, RubricFn } from "./adapter.js";
+
+// Generic types
+export type {
+  TestCase,
+  ScoringConfig,
+  DimensionConfig,
+  ToolEfficiencyConfig,
+  CliInvocationResult,
+  CliInvokerConfig,
+  DomainEndStateResult,
+  JudgeScore,
+  RubricResult,
+  CaseResult,
+  EvalReport,
+  EvalRunConfig,
+  EvalSummary,
+  ParsedToolCall,
+  ParsedAgentOutput,
+} from "./types.js";
+
+// Runner
+export { EvalRunner } from "./runner/eval-runner.js";
+export type { EvalRunnerOptions } from "./runner/eval-runner.js";
+export { FixtureManager } from "./runner/fixture-manager.js";
+export { ClaudeCliInvoker } from "./runner/claude-cli-invoker.js";
+
+// Scoring
+export { JudgeEvaluator } from "./scoring/judge-evaluator.js";
+
+// Reporting
+export { ConsoleReporter } from "./reporting/console-reporter.js";
+export { JsonReporter } from "./reporting/json-reporter.js";
@@ -0,0 +1,129 @@
+import chalk from "chalk";
+import type { EvalReport, CaseResult, EvalSummary } from "../types.js";
+import type { EvalAdapter } from "../adapter.js";
+
+/**
+ * Console reporter: prints color-coded eval results to stdout.
+ * Domain-specific per-case details are delegated to adapter.formatEndStateDetails().
+ */
+export class ConsoleReporter {
+  private readonly adapter: EvalAdapter;
+
+  constructor(adapter: EvalAdapter) {
+    this.adapter = adapter;
+  }
+
+  report(evalReport: EvalReport): void {
+    this.printHeader(evalReport);
+    this.printCaseResults(evalReport.results);
+    this.printSummary(evalReport.summary, evalReport.config.judgeEnabled);
+  }
+
+  private printHeader(report: EvalReport): void {
+    console.log("");
+    console.log(chalk.bold("MCP Agent Evaluation Report"));
+    console.log(chalk.gray("─".repeat(60)));
+    console.log(`  Timestamp:  ${report.timestamp}`);
+    console.log(`  Model:      ${report.config.model}`);
+    const judgeLabel = report.config.judgeEnabled
+      ? `enabled (${report.config.judgeModel ?? "sonnet"})`
+      : "disabled (fast mode)";
+    console.log(`  Judge:      ${judgeLabel}`);
+    console.log(`  Threshold:  ${report.config.passThreshold}/5.0`);
+    if (report.config.filter) {
+      console.log(`  Filter:     ${report.config.filter}`);
+    }
+    console.log(chalk.gray("─".repeat(60)));
+    console.log("");
+  }
+
+  private printCaseResults(results: CaseResult[]): void {
+    for (const result of results) {
+      this.printCaseResult(result);
+    }
+  }
+
+  private printCaseResult(result: CaseResult): void {
+    const statusIcon = result.pass ? chalk.green("PASS") : chalk.red("FAIL");
+    const scoreStr = result.finalScore.toFixed(2);
+
+    console.log(
+      `${statusIcon}  ${chalk.bold(result.testCaseId)} (${scoreStr}/5.0)`
+    );
+    console.log(chalk.gray(`       ${result.description}`));
+
+    // Generic deterministic checks
+    const es = result.endStateScore;
+    const toolCheck = es.toolSelected ? chalk.green("yes") : chalk.red("no");
+    const callsCheck = es.toolCallCountWithinMax
+      ? chalk.green(`${es.toolCallCount}`)
+      : chalk.red(`${es.toolCallCount} (exceeded max)`);
+
+    console.log(
+      chalk.gray(`       Tool selected: ${toolCheck}  |  Tool calls: ${callsCheck}`)
+    );
+
+    // Domain-specific details from adapter
+    const details = this.adapter.formatEndStateDetails(es);
+    for (const line of details) {
+      console.log(chalk.gray(`       ${line}`));
+    }
+
+    // Judge scores
+    if (result.judgeScore) {
+      const js = result.judgeScore;
+      console.log(
+        chalk.gray(
+          `       Judge: factual=${js.factualAccuracy.score} completeness=${js.completeness.score} efficiency=${js.toolEfficiency.score} quality=${js.responseQuality.score} => composite=${js.compositeScore.toFixed(2)}`
+        )
+      );
+    }
+
+    console.log("");
+  }
+
+  private printSummary(summary: EvalSummary, judgeEnabled: boolean): void {
+    console.log(chalk.gray("═".repeat(60)));
+    console.log(chalk.bold("Summary"));
+    console.log(chalk.gray("─".repeat(60)));
+
+    const passRateColor =
+      summary.passRate >= 0.8
+        ? chalk.green
+        : summary.passRate >= 0.5
+          ? chalk.yellow
+          : chalk.red;
+
+    console.log(`  Total:    ${summary.totalCases} cases`);
+    console.log(
+      `  Passed:   ${chalk.green(String(summary.passedCases))}  |  Failed: ${chalk.red(String(summary.failedCases))}`
+    );
+    console.log(
+      `  Pass rate: ${passRateColor((summary.passRate * 100).toFixed(1) + "%")}`
+    );
+    console.log(`  Avg score: ${summary.averageScore.toFixed(2)}/5.0`);
+
+    if (judgeEnabled && summary.averageByDimension) {
+      const dim = summary.averageByDimension;
+      console.log("");
+      console.log(chalk.bold("  Per-Dimension Averages:"));
+      console.log(`    Factual accuracy: ${dim.factualAccuracy.toFixed(2)}`);
+      console.log(`    Completeness:     ${dim.completeness.toFixed(2)}`);
+      console.log(`    Tool efficiency:  ${dim.toolEfficiency.toFixed(2)}`);
+      console.log(`    Response quality: ${dim.responseQuality.toFixed(2)}`);
+    }
+
+    if (summary.worstCases.length > 0) {
+      console.log("");
+      console.log(chalk.bold("  Worst Cases:"));
+      for (const wc of summary.worstCases) {
+        console.log(
+          chalk.red(`    ${wc.testCaseId}: ${wc.score.toFixed(2)}/5.0`)
+        );
+      }
+    }
+
+    console.log(chalk.gray("═".repeat(60)));
+    console.log("");
+  }
+}
@@ -0,0 +1,31 @@
+import * as fs from "node:fs";
+import * as path from "node:path";
+import type { EvalReport } from "../types.js";
+
+/**
+ * JSON reporter: writes the full EvalReport to a timestamped file
+ * in eval/results/.
+ */
+export class JsonReporter {
+  private readonly resultsDir: string;
+
+  constructor(resultsDir: string) {
+    this.resultsDir = resultsDir;
+  }
+
+  /**
+   * Writes the evaluation report to a JSON file.
+   * Returns the path to the written file.
+   */
+  report(evalReport: EvalReport): string {
+    fs.mkdirSync(this.resultsDir, { recursive: true });
+
+    const timestamp = evalReport.timestamp.replace(/[:.]/g, "-");
+    const fileName = `eval-${timestamp}.json`;
+    const filePath = path.join(this.resultsDir, fileName);
+
+    fs.writeFileSync(filePath, JSON.stringify(evalReport, null, 2), "utf-8");
+
+    return filePath;
+  }
+}