eliza/packages/benchmarks/trust/run.ts at develop · elizaOS/eliza · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env bun
/**
 * Trust Security Benchmark Runner
 *
 * Usage: bun run benchmarks/trust/run.ts [--handler oracle]
 *
 * Runs the adversarial test corpus against a handler and reports
 * precision, recall, F1 per category plus overall metrics.
 */

import { TEST_CORPUS } from "./corpus";
import { perfectHandler } from "./handlers/perfect";
import { formatReport } from "./reporter";
import { scoreResults } from "./scorer";
import type { DetectionResult, TrustBenchmarkHandler } from "./types";

// ── Handler registry ───────────────────────────────────────────────────────

const HANDLERS: Record<string, TrustBenchmarkHandler> = {
  oracle: perfectHandler,
};

// ── Runner ─────────────────────────────────────────────────────────────────

async function runBenchmark(
  handler: TrustBenchmarkHandler,
): Promise<DetectionResult[]> {
  const results: DetectionResult[] = [];

  for (const testCase of TEST_CORPUS) {
    let detection: { detected: boolean; confidence: number };

    switch (testCase.category) {
      case "prompt_injection":
        detection = await handler.detectInjection(testCase.input);
        break;
      case "social_engineering":
        detection = await handler.detectSocialEngineering(testCase.input);
        break;
      case "impersonation":
        detection = await handler.detectImpersonation(
          testCase.input,
          testCase.existingUsers ?? [],
        );
        break;
      case "credential_theft":
        detection = await handler.detectCredentialTheft(testCase.input);
        break;
      case "benign": {
        // For benign cases, run through ALL detectors and flag if ANY fires
        const [inj, se, cred] = await Promise.all([
          handler.detectInjection(testCase.input),
          handler.detectSocialEngineering(testCase.input),
          handler.detectCredentialTheft(testCase.input),
        ]);
        detection = {
          detected: inj.detected || se.detected || cred.detected,
          confidence: Math.max(inj.confidence, se.confidence, cred.confidence),
        };
        break;
      }
      default:
        detection = { detected: false, confidence: 0 };
    }

    results.push({
      testId: testCase.id,
      detected: detection.detected,
      confidence: detection.confidence,
      detectedType: detection.detected ? testCase.category : undefined,
    });
  }

  return results;
}

// ── Main ───────────────────────────────────────────────────────────────────

async function main() {
  const handlerName = process.argv.includes("--handler")
    ? process.argv[process.argv.indexOf("--handler") + 1]
    : "oracle";

  const handler = HANDLERS[handlerName];
  if (!handler) {
    console.error(
      `Unknown handler: "${handlerName}". Available: ${Object.keys(HANDLERS).join(", ")}`,
    );
    process.exit(1);
  }

  console.log(`Running trust benchmark with handler: ${handler.name}`);
  console.log(`Test corpus: ${TEST_CORPUS.length} cases`);
  console.log("");

  const detections = await runBenchmark(handler);
  const result = scoreResults(TEST_CORPUS, detections);
  const report = formatReport(result, handler.name, TEST_CORPUS, detections);

  console.log(report);

  // Write JSON results
  const resultsPath = new URL("./results.json", import.meta.url).pathname;
  await Bun.write(resultsPath, JSON.stringify(result, null, 2));
  console.log(`Results written to ${resultsPath}`);

  // Exit code based on overall quality
  if (result.overallF1 < 0.5) {
    console.log("\nWARNING: Overall F1 below 50% threshold");
    process.exit(1);
  }
}

main().catch((err) => {
  console.error("Benchmark failed:", err);
  process.exit(1);
});