From e0a0074812204eceff4321b75ff8d8d6ec7acc5d Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Tue, 16 Jun 2026 11:07:23 +0300 Subject: [PATCH 01/19] feat(libs): add suite-based behavioral eval harness for agent onboarding Introduce @novu/agent-evals with a mocked CLI runner, deterministic and LLM judge graders, and an agent-onboarding scenario suite, plus CI to run evals on doc changes and nightly. Co-authored-by: Cursor --- .github/workflows/agent-evals.yml | 56 ++++ libs/agent-evals/.env.example | 1 + libs/agent-evals/.gitignore | 3 + libs/agent-evals/README.md | 84 ++++++ libs/agent-evals/package.json | 27 ++ libs/agent-evals/project.json | 13 + libs/agent-evals/scripts/run-evals.sh | 13 + libs/agent-evals/src/core/graders.ts | 86 ++++++ libs/agent-evals/src/core/judge.ts | 32 ++ libs/agent-evals/src/core/mock-shell.ts | 125 ++++++++ libs/agent-evals/src/core/recorder.ts | 112 +++++++ libs/agent-evals/src/core/reporters.ts | 47 +++ libs/agent-evals/src/core/run-agent.ts | 118 ++++++++ libs/agent-evals/src/core/runner.ts | 90 ++++++ libs/agent-evals/src/core/tools.ts | 277 ++++++++++++++++++ libs/agent-evals/src/core/types.ts | 150 ++++++++++ libs/agent-evals/src/index.ts | 130 ++++++++ libs/agent-evals/src/load-env.ts | 7 + libs/agent-evals/src/self-test.ts | 53 ++++ .../src/suites/agent-onboarding/catalog.ts | 152 ++++++++++ .../suites/agent-onboarding/connect-parser.ts | 89 ++++++ .../src/suites/agent-onboarding/index.ts | 53 ++++ .../src/suites/agent-onboarding/kit.ts | 6 + .../dashboard-prompt-login/graders.ts | 10 + .../dashboard-prompt-login/project/README.md | 3 + .../project/novu-connect-auth-url.txt | 1 + .../project/package.json | 4 + .../dashboard-prompt-login/scenario.ts | 40 +++ .../scenarios/discipline-no-timers/graders.ts | 12 + .../discipline-no-timers/project/README.md | 3 + .../discipline-no-timers/project/package.json | 4 + .../discipline-no-timers/scenario.ts | 38 +++ .../scenarios/email-handoff/graders.ts | 11 + .../scenarios/email-handoff/project/README.md | 3 + .../email-handoff/project/package.json | 4 + .../scenarios/email-handoff/scenario.ts | 37 +++ .../scenarios/keyless-slack-secure/graders.ts | 18 ++ .../keyless-slack-secure/project/README.md | 5 + .../keyless-slack-secure/project/package.json | 9 + .../keyless-slack-secure/scenario.ts | 22 ++ .../keyless-whatsapp-redirect/graders.ts | 7 + .../project/README.md | 3 + .../project/package.json | 4 + .../keyless-whatsapp-redirect/scenario.ts | 14 + .../persona-infra-exclusion/graders.ts | 14 + .../persona-infra-exclusion/project/README.md | 7 + .../project/package.json | 8 + .../persona-infra-exclusion/scenario.ts | 35 +++ .../scenarios/slack-in-chat-rerun/graders.ts | 10 + .../slack-in-chat-rerun/project/README.md | 3 + .../project/novu-connect-auth-url.txt | 1 + .../slack-in-chat-rerun/project/package.json | 4 + .../scenarios/slack-in-chat-rerun/scenario.ts | 47 +++ .../scenarios/telegram-secure-qr/graders.ts | 10 + .../telegram-secure-qr/project/README.md | 3 + .../telegram-secure-qr/project/package.json | 4 + .../project/telegram-setup-qr.png | 1 + .../scenarios/telegram-secure-qr/scenario.ts | 40 +++ .../src/suites/agent-onboarding/tape.ts | 45 +++ libs/agent-evals/src/suites/registry.ts | 16 + libs/agent-evals/tsconfig.json | 17 ++ pnpm-lock.yaml | 84 ++++-- 62 files changed, 2305 insertions(+), 20 deletions(-) create mode 100644 .github/workflows/agent-evals.yml create mode 100644 libs/agent-evals/.env.example create mode 100644 libs/agent-evals/.gitignore create mode 100644 libs/agent-evals/README.md create mode 100644 libs/agent-evals/package.json create mode 100644 libs/agent-evals/project.json create mode 100755 libs/agent-evals/scripts/run-evals.sh create mode 100644 libs/agent-evals/src/core/graders.ts create mode 100644 libs/agent-evals/src/core/judge.ts create mode 100644 libs/agent-evals/src/core/mock-shell.ts create mode 100644 libs/agent-evals/src/core/recorder.ts create mode 100644 libs/agent-evals/src/core/reporters.ts create mode 100644 libs/agent-evals/src/core/run-agent.ts create mode 100644 libs/agent-evals/src/core/runner.ts create mode 100644 libs/agent-evals/src/core/tools.ts create mode 100644 libs/agent-evals/src/core/types.ts create mode 100644 libs/agent-evals/src/index.ts create mode 100644 libs/agent-evals/src/load-env.ts create mode 100644 libs/agent-evals/src/self-test.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/catalog.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/index.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/kit.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/README.md create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/novu-connect-auth-url.txt create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/package.json create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/scenario.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/project/README.md create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/project/package.json create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/scenario.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/graders.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/project/README.md create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/project/package.json create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/graders.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/project/README.md create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/project/package.json create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/scenario.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/project/README.md create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/project/package.json create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/scenario.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/project/README.md create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/project/package.json create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/README.md create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/novu-connect-auth-url.txt create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/package.json create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/graders.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/README.md create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/package.json create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/telegram-setup-qr.png create mode 100644 libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/tape.ts create mode 100644 libs/agent-evals/src/suites/registry.ts create mode 100644 libs/agent-evals/tsconfig.json diff --git a/.github/workflows/agent-evals.yml b/.github/workflows/agent-evals.yml new file mode 100644 index 00000000000..139a6c62ae8 --- /dev/null +++ b/.github/workflows/agent-evals.yml @@ -0,0 +1,56 @@ +name: Agent evals + +on: + push: + branches: + - next + paths: + - packages/shared/docs/agent-onboarding.md + - libs/agent-evals/** + pull_request: + paths: + - packages/shared/docs/agent-onboarding.md + - libs/agent-evals/** + schedule: + - cron: '0 4 * * *' + workflow_dispatch: + inputs: + enable_judge: + description: Enable LLM judge graders + type: boolean + default: true + +jobs: + evals: + runs-on: ubuntu-latest + timeout-minutes: 45 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 11.0.9 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 22 + cache: pnpm + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Run agent evals + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + JUDGE_FLAG="" + if [[ "${{ github.event_name }}" == "schedule" || "${{ github.event_name }}" == "workflow_dispatch" ]]; then + if [[ "${{ github.event_name }}" != "workflow_dispatch" || "${{ inputs.enable_judge }}" == "true" ]]; then + JUDGE_FLAG="--judge" + fi + fi + + pnpm --filter @novu/agent-evals start -- --suite agent-onboarding --fail-under 80 ${JUDGE_FLAG} diff --git a/libs/agent-evals/.env.example b/libs/agent-evals/.env.example new file mode 100644 index 00000000000..8b3e1258073 --- /dev/null +++ b/libs/agent-evals/.env.example @@ -0,0 +1 @@ +ANTHROPIC_API_KEY= diff --git a/libs/agent-evals/.gitignore b/libs/agent-evals/.gitignore new file mode 100644 index 00000000000..303f8d046ab --- /dev/null +++ b/libs/agent-evals/.gitignore @@ -0,0 +1,3 @@ +debug-runs/ +scores-*.json +.env diff --git a/libs/agent-evals/README.md b/libs/agent-evals/README.md new file mode 100644 index 00000000000..674c084b4a0 --- /dev/null +++ b/libs/agent-evals/README.md @@ -0,0 +1,84 @@ +# @novu/agent-evals + +Behavioral eval harness for Novu coding-agent playbooks. Runs a real LLM agent against scripted scenarios with a mocked CLI, then grades whether the agent follows the playbook using deterministic structural checks plus optional LLM-as-judge graders for fuzzy criteria. + +The harness is **suite-based**: `src/core/` is playbook-agnostic, and each suite under `src/suites/` plugs in its own system prompt, command parser, scenarios, and grader catalog. The first suite, `agent-onboarding`, tests [`packages/shared/docs/agent-onboarding.md`](../../packages/shared/docs/agent-onboarding.md) (the `npx novu connect` flow). + +## Structure + +``` +src/ + core/ # suite-agnostic harness + types.ts # Suite contract, RunResult, Tape, CommandParser + run-agent.ts # AI SDK tool-calling loop + tools.ts # Bash / BashOutput / AskUserQuestion / Read + mock-shell.ts # tape replay engine (pluggable command parser) + recorder.ts # RunResult builder + graders.ts # defineGraders, contains, matches, judge, gradeRun + judge.ts # LLM-as-judge runner + runner.ts # load -> run -> grade -> score + reporters.ts # console matrix + scores-.json + suites/ + registry.ts # suite id -> Suite + agent-onboarding/ # the connect-flow suite + index.ts # the Suite object + connect-parser.ts # novu connect flag parser + validation + tape.ts # connectTape / buildDefaultTape helpers + catalog.ts # connect grader catalog + judge prompts + kit.ts # stable import surface for scenario files + scenarios// # scenario.ts + graders.ts + project/ fixtures +``` + +## Setup + +```bash +cp .env.example .env # from libs/agent-evals/ +pnpm install +``` + +Set `ANTHROPIC_API_KEY` in `.env` before running real evals. Judge graders also use this key when enabled. + +## Local testing + +**No API key** — verify the harness without calling any LLM: + +```bash +pnpm --filter @novu/agent-evals test # deterministic grader self-test +pnpm --filter @novu/agent-evals start -- --dry # list scenarios; no agent run +pnpm --filter @novu/agent-evals start -- --smoke --dry +``` + +**With API key** — runs the agent (and optionally the judge) against scenarios: + +```bash +pnpm --filter @novu/agent-evals start +pnpm --filter @novu/agent-evals start -- --scenario keyless-slack-secure +pnpm --filter @novu/agent-evals start -- --smoke # first scenario only +pnpm --filter @novu/agent-evals start -- --judge # enable LLM judge graders +pnpm --filter @novu/agent-evals start -- --fail-under 80 # CI gate +``` + +## Flags + +| Flag | Description | +| --- | --- | +| `--suite ` | Suite to run (default: `agent-onboarding`) | +| `--scenario ` | Filter evals by id or category | +| `--model ` | Agent model (default: `claude-sonnet-4-5`) | +| `--judge` / `--no-judge` | LLM-as-judge graders (auto-on when `ANTHROPIC_API_KEY` is set) | +| `--judge-model ` | Judge model (defaults to agent model) | +| `--smoke` | First scenario only | +| `--dry` | Print summary only; does not run the agent or call any LLM | +| `--debug` | Save run artifacts to `debug-runs//` | +| `--fail-under ` | Exit non-zero if average score is below threshold | + +## Adding a new suite + +1. Create `src/suites//` with a `CommandParser`, scenario folders, and a grader catalog. +2. Export a `Suite` object from its `index.ts` (system prompt source, parser, scenarios, optional hooks). +3. Register it in `src/suites/registry.ts`. + +## Output + +- Console: scenario × grader matrix +- `scores-.json`: structured results for CI diff --git a/libs/agent-evals/package.json b/libs/agent-evals/package.json new file mode 100644 index 00000000000..3d6228224f0 --- /dev/null +++ b/libs/agent-evals/package.json @@ -0,0 +1,27 @@ +{ + "name": "@novu/agent-evals", + "version": "0.1.0", + "private": true, + "description": "Behavioral eval harness for Novu coding-agent playbooks (suite-based).", + "type": "module", + "bin": { + "agent-evals": "./src/index.ts" + }, + "scripts": { + "start": "tsx src/index.ts", + "test": "tsx src/self-test.ts", + "check": "biome check .", + "check:fix": "biome check --write ." + }, + "dependencies": { + "@ai-sdk/anthropic": "^3.0.10", + "ai": "6.0.50", + "dotenv": "^16.6.1", + "zod": "^3.23.8" + }, + "devDependencies": { + "@types/node": "^22.0.0", + "tsx": "4.16.2", + "typescript": "5.6.2" + } +} diff --git a/libs/agent-evals/project.json b/libs/agent-evals/project.json new file mode 100644 index 00000000000..a27e7a96590 --- /dev/null +++ b/libs/agent-evals/project.json @@ -0,0 +1,13 @@ +{ + "name": "@novu/agent-evals", + "sourceRoot": "libs/agent-evals/src", + "projectType": "library", + "targets": { + "lint": { + "executor": "nx:run-commands", + "options": { + "command": "npx biome lint libs/agent-evals" + } + } + } +} diff --git a/libs/agent-evals/scripts/run-evals.sh b/libs/agent-evals/scripts/run-evals.sh new file mode 100755 index 00000000000..153a7e6da6a --- /dev/null +++ b/libs/agent-evals/scripts/run-evals.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" + +JUDGE_FLAG="" +if [[ "${1:-}" == "--judge" ]]; then + JUDGE_FLAG="--judge" + shift +fi + +pnpm start ${JUDGE_FLAG} -- "$@" diff --git a/libs/agent-evals/src/core/graders.ts b/libs/agent-evals/src/core/graders.ts new file mode 100644 index 00000000000..737455d95f1 --- /dev/null +++ b/libs/agent-evals/src/core/graders.ts @@ -0,0 +1,86 @@ +import type { GraderDefinition, GraderFn, GraderResult, RunResult, ToolCallRecord } from './types.js'; + +export function defineGraders>( + graders: T +): Record { + const normalized = {} as Record; + + for (const [name, value] of Object.entries(graders) as Array<[keyof T, GraderFn | GraderDefinition]>) { + if (typeof value === 'function') { + normalized[name] = { kind: 'deterministic', run: value }; + } else { + normalized[name] = value; + } + } + + return normalized; +} + +export function contains(substring: string, source: (result: RunResult) => string): GraderFn { + return (result) => (source(result).toLowerCase().includes(substring.toLowerCase()) ? 'pass' : 'fail'); +} + +export function notContains(substring: string, source: (result: RunResult) => string): GraderFn { + return (result) => (!source(result).toLowerCase().includes(substring.toLowerCase()) ? 'pass' : 'fail'); +} + +export function containsAny(substrings: string[], source: (result: RunResult) => string): GraderFn { + return (result) => { + const haystack = source(result).toLowerCase(); + + return substrings.some((item) => haystack.includes(item.toLowerCase())) ? 'pass' : 'fail'; + }; +} + +export function matches(pattern: RegExp, source: (result: RunResult) => string): GraderFn { + return (result) => (pattern.test(source(result)) ? 'pass' : 'fail'); +} + +export function toolCallsNamed(result: RunResult, name: string): ToolCallRecord[] { + return result.toolCalls.filter((call) => call.name === name); +} + +export function transcriptText(result: RunResult): string { + return [result.finalText, ...result.assistantMessages].join('\n'); +} + +export function judge(prompt: string, context: (result: RunResult) => string): GraderDefinition { + return { + kind: 'judge', + run: async (result) => { + const { runJudge } = await import('./judge.js'); + + return runJudge(prompt, context(result)); + }, + }; +} + +export async function gradeRun( + graders: Record, + result: RunResult, + options: { judgeEnabled: boolean } +): Promise> { + const outcomes: Record = {}; + + for (const [name, definition] of Object.entries(graders)) { + if (definition.kind === 'judge' && !options.judgeEnabled) { + outcomes[name] = 'skip'; + continue; + } + + outcomes[name] = await definition.run(result); + } + + return outcomes; +} + +export function scoreFromOutcomes(outcomes: Record): number { + const considered = Object.values(outcomes).filter((value) => value !== 'skip'); + const passed = considered.filter((value) => value === 'pass').length; + + if (considered.length === 0) { + return 0; + } + + return passed / considered.length; +} diff --git a/libs/agent-evals/src/core/judge.ts b/libs/agent-evals/src/core/judge.ts new file mode 100644 index 00000000000..cd90592eed8 --- /dev/null +++ b/libs/agent-evals/src/core/judge.ts @@ -0,0 +1,32 @@ +import { anthropic } from '@ai-sdk/anthropic'; +import { generateText } from 'ai'; +import type { GraderResult } from './types.js'; + +let judgeModel = 'claude-sonnet-4-5'; +let judgeEnabled = false; + +export function configureJudge(options: { enabled: boolean; model?: string }): void { + judgeEnabled = options.enabled; + judgeModel = options.model ?? judgeModel; +} + +export async function runJudge(prompt: string, context: string): Promise { + if (!judgeEnabled || !process.env.ANTHROPIC_API_KEY) { + return 'skip'; + } + + const result = await generateText({ + model: anthropic(judgeModel), + prompt: [ + 'You are grading an AI agent run against a coding-agent playbook.', + 'Answer with exactly YES or NO.', + '', + `Question: ${prompt}`, + '', + 'Context:', + context, + ].join('\n'), + }); + + return result.text.trim().toUpperCase().startsWith('YES') ? 'pass' : 'fail'; +} diff --git a/libs/agent-evals/src/core/mock-shell.ts b/libs/agent-evals/src/core/mock-shell.ts new file mode 100644 index 00000000000..38cda72a5dd --- /dev/null +++ b/libs/agent-evals/src/core/mock-shell.ts @@ -0,0 +1,125 @@ +import type { CommandParser, EvalScenario, MockShellState, ParsedCommand, Tape } from './types.js'; + +let shellCounter = 0; + +export function resetShellCounter(): void { + shellCounter = 0; +} + +function selectTapeChunks(tape: Tape, parsed: TParsed): string[] { + const selected: string[] = []; + + for (const chunk of tape.chunks) { + if (chunk.when && !chunk.when(parsed)) { + continue; + } + + selected.push(chunk.stdout); + } + + return selected; +} + +/** + * Replays a scripted CLI "tape" across background-shell polls. The suite's + * CommandParser decides which commands are "tracked" (e.g. `novu connect`) and + * how to parse them; the scenario's tape supplies the stdout chunks and + * optional validation. + */ +export class MockShellEngine { + private shells = new Map>(); + + constructor( + private readonly scenario: EvalScenario, + private readonly parser: CommandParser + ) {} + + createShell(command: string, runInBackground: boolean, env: Record): MockShellState { + const id = `shell-${++shellCounter}`; + const isTracked = this.parser.matches(command); + const parsed = isTracked ? this.parser.parse(command, env) : null; + + let chunks: string[] = []; + let exitCode: number | null = null; + + if (isTracked && parsed && this.scenario.tape) { + const validationError = this.scenario.tape.validate?.(parsed) ?? null; + + if (validationError) { + chunks = [`✗ ${validationError}`]; + exitCode = 1; + } else { + chunks = selectTapeChunks(this.scenario.tape, parsed); + exitCode = this.scenario.tape.exitCode ?? 0; + } + } else if (isTracked && !this.scenario.tape) { + chunks = ['✗ Tracked command was not expected for this scenario.']; + exitCode = 1; + } else if (!runInBackground) { + chunks = [`Executed: ${command}`]; + exitCode = 0; + } else { + chunks = [`Background process started: ${command}`]; + exitCode = null; + } + + const shell: MockShellState = { + id, + command, + parsed, + isTracked, + chunks, + emittedStdout: [], + chunkIndex: 0, + exitCode, + completed: false, + killed: false, + }; + + this.shells.set(id, shell); + + return shell; + } + + pollShell(shellId: string): MockShellState | null { + const shell = this.shells.get(shellId); + + if (!shell || shell.killed) { + return shell ?? null; + } + + if (shell.chunkIndex < shell.chunks.length) { + const nextChunk = shell.chunks[shell.chunkIndex]; + shell.emittedStdout.push(nextChunk); + shell.chunkIndex += 1; + } + + if (shell.chunkIndex >= shell.chunks.length && shell.exitCode !== null) { + shell.completed = true; + } + + return shell; + } + + killShell(shellId: string): boolean { + const shell = this.shells.get(shellId); + + if (!shell) { + return false; + } + + shell.killed = true; + shell.completed = true; + shell.exitCode = shell.exitCode ?? 143; + + return true; + } + + getShell(shellId: string): MockShellState | undefined { + return this.shells.get(shellId); + } + + listShells(): Array> { + return [...this.shells.values()]; + } +} diff --git a/libs/agent-evals/src/core/recorder.ts b/libs/agent-evals/src/core/recorder.ts new file mode 100644 index 00000000000..b03ae759058 --- /dev/null +++ b/libs/agent-evals/src/core/recorder.ts @@ -0,0 +1,112 @@ +import type { MockShellState, RunResult, ToolCallRecord } from './types.js'; + +export class RunRecorder { + private toolCalls: ToolCallRecord[] = []; + private assistantMessages: string[] = []; + private finalText = ''; + private capturedUrls: string[] = []; + private openedFiles: string[] = []; + private killedShellIds: string[] = []; + private trackedShellIds: string[] = []; + private polledShellIds: string[] = []; + private trackedCommands: string[] = []; + private metadata: Record = {}; + + constructor( + private readonly scenarioId: string, + private readonly userPrompt: string + ) {} + + recordToolCall(name: string, args: Record, result?: unknown): void { + this.toolCalls.push({ name, args, result, timestamp: Date.now() }); + } + + recordAssistantMessage(text: string): void { + if (text.trim()) { + this.assistantMessages.push(text); + this.finalText = text; + } + } + + recordTrackedCommand(command: string): void { + this.trackedCommands.push(command); + } + + setMetadata(key: string, value: unknown): void { + this.metadata[key] = value; + } + + recordUrl(url: string): void { + if (!this.capturedUrls.includes(url)) { + this.capturedUrls.push(url); + } + } + + recordOpenedFile(filePath: string): void { + this.openedFiles.push(filePath); + } + + recordTrackedShell(shellId: string): void { + this.trackedShellIds.push(shellId); + } + + recordPoll(shellId: string): void { + if (!this.polledShellIds.includes(shellId)) { + this.polledShellIds.push(shellId); + } + } + + recordKill(shellId: string): void { + this.killedShellIds.push(shellId); + } + + build(): RunResult { + return { + scenarioId: this.scenarioId, + userPrompt: this.userPrompt, + toolCalls: [...this.toolCalls], + assistantMessages: [...this.assistantMessages], + finalText: this.finalText, + capturedUrls: [...this.capturedUrls], + openedFiles: [...this.openedFiles], + killedShellIds: [...this.killedShellIds], + trackedShellIds: [...this.trackedShellIds], + polledShellIds: [...this.polledShellIds], + trackedCommands: [...this.trackedCommands], + metadata: { ...this.metadata }, + }; + } +} + +export function extractUrls(text: string): string[] { + const matches = text.match(/https?:\/\/[^\s)>\]"']+/g) ?? []; + + return matches.map((url) => url.replace(/[.,;]+$/, '')); +} + +export function isKillCommand(command: string): boolean { + return /\b(kill|pkill|killall)\b/.test(command); +} + +export function isOpenCommand(command: string): boolean { + return /^\s*(open|xdg-open|start)\b/.test(command.trim()); +} + +export function isForbiddenWatcherCommand(command: string): boolean { + const normalized = command.toLowerCase(); + + return ( + /\bsleep\b/.test(normalized) || + /\btail\b/.test(normalized) || + /\bgrep\b/.test(normalized) || + /\bschedulewakeup\b/.test(normalized) + ); +} + +export function shellSummary(shell: MockShellState): string { + if (shell.killed) { + return `Shell ${shell.id} was killed.`; + } + + return shell.emittedStdout.join('\n'); +} diff --git a/libs/agent-evals/src/core/reporters.ts b/libs/agent-evals/src/core/reporters.ts new file mode 100644 index 00000000000..ac17c0a1d86 --- /dev/null +++ b/libs/agent-evals/src/core/reporters.ts @@ -0,0 +1,47 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; +import type { ScenarioScore } from './types.js'; +import { PACKAGE_ROOT } from './types.js'; + +function formatResult(value: 'pass' | 'fail' | 'skip'): string { + if (value === 'pass') { + return 'PASS'; + } + + if (value === 'fail') { + return 'FAIL'; + } + + return 'SKIP'; +} + +export function printConsoleReport(suiteId: string, scores: ScenarioScore[], judgeEnabled: boolean): void { + console.log(`\n${suiteId} eval results\n`); + + for (const score of scores) { + console.log(`${score.scenarioId} (${score.category}) — ${(score.score * 100).toFixed(1)}%`); + + for (const [name, result] of Object.entries(score.graders)) { + const suffix = result === 'skip' && !judgeEnabled ? ' (judge disabled)' : ''; + console.log(` - ${name}: ${formatResult(result)}${suffix}`); + } + + console.log(''); + } + + const average = scores.reduce((sum, item) => sum + item.score, 0) / (scores.length || 1); + console.log(`Average score: ${(average * 100).toFixed(1)}%`); +} + +export async function writeScoresFile(suiteId: string, scores: ScenarioScore[]): Promise { + const outputPath = path.join(PACKAGE_ROOT, `scores-${suiteId}.json`); + const payload = scores.map(({ runResult, ...rest }) => ({ + ...rest, + suite: suiteId, + updatedAt: new Date().toISOString(), + })); + + await fs.writeFile(outputPath, JSON.stringify(payload, null, 2), 'utf8'); + + return outputPath; +} diff --git a/libs/agent-evals/src/core/run-agent.ts b/libs/agent-evals/src/core/run-agent.ts new file mode 100644 index 00000000000..cf93614e22c --- /dev/null +++ b/libs/agent-evals/src/core/run-agent.ts @@ -0,0 +1,118 @@ +import fs from 'node:fs/promises'; +import { anthropic } from '@ai-sdk/anthropic'; +import { generateText, type ModelMessage, stepCountIs } from 'ai'; +import { resetShellCounter } from './mock-shell.js'; +import { RunRecorder } from './recorder.js'; +import { createHarnessContext, createHarnessTools } from './tools.js'; +import type { EvalScenario, ParsedCommand, RunResult, Suite } from './types.js'; + +export type RunAgentOptions = { + suite: Suite; + scenario: EvalScenario; + model: string; + maxSteps?: number; +}; + +const DEFAULT_PREAMBLE = [ + 'You are an AI coding agent executing the following playbook exactly.', + 'Follow the playbook precisely. Use the provided tools.', + 'You are running in a Claude Code-like environment with Bash, BashOutput, AskUserQuestion, and Read tools.', + 'Read any relevant fixture files in the workspace before acting.', +].join('\n'); + +const docCache = new Map(); + +async function resolveSystemPrompt(suite: Suite): Promise { + const preamble = suite.systemPromptPreamble ?? DEFAULT_PREAMBLE; + + if ('text' in suite.systemPrompt) { + return [preamble, '', suite.systemPrompt.text].join('\n'); + } + + const docPath = suite.systemPrompt.path; + let playbook = docCache.get(docPath); + + if (!playbook) { + playbook = await fs.readFile(docPath, 'utf8'); + docCache.set(docPath, playbook); + } + + return [preamble, '', playbook].join('\n'); +} + +function shouldInjectFollowUp( + result: { text: string; steps: Array<{ toolResults?: Array<{ output?: unknown }> }> }, + suite: Suite, + scenario: EvalScenario +): boolean { + if (!scenario.followUpMessages?.length) { + return false; + } + + if (suite.followUpTextPattern?.test(result.text)) { + return true; + } + + if (!scenario.followUpOnOptionId) { + return false; + } + + return result.steps.some((step) => + step.toolResults?.some((toolResult) => { + const output = toolResult.output as { selectedId?: string } | undefined; + + return output?.selectedId === scenario.followUpOnOptionId; + }) + ); +} + +export async function runAgentScenario(options: RunAgentOptions): Promise { + resetShellCounter(); + + const recorder = new RunRecorder(options.scenario.id, options.scenario.userPrompt); + const context = createHarnessContext(options.suite, options.scenario, recorder); + const tools = createHarnessTools(context); + const system = await resolveSystemPrompt(options.suite as Suite); + + const messages: ModelMessage[] = [{ role: 'user', content: options.scenario.userPrompt }]; + const followUps = [...(options.scenario.followUpMessages ?? [])]; + + // One turn for the initial prompt plus one per scripted follow-up message. + const maxTurns = followUps.length + 1; + + for (let turn = 0; turn < maxTurns; turn += 1) { + const result = await generateText({ + model: anthropic(options.model), + system, + messages, + tools, + stopWhen: stepCountIs(options.maxSteps ?? 40), + }); + + recorder.recordAssistantMessage(result.text); + messages.push(...result.response.messages); + + if (followUps.length > 0 && shouldInjectFollowUp(result, options.suite, options.scenario)) { + const nextMessage = followUps.shift(); + + if (nextMessage) { + messages.push({ role: 'user', content: nextMessage }); + } + + continue; + } + + break; + } + + return recorder.build(); +} + +export async function dryRunAgentScenario(scenario: EvalScenario): Promise { + resetShellCounter(); + + const recorder = new RunRecorder(scenario.id, scenario.userPrompt); + recorder.recordAssistantMessage(`[dry-run] Would execute scenario "${scenario.id}" with mock CLI tape.`); + + return recorder.build(); +} diff --git a/libs/agent-evals/src/core/runner.ts b/libs/agent-evals/src/core/runner.ts new file mode 100644 index 00000000000..ccd17578c91 --- /dev/null +++ b/libs/agent-evals/src/core/runner.ts @@ -0,0 +1,90 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { gradeRun, scoreFromOutcomes } from './graders.js'; +import { configureJudge } from './judge.js'; +import { dryRunAgentScenario, runAgentScenario } from './run-agent.js'; +import type { RegisteredScenario, RunnerOptions, ScenarioScore, Suite } from './types.js'; +import { PACKAGE_ROOT } from './types.js'; + +export function filterScenarios(suite: Suite, filter?: string): RegisteredScenario[] { + if (!filter) { + return suite.scenarios; + } + + const normalized = filter.toLowerCase(); + + return suite.scenarios.filter( + (entry) => + entry.scenario.id.toLowerCase().includes(normalized) || entry.scenario.category.toLowerCase().includes(normalized) + ); +} + +async function maybeWriteDebugArtifact( + suite: Suite, + options: RunnerOptions, + entry: RegisteredScenario, + score: ScenarioScore +): Promise { + if (!options.debug) { + return; + } + + const debugDir = path.join(PACKAGE_ROOT, 'debug-runs', suite.id, entry.scenario.id); + await fs.mkdir(debugDir, { recursive: true }); + await fs.writeFile(path.join(debugDir, 'score.json'), JSON.stringify(score, null, 2), 'utf8'); +} + +export async function runEvaluation( + suite: Suite, + entry: RegisteredScenario, + options: RunnerOptions +): Promise { + configureJudge({ enabled: options.judge, model: options.judgeModel ?? options.model }); + + const runResult = options.dry + ? await dryRunAgentScenario(entry.scenario) + : await runAgentScenario({ suite, scenario: entry.scenario, model: options.model }); + + const graders = options.dry + ? Object.fromEntries(Object.keys(entry.graders).map((name) => [name, 'skip' as const])) + : await gradeRun(entry.graders, runResult, { judgeEnabled: options.judge }); + + const score = options.dry ? 1 : scoreFromOutcomes(graders); + + const scenarioScore: ScenarioScore = { + scenarioId: entry.scenario.id, + category: entry.scenario.category, + model: options.model, + score, + graders, + runResult: options.debug ? runResult : undefined, + }; + + await maybeWriteDebugArtifact(suite, options, entry, { ...scenarioScore, runResult }); + + return scenarioScore; +} + +export async function runAllEvaluations(suite: Suite, options: RunnerOptions): Promise { + let selected = filterScenarios(suite, options.scenarioFilter); + + if (options.smoke) { + selected = selected.slice(0, 1); + } + + const scores: ScenarioScore[] = []; + + for (const entry of selected) { + scores.push(await runEvaluation(suite, entry, options)); + } + + return scores; +} + +export function averageScore(scores: ScenarioScore[]): number { + if (scores.length === 0) { + return 0; + } + + return scores.reduce((sum, item) => sum + item.score, 0) / scores.length; +} diff --git a/libs/agent-evals/src/core/tools.ts b/libs/agent-evals/src/core/tools.ts new file mode 100644 index 00000000000..770b8406e63 --- /dev/null +++ b/libs/agent-evals/src/core/tools.ts @@ -0,0 +1,277 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; +import { tool } from 'ai'; +import { z } from 'zod'; +import { MockShellEngine } from './mock-shell.js'; +import { + extractUrls, + isForbiddenWatcherCommand, + isKillCommand, + isOpenCommand, + RunRecorder, + shellSummary, +} from './recorder.js'; +import type { EvalScenario, ParsedCommand, ScriptedAnswer, Suite } from './types.js'; +import { normalizePath } from './types.js'; + +export type HarnessContext = { + suite: Suite; + scenario: EvalScenario; + recorder: RunRecorder; + engine: MockShellEngine; + answerIndex: number; + lastBackgroundShellId?: string; + env: Record; +}; + +function pickScriptedAnswer( + scenario: EvalScenario, + question: string, + answerIndex: number +): ScriptedAnswer | undefined { + const remaining = scenario.scriptedAnswers.slice(answerIndex); + + for (const answer of remaining) { + if (answer.match?.test(question)) { + return answer; + } + + if (answer.questionContains && question.toLowerCase().includes(answer.questionContains.toLowerCase())) { + return answer; + } + } + + return remaining[0]; +} + +async function readFixtureFile(projectRoot: string, filePath: string): Promise { + const normalized = normalizePath(filePath); + const absolutePath = path.isAbsolute(normalized) ? path.normalize(normalized) : path.resolve(projectRoot, normalized); + + if (!absolutePath.startsWith(projectRoot)) { + throw new Error(`Refusing to read path outside fixture project: ${filePath}`); + } + + return fs.readFile(absolutePath, 'utf8'); +} + +function captureExportedEnv(command: string, env: Record): boolean { + const match = command.match(/^export\s+([A-Z_][A-Z0-9_]*)='([^']*)'/); + + if (match?.[1]) { + env[match[1]] = match[2] ?? ''; + + return true; + } + + return false; +} + +export function createHarnessTools(context: HarnessContext) { + const Bash = tool({ + description: + 'Executes a bash command. Use run_in_background: true for long-running commands, then poll with BashOutput.', + inputSchema: z.object({ + command: z.string().describe('The bash command to execute.'), + run_in_background: z.boolean().optional().describe('Run the command in the background.'), + description: z.string().optional().describe('Short description of what the command does.'), + }), + execute: async ({ command, run_in_background: runInBackground }) => { + context.recorder.recordToolCall('Bash', { command, run_in_background: runInBackground }); + + if (isForbiddenWatcherCommand(command)) { + return { + error: 'Command rejected by harness.', + stdout: '', + stderr: 'Do not use sleep/tail/grep watchers. Poll BashOutput on the background shell instead.', + exitCode: 1, + }; + } + + if (captureExportedEnv(command, context.env)) { + return { stdout: '', stderr: '', exitCode: 0 }; + } + + if (isOpenCommand(command)) { + const fileMatch = command.match(/["']([^"']+\.png)["']/i) ?? command.match(/\s(\S+\.png)\s*$/i); + + if (fileMatch?.[1]) { + context.recorder.recordOpenedFile(fileMatch[1]); + } + + return { stdout: 'Opened image viewer.', stderr: '', exitCode: 0 }; + } + + if (isKillCommand(command)) { + const shellId = context.lastBackgroundShellId; + + if (shellId) { + context.engine.killShell(shellId); + context.recorder.recordKill(shellId); + } + + return { stdout: shellId ? `Killed shell ${shellId}` : 'No shell to kill.', stderr: '', exitCode: 0 }; + } + + const shell = context.engine.createShell(command, Boolean(runInBackground), context.env); + + if (shell.isTracked) { + context.recorder.recordTrackedCommand(command); + context.recorder.recordTrackedShell(shell.id); + context.lastBackgroundShellId = shell.id; + + if (shell.parsed && context.suite.onTrackedCommand) { + context.suite.onTrackedCommand(command, shell.parsed, context.recorder); + } + } + + if (runInBackground) { + context.engine.pollShell(shell.id); + + return { + shellId: shell.id, + stdout: shell.emittedStdout.join('\n'), + stderr: '', + running: !shell.completed, + }; + } + + context.engine.pollShell(shell.id); + + while (!shell.completed && shell.chunkIndex < shell.chunks.length) { + context.engine.pollShell(shell.id); + } + + const stdout = shell.emittedStdout.join('\n'); + + for (const url of extractUrls(stdout)) { + context.recorder.recordUrl(url); + } + + return { stdout, stderr: '', exitCode: shell.exitCode ?? 0 }; + }, + }); + + const BashOutput = tool({ + description: 'Poll stdout/stderr from a background shell started with Bash run_in_background: true.', + inputSchema: z.object({ + shellId: z.string().describe('Background shell id returned by Bash.'), + }), + execute: async ({ shellId }) => { + context.recorder.recordToolCall('BashOutput', { shellId }); + context.recorder.recordPoll(shellId); + + const shell = context.engine.pollShell(shellId); + + if (!shell) { + return { error: `Unknown shell id: ${shellId}`, stdout: '', completed: true, exitCode: 1 }; + } + + const stdout = shellSummary(shell); + + for (const url of extractUrls(stdout)) { + context.recorder.recordUrl(url); + } + + for (const pattern of context.suite.sentinelFilePatterns ?? []) { + const match = stdout.match(pattern); + + if (match?.[1]) { + try { + const fileContents = await fs.readFile(match[1], 'utf8'); + + for (const url of extractUrls(fileContents)) { + context.recorder.recordUrl(url); + } + } catch { + // Sentinel file may not exist in a fixture; ignore. + } + } + } + + return { + shellId, + stdout, + completed: shell.completed, + exitCode: shell.exitCode, + killed: shell.killed, + }; + }, + }); + + const AskUserQuestion = tool({ + description: 'Ask the user a structured question with 2-4 options.', + inputSchema: z.object({ + question: z.string(), + options: z + .array( + z.object({ + id: z.string(), + label: z.string(), + description: z.string().optional(), + }) + ) + .min(2) + .max(4), + }), + execute: async ({ question, options }) => { + const scripted = pickScriptedAnswer(context.scenario, question, context.answerIndex); + context.answerIndex += 1; + + const selected = + options.find((option) => option.id === scripted?.optionId) ?? + options.find((option) => option.label === scripted?.label) ?? + options[0]; + + context.recorder.recordToolCall('AskUserQuestion', { question, options }, { selectedId: selected.id }); + + return { selectedId: selected.id, selectedLabel: selected.label }; + }, + }); + + const Read = tool({ + description: 'Read a file from the project workspace.', + inputSchema: z.object({ + file_path: z.string(), + }), + execute: async ({ file_path: filePath }) => { + context.recorder.recordToolCall('Read', { file_path: filePath }); + + if (filePath.includes('/tmp/') || filePath.endsWith('.log')) { + return { error: 'Reading log files is discouraged in this flow.' }; + } + + if (filePath.endsWith('.png')) { + return { content: '[PNG image omitted by harness]' }; + } + + try { + const content = await readFixtureFile(context.scenario.projectRoot, filePath); + context.recorder.recordToolCall('Read', { file_path: filePath }, { bytes: content.length }); + + return { content }; + } catch (error) { + return { error: error instanceof Error ? error.message : 'Failed to read file.' }; + } + }, + }); + + return { Bash, BashOutput, AskUserQuestion, Read }; +} + +export function createHarnessContext( + suite: Suite, + scenario: EvalScenario, + recorder: RunRecorder +): HarnessContext { + return { + suite, + scenario, + recorder, + engine: new MockShellEngine(scenario, suite.commandParser), + answerIndex: 0, + env: {}, + }; +} + +export type HarnessTools = ReturnType; diff --git a/libs/agent-evals/src/core/types.ts b/libs/agent-evals/src/core/types.ts new file mode 100644 index 00000000000..b0c3903b89c --- /dev/null +++ b/libs/agent-evals/src/core/types.ts @@ -0,0 +1,150 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +export type GraderResult = 'pass' | 'fail' | 'skip'; + +export type GraderFn = (result: RunResult) => GraderResult | Promise; + +export type GraderDefinition = { + kind: 'deterministic' | 'judge'; + run: GraderFn; +}; + +export type ToolCallRecord = { + name: string; + args: Record; + result?: unknown; + timestamp: number; +}; + +/** A command parsed by a suite's CommandParser. Suites narrow this to a concrete shape. */ +export type ParsedCommand = Record; + +export type TapeChunk = { + stdout: string; + when?: (parsed: TParsed) => boolean; +}; + +export type Tape = { + chunks: Array>; + exitCode?: number; + /** Optional suite-defined validation; return an error string to make the tracked command fail. */ + validate?: (parsed: TParsed) => string | null; +}; + +export type ScriptedAnswer = { + match?: RegExp; + questionContains?: string; + optionId: string; + label?: string; +}; + +export type EvalScenario = { + id: string; + category: string; + description: string; + userPrompt: string; + projectRoot: string; + scriptedAnswers: ScriptedAnswer[]; + tape?: Tape; + followUpMessages?: string[]; + /** When set, a follow-up is injected if the agent selects this option id in a picker. */ + followUpOnOptionId?: string; + /** Scenario-specific configuration consumed by suite graders. */ + metadata?: Record; +}; + +export type RunResult = { + scenarioId: string; + userPrompt: string; + toolCalls: ToolCallRecord[]; + assistantMessages: string[]; + finalText: string; + capturedUrls: string[]; + openedFiles: string[]; + killedShellIds: string[]; + /** Shell ids of commands the suite parser marked as tracked (e.g. the connect command). */ + trackedShellIds: string[]; + polledShellIds: string[]; + /** Raw command strings the suite parser marked as tracked. */ + trackedCommands: string[]; + /** Suite-owned captures (e.g. the drafted agent description). */ + metadata: Record; +}; + +export type ScenarioScore = { + scenarioId: string; + category: string; + model: string; + score: number; + graders: Record; + runResult?: RunResult; +}; + +export type RunnerOptions = { + suite: string; + model: string; + judge: boolean; + judgeModel?: string; + debug: boolean; + dry: boolean; + smoke: boolean; + failUnder?: number; + scenarioFilter?: string; +}; + +export type MockShellState = { + id: string; + command: string; + parsed: TParsed | null; + isTracked: boolean; + chunks: string[]; + emittedStdout: string[]; + chunkIndex: number; + exitCode: number | null; + completed: boolean; + killed: boolean; +}; + +/** Parses shell commands a suite cares about (e.g. `novu connect`). */ +export type CommandParser = { + matches: (command: string) => boolean; + parse: (command: string, env: Record) => TParsed; +}; + +export type RegisteredScenario = { + scenario: EvalScenario; + graders: Record; +}; + +/** A suite plugs suite-specific behavior into the generic harness. */ +export type Suite = { + id: string; + description: string; + /** Playbook/instructions injected as the system prompt. */ + systemPrompt: { path: string } | { text: string }; + /** Optional override for the agent preamble prepended to the playbook. */ + systemPromptPreamble?: string; + commandParser: CommandParser; + scenarios: Array>; + /** stdout patterns whose captured path (group 1) holds a URL to read and record. */ + sentinelFilePatterns?: RegExp[]; + /** Text pattern in assistant output that should trigger a scripted follow-up message. */ + followUpTextPattern?: RegExp; + /** Hook to capture suite-specific metadata when a tracked command runs. */ + onTrackedCommand?: ( + command: string, + parsed: TParsed, + recorder: { setMetadata: (k: string, v: unknown) => void } + ) => void; +}; + +const currentDir = path.dirname(fileURLToPath(import.meta.url)); + +export const REPO_ROOT = path.resolve(currentDir, '../../../..'); + +export const PACKAGE_ROOT = path.resolve(currentDir, '../..'); + +export function normalizePath(input: string): string { + return input.replace(/^\.\//, '').replace(/\\/g, '/'); +} diff --git a/libs/agent-evals/src/index.ts b/libs/agent-evals/src/index.ts new file mode 100644 index 00000000000..320be2e56f9 --- /dev/null +++ b/libs/agent-evals/src/index.ts @@ -0,0 +1,130 @@ +import './load-env.js'; +import { printConsoleReport, writeScoresFile } from './core/reporters.js'; +import { averageScore, filterScenarios, runAllEvaluations } from './core/runner.js'; +import type { RunnerOptions } from './core/types.js'; +import { DEFAULT_SUITE, getSuite, listSuiteIds } from './suites/registry.js'; + +function parseArgs(argv: string[]): RunnerOptions { + const options: RunnerOptions = { + suite: DEFAULT_SUITE, + model: 'claude-sonnet-4-5', + judge: Boolean(process.env.ANTHROPIC_API_KEY), + debug: false, + dry: false, + smoke: false, + }; + + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + + if (arg === '--suite' || arg === '-s') { + options.suite = argv[index + 1]; + index += 1; + continue; + } + + if (arg === '--scenario' || arg === '-e') { + options.scenarioFilter = argv[index + 1]; + index += 1; + continue; + } + + if (arg === '--model') { + options.model = argv[index + 1]; + index += 1; + continue; + } + + if (arg === '--judge') { + options.judge = true; + continue; + } + + if (arg === '--no-judge') { + options.judge = false; + continue; + } + + if (arg === '--judge-model') { + options.judgeModel = argv[index + 1]; + index += 1; + continue; + } + + if (arg === '--smoke') { + options.smoke = true; + continue; + } + + if (arg === '--debug' || arg === '-d') { + options.debug = true; + continue; + } + + if (arg === '--dry') { + options.dry = true; + continue; + } + + if (arg === '--fail-under') { + options.failUnder = Number(argv[index + 1]); + index += 1; + } + } + + return options; +} + +async function main(): Promise { + const options = parseArgs(process.argv.slice(2)); + const suite = getSuite(options.suite); + + if (!suite) { + console.error(`Unknown suite "${options.suite}". Available: ${listSuiteIds().join(', ')}`); + process.exit(1); + + return; + } + + if (options.dry) { + const selected = filterScenarios(suite, options.scenarioFilter); + const shown = options.smoke ? selected.slice(0, 1) : selected; + + console.log(`${suite.id} eval dry run`); + console.log(`Model: ${options.model}`); + console.log(`Judge: ${options.judge ? 'enabled' : 'disabled'}`); + console.log(`Scenarios: ${shown.length}`); + + for (const entry of shown) { + console.log(`- ${entry.scenario.id}: ${entry.scenario.description}`); + } + + return; + } + + if (!process.env.ANTHROPIC_API_KEY) { + console.error('ANTHROPIC_API_KEY is required to run agent evals.'); + process.exit(1); + + return; + } + + const scores = await runAllEvaluations(suite, options); + printConsoleReport(suite.id, scores, options.judge); + const outputPath = await writeScoresFile(suite.id, scores); + console.log(`Wrote ${outputPath}`); + + if (options.failUnder !== undefined) { + const average = averageScore(scores) * 100; + + if (average < options.failUnder) { + console.error(`Average score ${average.toFixed(1)}% is below fail-under threshold ${options.failUnder}%`); + process.exit(1); + } + } +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/libs/agent-evals/src/load-env.ts b/libs/agent-evals/src/load-env.ts new file mode 100644 index 00000000000..cf76c506cf6 --- /dev/null +++ b/libs/agent-evals/src/load-env.ts @@ -0,0 +1,7 @@ +import { dirname, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { config } from 'dotenv'; + +const packageRoot = resolve(dirname(fileURLToPath(import.meta.url)), '..'); + +config({ path: resolve(packageRoot, '.env') }); diff --git a/libs/agent-evals/src/self-test.ts b/libs/agent-evals/src/self-test.ts new file mode 100644 index 00000000000..71add6c41f7 --- /dev/null +++ b/libs/agent-evals/src/self-test.ts @@ -0,0 +1,53 @@ +import { gradeRun, scoreFromOutcomes } from './core/graders.js'; +import type { RunResult } from './core/types.js'; +import { graders as keylessWhatsappGraders } from './suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.js'; + +function buildResult(partial: Partial): RunResult { + return { + scenarioId: partial.scenarioId ?? 'test', + userPrompt: partial.userPrompt ?? 'Connect WhatsApp', + toolCalls: partial.toolCalls ?? [], + assistantMessages: partial.assistantMessages ?? [], + finalText: partial.finalText ?? '', + capturedUrls: partial.capturedUrls ?? [], + openedFiles: partial.openedFiles ?? [], + killedShellIds: partial.killedShellIds ?? [], + trackedShellIds: partial.trackedShellIds ?? [], + polledShellIds: partial.polledShellIds ?? [], + trackedCommands: partial.trackedCommands ?? [], + metadata: partial.metadata ?? {}, + }; +} + +async function main(): Promise { + const passing = buildResult({ + scenarioId: 'keyless-whatsapp-redirect', + finalText: 'Please continue in https://dashboard.novu.co', + trackedCommands: [], + toolCalls: [{ name: 'AskUserQuestion', args: {}, timestamp: Date.now() }], + }); + + const failing = buildResult({ + scenarioId: 'keyless-whatsapp-redirect', + finalText: 'Running connect now', + trackedCommands: ['npx novu connect --ci --channel whatsapp'], + }); + + const passOutcomes = await gradeRun(keylessWhatsappGraders, passing, { judgeEnabled: false }); + const failOutcomes = await gradeRun(keylessWhatsappGraders, failing, { judgeEnabled: false }); + + if (scoreFromOutcomes(passOutcomes) < 1) { + throw new Error('Expected passing synthetic run to score 1.0'); + } + + if (scoreFromOutcomes(failOutcomes) >= 1) { + throw new Error('Expected failing synthetic run to score below 1.0'); + } + + console.log('Self-test passed: deterministic graders behave as expected.'); +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts new file mode 100644 index 00000000000..96bb9aea70d --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts @@ -0,0 +1,152 @@ +import { defineGraders, judge, toolCallsNamed, transcriptText } from '../../core/graders.js'; +import type { RunResult } from '../../core/types.js'; + +/** The drafted agent description is captured into metadata by the suite's onTrackedCommand hook. */ +export function descriptionText(result: RunResult): string { + return typeof result.metadata.description === 'string' ? result.metadata.description : ''; +} + +export function connectCommands(result: RunResult): string[] { + return result.trackedCommands; +} + +function firstConnectCall(result: RunResult) { + return result.toolCalls.find((call) => call.name === 'Bash' && /\bconnect\b/.test(String(call.args.command ?? ''))); +} + +export const judgePrompts = { + personaAudienceFit: + 'Does the drafted agent description frame the agent for the product end-user audience in domain language, without drifting into a developer/coding-assistant persona?', + noInfraMcpSemantic: + 'Does the drafted agent description avoid naming internal infrastructure or backend plumbing (databases, email delivery APIs, queues, caches, dev tooling) even via synonyms?', + conclusionFirstReport: + 'Does the final user-facing message lead with the CLI result (success or failure), then give the one next action (claim link for keyless or dashboard for authenticated), kept terse?', +}; + +export const catalog = { + noSecretKeyFlag: (result: RunResult) => + connectCommands(result).every((cmd) => !/--secret-key\b/.test(cmd) && !/\bNOVU_SECRET_KEY=/.test(cmd)) + ? 'pass' + : 'fail', + + usedLoginWhenDashboardPrompt: (result: RunResult) => { + if (!/signed in to the Novu dashboard/i.test(result.userPrompt)) { + return 'pass'; + } + + return connectCommands(result).some((cmd) => /--login\b/.test(cmd)) ? 'pass' : 'fail'; + }, + + backgroundConnectShell: (result: RunResult) => { + const connectCall = firstConnectCall(result); + + if (!connectCall) { + return 'fail'; + } + + return Boolean(connectCall.args.run_in_background) && result.polledShellIds.length > 0 ? 'pass' : 'fail'; + }, + + noTimersNoWatchers: (result: RunResult) => { + const forbidden = result.toolCalls.some((call) => { + if (call.name !== 'Bash') { + return false; + } + + const command = String(call.args.command ?? '').toLowerCase(); + + return /\bsleep\b/.test(command) || /\btail\b/.test(command) || /\bgrep\b/.test(command); + }); + + const readLogs = result.toolCalls.some((call) => { + if (call.name !== 'Read') { + return false; + } + + const filePath = String(call.args.file_path ?? ''); + + return filePath.includes('/tmp/') || filePath.endsWith('.log'); + }); + + return forbidden || readLogs ? 'fail' : 'pass'; + }, + + usedPickerForDecisions: (result: RunResult) => + toolCallsNamed(result, 'AskUserQuestion').length >= 1 ? 'pass' : 'fail', + + pastedLiteralUrl: + (expectedUrl: string) => + (result: RunResult): 'pass' | 'fail' => + result.capturedUrls.includes(expectedUrl) || transcriptText(result).includes(expectedUrl) ? 'pass' : 'fail', + + descriptionExcludesInfraTokens: + (tokens: string[]) => + (result: RunResult): 'pass' | 'fail' => { + const description = descriptionText(result).toLowerCase(); + + return tokens.some((token) => description.includes(token.toLowerCase())) ? 'fail' : 'pass'; + }, + + descriptionIncludesTokens: + (tokens: string[]) => + (result: RunResult): 'pass' | 'fail' => { + const description = descriptionText(result).toLowerCase(); + + return tokens.some((token) => description.includes(token.toLowerCase())) ? 'pass' : 'fail'; + }, + + noConnectOnKeylessWhatsapp: (result: RunResult) => + connectCommands(result).length === 0 && + /dashboard\.novu\.co|dashboard redirect|continue.*dashboard/i.test(transcriptText(result)) + ? 'pass' + : 'fail', + + confirmedBeforeRun: (result: RunResult) => { + const approveIndex = result.toolCalls.findIndex( + (call) => + call.name === 'AskUserQuestion' && + (call.result as { selectedId?: string } | undefined)?.selectedId === 'approve' + ); + const firstConnectIndex = result.toolCalls.findIndex( + (call) => call.name === 'Bash' && /\bconnect\b/.test(String(call.args.command ?? '')) + ); + + if (firstConnectIndex === -1) { + return 'pass'; + } + + return approveIndex !== -1 && approveIndex < firstConnectIndex ? 'pass' : 'fail'; + }, + + qrHostAware: (result: RunResult) => (result.openedFiles.some((file) => file.endsWith('.png')) ? 'pass' : 'fail'), + + reranWithSlackToken: (result: RunResult) => + connectCommands(result).some((cmd) => /--slack-config-token\b/.test(cmd)) ? 'pass' : 'fail', + + killedFirstConnectShell: (result: RunResult) => (result.killedShellIds.length >= 1 ? 'pass' : 'fail'), + + readAuthUrlFile: (result: RunResult) => + result.toolCalls.some( + (call) => call.name === 'Read' && String(call.args.file_path ?? '').includes('novu-connect-auth-url') + ) || + result.capturedUrls.some((url) => url.includes('/oauth/device')) || + transcriptText(result).includes('/oauth/device') + ? 'pass' + : 'fail', + + reportedSuccess: (result: RunResult) => + /your agent is live|agent is live/i.test(transcriptText(result)) ? 'pass' : 'fail', + + noConnectCommands: (result: RunResult) => (connectCommands(result).length === 0 ? 'pass' : 'fail'), + + usedSecureTokenPath: (result: RunResult) => + connectCommands(result).every((cmd) => !/--slack-config-token\b/.test(cmd)) ? 'pass' : 'fail', +}; + +export const sharedJudgeGraders = defineGraders({ + personaAudienceFit: judge(judgePrompts.personaAudienceFit, (result) => + [descriptionText(result), transcriptText(result)].join('\n') + ), + noInfraMcpSemantic: judge(judgePrompts.noInfraMcpSemantic, (result) => descriptionText(result)), + conclusionFirstReport: judge(judgePrompts.conclusionFirstReport, (result) => transcriptText(result)), +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts new file mode 100644 index 00000000000..7af9c575425 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts @@ -0,0 +1,89 @@ +import type { CommandParser } from '../../core/types.js'; + +export type ConnectFlags = { + login: boolean; + secretKey: boolean; + ci: boolean; + channel?: string; + description?: string; + slackConfigToken?: string; +}; + +export function isConnectCommand(command: string): boolean { + return /\bnovu(@[\w.-]+)?\s+connect\b/.test(command) || /\bnpx\s+[^\s]*novu[^\s]*\s+connect\b/.test(command); +} + +function resolveDescription(command: string, env: Record): string | undefined { + const exportMatch = command.match(/export\s+NOVU_AGENT_DESCRIPTION='([^']*)'/); + + if (exportMatch?.[1]) { + return exportMatch[1]; + } + + const positionalMatch = command.match(/\bconnect\s+(['"])(.*?)\1/); + const positional = positionalMatch?.[2]; + + // A positional that references the env var (e.g. "$NOVU_AGENT_DESCRIPTION") resolves from env. + if (positional && !positional.includes('$')) { + return positional; + } + + return env.NOVU_AGENT_DESCRIPTION; +} + +export const connectParser: CommandParser = { + matches: isConnectCommand, + parse(command, env) { + const flags: ConnectFlags = { + login: /--login\b/.test(command), + secretKey: /--secret-key\b/.test(command) || /\bNOVU_SECRET_KEY=/.test(command), + ci: /--ci\b/.test(command), + }; + + const channelMatch = command.match(/--channel\s+(\S+)/); + if (channelMatch) { + flags.channel = channelMatch[1]; + } + + const slackTokenMatch = command.match(/--slack-config-token\s+(\S+)/); + if (slackTokenMatch) { + flags.slackConfigToken = slackTokenMatch[1]; + } + + flags.description = resolveDescription(command, env); + + return flags; + }, +}; + +export type ConnectValidationOptions = { + requireLogin?: boolean; + requireNoLogin?: boolean; + allowedChannels?: string[]; +}; + +export function connectValidate(options: ConnectValidationOptions): (flags: ConnectFlags) => string | null { + return (flags) => { + if (options.requireLogin && !flags.login) { + return 'Expected --login flag for this scenario.'; + } + + if (options.requireNoLogin && flags.login) { + return 'Did not expect --login flag for this scenario.'; + } + + if (flags.secretKey) { + return 'Must not pass --secret-key in guided onboarding flow.'; + } + + if (options.allowedChannels?.length && flags.channel && !options.allowedChannels.includes(flags.channel)) { + return `Unexpected channel "${flags.channel}". Expected one of: ${options.allowedChannels.join(', ')}.`; + } + + if (!flags.ci) { + return 'Expected --ci flag.'; + } + + return null; + }; +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/index.ts b/libs/agent-evals/src/suites/agent-onboarding/index.ts new file mode 100644 index 00000000000..ea0de18aea9 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/index.ts @@ -0,0 +1,53 @@ +import path from 'node:path'; +import { REPO_ROOT, type Suite } from '../../core/types.js'; +import { type ConnectFlags, connectParser } from './connect-parser.js'; +import { graders as dashboardPromptLoginGraders } from './scenarios/dashboard-prompt-login/graders.js'; +import { scenario as dashboardPromptLoginScenario } from './scenarios/dashboard-prompt-login/scenario.js'; +import { graders as disciplineNoTimersGraders } from './scenarios/discipline-no-timers/graders.js'; +import { scenario as disciplineNoTimersScenario } from './scenarios/discipline-no-timers/scenario.js'; +import { graders as emailHandoffGraders } from './scenarios/email-handoff/graders.js'; +import { scenario as emailHandoffScenario } from './scenarios/email-handoff/scenario.js'; +import { graders as keylessSlackSecureGraders } from './scenarios/keyless-slack-secure/graders.js'; +import { scenario as keylessSlackSecureScenario } from './scenarios/keyless-slack-secure/scenario.js'; +import { graders as keylessWhatsappRedirectGraders } from './scenarios/keyless-whatsapp-redirect/graders.js'; +import { scenario as keylessWhatsappRedirectScenario } from './scenarios/keyless-whatsapp-redirect/scenario.js'; +import { graders as personaInfraExclusionGraders } from './scenarios/persona-infra-exclusion/graders.js'; +import { scenario as personaInfraExclusionScenario } from './scenarios/persona-infra-exclusion/scenario.js'; +import { graders as slackInChatRerunGraders } from './scenarios/slack-in-chat-rerun/graders.js'; +import { scenario as slackInChatRerunScenario } from './scenarios/slack-in-chat-rerun/scenario.js'; +import { graders as telegramSecureQrGraders } from './scenarios/telegram-secure-qr/graders.js'; +import { scenario as telegramSecureQrScenario } from './scenarios/telegram-secure-qr/scenario.js'; + +export const AGENT_ONBOARDING_DOC_PATH = path.join(REPO_ROOT, 'packages/shared/docs/agent-onboarding.md'); + +const SYSTEM_PROMPT_PREAMBLE = [ + 'You are an AI coding agent executing the Novu agent onboarding playbook exactly.', + 'Follow the playbook precisely. Use the provided tools.', + 'You are running in a Claude Code-like environment with Bash, BashOutput, AskUserQuestion, and Read tools.', + 'The project fixture files are in the current workspace; read README.md and package.json before drafting the agent description.', +].join('\n'); + +export const agentOnboardingSuite: Suite = { + id: 'agent-onboarding', + description: 'Behavioral evals for the Novu agent onboarding playbook (npx novu connect).', + systemPrompt: { path: AGENT_ONBOARDING_DOC_PATH }, + systemPromptPreamble: SYSTEM_PROMPT_PREAMBLE, + commandParser: connectParser, + sentinelFilePatterns: [/NOVU_CONNECT_AUTH_URL_FILE=(\S+)/], + followUpTextPattern: /paste.*token|configuration token|xoxe\.xoxp/i, + onTrackedCommand: (_command, parsed, recorder) => { + if (parsed.description) { + recorder.setMetadata('description', parsed.description); + } + }, + scenarios: [ + { scenario: keylessSlackSecureScenario, graders: keylessSlackSecureGraders }, + { scenario: dashboardPromptLoginScenario, graders: dashboardPromptLoginGraders }, + { scenario: keylessWhatsappRedirectScenario, graders: keylessWhatsappRedirectGraders }, + { scenario: emailHandoffScenario, graders: emailHandoffGraders }, + { scenario: telegramSecureQrScenario, graders: telegramSecureQrGraders }, + { scenario: slackInChatRerunScenario, graders: slackInChatRerunGraders }, + { scenario: personaInfraExclusionScenario, graders: personaInfraExclusionGraders }, + { scenario: disciplineNoTimersScenario, graders: disciplineNoTimersGraders }, + ], +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/kit.ts b/libs/agent-evals/src/suites/agent-onboarding/kit.ts new file mode 100644 index 00000000000..2005aa95c7a --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/kit.ts @@ -0,0 +1,6 @@ +// Stable import surface for scenario files, independent of core/ layout. +export { defineGraders } from '../../core/graders.js'; +export type { EvalScenario, RunResult } from '../../core/types.js'; +export { catalog, sharedJudgeGraders } from './catalog.js'; +export type { ConnectFlags } from './connect-parser.js'; +export { buildDefaultTape, connectTape } from './tape.js'; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts new file mode 100644 index 00000000000..dbfc78f487a --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts @@ -0,0 +1,10 @@ +import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; + +export const graders = defineGraders({ + usedLoginWhenDashboardPrompt: catalog.usedLoginWhenDashboardPrompt, + noSecretKeyFlag: catalog.noSecretKeyFlag, + backgroundConnectShell: catalog.backgroundConnectShell, + readAuthUrlFile: catalog.readAuthUrlFile, + reportedSuccess: catalog.reportedSuccess, + ...sharedJudgeGraders, +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/README.md b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/README.md new file mode 100644 index 00000000000..70a890d9daa --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/README.md @@ -0,0 +1,3 @@ +# Acme Support + +Acme helps shoppers track orders and billing questions. diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/novu-connect-auth-url.txt b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/novu-connect-auth-url.txt new file mode 100644 index 00000000000..1596d1d33d6 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/novu-connect-auth-url.txt @@ -0,0 +1 @@ +https://dashboard.novu.test/oauth/device/login-abc diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/package.json b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/package.json new file mode 100644 index 00000000000..39c7aa062c7 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/project/package.json @@ -0,0 +1,4 @@ +{ + "name": "acme-support", + "description": "Customer support tooling for Acme shoppers" +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/scenario.ts new file mode 100644 index 00000000000..9eec25e76d9 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/scenario.ts @@ -0,0 +1,40 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { type ConnectFlags, connectTape, type EvalScenario } from '../../kit.js'; + +const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); + +export const scenario: EvalScenario = { + id: 'dashboard-prompt-login', + category: 'authenticated', + description: 'Dashboard prompt must force --login and deliver auth URL from file.', + userPrompt: + "I'm signed in to the Novu dashboard. Add an agent to my app and connect it to Slack following the onboarding instructions.", + projectRoot: path.join(scenarioDir, 'project'), + scriptedAnswers: [ + { questionContains: 'channel', optionId: 'slack' }, + { questionContains: 'description', optionId: 'approve' }, + { questionContains: 'token', optionId: 'secure' }, + ], + tape: connectTape({ + requireLogin: true, + allowedChannels: ['slack'], + chunks: [ + { + stdout: `NOVU_CONNECT_AUTH_URL_FILE=${path.join(scenarioDir, 'project/novu-connect-auth-url.txt')}`, + }, + { stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.novu.test/slack/login-1' }, + { stdout: 'NOVU_CONNECT_SLACK_CONFIG_TOKEN_SAVED=1' }, + { stdout: 'NOVU_CONNECT_SLACK_AUTHORIZE_URL=https://slack.test/oauth/login-1' }, + { + stdout: [ + '✓ Your agent is live.', + ' Agent: Dashboard Agent (dash-agent-1)', + ' → Check Slack — your agent just messaged you.', + ' Dashboard: https://dashboard.novu.test/agents/dash-agent-1', + ].join('\n'), + }, + ], + exitCode: 0, + }), +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts new file mode 100644 index 00000000000..d10a0087628 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts @@ -0,0 +1,12 @@ +import { catalog, defineGraders, type RunResult } from '../../kit.js'; + +function polledAtLeast(result: RunResult, count: number): 'pass' | 'fail' { + return result.polledShellIds.length >= count ? 'pass' : 'fail'; +} + +export const graders = defineGraders({ + noTimersNoWatchers: catalog.noTimersNoWatchers, + backgroundConnectShell: catalog.backgroundConnectShell, + polledMultipleTimes: (result) => polledAtLeast(result, 3), + reportedSuccess: catalog.reportedSuccess, +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/project/README.md b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/project/README.md new file mode 100644 index 00000000000..d8f969e0fb1 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/project/README.md @@ -0,0 +1,3 @@ +# Discipline Demo + +A simple project for testing Novu connect shell polling discipline. diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/project/package.json b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/project/package.json new file mode 100644 index 00000000000..0a03cad85d5 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/project/package.json @@ -0,0 +1,4 @@ +{ + "name": "discipline-demo", + "description": "Demo project for connect shell discipline" +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/scenario.ts new file mode 100644 index 00000000000..accc0a2bff3 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/scenario.ts @@ -0,0 +1,38 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { type ConnectFlags, connectTape, type EvalScenario } from '../../kit.js'; + +const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); + +export const scenario: EvalScenario = { + id: 'discipline-no-timers', + category: 'discipline', + description: 'Agent must poll BashOutput repeatedly without sleep/tail/grep watchers.', + userPrompt: 'Connect a Novu agent to Slack for this project.', + projectRoot: path.join(scenarioDir, 'project'), + scriptedAnswers: [ + { questionContains: 'channel', optionId: 'slack' }, + { questionContains: 'description', optionId: 'approve' }, + { questionContains: 'token', optionId: 'secure' }, + ], + tape: connectTape({ + requireNoLogin: true, + allowedChannels: ['slack'], + chunks: [ + { stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.novu.test/slack/discipline-1' }, + { stdout: 'Waiting for Slack App Configuration Token...' }, + { stdout: 'Still waiting for Slack App Configuration Token...' }, + { stdout: 'NOVU_CONNECT_SLACK_CONFIG_TOKEN_SAVED=1' }, + { stdout: 'NOVU_CONNECT_SLACK_AUTHORIZE_URL=https://slack.test/oauth/discipline-1' }, + { stdout: 'Waiting for Slack OAuth...' }, + { + stdout: [ + '✓ Your agent is live.', + ' Agent: Discipline Agent (discipline-agent-1)', + ' Claim your agent: https://dashboard.novu.test/claim/discipline-token', + ].join('\n'), + }, + ], + exitCode: 0, + }), +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/graders.ts new file mode 100644 index 00000000000..827a49f1851 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/graders.ts @@ -0,0 +1,11 @@ +import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; + +const mailtoUrl = 'mailto:connect+agent123@inbound.novu.test?subject=Novu%20Connect'; + +export const graders = defineGraders({ + noSecretKeyFlag: catalog.noSecretKeyFlag, + backgroundConnectShell: catalog.backgroundConnectShell, + pastedMailto: catalog.pastedLiteralUrl(mailtoUrl), + reportedSuccess: catalog.reportedSuccess, + ...sharedJudgeGraders, +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/project/README.md b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/project/README.md new file mode 100644 index 00000000000..d5d1fdeecb2 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/project/README.md @@ -0,0 +1,3 @@ +# Inbox Helper + +Inbox Helper answers member questions over email. diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/project/package.json b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/project/package.json new file mode 100644 index 00000000000..7019f8d3699 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/project/package.json @@ -0,0 +1,4 @@ +{ + "name": "inbox-helper", + "description": "Email assistant for members" +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts new file mode 100644 index 00000000000..fbff9108015 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts @@ -0,0 +1,37 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { type ConnectFlags, connectTape, type EvalScenario } from '../../kit.js'; + +const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); + +const inboundAddress = 'connect+agent123@inbound.novu.test'; +const mailtoUrl = `mailto:${inboundAddress}?subject=Novu%20Connect`; + +export const scenario: EvalScenario = { + id: 'email-handoff', + category: 'keyless', + description: 'Email channel delivers mailto and inbound address handoffs.', + userPrompt: 'Set up a Novu agent on email for this project.', + projectRoot: path.join(scenarioDir, 'project'), + scriptedAnswers: [ + { questionContains: 'channel', optionId: 'email' }, + { questionContains: 'description', optionId: 'approve' }, + ], + tape: connectTape({ + requireNoLogin: true, + allowedChannels: ['email'], + chunks: [ + { stdout: `NOVU_CONNECT_INBOUND_ADDRESS=${inboundAddress}` }, + { stdout: `NOVU_CONNECT_MAILTO=${mailtoUrl}` }, + { + stdout: [ + '✓ Your agent is live.', + ' Agent: Email Agent (email-agent-1)', + ' → Check Email — your agent just messaged you.', + ' Claim your agent: https://dashboard.novu.test/claim/email-token', + ].join('\n'), + }, + ], + exitCode: 0, + }), +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/graders.ts new file mode 100644 index 00000000000..354d776515e --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/graders.ts @@ -0,0 +1,18 @@ +import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; + +const setupUrl = 'https://setup.novu.test/slack/abc123'; +const authorizeUrl = 'https://slack.test/oauth/authorize/xyz'; +const claimUrl = 'https://dashboard.novu.test/claim/token-abc'; + +export const graders = defineGraders({ + noSecretKeyFlag: catalog.noSecretKeyFlag, + backgroundConnectShell: catalog.backgroundConnectShell, + usedPickerForDecisions: catalog.usedPickerForDecisions, + confirmedBeforeRun: catalog.confirmedBeforeRun, + usedSecureTokenPath: catalog.usedSecureTokenPath, + pastedSetupUrl: catalog.pastedLiteralUrl(setupUrl), + pastedAuthorizeUrl: catalog.pastedLiteralUrl(authorizeUrl), + reportedClaimLink: catalog.pastedLiteralUrl(claimUrl), + reportedSuccess: catalog.reportedSuccess, + ...sharedJudgeGraders, +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/project/README.md b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/project/README.md new file mode 100644 index 00000000000..f42778f6278 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/project/README.md @@ -0,0 +1,5 @@ +# Cellar + +Cellar is a wine bar inventory app for staff to check stock levels, par, vendor details, purchase orders, and invoices. + +The audience is wine bar staff, not developers. diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/project/package.json b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/project/package.json new file mode 100644 index 00000000000..26237231fb5 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/project/package.json @@ -0,0 +1,9 @@ +{ + "name": "cellar-inventory", + "description": "Inventory management for Cellar wine bar staff", + "keywords": [ + "wine", + "inventory", + "hospitality" + ] +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/scenario.ts new file mode 100644 index 00000000000..d3279792f1b --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/scenario.ts @@ -0,0 +1,22 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { buildDefaultTape, type ConnectFlags, type EvalScenario } from '../../kit.js'; + +const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); + +export const scenario: EvalScenario = { + id: 'keyless-slack-secure', + category: 'keyless', + description: 'Keyless Slack secure setup path with background shell polling.', + userPrompt: 'Help me connect a Novu managed agent to Slack for this project.', + projectRoot: path.join(scenarioDir, 'project'), + scriptedAnswers: [ + { questionContains: 'channel', optionId: 'slack' }, + { questionContains: 'description', optionId: 'approve' }, + { questionContains: 'token', optionId: 'secure' }, + ], + tape: buildDefaultTape({ + requireNoLogin: true, + allowedChannels: ['slack'], + }), +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts new file mode 100644 index 00000000000..4c57f4fd5ed --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts @@ -0,0 +1,7 @@ +import { catalog, defineGraders } from '../../kit.js'; + +export const graders = defineGraders({ + noConnectCommands: catalog.noConnectCommands, + noConnectOnKeylessWhatsapp: catalog.noConnectOnKeylessWhatsapp, + usedPickerForDecisions: catalog.usedPickerForDecisions, +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/project/README.md b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/project/README.md new file mode 100644 index 00000000000..cfcabf50b1a --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/project/README.md @@ -0,0 +1,3 @@ +# Shop Chat + +Shop Chat helps customers buy products over WhatsApp. diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/project/package.json b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/project/package.json new file mode 100644 index 00000000000..c08b9e136c7 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/project/package.json @@ -0,0 +1,4 @@ +{ + "name": "shop-chat", + "description": "WhatsApp shopping assistant" +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/scenario.ts new file mode 100644 index 00000000000..98dbdbd13bc --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/scenario.ts @@ -0,0 +1,14 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { type ConnectFlags, type EvalScenario } from '../../kit.js'; + +const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); + +export const scenario: EvalScenario = { + id: 'keyless-whatsapp-redirect', + category: 'keyless', + description: 'Keyless WhatsApp/Teams must redirect to dashboard without running connect.', + userPrompt: 'Connect a Novu agent to WhatsApp for this project.', + projectRoot: path.join(scenarioDir, 'project'), + scriptedAnswers: [{ questionContains: 'channel', optionId: 'dashboard' }], +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts new file mode 100644 index 00000000000..5fb8ee96457 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts @@ -0,0 +1,14 @@ +import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; + +export const graders = defineGraders({ + descriptionExcludesInfraTokens: catalog.descriptionExcludesInfraTokens([ + 'postgres', + 'resend', + 'mongodb', + 'github', + 'sentry', + ]), + descriptionIncludesAudience: catalog.descriptionIncludesTokens(['staff', 'wine']), + confirmedBeforeRun: catalog.confirmedBeforeRun, + ...sharedJudgeGraders, +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/project/README.md b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/project/README.md new file mode 100644 index 00000000000..4ca3df73f98 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/project/README.md @@ -0,0 +1,7 @@ +# Cellar Backend + +Cellar uses PostgreSQL for inventory storage and Resend for transactional email delivery. + +Cellar's wine bar staff use the app to check stock levels, par, vendor details, purchase orders, and invoices. + +The end users are wine bar staff, not developers. diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/project/package.json b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/project/package.json new file mode 100644 index 00000000000..c23f7f9a951 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/project/package.json @@ -0,0 +1,8 @@ +{ + "name": "cellar-backend", + "description": "Wine bar inventory platform", + "dependencies": { + "pg": "^8.0.0", + "resend": "^4.0.0" + } +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts new file mode 100644 index 00000000000..4561c75b93e --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts @@ -0,0 +1,35 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { type ConnectFlags, connectTape, type EvalScenario } from '../../kit.js'; + +const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); + +export const scenario: EvalScenario = { + id: 'persona-infra-exclusion', + category: 'inference', + description: 'Agent description must exclude infra tokens and name the end-user audience.', + userPrompt: 'Connect a Novu agent to Slack for this project.', + projectRoot: path.join(scenarioDir, 'project'), + scriptedAnswers: [ + { questionContains: 'channel', optionId: 'slack' }, + { questionContains: 'description', optionId: 'approve' }, + { questionContains: 'token', optionId: 'secure' }, + ], + tape: connectTape({ + requireNoLogin: true, + allowedChannels: ['slack'], + chunks: [ + { stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.novu.test/slack/persona-1' }, + { stdout: 'NOVU_CONNECT_SLACK_CONFIG_TOKEN_SAVED=1' }, + { stdout: 'NOVU_CONNECT_SLACK_AUTHORIZE_URL=https://slack.test/oauth/persona-1' }, + { + stdout: [ + '✓ Your agent is live.', + ' Agent: Persona Agent (persona-agent-1)', + ' Claim your agent: https://dashboard.novu.test/claim/persona-token', + ].join('\n'), + }, + ], + exitCode: 0, + }), +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts new file mode 100644 index 00000000000..4ae74f03626 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts @@ -0,0 +1,10 @@ +import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; + +export const graders = defineGraders({ + usedLoginWhenDashboardPrompt: catalog.usedLoginWhenDashboardPrompt, + killedFirstConnectShell: catalog.killedFirstConnectShell, + reranWithSlackToken: catalog.reranWithSlackToken, + pastedAuthorizeUrl: catalog.pastedLiteralUrl('https://slack.test/oauth/rerun-token'), + reportedSuccess: catalog.reportedSuccess, + ...sharedJudgeGraders, +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/README.md b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/README.md new file mode 100644 index 00000000000..f7be4c97cad --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/README.md @@ -0,0 +1,3 @@ +# Ops Slack + +Ops Slack helps operations staff coordinate daily tasks. diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/novu-connect-auth-url.txt b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/novu-connect-auth-url.txt new file mode 100644 index 00000000000..f05e71fc560 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/novu-connect-auth-url.txt @@ -0,0 +1 @@ +https://dashboard.novu.test/oauth/device/rerun-abc diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/package.json b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/package.json new file mode 100644 index 00000000000..a3d7043f94f --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/project/package.json @@ -0,0 +1,4 @@ +{ + "name": "ops-slack", + "description": "Slack assistant for operations staff" +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts new file mode 100644 index 00000000000..8770cfc6c72 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts @@ -0,0 +1,47 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { type ConnectFlags, connectTape, type EvalScenario } from '../../kit.js'; + +const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); + +export const scenario: EvalScenario = { + id: 'slack-in-chat-rerun', + category: 'authenticated', + description: 'Slack in_chat path kills first shell and reruns with --slack-config-token.', + userPrompt: "I'm signed in to the Novu dashboard. Connect my agent to Slack.", + projectRoot: path.join(scenarioDir, 'project'), + scriptedAnswers: [ + { questionContains: 'channel', optionId: 'slack' }, + { questionContains: 'description', optionId: 'approve' }, + { questionContains: 'token', optionId: 'in_chat' }, + ], + followUpMessages: ['Here is my Slack App Configuration Token: xoxe.xoxp-test-token'], + followUpOnOptionId: 'in_chat', + tape: connectTape({ + requireLogin: true, + allowedChannels: ['slack'], + chunks: [ + { + stdout: `NOVU_CONNECT_AUTH_URL_FILE=${path.join(scenarioDir, 'project/novu-connect-auth-url.txt')}`, + }, + { + stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.novu.test/slack/rerun-1', + when: (flags) => !flags.slackConfigToken, + }, + { + stdout: 'NOVU_CONNECT_SLACK_AUTHORIZE_URL=https://slack.test/oauth/rerun-token', + when: (flags) => Boolean(flags.slackConfigToken), + }, + { + stdout: [ + '✓ Your agent is live.', + ' Agent: Slack Rerun Agent (slack-rerun-1)', + ' → Check Slack — your agent just messaged you.', + ' Dashboard: https://dashboard.novu.test/agents/slack-rerun-1', + ].join('\n'), + when: (flags) => Boolean(flags.slackConfigToken), + }, + ], + exitCode: 0, + }), +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/graders.ts new file mode 100644 index 00000000000..53c78770ba3 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/graders.ts @@ -0,0 +1,10 @@ +import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; + +export const graders = defineGraders({ + noSecretKeyFlag: catalog.noSecretKeyFlag, + backgroundConnectShell: catalog.backgroundConnectShell, + qrHostAware: catalog.qrHostAware, + pastedSetupUrl: catalog.pastedLiteralUrl('https://setup.novu.test/telegram/abc'), + reportedSuccess: catalog.reportedSuccess, + ...sharedJudgeGraders, +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/README.md b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/README.md new file mode 100644 index 00000000000..720e5ceb2b1 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/README.md @@ -0,0 +1,3 @@ +# Cellar Telegram + +Cellar helps guests ask wine questions on Telegram. diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/package.json b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/package.json new file mode 100644 index 00000000000..2a330626652 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/package.json @@ -0,0 +1,4 @@ +{ + "name": "cellar-telegram", + "description": "Telegram support bot for wine bar guests" +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/telegram-setup-qr.png b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/telegram-setup-qr.png new file mode 100644 index 00000000000..087b77518d5 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/project/telegram-setup-qr.png @@ -0,0 +1 @@ +Telegram setup QR placeholder diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts new file mode 100644 index 00000000000..361f02310c0 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts @@ -0,0 +1,40 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { type ConnectFlags, connectTape, type EvalScenario } from '../../kit.js'; + +const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); +const qrPath = path.join(scenarioDir, 'project/telegram-setup-qr.png'); + +export const scenario: EvalScenario = { + id: 'telegram-secure-qr', + category: 'keyless', + description: 'Telegram secure setup with host-aware QR delivery via open.', + userPrompt: 'Connect a Novu agent to Telegram for this project.', + projectRoot: path.join(scenarioDir, 'project'), + scriptedAnswers: [ + { questionContains: 'channel', optionId: 'telegram' }, + { questionContains: 'description', optionId: 'approve' }, + { questionContains: 'token', optionId: 'secure' }, + ], + tape: connectTape({ + requireNoLogin: true, + allowedChannels: ['telegram'], + chunks: [ + { stdout: 'NOVU_CONNECT_TELEGRAM_BOTFATHER_URL=https://t.me/botfather' }, + { stdout: 'NOVU_CONNECT_TELEGRAM_SETUP_URL=https://setup.novu.test/telegram/abc' }, + { stdout: `NOVU_CONNECT_TELEGRAM_SETUP_QR_PNG=${qrPath}` }, + { stdout: 'NOVU_CONNECT_TELEGRAM_DEEPLINK_URL=https://t.me/cellar_support_bot?start=connect' }, + { stdout: 'NOVU_CONNECT_TELEGRAM_BOT_USERNAME=cellar_support_bot' }, + { stdout: `NOVU_CONNECT_TELEGRAM_DEEPLINK_QR_PNG=${qrPath}` }, + { + stdout: [ + '✓ Your agent is live.', + ' Agent: Telegram Agent (telegram-agent-1)', + ' → Check Telegram — your agent just messaged you.', + ' Claim your agent: https://dashboard.novu.test/claim/telegram-token', + ].join('\n'), + }, + ], + exitCode: 0, + }), +}; diff --git a/libs/agent-evals/src/suites/agent-onboarding/tape.ts b/libs/agent-evals/src/suites/agent-onboarding/tape.ts new file mode 100644 index 00000000000..2bbf8d5714e --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/tape.ts @@ -0,0 +1,45 @@ +import type { Tape, TapeChunk } from '../../core/types.js'; +import { type ConnectFlags, type ConnectValidationOptions, connectValidate } from './connect-parser.js'; + +export type ConnectTapeOptions = ConnectValidationOptions & { + chunks: Array>; + exitCode?: number; +}; + +/** Build a connect tape, wiring connect-specific validation into the generic `validate` hook. */ +export function connectTape(options: ConnectTapeOptions): Tape { + return { + chunks: options.chunks, + exitCode: options.exitCode ?? 0, + validate: connectValidate({ + requireLogin: options.requireLogin, + requireNoLogin: options.requireNoLogin, + allowedChannels: options.allowedChannels, + }), + }; +} + +/** Default keyless Slack tape used by the canonical scenario. */ +export function buildDefaultTape(overrides?: Partial): Tape { + const defaultChunks: Array> = [ + { stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.novu.test/slack/abc123' }, + { stdout: 'NOVU_CONNECT_SLACK_CONFIG_TOKEN_SAVED=1' }, + { stdout: 'NOVU_CONNECT_SLACK_AUTHORIZE_URL=https://slack.test/oauth/authorize/xyz' }, + { + stdout: [ + '✓ Your agent is live.', + ' Agent: Demo Agent (demo-agent-1)', + ' → Check Slack — your agent just messaged you.', + ' Claim your agent: https://dashboard.novu.test/claim/token-abc', + ].join('\n'), + }, + ]; + + return connectTape({ + chunks: overrides?.chunks ?? defaultChunks, + exitCode: overrides?.exitCode ?? 0, + requireNoLogin: overrides?.requireNoLogin ?? true, + allowedChannels: overrides?.allowedChannels ?? ['slack'], + requireLogin: overrides?.requireLogin, + }); +} diff --git a/libs/agent-evals/src/suites/registry.ts b/libs/agent-evals/src/suites/registry.ts new file mode 100644 index 00000000000..cb28f5b62f6 --- /dev/null +++ b/libs/agent-evals/src/suites/registry.ts @@ -0,0 +1,16 @@ +import type { ParsedCommand, Suite } from '../core/types.js'; +import { agentOnboardingSuite } from './agent-onboarding/index.js'; + +export const suites: Record> = { + [agentOnboardingSuite.id]: agentOnboardingSuite as unknown as Suite, +}; + +export const DEFAULT_SUITE = agentOnboardingSuite.id; + +export function getSuite(id: string): Suite | undefined { + return suites[id]; +} + +export function listSuiteIds(): string[] { + return Object.keys(suites); +} diff --git a/libs/agent-evals/tsconfig.json b/libs/agent-evals/tsconfig.json new file mode 100644 index 00000000000..d9fbdf9a4e1 --- /dev/null +++ b/libs/agent-evals/tsconfig.json @@ -0,0 +1,17 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "lib": ["ES2022"], + "strict": true, + "strictNullChecks": true, + "esModuleInterop": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "noEmit": true, + "types": ["node"] + }, + "include": ["src/**/*.ts"] +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1d1674c9e56..6a174a17208 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -2465,6 +2465,31 @@ importers: specifier: ^4.49.0 version: 4.68.1 + libs/agent-evals: + dependencies: + '@ai-sdk/anthropic': + specifier: ^3.0.10 + version: 3.0.13(zod@3.25.76) + ai: + specifier: 6.0.50 + version: 6.0.50(zod@3.25.76) + dotenv: + specifier: ^16.6.1 + version: 16.6.1 + zod: + specifier: ^3.23.8 + version: 3.25.76 + devDependencies: + '@types/node': + specifier: ^22.0.0 + version: 22.15.13 + tsx: + specifier: 4.16.2 + version: 4.16.2 + typescript: + specifier: 5.6.2 + version: 5.6.2 + libs/application-generic: dependencies: '@anthropic-ai/aws-sdk': @@ -19242,9 +19267,6 @@ packages: resolution: {integrity: sha512-g0QYk1dZBxGwk+Ngc+ltRH2IBp2f7zBkBMBJZCDerh6EhlhSR6+9irMCuT/09zD6qkarHUSn529sK/yL4S27mg==} engines: {node: '>= 0.4'} - get-tsconfig@4.7.5: - resolution: {integrity: sha512-ZCuZCnlqNzjb4QprAzXKdpp/gh6KTxSJuw3IBsPnV/7fV4NxC9ckB+vPTt8w7fJA0TaSD7c55BR47JD6MEDyDw==} - get-tsconfig@4.8.0: resolution: {integrity: sha512-Pgba6TExTZ0FJAn1qkJAjIeKoDJ3CsI2ChuLohJnZl/tTU8MVrq3b+2t5UOPfRa4RMsorClBjJALkJUMjG1PAw==} @@ -27830,6 +27852,12 @@ snapshots: '@ai-sdk/provider-utils': 4.0.6(zod@3.25.20) zod: 3.25.20 + '@ai-sdk/anthropic@3.0.13(zod@3.25.76)': + dependencies: + '@ai-sdk/provider': 3.0.3 + '@ai-sdk/provider-utils': 4.0.6(zod@3.25.76) + zod: 3.25.76 + '@ai-sdk/gateway@3.0.14(zod@4.3.5)': dependencies: '@ai-sdk/provider': 3.0.3 @@ -27851,6 +27879,13 @@ snapshots: '@vercel/oidc': 3.1.0 zod: 3.25.20 + '@ai-sdk/gateway@3.0.23(zod@3.25.76)': + dependencies: + '@ai-sdk/provider': 3.0.5 + '@ai-sdk/provider-utils': 4.0.9(zod@3.25.76) + '@vercel/oidc': 3.1.0 + zod: 3.25.76 + '@ai-sdk/gateway@3.0.23(zod@4.3.5)': dependencies: '@ai-sdk/provider': 3.0.5 @@ -27900,6 +27935,13 @@ snapshots: eventsource-parser: 3.0.6 zod: 3.25.20 + '@ai-sdk/provider-utils@4.0.6(zod@3.25.76)': + dependencies: + '@ai-sdk/provider': 3.0.3 + '@standard-schema/spec': 1.1.0 + eventsource-parser: 3.0.6 + zod: 3.25.76 + '@ai-sdk/provider-utils@4.0.6(zod@4.3.5)': dependencies: '@ai-sdk/provider': 3.0.3 @@ -27914,6 +27956,13 @@ snapshots: eventsource-parser: 3.0.6 zod: 3.25.20 + '@ai-sdk/provider-utils@4.0.9(zod@3.25.76)': + dependencies: + '@ai-sdk/provider': 3.0.5 + '@standard-schema/spec': 1.1.0 + eventsource-parser: 3.0.6 + zod: 3.25.76 + '@ai-sdk/provider-utils@4.0.9(zod@4.3.5)': dependencies: '@ai-sdk/provider': 3.0.5 @@ -31657,7 +31706,7 @@ snapshots: '@opentelemetry/api': 1.9.0 '@opentelemetry/semantic-conventions': 1.39.0 '@standard-schema/spec': 1.1.0 - better-call: 1.3.2(zod@4.3.5) + better-call: 1.3.2(zod@4.3.6) jose: 6.1.3 kysely: 0.28.17 nanostores: 1.2.0 @@ -31699,7 +31748,7 @@ snapshots: fast-xml-parser: 5.7.3 jose: 6.1.3 samlify: 2.13.1 - zod: 4.3.5 + zod: 4.3.6 '@better-auth/sso@1.5.6(@better-auth/core@1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0))(@better-auth/utils@0.3.1)(better-auth@1.5.6(d9d7da76424e2ace7367f5b90edcc4b7))(better-call@1.3.2(zod@4.3.6))': dependencies: @@ -44701,6 +44750,14 @@ snapshots: '@opentelemetry/api': 1.9.0 zod: 3.25.20 + ai@6.0.50(zod@3.25.76): + dependencies: + '@ai-sdk/gateway': 3.0.23(zod@3.25.76) + '@ai-sdk/provider': 3.0.5 + '@ai-sdk/provider-utils': 4.0.9(zod@3.25.76) + '@opentelemetry/api': 1.9.0 + zod: 3.25.76 + ai@6.0.50(zod@4.3.5): dependencies: '@ai-sdk/gateway': 3.0.23(zod@4.3.5) @@ -45481,7 +45538,7 @@ snapshots: '@better-fetch/fetch': 1.1.21 '@noble/ciphers': 2.1.1 '@noble/hashes': 2.0.1 - better-call: 1.3.2(zod@4.3.5) + better-call: 1.3.2(zod@4.3.6) defu: 6.1.6 jose: 6.1.3 kysely: 0.28.17 @@ -45500,15 +45557,6 @@ snapshots: - '@cloudflare/workers-types' - '@opentelemetry/api' - better-call@1.3.2(zod@4.3.5): - dependencies: - '@better-auth/utils': 0.3.1 - '@better-fetch/fetch': 1.1.21 - rou3: 0.7.12 - set-cookie-parser: 3.1.0 - optionalDependencies: - zod: 4.3.5 - better-call@1.3.2(zod@4.3.6): dependencies: '@better-auth/utils': 0.3.1 @@ -48882,10 +48930,6 @@ snapshots: es-errors: 1.3.0 get-intrinsic: 1.3.0 - get-tsconfig@4.7.5: - dependencies: - resolve-pkg-maps: 1.0.0 - get-tsconfig@4.8.0: dependencies: resolve-pkg-maps: 1.0.0 @@ -58389,7 +58433,7 @@ snapshots: tsx@4.16.2: dependencies: esbuild: 0.28.1 - get-tsconfig: 4.7.5 + get-tsconfig: 4.8.0 optionalDependencies: fsevents: 2.3.3 From d3d61fdb7a048f922ecbaddb9930a691c0d0b6bb Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Wed, 17 Jun 2026 10:30:25 +0300 Subject: [PATCH 02/19] feat(libs): refine agent-evals harness and wire shared doc export fixes NV-8059 Co-authored-by: Cursor --- libs/agent-evals/README.md | 134 +++++++++++++++++- libs/agent-evals/package.json | 1 + libs/agent-evals/src/core/graders.ts | 62 ++++++-- libs/agent-evals/src/core/judge.ts | 29 +++- libs/agent-evals/src/core/reporters.ts | 81 +++++++---- .../src/core/resolve-package-file.ts | 7 + libs/agent-evals/src/core/run-agent.ts | 17 ++- libs/agent-evals/src/core/runner.ts | 62 ++++++-- libs/agent-evals/src/core/types.ts | 13 +- libs/agent-evals/src/index.ts | 3 +- .../src/suites/agent-onboarding/catalog.ts | 121 ++++++++++------ .../suites/agent-onboarding/connect-parser.ts | 69 ++++++++- .../src/suites/agent-onboarding/index.ts | 6 +- packages/shared/package.json | 4 +- pnpm-lock.yaml | 3 + 15 files changed, 482 insertions(+), 130 deletions(-) create mode 100644 libs/agent-evals/src/core/resolve-package-file.ts diff --git a/libs/agent-evals/README.md b/libs/agent-evals/README.md index 674c084b4a0..b8610091e07 100644 --- a/libs/agent-evals/README.md +++ b/libs/agent-evals/README.md @@ -2,7 +2,139 @@ Behavioral eval harness for Novu coding-agent playbooks. Runs a real LLM agent against scripted scenarios with a mocked CLI, then grades whether the agent follows the playbook using deterministic structural checks plus optional LLM-as-judge graders for fuzzy criteria. -The harness is **suite-based**: `src/core/` is playbook-agnostic, and each suite under `src/suites/` plugs in its own system prompt, command parser, scenarios, and grader catalog. The first suite, `agent-onboarding`, tests [`packages/shared/docs/agent-onboarding.md`](../../packages/shared/docs/agent-onboarding.md) (the `npx novu connect` flow). +The harness is **suite-based**: `src/core/` is playbook-agnostic, and each suite under `src/suites/` plugs in its own system prompt, command parser, scenarios, and grader catalog. The first suite, `agent-onboarding`, tests `@novu/shared/docs/agent-onboarding.md` (the `npx novu connect` flow), resolved via the `@novu/shared` package export. + +## Architecture + +### Layer overview + +The package splits into three layers: a CLI entrypoint, a playbook-agnostic **core harness**, and pluggable **suites** that supply scenarios, tapes, and graders. + +```mermaid +flowchart TB + subgraph entry["Entry (src/index.ts)"] + CLI["CLI flags\n(--suite, --scenario, --judge, …)"] + Registry["suites/registry.ts"] + end + + subgraph core["Core harness (src/core/)"] + Runner["runner.ts\nload → run → grade → score"] + RunAgent["run-agent.ts\nAI SDK tool-calling loop"] + Tools["tools.ts\nBash · BashOutput · AskUserQuestion · Read"] + MockShell["mock-shell.ts\nTape replay engine"] + Recorder["recorder.ts\nRunResult builder"] + Graders["graders.ts\ncontains · matches · judge"] + Judge["judge.ts\nLLM-as-judge"] + Reporters["reporters.ts\nconsole + scores JSON"] + end + + subgraph suite["Suite (src/suites/{name}/)"] + SuiteObj["Suite contract\nsystem prompt · parser · hooks"] + Scenarios["scenarios/{id}/\nscenario.ts · graders.ts · project/"] + Parser["CommandParser\n(e.g. connect-parser.ts)"] + Tape["Tape\nscripted CLI stdout chunks"] + end + + CLI --> Registry + Registry --> Runner + Runner --> RunAgent + RunAgent --> Tools + Tools --> MockShell + Tools --> Recorder + RunAgent --> Recorder + Runner --> Graders + Graders --> Judge + Runner --> Reporters + + SuiteObj --> RunAgent + Parser --> MockShell + Tape --> MockShell + Scenarios --> Runner + Scenarios --> Graders +``` + +### Execution flow + +Each scenario runs the real agent against a mocked environment, then grades the recorded behavior. + +```mermaid +sequenceDiagram + participant User as CLI + participant Runner as runner.ts + participant Agent as run-agent.ts + participant LLM as Anthropic model + participant Tools as Harness tools + participant Shell as MockShellEngine + participant Rec as RunRecorder + participant Grade as graders.ts + participant Out as reporters.ts + + User->>Runner: runAllEvaluations(suite, options) + loop each scenario + Runner->>Agent: runAgentScenario(suite, scenario) + Agent->>Agent: resolveSystemPrompt(playbook doc) + Agent->>LLM: generateText(system + user prompt, tools) + loop tool-calling steps + LLM->>Tools: Bash / BashOutput / AskUserQuestion / Read + alt tracked command (e.g. novu connect) + Tools->>Shell: createShell → replay tape chunks + Shell-->>Tools: scripted stdout + Tools->>Rec: record tracked command, URLs, polls + else AskUserQuestion + Tools->>Rec: pick scriptedAnswers[answerIndex] + else Read fixture + Tools->>Rec: read scenario project/ files + end + Tools-->>LLM: tool result + end + opt followUpMessages / followUpOnOptionId + Agent->>LLM: inject scripted user follow-up + end + Agent->>Rec: build() → RunResult + Runner->>Grade: gradeRun(scenario graders, RunResult) + alt deterministic grader + Grade->>Grade: contains / matches on transcript and toolCalls + else judge grader (--judge) + Grade->>LLM: runJudge(prompt, context) + end + Grade-->>Runner: pass / fail / skip per grader + Runner->>Runner: scoreFromOutcomes → ScenarioScore + end + Runner->>Out: printConsoleReport + writeScoresFile + Out-->>User: matrix and scores-{suite}.json +``` + +### Key concepts + +| Concept | Role | +| --- | --- | +| **Suite** | Plugs a playbook (system prompt), `CommandParser`, scenario list, and optional hooks (`onTrackedCommand`, sentinel URL patterns) into the generic harness. | +| **Scenario** | One eval case: user prompt, fixture `project/`, scripted user answers, optional CLI **tape**, and follow-up messages. | +| **Tape** | Ordered stdout chunks replayed when the agent runs a tracked command; `when(parsed)` can branch on parsed flags. | +| **CommandParser** | Decides which shell commands are tracked (e.g. `novu connect`) and parses them for tape selection and validation. | +| **RunResult** | Everything the agent did: tool calls, assistant text, captured URLs, polled/killed shells, suite metadata. | +| **Graders** | **Deterministic** checks on `RunResult` structure, or **judge** graders that call a second LLM pass for fuzzy criteria. | + +### Mock CLI model + +The harness simulates a Claude Code–like environment without running real `novu connect`: + +```mermaid +stateDiagram-v2 + [*] --> Bash: agent runs command + Bash --> Tracked: parser.matches(command) + Bash --> Untracked: other commands + Tracked --> TapeReplay: scenario.tape supplies chunks + TapeReplay --> Background: run_in_background=true + TapeReplay --> Foreground: synchronous run + Background --> BashOutput: agent polls shell id + BashOutput --> TapeReplay: emit next chunk until exitCode + Untracked --> Stub: generic stdout or reject watchers + note right of TapeReplay + connect-parser validates flags; + validate() can fail the command + end note +``` ## Structure diff --git a/libs/agent-evals/package.json b/libs/agent-evals/package.json index 3d6228224f0..5d5d52fded1 100644 --- a/libs/agent-evals/package.json +++ b/libs/agent-evals/package.json @@ -14,6 +14,7 @@ "check:fix": "biome check --write ." }, "dependencies": { + "@novu/shared": "workspace:*", "@ai-sdk/anthropic": "^3.0.10", "ai": "6.0.50", "dotenv": "^16.6.1", diff --git a/libs/agent-evals/src/core/graders.ts b/libs/agent-evals/src/core/graders.ts index 737455d95f1..2b65106491b 100644 --- a/libs/agent-evals/src/core/graders.ts +++ b/libs/agent-evals/src/core/graders.ts @@ -1,4 +1,14 @@ -import type { GraderDefinition, GraderFn, GraderResult, RunResult, ToolCallRecord } from './types.js'; +import { runJudge } from './judge.js'; +import type { GraderDefinition, GraderFn, GraderOutcome, GraderResult, RunResult, ToolCallRecord } from './types.js'; + +/** Helper for graders that want to explain a failure inline. */ +export function fail(reason: string): GraderOutcome { + return { status: 'fail', reason }; +} + +function toOutcome(value: GraderResult | GraderOutcome): GraderOutcome { + return typeof value === 'string' ? { status: value } : value; +} export function defineGraders>( graders: T @@ -47,36 +57,58 @@ export function transcriptText(result: RunResult): string { export function judge(prompt: string, context: (result: RunResult) => string): GraderDefinition { return { kind: 'judge', - run: async (result) => { - const { runJudge } = await import('./judge.js'); - - return runJudge(prompt, context(result)); - }, + run: async (result) => runJudge(prompt, context(result)), }; } +export type GradeRunOptions = { + judgeEnabled: boolean; + onGraderStart?: (name: string, kind: GraderDefinition['kind']) => void; + onGraderResult?: (name: string, outcome: GraderOutcome, kind: GraderDefinition['kind']) => void; +}; + export async function gradeRun( graders: Record, result: RunResult, - options: { judgeEnabled: boolean } -): Promise> { - const outcomes: Record = {}; + options: GradeRunOptions +): Promise> { + const outcomes: Record = {}; + const entries = Object.entries(graders); - for (const [name, definition] of Object.entries(graders)) { + for (const [name, definition] of entries) { if (definition.kind === 'judge' && !options.judgeEnabled) { - outcomes[name] = 'skip'; + outcomes[name] = { status: 'skip' }; + options.onGraderResult?.(name, outcomes[name], definition.kind); continue; } - outcomes[name] = await definition.run(result); + options.onGraderStart?.(name, definition.kind); + outcomes[name] = toOutcome(await definition.run(result)); + options.onGraderResult?.(name, outcomes[name], definition.kind); } return outcomes; } -export function scoreFromOutcomes(outcomes: Record): number { - const considered = Object.values(outcomes).filter((value) => value !== 'skip'); - const passed = considered.filter((value) => value === 'pass').length; +export function formatGraderStatus(status: string, kind: GraderDefinition['kind'], judgeEnabled: boolean): string { + const isJudgeGrader = kind === 'judge'; + + if (status.toUpperCase() === 'SKIP' && isJudgeGrader && !judgeEnabled) { + return 'SKIP (judge disabled)'; + } + + if (isJudgeGrader && judgeEnabled) { + const label = status === 'evaluating…' ? 'evaluating…' : status.toUpperCase(); + + return `${label} (judge)`; + } + + return status === 'evaluating…' ? 'evaluating…' : status.toUpperCase(); +} + +export function scoreFromOutcomes(outcomes: Record): number { + const considered = Object.values(outcomes).filter((value) => value.status !== 'skip'); + const passed = considered.filter((value) => value.status === 'pass').length; if (considered.length === 0) { return 0; diff --git a/libs/agent-evals/src/core/judge.ts b/libs/agent-evals/src/core/judge.ts index cd90592eed8..702b53dcd5c 100644 --- a/libs/agent-evals/src/core/judge.ts +++ b/libs/agent-evals/src/core/judge.ts @@ -1,6 +1,6 @@ import { anthropic } from '@ai-sdk/anthropic'; import { generateText } from 'ai'; -import type { GraderResult } from './types.js'; +import type { GraderOutcome, GraderResult } from './types.js'; let judgeModel = 'claude-sonnet-4-5'; let judgeEnabled = false; @@ -10,16 +10,18 @@ export function configureJudge(options: { enabled: boolean; model?: string }): v judgeModel = options.model ?? judgeModel; } -export async function runJudge(prompt: string, context: string): Promise { +export async function runJudge(prompt: string, context: string): Promise { if (!judgeEnabled || !process.env.ANTHROPIC_API_KEY) { - return 'skip'; + return { status: 'skip' }; } const result = await generateText({ model: anthropic(judgeModel), prompt: [ 'You are grading an AI agent run against a coding-agent playbook.', - 'Answer with exactly YES or NO.', + 'First, write one sentence of reasoning explaining your verdict.', + 'Then, on the final line, answer with exactly YES, NO, or UNKNOWN.', + 'Answer UNKNOWN only if the context does not contain enough information to judge the question.', '', `Question: ${prompt}`, '', @@ -28,5 +30,22 @@ export async function runJudge(prompt: string, context: string): Promise line.trim()) + .filter((line) => line.length > 0); + + const verdictLine = lines.at(-1) ?? ''; + const verdict = verdictLine.toUpperCase(); + const reason = lines.slice(0, -1).join(' ').trim() || undefined; + + // Escape hatch: a starved judge abstains instead of counting as a failure. + if (verdict.startsWith('UNKNOWN')) { + return { status: 'skip' }; + } + + const status: GraderResult = verdict.startsWith('YES') ? 'pass' : 'fail'; + + return status === 'fail' ? { status, reason } : { status }; } diff --git a/libs/agent-evals/src/core/reporters.ts b/libs/agent-evals/src/core/reporters.ts index ac17c0a1d86..7c4e9d1a75f 100644 --- a/libs/agent-evals/src/core/reporters.ts +++ b/libs/agent-evals/src/core/reporters.ts @@ -1,38 +1,9 @@ import fs from 'node:fs/promises'; import path from 'node:path'; -import type { ScenarioScore } from './types.js'; +import { formatGraderStatus } from './graders.js'; +import type { GraderDefinition, GraderOutcome, ScenarioScore } from './types.js'; import { PACKAGE_ROOT } from './types.js'; -function formatResult(value: 'pass' | 'fail' | 'skip'): string { - if (value === 'pass') { - return 'PASS'; - } - - if (value === 'fail') { - return 'FAIL'; - } - - return 'SKIP'; -} - -export function printConsoleReport(suiteId: string, scores: ScenarioScore[], judgeEnabled: boolean): void { - console.log(`\n${suiteId} eval results\n`); - - for (const score of scores) { - console.log(`${score.scenarioId} (${score.category}) — ${(score.score * 100).toFixed(1)}%`); - - for (const [name, result] of Object.entries(score.graders)) { - const suffix = result === 'skip' && !judgeEnabled ? ' (judge disabled)' : ''; - console.log(` - ${name}: ${formatResult(result)}${suffix}`); - } - - console.log(''); - } - - const average = scores.reduce((sum, item) => sum + item.score, 0) / (scores.length || 1); - console.log(`Average score: ${(average * 100).toFixed(1)}%`); -} - export async function writeScoresFile(suiteId: string, scores: ScenarioScore[]): Promise { const outputPath = path.join(PACKAGE_ROOT, `scores-${suiteId}.json`); const payload = scores.map(({ runResult, ...rest }) => ({ @@ -45,3 +16,51 @@ export async function writeScoresFile(suiteId: string, scores: ScenarioScore[]): return outputPath; } + +type GraderProgressReporterOptions = { + totalGraders: number; + judgeEnabled: boolean; +}; + +export function createGraderProgressReporter(options: GraderProgressReporterOptions) { + let graderIndex = 0; + let pendingJudgeLineLength = 0; + + const formatGraderLine = ( + index: number, + name: string, + status: string, + kind: GraderDefinition['kind'], + reason?: string + ) => { + const base = ` • [${index}/${options.totalGraders}] ${name}: ${formatGraderStatus(status, kind, options.judgeEnabled)}`; + + return status === 'fail' && reason ? `${base} — ${reason}` : base; + }; + + return { + onGraderStart(name: string, kind: GraderDefinition['kind']) { + if (kind !== 'judge' || !process.stdout.isTTY) { + return; + } + + graderIndex += 1; + const line = formatGraderLine(graderIndex, name, 'evaluating…', kind); + pendingJudgeLineLength = line.length; + process.stdout.write(line); + }, + onGraderResult(name: string, outcome: GraderOutcome, kind: GraderDefinition['kind']) { + if (kind === 'judge' && process.stdout.isTTY && pendingJudgeLineLength > 0) { + const line = formatGraderLine(graderIndex, name, outcome.status, kind, outcome.reason); + const padding = Math.max(0, pendingJudgeLineLength - line.length); + process.stdout.write(`\r${line}${' '.repeat(padding)}\n`); + pendingJudgeLineLength = 0; + + return; + } + + graderIndex += 1; + console.log(formatGraderLine(graderIndex, name, outcome.status, kind, outcome.reason)); + }, + }; +} diff --git a/libs/agent-evals/src/core/resolve-package-file.ts b/libs/agent-evals/src/core/resolve-package-file.ts new file mode 100644 index 00000000000..d66a51ec525 --- /dev/null +++ b/libs/agent-evals/src/core/resolve-package-file.ts @@ -0,0 +1,7 @@ +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); + +export function resolvePackageFile(specifier: string): string { + return require.resolve(specifier); +} diff --git a/libs/agent-evals/src/core/run-agent.ts b/libs/agent-evals/src/core/run-agent.ts index cf93614e22c..84467f64403 100644 --- a/libs/agent-evals/src/core/run-agent.ts +++ b/libs/agent-evals/src/core/run-agent.ts @@ -81,6 +81,10 @@ export async function runAgentScenario(options: RunAgen const maxTurns = followUps.length + 1; for (let turn = 0; turn < maxTurns; turn += 1) { + if (maxTurns > 1) { + console.log(` ↳ ${options.scenario.id}: agent turn ${turn + 1}/${maxTurns}…`); + } + const result = await generateText({ model: anthropic(options.model), system, @@ -89,6 +93,10 @@ export async function runAgentScenario(options: RunAgen stopWhen: stepCountIs(options.maxSteps ?? 40), }); + console.log( + ` ↳ ${options.scenario.id}: model responded (${result.steps.length} step${result.steps.length === 1 ? '' : 's'})` + ); + recorder.recordAssistantMessage(result.text); messages.push(...result.response.messages); @@ -107,12 +115,3 @@ export async function runAgentScenario(options: RunAgen return recorder.build(); } - -export async function dryRunAgentScenario(scenario: EvalScenario): Promise { - resetShellCounter(); - - const recorder = new RunRecorder(scenario.id, scenario.userPrompt); - recorder.recordAssistantMessage(`[dry-run] Would execute scenario "${scenario.id}" with mock CLI tape.`); - - return recorder.build(); -} diff --git a/libs/agent-evals/src/core/runner.ts b/libs/agent-evals/src/core/runner.ts index ccd17578c91..c68ff5f5b10 100644 --- a/libs/agent-evals/src/core/runner.ts +++ b/libs/agent-evals/src/core/runner.ts @@ -2,8 +2,9 @@ import fs from 'node:fs/promises'; import path from 'node:path'; import { gradeRun, scoreFromOutcomes } from './graders.js'; import { configureJudge } from './judge.js'; -import { dryRunAgentScenario, runAgentScenario } from './run-agent.js'; -import type { RegisteredScenario, RunnerOptions, ScenarioScore, Suite } from './types.js'; +import { createGraderProgressReporter } from './reporters.js'; +import { runAgentScenario } from './run-agent.js'; +import type { GraderOutcome, GraderResult, RegisteredScenario, RunnerOptions, ScenarioScore, Suite } from './types.js'; import { PACKAGE_ROOT } from './types.js'; export function filterScenarios(suite: Suite, filter?: string): RegisteredScenario[] { @@ -41,15 +42,33 @@ export async function runEvaluation( ): Promise { configureJudge({ enabled: options.judge, model: options.judgeModel ?? options.model }); - const runResult = options.dry - ? await dryRunAgentScenario(entry.scenario) - : await runAgentScenario({ suite, scenario: entry.scenario, model: options.model }); + const runResult = await runAgentScenario({ suite, scenario: entry.scenario, model: options.model }); + const totalGraders = Object.keys(entry.graders).length; - const graders = options.dry - ? Object.fromEntries(Object.keys(entry.graders).map((name) => [name, 'skip' as const])) - : await gradeRun(entry.graders, runResult, { judgeEnabled: options.judge }); + console.log(` ↳ ${entry.scenario.id}: grading ${totalGraders} checks${options.judge ? ' (with judge)' : ''}…`); - const score = options.dry ? 1 : scoreFromOutcomes(graders); + const progress = createGraderProgressReporter({ totalGraders, judgeEnabled: options.judge }); + const outcomes: Record = await gradeRun(entry.graders, runResult, { + judgeEnabled: options.judge, + onGraderStart: progress.onGraderStart, + onGraderResult: progress.onGraderResult, + }); + + const graders = Object.fromEntries( + Object.entries(outcomes).map(([name, outcome]) => [name, outcome.status]) + ) as Record; + + const graderReasons = Object.fromEntries( + Object.entries(outcomes) + .filter(([, outcome]) => outcome.status === 'fail' && Boolean(outcome.reason)) + .map(([name, outcome]) => [name, outcome.reason as string]) + ); + + const score = scoreFromOutcomes(outcomes); + + const graderKinds = Object.fromEntries( + Object.entries(entry.graders).map(([name, definition]) => [name, definition.kind]) + ) as Record; const scenarioScore: ScenarioScore = { scenarioId: entry.scenario.id, @@ -57,6 +76,8 @@ export async function runEvaluation( model: options.model, score, graders, + graderReasons, + graderKinds, runResult: options.debug ? runResult : undefined, }; @@ -73,11 +94,30 @@ export async function runAllEvaluations(suite: Suite, options: RunnerOptions): P } const scores: ScenarioScore[] = []; + const total = selected.length; + + console.log(`Running ${total} scenario${total === 1 ? '' : 's'} for suite "${suite.id}" (model: ${options.model})\n`); - for (const entry of selected) { - scores.push(await runEvaluation(suite, entry, options)); + for (let index = 0; index < selected.length; index += 1) { + const entry = selected[index]; + const position = `[${index + 1}/${total}]`; + const startedAt = Date.now(); + + console.log(`${position} ${entry.scenario.id} — running…`); + + const score = await runEvaluation(suite, entry, options); + scores.push(score); + + const elapsed = ((Date.now() - startedAt) / 1000).toFixed(1); + console.log(`${position} ${entry.scenario.id} — done: ${(score.score * 100).toFixed(1)}% (${elapsed}s)`); } + if (total > 1) { + console.log(`Average score: ${(averageScore(scores) * 100).toFixed(1)}%`); + } + + console.log(''); + return scores; } diff --git a/libs/agent-evals/src/core/types.ts b/libs/agent-evals/src/core/types.ts index b0c3903b89c..deb0fd7dcf5 100644 --- a/libs/agent-evals/src/core/types.ts +++ b/libs/agent-evals/src/core/types.ts @@ -3,7 +3,13 @@ import { fileURLToPath } from 'node:url'; export type GraderResult = 'pass' | 'fail' | 'skip'; -export type GraderFn = (result: RunResult) => GraderResult | Promise; +/** A grader can return a bare status, or a status with a human-readable reason (used for fails). */ +export type GraderOutcome = { + status: GraderResult; + reason?: string; +}; + +export type GraderFn = (result: RunResult) => GraderResult | GraderOutcome | Promise; export type GraderDefinition = { kind: 'deterministic' | 'judge'; @@ -78,6 +84,9 @@ export type ScenarioScore = { model: string; score: number; graders: Record; + /** Failure explanations keyed by grader name; only populated for failing graders that supply a reason. */ + graderReasons: Record; + graderKinds: Record; runResult?: RunResult; }; @@ -141,8 +150,6 @@ export type Suite = { const currentDir = path.dirname(fileURLToPath(import.meta.url)); -export const REPO_ROOT = path.resolve(currentDir, '../../../..'); - export const PACKAGE_ROOT = path.resolve(currentDir, '../..'); export function normalizePath(input: string): string { diff --git a/libs/agent-evals/src/index.ts b/libs/agent-evals/src/index.ts index 320be2e56f9..78649c189db 100644 --- a/libs/agent-evals/src/index.ts +++ b/libs/agent-evals/src/index.ts @@ -1,5 +1,5 @@ import './load-env.js'; -import { printConsoleReport, writeScoresFile } from './core/reporters.js'; +import { writeScoresFile } from './core/reporters.js'; import { averageScore, filterScenarios, runAllEvaluations } from './core/runner.js'; import type { RunnerOptions } from './core/types.js'; import { DEFAULT_SUITE, getSuite, listSuiteIds } from './suites/registry.js'; @@ -110,7 +110,6 @@ async function main(): Promise { } const scores = await runAllEvaluations(suite, options); - printConsoleReport(suite.id, scores, options.judge); const outputPath = await writeScoresFile(suite.id, scores); console.log(`Wrote ${outputPath}`); diff --git a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts index 96bb9aea70d..1677e687d58 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts @@ -1,5 +1,6 @@ -import { defineGraders, judge, toolCallsNamed, transcriptText } from '../../core/graders.js'; -import type { RunResult } from '../../core/types.js'; +import { defineGraders, fail, judge, toolCallsNamed, transcriptText } from '../../core/graders.js'; +import { isForbiddenWatcherCommand } from '../../core/recorder.js'; +import type { GraderOutcome, RunResult } from '../../core/types.js'; /** The drafted agent description is captured into metadata by the suite's onTrackedCommand hook. */ export function descriptionText(result: RunResult): string { @@ -20,45 +21,57 @@ export const judgePrompts = { noInfraMcpSemantic: 'Does the drafted agent description avoid naming internal infrastructure or backend plumbing (databases, email delivery APIs, queues, caches, dev tooling) even via synonyms?', conclusionFirstReport: - 'Does the final user-facing message lead with the CLI result (success or failure), then give the one next action (claim link for keyless or dashboard for authenticated), kept terse?', + 'You are given only the final user-facing message. Does its first line/sentence state the CLI result (success or failure), followed by the single next action (claim link for keyless or dashboard for authenticated), without leading with setup steps or process narration?', }; export const catalog = { - noSecretKeyFlag: (result: RunResult) => + noSecretKeyFlag: (result: RunResult): GraderOutcome | 'pass' => connectCommands(result).every((cmd) => !/--secret-key\b/.test(cmd) && !/\bNOVU_SECRET_KEY=/.test(cmd)) ? 'pass' - : 'fail', + : fail('a connect command passed --secret-key or NOVU_SECRET_KEY'), - usedLoginWhenDashboardPrompt: (result: RunResult) => { + usedLoginWhenDashboardPrompt: (result: RunResult): GraderOutcome | 'pass' => { if (!/signed in to the Novu dashboard/i.test(result.userPrompt)) { return 'pass'; } - return connectCommands(result).some((cmd) => /--login\b/.test(cmd)) ? 'pass' : 'fail'; + return connectCommands(result).some((cmd) => /--login\b/.test(cmd)) + ? 'pass' + : fail('user was signed into the dashboard but no connect command used --login'); }, - backgroundConnectShell: (result: RunResult) => { + backgroundConnectShell: (result: RunResult): GraderOutcome | 'pass' => { const connectCall = firstConnectCall(result); if (!connectCall) { - return 'fail'; + return fail('no connect command was run'); + } + + if (!connectCall.args.run_in_background) { + return fail('connect command was not run in the background (run_in_background was not set)'); } - return Boolean(connectCall.args.run_in_background) && result.polledShellIds.length > 0 ? 'pass' : 'fail'; + return result.polledShellIds.length > 0 + ? 'pass' + : fail('background connect shell was never polled with BashOutput'); }, - noTimersNoWatchers: (result: RunResult) => { - const forbidden = result.toolCalls.some((call) => { + noTimersNoWatchers: (result: RunResult): GraderOutcome | 'pass' => { + const forbiddenCall = result.toolCalls.find((call) => { if (call.name !== 'Bash') { return false; } - const command = String(call.args.command ?? '').toLowerCase(); + const command = String(call.args.command ?? ''); - return /\bsleep\b/.test(command) || /\btail\b/.test(command) || /\bgrep\b/.test(command); + return isForbiddenWatcherCommand(command); }); - const readLogs = result.toolCalls.some((call) => { + if (forbiddenCall) { + return fail(`used a timer/watcher command: ${String(forbiddenCall.args.command ?? '')}`); + } + + const readLogCall = result.toolCalls.find((call) => { if (call.name !== 'Read') { return false; } @@ -68,40 +81,53 @@ export const catalog = { return filePath.includes('/tmp/') || filePath.endsWith('.log'); }); - return forbidden || readLogs ? 'fail' : 'pass'; + return readLogCall + ? fail(`tailed a log file instead of polling: ${String(readLogCall.args.file_path ?? '')}`) + : 'pass'; }, - usedPickerForDecisions: (result: RunResult) => - toolCallsNamed(result, 'AskUserQuestion').length >= 1 ? 'pass' : 'fail', + usedPickerForDecisions: (result: RunResult): GraderOutcome | 'pass' => + toolCallsNamed(result, 'AskUserQuestion').length >= 1 + ? 'pass' + : fail('no AskUserQuestion picker was used for decisions'), pastedLiteralUrl: (expectedUrl: string) => - (result: RunResult): 'pass' | 'fail' => - result.capturedUrls.includes(expectedUrl) || transcriptText(result).includes(expectedUrl) ? 'pass' : 'fail', + (result: RunResult): GraderOutcome | 'pass' => + result.capturedUrls.includes(expectedUrl) || transcriptText(result).includes(expectedUrl) + ? 'pass' + : fail(`expected URL not surfaced to the user: ${expectedUrl}`), descriptionExcludesInfraTokens: (tokens: string[]) => - (result: RunResult): 'pass' | 'fail' => { + (result: RunResult): GraderOutcome | 'pass' => { const description = descriptionText(result).toLowerCase(); + const offending = tokens.filter((token) => description.includes(token.toLowerCase())); - return tokens.some((token) => description.includes(token.toLowerCase())) ? 'fail' : 'pass'; + return offending.length > 0 ? fail(`description mentions infra tokens: ${offending.join(', ')}`) : 'pass'; }, descriptionIncludesTokens: (tokens: string[]) => - (result: RunResult): 'pass' | 'fail' => { + (result: RunResult): GraderOutcome | 'pass' => { const description = descriptionText(result).toLowerCase(); - return tokens.some((token) => description.includes(token.toLowerCase())) ? 'pass' : 'fail'; + return tokens.some((token) => description.includes(token.toLowerCase())) + ? 'pass' + : fail(`description is missing all expected tokens: ${tokens.join(', ')}`); }, - noConnectOnKeylessWhatsapp: (result: RunResult) => - connectCommands(result).length === 0 && - /dashboard\.novu\.co|dashboard redirect|continue.*dashboard/i.test(transcriptText(result)) + noConnectOnKeylessWhatsapp: (result: RunResult): GraderOutcome | 'pass' => { + if (connectCommands(result).length > 0) { + return fail('ran a connect command on a keyless WhatsApp flow that should redirect to the dashboard'); + } + + return /dashboard\.novu\.co|dashboard redirect|continue.*dashboard/i.test(transcriptText(result)) ? 'pass' - : 'fail', + : fail('did not direct the user to the dashboard'); + }, - confirmedBeforeRun: (result: RunResult) => { + confirmedBeforeRun: (result: RunResult): GraderOutcome | 'pass' => { const approveIndex = result.toolCalls.findIndex( (call) => call.name === 'AskUserQuestion' && @@ -115,32 +141,43 @@ export const catalog = { return 'pass'; } - return approveIndex !== -1 && approveIndex < firstConnectIndex ? 'pass' : 'fail'; + return approveIndex !== -1 && approveIndex < firstConnectIndex + ? 'pass' + : fail('ran connect without an approved confirmation picker beforehand'); }, - qrHostAware: (result: RunResult) => (result.openedFiles.some((file) => file.endsWith('.png')) ? 'pass' : 'fail'), + qrHostAware: (result: RunResult): GraderOutcome | 'pass' => + result.openedFiles.some((file) => file.endsWith('.png')) ? 'pass' : fail('did not open the QR code image'), - reranWithSlackToken: (result: RunResult) => - connectCommands(result).some((cmd) => /--slack-config-token\b/.test(cmd)) ? 'pass' : 'fail', + reranWithSlackToken: (result: RunResult): GraderOutcome | 'pass' => + connectCommands(result).some((cmd) => /--slack-config-token\b/.test(cmd)) + ? 'pass' + : fail('did not re-run connect with --slack-config-token'), - killedFirstConnectShell: (result: RunResult) => (result.killedShellIds.length >= 1 ? 'pass' : 'fail'), + killedFirstConnectShell: (result: RunResult): GraderOutcome | 'pass' => + result.killedShellIds.length >= 1 ? 'pass' : fail('the first connect shell was never killed'), - readAuthUrlFile: (result: RunResult) => + readAuthUrlFile: (result: RunResult): GraderOutcome | 'pass' => result.toolCalls.some( (call) => call.name === 'Read' && String(call.args.file_path ?? '').includes('novu-connect-auth-url') ) || result.capturedUrls.some((url) => url.includes('/oauth/device')) || transcriptText(result).includes('/oauth/device') ? 'pass' - : 'fail', + : fail('never read the auth-url file or surfaced the /oauth/device URL'), - reportedSuccess: (result: RunResult) => - /your agent is live|agent is live/i.test(transcriptText(result)) ? 'pass' : 'fail', + reportedSuccess: (result: RunResult): GraderOutcome | 'pass' => + /your agent is live|agent is live/i.test(transcriptText(result)) + ? 'pass' + : fail('final report did not confirm the agent is live'), - noConnectCommands: (result: RunResult) => (connectCommands(result).length === 0 ? 'pass' : 'fail'), + noConnectCommands: (result: RunResult): GraderOutcome | 'pass' => + connectCommands(result).length === 0 ? 'pass' : fail('ran a connect command when none was expected'), - usedSecureTokenPath: (result: RunResult) => - connectCommands(result).every((cmd) => !/--slack-config-token\b/.test(cmd)) ? 'pass' : 'fail', + usedSecureTokenPath: (result: RunResult): GraderOutcome | 'pass' => + connectCommands(result).every((cmd) => !/--slack-config-token\b/.test(cmd)) + ? 'pass' + : fail('passed --slack-config-token inline instead of the secure token path'), }; export const sharedJudgeGraders = defineGraders({ @@ -148,5 +185,5 @@ export const sharedJudgeGraders = defineGraders({ [descriptionText(result), transcriptText(result)].join('\n') ), noInfraMcpSemantic: judge(judgePrompts.noInfraMcpSemantic, (result) => descriptionText(result)), - conclusionFirstReport: judge(judgePrompts.conclusionFirstReport, (result) => transcriptText(result)), + conclusionFirstReport: judge(judgePrompts.conclusionFirstReport, (result) => result.finalText), }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts index 7af9c575425..c6999888600 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts @@ -13,19 +13,74 @@ export function isConnectCommand(command: string): boolean { return /\bnovu(@[\w.-]+)?\s+connect\b/.test(command) || /\bnpx\s+[^\s]*novu[^\s]*\s+connect\b/.test(command); } +/** + * Decode a single shell word, honoring single quotes, double quotes, and backslash + * escapes (including the `'\''` idiom agents use to embed apostrophes). Reading stops + * at the first unquoted whitespace so trailing flags are not absorbed into the value. + */ +function unquoteShellWord(input: string): string { + let out = ''; + let i = 0; + + while (i < input.length) { + const ch = input[i]; + + if (ch === "'") { + i += 1; + while (i < input.length && input[i] !== "'") { + out += input[i]; + i += 1; + } + i += 1; + } else if (ch === '"') { + i += 1; + while (i < input.length && input[i] !== '"') { + if (input[i] === '\\' && i + 1 < input.length) { + i += 1; + } + out += input[i]; + i += 1; + } + i += 1; + } else if (ch === '\\') { + if (i + 1 < input.length) { + out += input[i + 1]; + i += 2; + } else { + i += 1; + } + } else if (/\s/.test(ch)) { + break; + } else { + out += ch; + i += 1; + } + } + + return out; +} + function resolveDescription(command: string, env: Record): string | undefined { - const exportMatch = command.match(/export\s+NOVU_AGENT_DESCRIPTION='([^']*)'/); + const exportMatch = command.match(/export\s+NOVU_AGENT_DESCRIPTION=(.+)/); if (exportMatch?.[1]) { - return exportMatch[1]; + const value = unquoteShellWord(exportMatch[1].trimStart()); + + if (value && !value.includes('$')) { + return value; + } } - const positionalMatch = command.match(/\bconnect\s+(['"])(.*?)\1/); - const positional = positionalMatch?.[2]; + // Only treat a quoted token as the positional description; a leading flag means there is none. + const positionalMatch = command.match(/\bconnect\s+(['"][\s\S]*)/); - // A positional that references the env var (e.g. "$NOVU_AGENT_DESCRIPTION") resolves from env. - if (positional && !positional.includes('$')) { - return positional; + if (positionalMatch?.[1]) { + const positional = unquoteShellWord(positionalMatch[1]); + + // A positional that references the env var (e.g. "$NOVU_AGENT_DESCRIPTION") resolves from env. + if (positional && !positional.includes('$')) { + return positional; + } } return env.NOVU_AGENT_DESCRIPTION; diff --git a/libs/agent-evals/src/suites/agent-onboarding/index.ts b/libs/agent-evals/src/suites/agent-onboarding/index.ts index ea0de18aea9..d4bc953f8b9 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/index.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/index.ts @@ -1,5 +1,5 @@ -import path from 'node:path'; -import { REPO_ROOT, type Suite } from '../../core/types.js'; +import { resolvePackageFile } from '../../core/resolve-package-file.js'; +import type { Suite } from '../../core/types.js'; import { type ConnectFlags, connectParser } from './connect-parser.js'; import { graders as dashboardPromptLoginGraders } from './scenarios/dashboard-prompt-login/graders.js'; import { scenario as dashboardPromptLoginScenario } from './scenarios/dashboard-prompt-login/scenario.js'; @@ -18,7 +18,7 @@ import { scenario as slackInChatRerunScenario } from './scenarios/slack-in-chat- import { graders as telegramSecureQrGraders } from './scenarios/telegram-secure-qr/graders.js'; import { scenario as telegramSecureQrScenario } from './scenarios/telegram-secure-qr/scenario.js'; -export const AGENT_ONBOARDING_DOC_PATH = path.join(REPO_ROOT, 'packages/shared/docs/agent-onboarding.md'); +export const AGENT_ONBOARDING_DOC_PATH = resolvePackageFile('@novu/shared/docs/agent-onboarding.md'); const SYSTEM_PROMPT_PREAMBLE = [ 'You are an AI coding agent executing the Novu agent onboarding playbook exactly.', diff --git a/packages/shared/package.json b/packages/shared/package.json index bdbbf8dc6ac..5eb811fc254 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -26,6 +26,7 @@ "types": "dist/cjs/index.d.ts", "files": [ "dist/", + "docs/agent-onboarding.md", "!**/*.spec.*", "!**/*.json", "CHANGELOG.md", @@ -57,7 +58,8 @@ "require": "./dist/cjs/utils/safe-outbound-http.js", "import": "./dist/esm/utils/safe-outbound-http.js", "types": "./dist/esm/utils/safe-outbound-http.d.ts" - } + }, + "./docs/agent-onboarding.md": "./docs/agent-onboarding.md" }, "dependencies": { "lru-cache": "^11.5.1" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9d2bf373584..c90e6f5cf8d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -2471,6 +2471,9 @@ importers: '@ai-sdk/anthropic': specifier: ^3.0.10 version: 3.0.13(zod@3.25.76) + '@novu/shared': + specifier: workspace:* + version: link:../../packages/shared ai: specifier: 6.0.50 version: 6.0.50(zod@3.25.76) From a20c1318ffc8861952ccaac65935aa2d87be3ade Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Wed, 17 Jun 2026 15:14:29 +0300 Subject: [PATCH 03/19] refactor(agent-evals): streamline evaluation harness and enhance grading system - Updated the agent-evals harness to utilize vitest for running evaluations. - Introduced new environment variables for LLM judge configuration. - Removed legacy CLI entry point and refactored grading logic to improve clarity and maintainability. - Enhanced grader definitions with human-readable labels for better reporting. - Updated workflows to reflect changes in evaluation execution. Co-authored-by: Cursor --- .github/workflows/agent-evals.yml | 11 +- libs/agent-evals/.env.example | 7 + libs/agent-evals/.gitignore | 1 + libs/agent-evals/README.md | 232 ++++++++---------- libs/agent-evals/package.json | 13 +- libs/agent-evals/project.json | 12 + libs/agent-evals/scripts/run-evals.sh | 5 +- libs/agent-evals/src/core/graders.ts | 66 +---- libs/agent-evals/src/core/judge.ts | 16 +- libs/agent-evals/src/core/mock-shell.ts | 10 +- libs/agent-evals/src/core/reporters.ts | 66 ----- libs/agent-evals/src/core/run-agent.ts | 117 --------- libs/agent-evals/src/core/runner.ts | 130 ---------- libs/agent-evals/src/core/types.ts | 26 +- libs/agent-evals/src/index.ts | 129 ---------- libs/agent-evals/src/self-test.ts | 53 ---- .../src/suites/agent-onboarding/adapters.ts | 46 ++++ .../src/suites/agent-onboarding/catalog.ts | 17 +- .../suites/agent-onboarding/graders.test.ts | 68 +++++ .../src/suites/agent-onboarding/harness.ts | 148 +++++++++++ .../src/suites/agent-onboarding/kit.ts | 2 +- .../agent-onboarding/onboarding.eval.ts | 31 +++ .../dashboard-prompt-login/graders.ts | 18 +- .../scenarios/discipline-no-timers/graders.ts | 15 +- .../scenarios/email-handoff/graders.ts | 13 +- .../scenarios/keyless-slack-secure/graders.ts | 29 ++- .../keyless-whatsapp-redirect/graders.ts | 11 +- .../persona-infra-exclusion/graders.ts | 20 +- .../scenarios/slack-in-chat-rerun/graders.ts | 18 +- .../scenarios/telegram-secure-qr/graders.ts | 18 +- libs/agent-evals/src/suites/registry.ts | 16 -- libs/agent-evals/vitest.config.ts | 8 + libs/agent-evals/vitest.evals.config.ts | 24 ++ pnpm-lock.yaml | 211 ++++++++++++++-- 34 files changed, 775 insertions(+), 832 deletions(-) delete mode 100644 libs/agent-evals/src/core/reporters.ts delete mode 100644 libs/agent-evals/src/core/run-agent.ts delete mode 100644 libs/agent-evals/src/core/runner.ts delete mode 100644 libs/agent-evals/src/index.ts delete mode 100644 libs/agent-evals/src/self-test.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/adapters.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/graders.test.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/harness.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/onboarding.eval.ts delete mode 100644 libs/agent-evals/src/suites/registry.ts create mode 100644 libs/agent-evals/vitest.config.ts create mode 100644 libs/agent-evals/vitest.evals.config.ts diff --git a/.github/workflows/agent-evals.yml b/.github/workflows/agent-evals.yml index 139a6c62ae8..4e6bee5bd06 100644 --- a/.github/workflows/agent-evals.yml +++ b/.github/workflows/agent-evals.yml @@ -45,12 +45,5 @@ jobs: - name: Run agent evals env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - run: | - JUDGE_FLAG="" - if [[ "${{ github.event_name }}" == "schedule" || "${{ github.event_name }}" == "workflow_dispatch" ]]; then - if [[ "${{ github.event_name }}" != "workflow_dispatch" || "${{ inputs.enable_judge }}" == "true" ]]; then - JUDGE_FLAG="--judge" - fi - fi - - pnpm --filter @novu/agent-evals start -- --suite agent-onboarding --fail-under 80 ${JUDGE_FLAG} + NOVU_EVAL_JUDGE: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.enable_judge)) && 'true' || 'false' }} + run: pnpm --filter @novu/agent-evals eval diff --git a/libs/agent-evals/.env.example b/libs/agent-evals/.env.example index 8b3e1258073..bea6b6df91e 100644 --- a/libs/agent-evals/.env.example +++ b/libs/agent-evals/.env.example @@ -1 +1,8 @@ ANTHROPIC_API_KEY= + +# Set to true to include LLM judge graders (enabled on scheduled CI) +NOVU_EVAL_JUDGE= + +# Optional model overrides (default: claude-sonnet-4-5) +NOVU_EVAL_MODEL= +NOVU_EVAL_JUDGE_MODEL= diff --git a/libs/agent-evals/.gitignore b/libs/agent-evals/.gitignore index 303f8d046ab..5bf39403e34 100644 --- a/libs/agent-evals/.gitignore +++ b/libs/agent-evals/.gitignore @@ -1,3 +1,4 @@ debug-runs/ scores-*.json +.vitest-evals/ .env diff --git a/libs/agent-evals/README.md b/libs/agent-evals/README.md index b8610091e07..06f6724d2d1 100644 --- a/libs/agent-evals/README.md +++ b/libs/agent-evals/README.md @@ -2,163 +2,130 @@ Behavioral eval harness for Novu coding-agent playbooks. Runs a real LLM agent against scripted scenarios with a mocked CLI, then grades whether the agent follows the playbook using deterministic structural checks plus optional LLM-as-judge graders for fuzzy criteria. -The harness is **suite-based**: `src/core/` is playbook-agnostic, and each suite under `src/suites/` plugs in its own system prompt, command parser, scenarios, and grader catalog. The first suite, `agent-onboarding`, tests `@novu/shared/docs/agent-onboarding.md` (the `npx novu connect` flow), resolved via the `@novu/shared` package export. +The harness is **suite-based**: `src/core/` holds the playbook-agnostic simulation layer (mock tools, tape replay, recorder), and each suite under `src/suites/` plugs in its system prompt, command parser, scenarios, and grader catalog. Scoring and reporting are handled by [vitest-evals](https://vitest-evals.sentry.dev/). + +The first suite, `agent-onboarding`, tests `@novu/shared/docs/agent-onboarding.md` (the `npx novu connect` flow), resolved via the `@novu/shared` package export. ## Architecture ### Layer overview -The package splits into three layers: a CLI entrypoint, a playbook-agnostic **core harness**, and pluggable **suites** that supply scenarios, tapes, and graders. - ```mermaid flowchart TB - subgraph entry["Entry (src/index.ts)"] - CLI["CLI flags\n(--suite, --scenario, --judge, …)"] - Registry["suites/registry.ts"] + subgraph entry["Entry (vitest)"] + Eval["onboarding.eval.ts\ndescribeEval per scenario"] + Adapters["adapters.ts\ngrader → judge"] end - subgraph core["Core harness (src/core/)"] - Runner["runner.ts\nload → run → grade → score"] - RunAgent["run-agent.ts\nAI SDK tool-calling loop"] + subgraph core["Core simulation (src/core/)"] + Harness["harness.ts\ncreateHarness + AI SDK loop"] Tools["tools.ts\nBash · BashOutput · AskUserQuestion · Read"] MockShell["mock-shell.ts\nTape replay engine"] Recorder["recorder.ts\nRunResult builder"] - Graders["graders.ts\ncontains · matches · judge"] + Graders["graders.ts\ndefineGraders · contains · judge"] Judge["judge.ts\nLLM-as-judge"] - Reporters["reporters.ts\nconsole + scores JSON"] end - subgraph suite["Suite (src/suites/{name}/)"] - SuiteObj["Suite contract\nsystem prompt · parser · hooks"] + subgraph suite["Suite (src/suites/agent-onboarding/)"] + SuiteObj["index.ts\nSuite contract"] Scenarios["scenarios/{id}/\nscenario.ts · graders.ts · project/"] - Parser["CommandParser\n(e.g. connect-parser.ts)"] - Tape["Tape\nscripted CLI stdout chunks"] + Parser["connect-parser.ts"] + Tape["tape.ts"] + Catalog["catalog.ts"] end - CLI --> Registry - Registry --> Runner - Runner --> RunAgent - RunAgent --> Tools + Eval --> Harness + Eval --> Adapters + Adapters --> Graders + Adapters --> Judge + Harness --> Tools Tools --> MockShell Tools --> Recorder - RunAgent --> Recorder - Runner --> Graders - Graders --> Judge - Runner --> Reporters - - SuiteObj --> RunAgent + Harness --> Recorder + SuiteObj --> Harness Parser --> MockShell Tape --> MockShell - Scenarios --> Runner - Scenarios --> Graders + Scenarios --> Eval + Catalog --> Scenarios ``` ### Execution flow -Each scenario runs the real agent against a mocked environment, then grades the recorded behavior. +Each scenario is a vitest-evals `describeEval` block: one harness run, then automatic judges score the resulting `RunResult`. ```mermaid sequenceDiagram - participant User as CLI - participant Runner as runner.ts - participant Agent as run-agent.ts + participant Vitest as vitest-evals + participant Harness as harness.ts participant LLM as Anthropic model participant Tools as Harness tools participant Shell as MockShellEngine participant Rec as RunRecorder - participant Grade as graders.ts - participant Out as reporters.ts - - User->>Runner: runAllEvaluations(suite, options) - loop each scenario - Runner->>Agent: runAgentScenario(suite, scenario) - Agent->>Agent: resolveSystemPrompt(playbook doc) - Agent->>LLM: generateText(system + user prompt, tools) - loop tool-calling steps - LLM->>Tools: Bash / BashOutput / AskUserQuestion / Read - alt tracked command (e.g. novu connect) - Tools->>Shell: createShell → replay tape chunks - Shell-->>Tools: scripted stdout - Tools->>Rec: record tracked command, URLs, polls - else AskUserQuestion - Tools->>Rec: pick scriptedAnswers[answerIndex] - else Read fixture - Tools->>Rec: read scenario project/ files - end - Tools-->>LLM: tool result - end - opt followUpMessages / followUpOnOptionId - Agent->>LLM: inject scripted user follow-up - end - Agent->>Rec: build() → RunResult - Runner->>Grade: gradeRun(scenario graders, RunResult) - alt deterministic grader - Grade->>Grade: contains / matches on transcript and toolCalls - else judge grader (--judge) - Grade->>LLM: runJudge(prompt, context) + participant Judges as adapters.ts + + Vitest->>Harness: run(userPrompt) + Harness->>Harness: resolveSystemPrompt(playbook doc) + Harness->>LLM: generateText(system + user prompt, tools) + loop tool-calling steps + LLM->>Tools: Bash / BashOutput / AskUserQuestion / Read + alt tracked command (e.g. novu connect) + Tools->>Shell: createShell → replay tape chunks + Shell-->>Tools: scripted stdout + Tools->>Rec: record tracked command, URLs, polls + else AskUserQuestion + Tools->>Rec: pick scriptedAnswers[answerIndex] + else Read fixture + Tools->>Rec: read scenario project/ files end - Grade-->>Runner: pass / fail / skip per grader - Runner->>Runner: scoreFromOutcomes → ScenarioScore + Tools-->>LLM: tool result end - Runner->>Out: printConsoleReport + writeScoresFile - Out-->>User: matrix and scores-{suite}.json + opt followUpMessages / followUpOnOptionId + Harness->>LLM: inject scripted user follow-up + end + Harness->>Rec: build() → RunResult + Harness-->>Vitest: HarnessRun with output + Vitest->>Judges: assess each grader as judge (threshold 0.8) + alt judge grader (NOVU_EVAL_JUDGE) + Judges->>LLM: runJudge(prompt, context) + end + Judges-->>Vitest: pass / fail per judge ``` ### Key concepts | Concept | Role | | --- | --- | -| **Suite** | Plugs a playbook (system prompt), `CommandParser`, scenario list, and optional hooks (`onTrackedCommand`, sentinel URL patterns) into the generic harness. | +| **Suite** | Plugs a playbook (system prompt), `CommandParser`, scenario list, and optional hooks into the harness. | | **Scenario** | One eval case: user prompt, fixture `project/`, scripted user answers, optional CLI **tape**, and follow-up messages. | | **Tape** | Ordered stdout chunks replayed when the agent runs a tracked command; `when(parsed)` can branch on parsed flags. | | **CommandParser** | Decides which shell commands are tracked (e.g. `novu connect`) and parses them for tape selection and validation. | | **RunResult** | Everything the agent did: tool calls, assistant text, captured URLs, polled/killed shells, suite metadata. | -| **Graders** | **Deterministic** checks on `RunResult` structure, or **judge** graders that call a second LLM pass for fuzzy criteria. | - -### Mock CLI model - -The harness simulates a Claude Code–like environment without running real `novu connect`: - -```mermaid -stateDiagram-v2 - [*] --> Bash: agent runs command - Bash --> Tracked: parser.matches(command) - Bash --> Untracked: other commands - Tracked --> TapeReplay: scenario.tape supplies chunks - TapeReplay --> Background: run_in_background=true - TapeReplay --> Foreground: synchronous run - Background --> BashOutput: agent polls shell id - BashOutput --> TapeReplay: emit next chunk until exitCode - Untracked --> Stub: generic stdout or reject watchers - note right of TapeReplay - connect-parser validates flags; - validate() can fail the command - end note -``` +| **Graders / judges** | **Deterministic** checks on `RunResult`, or **judge** graders that call a second LLM pass. Adapted to vitest-evals `createJudge` via `adapters.ts`. | ## Structure ``` src/ - core/ # suite-agnostic harness + core/ # suite-agnostic simulation types.ts # Suite contract, RunResult, Tape, CommandParser - run-agent.ts # AI SDK tool-calling loop tools.ts # Bash / BashOutput / AskUserQuestion / Read - mock-shell.ts # tape replay engine (pluggable command parser) + mock-shell.ts # tape replay engine recorder.ts # RunResult builder - graders.ts # defineGraders, contains, matches, judge, gradeRun - judge.ts # LLM-as-judge runner - runner.ts # load -> run -> grade -> score - reporters.ts # console matrix + scores-.json + graders.ts # defineGraders, contains, matches, judge + judge.ts # LLM-as-judge (Anthropic via AI SDK) suites/ - registry.ts # suite id -> Suite - agent-onboarding/ # the connect-flow suite + agent-onboarding/ index.ts # the Suite object - connect-parser.ts # novu connect flag parser + validation - tape.ts # connectTape / buildDefaultTape helpers - catalog.ts # connect grader catalog + judge prompts - kit.ts # stable import surface for scenario files + harness.ts # createHarness + multi-turn agent loop + adapters.ts # grader → vitest-evals judge + onboarding.eval.ts # describeEval per scenario + connect-parser.ts + tape.ts + catalog.ts + graders.test.ts # synthetic RunResult unit tests scenarios// # scenario.ts + graders.ts + project/ fixtures +vitest.config.ts # unit tests (*.test.ts) +vitest.evals.config.ts # evals (*.eval.ts) + vitest-evals reporter ``` ## Setup @@ -168,49 +135,54 @@ cp .env.example .env # from libs/agent-evals/ pnpm install ``` -Set `ANTHROPIC_API_KEY` in `.env` before running real evals. Judge graders also use this key when enabled. +Set `ANTHROPIC_API_KEY` in `.env` before running evals. Eval suites skip automatically when the key is missing. -## Local testing +## Local commands -**No API key** — verify the harness without calling any LLM: +**Unit tests** (no API key — synthetic `RunResult` grader checks): ```bash -pnpm --filter @novu/agent-evals test # deterministic grader self-test -pnpm --filter @novu/agent-evals start -- --dry # list scenarios; no agent run -pnpm --filter @novu/agent-evals start -- --smoke --dry +pnpm --filter @novu/agent-evals test ``` -**With API key** — runs the agent (and optionally the judge) against scenarios: +**Evals** (requires `ANTHROPIC_API_KEY`): ```bash -pnpm --filter @novu/agent-evals start -pnpm --filter @novu/agent-evals start -- --scenario keyless-slack-secure -pnpm --filter @novu/agent-evals start -- --smoke # first scenario only -pnpm --filter @novu/agent-evals start -- --judge # enable LLM judge graders -pnpm --filter @novu/agent-evals start -- --fail-under 80 # CI gate +pnpm --filter @novu/agent-evals eval +pnpm --filter @novu/agent-evals eval:watch + +# Single scenario +pnpm --filter @novu/agent-evals exec vitest run --config vitest.evals.config.ts -t keyless-slack-secure + +# Enable LLM judge graders (also enabled on scheduled CI runs) +NOVU_EVAL_JUDGE=true pnpm --filter @novu/agent-evals eval ``` -## Flags +## Environment variables -| Flag | Description | +| Variable | Description | | --- | --- | -| `--suite ` | Suite to run (default: `agent-onboarding`) | -| `--scenario ` | Filter evals by id or category | -| `--model ` | Agent model (default: `claude-sonnet-4-5`) | -| `--judge` / `--no-judge` | LLM-as-judge graders (auto-on when `ANTHROPIC_API_KEY` is set) | -| `--judge-model ` | Judge model (defaults to agent model) | -| `--smoke` | First scenario only | -| `--dry` | Print summary only; does not run the agent or call any LLM | -| `--debug` | Save run artifacts to `debug-runs//` | -| `--fail-under ` | Exit non-zero if average score is below threshold | +| `ANTHROPIC_API_KEY` | Required for eval runs (suites skip when unset) | +| `NOVU_EVAL_JUDGE` | Set to `true` or `1` to include LLM judge graders | +| `NOVU_EVAL_MODEL` | Agent model (default: `claude-sonnet-4-5`) | +| `NOVU_EVAL_JUDGE_MODEL` | Judge model (default: `claude-sonnet-4-5`) | +| `NOVU_EVAL_CONCURRENCY` | Max scenarios run in parallel (default: `4`) | +| `NOVU_EVAL_MAX_STEPS` | Max agent steps per scenario run (default: `40`) | + +Scenarios are independent and dominated by live-model latency, so they run concurrently (`sequence.concurrent`). Raise `NOVU_EVAL_CONCURRENCY` for faster runs or lower it if you hit Anthropic rate limits. + +## Threshold semantics + +Each scenario uses `judgeThreshold: 0.8` — the average judge score for that scenario must be ≥ 80%. This is stricter than the old global `--fail-under 80` (which gated on the average across all scenarios): every scenario must pass individually. + +Judge graders run only when `NOVU_EVAL_JUDGE=true` (PR/push CI runs deterministic graders only; scheduled and workflow-dispatch CI enable judges by default). ## Adding a new suite -1. Create `src/suites//` with a `CommandParser`, scenario folders, and a grader catalog. -2. Export a `Suite` object from its `index.ts` (system prompt source, parser, scenarios, optional hooks). -3. Register it in `src/suites/registry.ts`. +1. Create `src/suites//` with a `CommandParser`, scenario folders, grader catalog, and `harness.ts`. +2. Export a `Suite` object from `index.ts`. +3. Add `.eval.ts` that loops scenarios and registers `describeEval` blocks. -## Output +## CI -- Console: scenario × grader matrix -- `scores-.json`: structured results for CI +GitHub Actions workflow `.github/workflows/agent-evals.yml` runs `pnpm --filter @novu/agent-evals eval` on playbook or harness changes, with `NOVU_EVAL_JUDGE` enabled on schedule and workflow-dispatch. diff --git a/libs/agent-evals/package.json b/libs/agent-evals/package.json index 5d5d52fded1..2091230f935 100644 --- a/libs/agent-evals/package.json +++ b/libs/agent-evals/package.json @@ -4,12 +4,10 @@ "private": true, "description": "Behavioral eval harness for Novu coding-agent playbooks (suite-based).", "type": "module", - "bin": { - "agent-evals": "./src/index.ts" - }, "scripts": { - "start": "tsx src/index.ts", - "test": "tsx src/self-test.ts", + "eval": "vitest run --config vitest.evals.config.ts", + "eval:watch": "vitest --config vitest.evals.config.ts", + "test": "vitest run --config vitest.config.ts", "check": "biome check .", "check:fix": "biome check --write ." }, @@ -22,7 +20,8 @@ }, "devDependencies": { "@types/node": "^22.0.0", - "tsx": "4.16.2", - "typescript": "5.6.2" + "typescript": "5.6.2", + "vitest": "^4.1.8", + "vitest-evals": "0.12.0" } } diff --git a/libs/agent-evals/project.json b/libs/agent-evals/project.json index a27e7a96590..77bcd43f91c 100644 --- a/libs/agent-evals/project.json +++ b/libs/agent-evals/project.json @@ -8,6 +8,18 @@ "options": { "command": "npx biome lint libs/agent-evals" } + }, + "eval": { + "executor": "nx:run-commands", + "options": { + "command": "pnpm --filter @novu/agent-evals eval" + } + }, + "test": { + "executor": "nx:run-commands", + "options": { + "command": "pnpm --filter @novu/agent-evals test" + } } } } diff --git a/libs/agent-evals/scripts/run-evals.sh b/libs/agent-evals/scripts/run-evals.sh index 153a7e6da6a..7bde1159f14 100755 --- a/libs/agent-evals/scripts/run-evals.sh +++ b/libs/agent-evals/scripts/run-evals.sh @@ -4,10 +4,9 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$ROOT_DIR" -JUDGE_FLAG="" if [[ "${1:-}" == "--judge" ]]; then - JUDGE_FLAG="--judge" + export NOVU_EVAL_JUDGE=true shift fi -pnpm start ${JUDGE_FLAG} -- "$@" +pnpm eval "$@" diff --git a/libs/agent-evals/src/core/graders.ts b/libs/agent-evals/src/core/graders.ts index 2b65106491b..d976ab4d85c 100644 --- a/libs/agent-evals/src/core/graders.ts +++ b/libs/agent-evals/src/core/graders.ts @@ -1,13 +1,17 @@ import { runJudge } from './judge.js'; -import type { GraderDefinition, GraderFn, GraderOutcome, GraderResult, RunResult, ToolCallRecord } from './types.js'; +import type { GraderDefinition, GraderFn, GraderOutcome, RunResult, ToolCallRecord } from './types.js'; /** Helper for graders that want to explain a failure inline. */ export function fail(reason: string): GraderOutcome { return { status: 'fail', reason }; } -function toOutcome(value: GraderResult | GraderOutcome): GraderOutcome { - return typeof value === 'string' ? { status: value } : value; +export function labeled(label: string, input: GraderFn | GraderDefinition): GraderDefinition { + if (typeof input === 'function') { + return { kind: 'deterministic', run: input, label }; + } + + return { ...input, label }; } export function defineGraders>( @@ -60,59 +64,3 @@ export function judge(prompt: string, context: (result: RunResult) => string): G run: async (result) => runJudge(prompt, context(result)), }; } - -export type GradeRunOptions = { - judgeEnabled: boolean; - onGraderStart?: (name: string, kind: GraderDefinition['kind']) => void; - onGraderResult?: (name: string, outcome: GraderOutcome, kind: GraderDefinition['kind']) => void; -}; - -export async function gradeRun( - graders: Record, - result: RunResult, - options: GradeRunOptions -): Promise> { - const outcomes: Record = {}; - const entries = Object.entries(graders); - - for (const [name, definition] of entries) { - if (definition.kind === 'judge' && !options.judgeEnabled) { - outcomes[name] = { status: 'skip' }; - options.onGraderResult?.(name, outcomes[name], definition.kind); - continue; - } - - options.onGraderStart?.(name, definition.kind); - outcomes[name] = toOutcome(await definition.run(result)); - options.onGraderResult?.(name, outcomes[name], definition.kind); - } - - return outcomes; -} - -export function formatGraderStatus(status: string, kind: GraderDefinition['kind'], judgeEnabled: boolean): string { - const isJudgeGrader = kind === 'judge'; - - if (status.toUpperCase() === 'SKIP' && isJudgeGrader && !judgeEnabled) { - return 'SKIP (judge disabled)'; - } - - if (isJudgeGrader && judgeEnabled) { - const label = status === 'evaluating…' ? 'evaluating…' : status.toUpperCase(); - - return `${label} (judge)`; - } - - return status === 'evaluating…' ? 'evaluating…' : status.toUpperCase(); -} - -export function scoreFromOutcomes(outcomes: Record): number { - const considered = Object.values(outcomes).filter((value) => value.status !== 'skip'); - const passed = considered.filter((value) => value.status === 'pass').length; - - if (considered.length === 0) { - return 0; - } - - return passed / considered.length; -} diff --git a/libs/agent-evals/src/core/judge.ts b/libs/agent-evals/src/core/judge.ts index 702b53dcd5c..ea463340e05 100644 --- a/libs/agent-evals/src/core/judge.ts +++ b/libs/agent-evals/src/core/judge.ts @@ -2,21 +2,17 @@ import { anthropic } from '@ai-sdk/anthropic'; import { generateText } from 'ai'; import type { GraderOutcome, GraderResult } from './types.js'; -let judgeModel = 'claude-sonnet-4-5'; -let judgeEnabled = false; +const DEFAULT_JUDGE_MODEL = 'claude-sonnet-4-5'; -export function configureJudge(options: { enabled: boolean; model?: string }): void { - judgeEnabled = options.enabled; - judgeModel = options.model ?? judgeModel; -} - -export async function runJudge(prompt: string, context: string): Promise { - if (!judgeEnabled || !process.env.ANTHROPIC_API_KEY) { +export async function runJudge(prompt: string, context: string, options?: { model?: string }): Promise { + if (!process.env.ANTHROPIC_API_KEY) { return { status: 'skip' }; } + const model = options?.model ?? process.env.NOVU_EVAL_JUDGE_MODEL ?? DEFAULT_JUDGE_MODEL; + const result = await generateText({ - model: anthropic(judgeModel), + model: anthropic(model), prompt: [ 'You are grading an AI agent run against a coding-agent playbook.', 'First, write one sentence of reasoning explaining your verdict.', diff --git a/libs/agent-evals/src/core/mock-shell.ts b/libs/agent-evals/src/core/mock-shell.ts index 38cda72a5dd..867b693538e 100644 --- a/libs/agent-evals/src/core/mock-shell.ts +++ b/libs/agent-evals/src/core/mock-shell.ts @@ -1,11 +1,5 @@ import type { CommandParser, EvalScenario, MockShellState, ParsedCommand, Tape } from './types.js'; -let shellCounter = 0; - -export function resetShellCounter(): void { - shellCounter = 0; -} - function selectTapeChunks(tape: Tape, parsed: TParsed): string[] { const selected: string[] = []; @@ -28,6 +22,7 @@ function selectTapeChunks(tape: Tape, parsed: TParsed): string */ export class MockShellEngine { private shells = new Map>(); + private shellCounter = 0; constructor( private readonly scenario: EvalScenario, @@ -35,7 +30,8 @@ export class MockShellEngine { ) {} createShell(command: string, runInBackground: boolean, env: Record): MockShellState { - const id = `shell-${++shellCounter}`; + this.shellCounter += 1; + const id = `shell-${this.shellCounter}`; const isTracked = this.parser.matches(command); const parsed = isTracked ? this.parser.parse(command, env) : null; diff --git a/libs/agent-evals/src/core/reporters.ts b/libs/agent-evals/src/core/reporters.ts deleted file mode 100644 index 7c4e9d1a75f..00000000000 --- a/libs/agent-evals/src/core/reporters.ts +++ /dev/null @@ -1,66 +0,0 @@ -import fs from 'node:fs/promises'; -import path from 'node:path'; -import { formatGraderStatus } from './graders.js'; -import type { GraderDefinition, GraderOutcome, ScenarioScore } from './types.js'; -import { PACKAGE_ROOT } from './types.js'; - -export async function writeScoresFile(suiteId: string, scores: ScenarioScore[]): Promise { - const outputPath = path.join(PACKAGE_ROOT, `scores-${suiteId}.json`); - const payload = scores.map(({ runResult, ...rest }) => ({ - ...rest, - suite: suiteId, - updatedAt: new Date().toISOString(), - })); - - await fs.writeFile(outputPath, JSON.stringify(payload, null, 2), 'utf8'); - - return outputPath; -} - -type GraderProgressReporterOptions = { - totalGraders: number; - judgeEnabled: boolean; -}; - -export function createGraderProgressReporter(options: GraderProgressReporterOptions) { - let graderIndex = 0; - let pendingJudgeLineLength = 0; - - const formatGraderLine = ( - index: number, - name: string, - status: string, - kind: GraderDefinition['kind'], - reason?: string - ) => { - const base = ` • [${index}/${options.totalGraders}] ${name}: ${formatGraderStatus(status, kind, options.judgeEnabled)}`; - - return status === 'fail' && reason ? `${base} — ${reason}` : base; - }; - - return { - onGraderStart(name: string, kind: GraderDefinition['kind']) { - if (kind !== 'judge' || !process.stdout.isTTY) { - return; - } - - graderIndex += 1; - const line = formatGraderLine(graderIndex, name, 'evaluating…', kind); - pendingJudgeLineLength = line.length; - process.stdout.write(line); - }, - onGraderResult(name: string, outcome: GraderOutcome, kind: GraderDefinition['kind']) { - if (kind === 'judge' && process.stdout.isTTY && pendingJudgeLineLength > 0) { - const line = formatGraderLine(graderIndex, name, outcome.status, kind, outcome.reason); - const padding = Math.max(0, pendingJudgeLineLength - line.length); - process.stdout.write(`\r${line}${' '.repeat(padding)}\n`); - pendingJudgeLineLength = 0; - - return; - } - - graderIndex += 1; - console.log(formatGraderLine(graderIndex, name, outcome.status, kind, outcome.reason)); - }, - }; -} diff --git a/libs/agent-evals/src/core/run-agent.ts b/libs/agent-evals/src/core/run-agent.ts deleted file mode 100644 index 84467f64403..00000000000 --- a/libs/agent-evals/src/core/run-agent.ts +++ /dev/null @@ -1,117 +0,0 @@ -import fs from 'node:fs/promises'; -import { anthropic } from '@ai-sdk/anthropic'; -import { generateText, type ModelMessage, stepCountIs } from 'ai'; -import { resetShellCounter } from './mock-shell.js'; -import { RunRecorder } from './recorder.js'; -import { createHarnessContext, createHarnessTools } from './tools.js'; -import type { EvalScenario, ParsedCommand, RunResult, Suite } from './types.js'; - -export type RunAgentOptions = { - suite: Suite; - scenario: EvalScenario; - model: string; - maxSteps?: number; -}; - -const DEFAULT_PREAMBLE = [ - 'You are an AI coding agent executing the following playbook exactly.', - 'Follow the playbook precisely. Use the provided tools.', - 'You are running in a Claude Code-like environment with Bash, BashOutput, AskUserQuestion, and Read tools.', - 'Read any relevant fixture files in the workspace before acting.', -].join('\n'); - -const docCache = new Map(); - -async function resolveSystemPrompt(suite: Suite): Promise { - const preamble = suite.systemPromptPreamble ?? DEFAULT_PREAMBLE; - - if ('text' in suite.systemPrompt) { - return [preamble, '', suite.systemPrompt.text].join('\n'); - } - - const docPath = suite.systemPrompt.path; - let playbook = docCache.get(docPath); - - if (!playbook) { - playbook = await fs.readFile(docPath, 'utf8'); - docCache.set(docPath, playbook); - } - - return [preamble, '', playbook].join('\n'); -} - -function shouldInjectFollowUp( - result: { text: string; steps: Array<{ toolResults?: Array<{ output?: unknown }> }> }, - suite: Suite, - scenario: EvalScenario -): boolean { - if (!scenario.followUpMessages?.length) { - return false; - } - - if (suite.followUpTextPattern?.test(result.text)) { - return true; - } - - if (!scenario.followUpOnOptionId) { - return false; - } - - return result.steps.some((step) => - step.toolResults?.some((toolResult) => { - const output = toolResult.output as { selectedId?: string } | undefined; - - return output?.selectedId === scenario.followUpOnOptionId; - }) - ); -} - -export async function runAgentScenario(options: RunAgentOptions): Promise { - resetShellCounter(); - - const recorder = new RunRecorder(options.scenario.id, options.scenario.userPrompt); - const context = createHarnessContext(options.suite, options.scenario, recorder); - const tools = createHarnessTools(context); - const system = await resolveSystemPrompt(options.suite as Suite); - - const messages: ModelMessage[] = [{ role: 'user', content: options.scenario.userPrompt }]; - const followUps = [...(options.scenario.followUpMessages ?? [])]; - - // One turn for the initial prompt plus one per scripted follow-up message. - const maxTurns = followUps.length + 1; - - for (let turn = 0; turn < maxTurns; turn += 1) { - if (maxTurns > 1) { - console.log(` ↳ ${options.scenario.id}: agent turn ${turn + 1}/${maxTurns}…`); - } - - const result = await generateText({ - model: anthropic(options.model), - system, - messages, - tools, - stopWhen: stepCountIs(options.maxSteps ?? 40), - }); - - console.log( - ` ↳ ${options.scenario.id}: model responded (${result.steps.length} step${result.steps.length === 1 ? '' : 's'})` - ); - - recorder.recordAssistantMessage(result.text); - messages.push(...result.response.messages); - - if (followUps.length > 0 && shouldInjectFollowUp(result, options.suite, options.scenario)) { - const nextMessage = followUps.shift(); - - if (nextMessage) { - messages.push({ role: 'user', content: nextMessage }); - } - - continue; - } - - break; - } - - return recorder.build(); -} diff --git a/libs/agent-evals/src/core/runner.ts b/libs/agent-evals/src/core/runner.ts deleted file mode 100644 index c68ff5f5b10..00000000000 --- a/libs/agent-evals/src/core/runner.ts +++ /dev/null @@ -1,130 +0,0 @@ -import fs from 'node:fs/promises'; -import path from 'node:path'; -import { gradeRun, scoreFromOutcomes } from './graders.js'; -import { configureJudge } from './judge.js'; -import { createGraderProgressReporter } from './reporters.js'; -import { runAgentScenario } from './run-agent.js'; -import type { GraderOutcome, GraderResult, RegisteredScenario, RunnerOptions, ScenarioScore, Suite } from './types.js'; -import { PACKAGE_ROOT } from './types.js'; - -export function filterScenarios(suite: Suite, filter?: string): RegisteredScenario[] { - if (!filter) { - return suite.scenarios; - } - - const normalized = filter.toLowerCase(); - - return suite.scenarios.filter( - (entry) => - entry.scenario.id.toLowerCase().includes(normalized) || entry.scenario.category.toLowerCase().includes(normalized) - ); -} - -async function maybeWriteDebugArtifact( - suite: Suite, - options: RunnerOptions, - entry: RegisteredScenario, - score: ScenarioScore -): Promise { - if (!options.debug) { - return; - } - - const debugDir = path.join(PACKAGE_ROOT, 'debug-runs', suite.id, entry.scenario.id); - await fs.mkdir(debugDir, { recursive: true }); - await fs.writeFile(path.join(debugDir, 'score.json'), JSON.stringify(score, null, 2), 'utf8'); -} - -export async function runEvaluation( - suite: Suite, - entry: RegisteredScenario, - options: RunnerOptions -): Promise { - configureJudge({ enabled: options.judge, model: options.judgeModel ?? options.model }); - - const runResult = await runAgentScenario({ suite, scenario: entry.scenario, model: options.model }); - const totalGraders = Object.keys(entry.graders).length; - - console.log(` ↳ ${entry.scenario.id}: grading ${totalGraders} checks${options.judge ? ' (with judge)' : ''}…`); - - const progress = createGraderProgressReporter({ totalGraders, judgeEnabled: options.judge }); - const outcomes: Record = await gradeRun(entry.graders, runResult, { - judgeEnabled: options.judge, - onGraderStart: progress.onGraderStart, - onGraderResult: progress.onGraderResult, - }); - - const graders = Object.fromEntries( - Object.entries(outcomes).map(([name, outcome]) => [name, outcome.status]) - ) as Record; - - const graderReasons = Object.fromEntries( - Object.entries(outcomes) - .filter(([, outcome]) => outcome.status === 'fail' && Boolean(outcome.reason)) - .map(([name, outcome]) => [name, outcome.reason as string]) - ); - - const score = scoreFromOutcomes(outcomes); - - const graderKinds = Object.fromEntries( - Object.entries(entry.graders).map(([name, definition]) => [name, definition.kind]) - ) as Record; - - const scenarioScore: ScenarioScore = { - scenarioId: entry.scenario.id, - category: entry.scenario.category, - model: options.model, - score, - graders, - graderReasons, - graderKinds, - runResult: options.debug ? runResult : undefined, - }; - - await maybeWriteDebugArtifact(suite, options, entry, { ...scenarioScore, runResult }); - - return scenarioScore; -} - -export async function runAllEvaluations(suite: Suite, options: RunnerOptions): Promise { - let selected = filterScenarios(suite, options.scenarioFilter); - - if (options.smoke) { - selected = selected.slice(0, 1); - } - - const scores: ScenarioScore[] = []; - const total = selected.length; - - console.log(`Running ${total} scenario${total === 1 ? '' : 's'} for suite "${suite.id}" (model: ${options.model})\n`); - - for (let index = 0; index < selected.length; index += 1) { - const entry = selected[index]; - const position = `[${index + 1}/${total}]`; - const startedAt = Date.now(); - - console.log(`${position} ${entry.scenario.id} — running…`); - - const score = await runEvaluation(suite, entry, options); - scores.push(score); - - const elapsed = ((Date.now() - startedAt) / 1000).toFixed(1); - console.log(`${position} ${entry.scenario.id} — done: ${(score.score * 100).toFixed(1)}% (${elapsed}s)`); - } - - if (total > 1) { - console.log(`Average score: ${(averageScore(scores) * 100).toFixed(1)}%`); - } - - console.log(''); - - return scores; -} - -export function averageScore(scores: ScenarioScore[]): number { - if (scores.length === 0) { - return 0; - } - - return scores.reduce((sum, item) => sum + item.score, 0) / scores.length; -} diff --git a/libs/agent-evals/src/core/types.ts b/libs/agent-evals/src/core/types.ts index deb0fd7dcf5..447caf629f9 100644 --- a/libs/agent-evals/src/core/types.ts +++ b/libs/agent-evals/src/core/types.ts @@ -14,6 +14,8 @@ export type GraderFn = (result: RunResult) => GraderResult | GraderOutcome | Pro export type GraderDefinition = { kind: 'deterministic' | 'judge'; run: GraderFn; + /** Human-readable label shown in eval reports (defaults to the grader key). */ + label?: string; }; export type ToolCallRecord = { @@ -78,30 +80,6 @@ export type RunResult = { metadata: Record; }; -export type ScenarioScore = { - scenarioId: string; - category: string; - model: string; - score: number; - graders: Record; - /** Failure explanations keyed by grader name; only populated for failing graders that supply a reason. */ - graderReasons: Record; - graderKinds: Record; - runResult?: RunResult; -}; - -export type RunnerOptions = { - suite: string; - model: string; - judge: boolean; - judgeModel?: string; - debug: boolean; - dry: boolean; - smoke: boolean; - failUnder?: number; - scenarioFilter?: string; -}; - export type MockShellState = { id: string; command: string; diff --git a/libs/agent-evals/src/index.ts b/libs/agent-evals/src/index.ts deleted file mode 100644 index 78649c189db..00000000000 --- a/libs/agent-evals/src/index.ts +++ /dev/null @@ -1,129 +0,0 @@ -import './load-env.js'; -import { writeScoresFile } from './core/reporters.js'; -import { averageScore, filterScenarios, runAllEvaluations } from './core/runner.js'; -import type { RunnerOptions } from './core/types.js'; -import { DEFAULT_SUITE, getSuite, listSuiteIds } from './suites/registry.js'; - -function parseArgs(argv: string[]): RunnerOptions { - const options: RunnerOptions = { - suite: DEFAULT_SUITE, - model: 'claude-sonnet-4-5', - judge: Boolean(process.env.ANTHROPIC_API_KEY), - debug: false, - dry: false, - smoke: false, - }; - - for (let index = 0; index < argv.length; index += 1) { - const arg = argv[index]; - - if (arg === '--suite' || arg === '-s') { - options.suite = argv[index + 1]; - index += 1; - continue; - } - - if (arg === '--scenario' || arg === '-e') { - options.scenarioFilter = argv[index + 1]; - index += 1; - continue; - } - - if (arg === '--model') { - options.model = argv[index + 1]; - index += 1; - continue; - } - - if (arg === '--judge') { - options.judge = true; - continue; - } - - if (arg === '--no-judge') { - options.judge = false; - continue; - } - - if (arg === '--judge-model') { - options.judgeModel = argv[index + 1]; - index += 1; - continue; - } - - if (arg === '--smoke') { - options.smoke = true; - continue; - } - - if (arg === '--debug' || arg === '-d') { - options.debug = true; - continue; - } - - if (arg === '--dry') { - options.dry = true; - continue; - } - - if (arg === '--fail-under') { - options.failUnder = Number(argv[index + 1]); - index += 1; - } - } - - return options; -} - -async function main(): Promise { - const options = parseArgs(process.argv.slice(2)); - const suite = getSuite(options.suite); - - if (!suite) { - console.error(`Unknown suite "${options.suite}". Available: ${listSuiteIds().join(', ')}`); - process.exit(1); - - return; - } - - if (options.dry) { - const selected = filterScenarios(suite, options.scenarioFilter); - const shown = options.smoke ? selected.slice(0, 1) : selected; - - console.log(`${suite.id} eval dry run`); - console.log(`Model: ${options.model}`); - console.log(`Judge: ${options.judge ? 'enabled' : 'disabled'}`); - console.log(`Scenarios: ${shown.length}`); - - for (const entry of shown) { - console.log(`- ${entry.scenario.id}: ${entry.scenario.description}`); - } - - return; - } - - if (!process.env.ANTHROPIC_API_KEY) { - console.error('ANTHROPIC_API_KEY is required to run agent evals.'); - process.exit(1); - - return; - } - - const scores = await runAllEvaluations(suite, options); - const outputPath = await writeScoresFile(suite.id, scores); - console.log(`Wrote ${outputPath}`); - - if (options.failUnder !== undefined) { - const average = averageScore(scores) * 100; - - if (average < options.failUnder) { - console.error(`Average score ${average.toFixed(1)}% is below fail-under threshold ${options.failUnder}%`); - process.exit(1); - } - } -} - -main().catch((error) => { - console.error(error); - process.exit(1); -}); diff --git a/libs/agent-evals/src/self-test.ts b/libs/agent-evals/src/self-test.ts deleted file mode 100644 index 71add6c41f7..00000000000 --- a/libs/agent-evals/src/self-test.ts +++ /dev/null @@ -1,53 +0,0 @@ -import { gradeRun, scoreFromOutcomes } from './core/graders.js'; -import type { RunResult } from './core/types.js'; -import { graders as keylessWhatsappGraders } from './suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.js'; - -function buildResult(partial: Partial): RunResult { - return { - scenarioId: partial.scenarioId ?? 'test', - userPrompt: partial.userPrompt ?? 'Connect WhatsApp', - toolCalls: partial.toolCalls ?? [], - assistantMessages: partial.assistantMessages ?? [], - finalText: partial.finalText ?? '', - capturedUrls: partial.capturedUrls ?? [], - openedFiles: partial.openedFiles ?? [], - killedShellIds: partial.killedShellIds ?? [], - trackedShellIds: partial.trackedShellIds ?? [], - polledShellIds: partial.polledShellIds ?? [], - trackedCommands: partial.trackedCommands ?? [], - metadata: partial.metadata ?? {}, - }; -} - -async function main(): Promise { - const passing = buildResult({ - scenarioId: 'keyless-whatsapp-redirect', - finalText: 'Please continue in https://dashboard.novu.co', - trackedCommands: [], - toolCalls: [{ name: 'AskUserQuestion', args: {}, timestamp: Date.now() }], - }); - - const failing = buildResult({ - scenarioId: 'keyless-whatsapp-redirect', - finalText: 'Running connect now', - trackedCommands: ['npx novu connect --ci --channel whatsapp'], - }); - - const passOutcomes = await gradeRun(keylessWhatsappGraders, passing, { judgeEnabled: false }); - const failOutcomes = await gradeRun(keylessWhatsappGraders, failing, { judgeEnabled: false }); - - if (scoreFromOutcomes(passOutcomes) < 1) { - throw new Error('Expected passing synthetic run to score 1.0'); - } - - if (scoreFromOutcomes(failOutcomes) >= 1) { - throw new Error('Expected failing synthetic run to score below 1.0'); - } - - console.log('Self-test passed: deterministic graders behave as expected.'); -} - -main().catch((error) => { - console.error(error); - process.exit(1); -}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/adapters.ts b/libs/agent-evals/src/suites/agent-onboarding/adapters.ts new file mode 100644 index 00000000000..50a69e6759b --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/adapters.ts @@ -0,0 +1,46 @@ +import { createJudge, type Judge } from 'vitest-evals'; +import type { GraderDefinition, GraderOutcome, GraderResult, RunResult } from '../../core/types.js'; + +function toOutcome(value: GraderResult | GraderOutcome): GraderOutcome { + return typeof value === 'string' ? { status: value } : value; +} + +function outcomeToScore(outcome: GraderOutcome): number { + if (outcome.status === 'skip') { + return 1; + } + + return outcome.status === 'pass' ? 1 : 0; +} + +export function graderToJudge(name: string, definition: GraderDefinition): Judge { + return createJudge(definition.label ?? name, async ({ output }) => { + const outcome = toOutcome(await definition.run(output as RunResult)); + + return { + score: outcomeToScore(outcome), + metadata: outcome.reason ? { rationale: outcome.reason, status: outcome.status } : { status: outcome.status }, + }; + }); +} + +export function gradersToJudges( + graders: Record, + options: { judgeEnabled: boolean } +): Judge[] { + const judges: Judge[] = []; + + for (const [name, definition] of Object.entries(graders)) { + if (definition.kind === 'judge' && !options.judgeEnabled) { + continue; + } + + judges.push(graderToJudge(name, definition)); + } + + return judges; +} + +export function isJudgeEnabled(): boolean { + return process.env.NOVU_EVAL_JUDGE === 'true' || process.env.NOVU_EVAL_JUDGE === '1'; +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts index 1677e687d58..adcc99f8139 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts @@ -1,4 +1,4 @@ -import { defineGraders, fail, judge, toolCallsNamed, transcriptText } from '../../core/graders.js'; +import { defineGraders, fail, judge, labeled, toolCallsNamed, transcriptText } from '../../core/graders.js'; import { isForbiddenWatcherCommand } from '../../core/recorder.js'; import type { GraderOutcome, RunResult } from '../../core/types.js'; @@ -181,9 +181,16 @@ export const catalog = { }; export const sharedJudgeGraders = defineGraders({ - personaAudienceFit: judge(judgePrompts.personaAudienceFit, (result) => - [descriptionText(result), transcriptText(result)].join('\n') + personaAudienceFit: labeled( + 'frames the agent for the product end-user audience in domain language', + judge(judgePrompts.personaAudienceFit, (result) => [descriptionText(result), transcriptText(result)].join('\n')) + ), + noInfraMcpSemantic: labeled( + 'avoids naming internal infrastructure in the drafted agent description', + judge(judgePrompts.noInfraMcpSemantic, (result) => descriptionText(result)) + ), + conclusionFirstReport: labeled( + 'leads the final report with the CLI result and next action', + judge(judgePrompts.conclusionFirstReport, (result) => result.finalText) ), - noInfraMcpSemantic: judge(judgePrompts.noInfraMcpSemantic, (result) => descriptionText(result)), - conclusionFirstReport: judge(judgePrompts.conclusionFirstReport, (result) => result.finalText), }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/graders.test.ts b/libs/agent-evals/src/suites/agent-onboarding/graders.test.ts new file mode 100644 index 00000000000..589a951b155 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/graders.test.ts @@ -0,0 +1,68 @@ +import { describe, expect, it } from 'vitest'; +import type { RunResult } from '../../core/types.js'; +import { graderToJudge } from './adapters.js'; +import { graders as keylessWhatsappGraders } from './scenarios/keyless-whatsapp-redirect/graders.js'; + +function buildResult(partial: Partial): RunResult { + return { + scenarioId: partial.scenarioId ?? 'test', + userPrompt: partial.userPrompt ?? 'Connect WhatsApp', + toolCalls: partial.toolCalls ?? [], + assistantMessages: partial.assistantMessages ?? [], + finalText: partial.finalText ?? '', + capturedUrls: partial.capturedUrls ?? [], + openedFiles: partial.openedFiles ?? [], + killedShellIds: partial.killedShellIds ?? [], + trackedShellIds: partial.trackedShellIds ?? [], + polledShellIds: partial.polledShellIds ?? [], + trackedCommands: partial.trackedCommands ?? [], + metadata: partial.metadata ?? {}, + }; +} + +async function averageScore( + graders: Record unknown }>, + result: RunResult +): Promise { + const judges = Object.entries(graders).map(([name, definition]) => graderToJudge(name, definition)); + const scores = await Promise.all( + judges.map(async (judge) => { + const verdict = await judge.assess({ output: result } as never); + + return verdict.score; + }) + ); + + if (scores.length === 0) { + return 0; + } + + return scores.reduce((sum, score) => sum + score, 0) / scores.length; +} + +describe('keyless-whatsapp-redirect graders', () => { + it('scores a passing synthetic run at 1.0', async () => { + const passing = buildResult({ + scenarioId: 'keyless-whatsapp-redirect', + finalText: 'Please continue in https://dashboard.novu.co', + trackedCommands: [], + toolCalls: [{ name: 'AskUserQuestion', args: {}, timestamp: Date.now() }], + }); + + const score = await averageScore(keylessWhatsappGraders, passing); + + expect(score).toBe(1); + }); + + it('scores a failing synthetic run below 1.0', async () => { + const failing = buildResult({ + scenarioId: 'keyless-whatsapp-redirect', + finalText: 'Running connect now', + trackedCommands: ['npx novu connect --ci --channel whatsapp'], + }); + + const score = await averageScore(keylessWhatsappGraders, failing); + + expect(score).toBeLessThan(1); + }); +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/harness.ts b/libs/agent-evals/src/suites/agent-onboarding/harness.ts new file mode 100644 index 00000000000..04d2cbcfda2 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/harness.ts @@ -0,0 +1,148 @@ +import fs from 'node:fs/promises'; +import { anthropic } from '@ai-sdk/anthropic'; +import { generateText, type ModelMessage, stepCountIs } from 'ai'; +import { createHarness } from 'vitest-evals/harness'; +import { RunRecorder } from '../../core/recorder.js'; +import { createHarnessContext, createHarnessTools } from '../../core/tools.js'; +import type { EvalScenario, ParsedCommand, RunResult, Suite } from '../../core/types.js'; + +const DEFAULT_PREAMBLE = [ + 'You are an AI coding agent executing the following playbook exactly.', + 'Follow the playbook precisely. Use the provided tools.', + 'You are running in a Claude Code-like environment with Bash, BashOutput, AskUserQuestion, and Read tools.', + 'Read any relevant fixture files in the workspace before acting.', +].join('\n'); + +const docCache = new Map(); + +async function resolveSystemPrompt(suite: Suite): Promise { + const preamble = suite.systemPromptPreamble ?? DEFAULT_PREAMBLE; + + if ('text' in suite.systemPrompt) { + return [preamble, '', suite.systemPrompt.text].join('\n'); + } + + const docPath = suite.systemPrompt.path; + let playbook = docCache.get(docPath); + + if (!playbook) { + playbook = await fs.readFile(docPath, 'utf8'); + docCache.set(docPath, playbook); + } + + return [preamble, '', playbook].join('\n'); +} + +function shouldInjectFollowUp( + result: { text: string; steps: Array<{ toolResults?: Array<{ output?: unknown }> }> }, + suite: Suite, + scenario: EvalScenario +): boolean { + if (!scenario.followUpMessages?.length) { + return false; + } + + if (suite.followUpTextPattern?.test(result.text)) { + return true; + } + + if (!scenario.followUpOnOptionId) { + return false; + } + + return result.steps.some((step) => + step.toolResults?.some((toolResult) => { + const output = toolResult.output as { selectedId?: string } | undefined; + + return output?.selectedId === scenario.followUpOnOptionId; + }) + ); +} + +function toJsonSafeRunResult(result: RunResult): RunResult { + return JSON.parse( + JSON.stringify(result, (_key, value) => { + if (value === undefined) { + return null; + } + + return value; + }) + ) as RunResult; +} + +export type ScenarioHarnessOptions = { + suite: Suite; + scenario: EvalScenario; + system: string; + model?: string; + maxSteps?: number; +}; + +function resolveMaxSteps(explicit?: number): number { + if (explicit !== undefined) { + return explicit; + } + + const fromEnv = Number.parseInt(process.env.NOVU_EVAL_MAX_STEPS ?? '', 10); + + return Number.isFinite(fromEnv) && fromEnv > 0 ? fromEnv : 40; +} + +export function scenarioHarness(options: ScenarioHarnessOptions) { + const modelName = options.model ?? process.env.NOVU_EVAL_MODEL ?? 'claude-sonnet-4-5'; + const maxSteps = resolveMaxSteps(options.maxSteps); + + return createHarness({ + name: `agent-onboarding/${options.scenario.id}`, + run: async ({ input }) => { + const recorder = new RunRecorder(options.scenario.id, input); + const context = createHarnessContext(options.suite, options.scenario, recorder); + const tools = createHarnessTools(context); + const messages: ModelMessage[] = [{ role: 'user', content: input }]; + const followUps = [...(options.scenario.followUpMessages ?? [])]; + const maxTurns = followUps.length + 1; + let lastResult: Awaited> | undefined; + + for (let turn = 0; turn < maxTurns; turn += 1) { + lastResult = await generateText({ + model: anthropic(modelName), + system: options.system, + messages, + tools, + stopWhen: stepCountIs(maxSteps), + }); + + recorder.recordAssistantMessage(lastResult.text); + messages.push(...lastResult.response.messages); + + if (followUps.length > 0 && shouldInjectFollowUp(lastResult, options.suite, options.scenario)) { + const nextMessage = followUps.shift(); + + if (nextMessage) { + messages.push({ role: 'user', content: nextMessage }); + } + + continue; + } + + break; + } + + return { + output: toJsonSafeRunResult(recorder.build()), + usage: { + provider: 'anthropic', + model: modelName, + inputTokens: lastResult?.usage?.inputTokens, + outputTokens: lastResult?.usage?.outputTokens, + totalTokens: lastResult?.usage?.totalTokens, + }, + }; + }, + }); +} + +export async function loadSuiteSystemPrompt(suite: Suite): Promise { + return resolveSystemPrompt(suite); +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/kit.ts b/libs/agent-evals/src/suites/agent-onboarding/kit.ts index 2005aa95c7a..3d7ac1ca741 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/kit.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/kit.ts @@ -1,5 +1,5 @@ // Stable import surface for scenario files, independent of core/ layout. -export { defineGraders } from '../../core/graders.js'; +export { defineGraders, labeled } from '../../core/graders.js'; export type { EvalScenario, RunResult } from '../../core/types.js'; export { catalog, sharedJudgeGraders } from './catalog.js'; export type { ConnectFlags } from './connect-parser.js'; diff --git a/libs/agent-evals/src/suites/agent-onboarding/onboarding.eval.ts b/libs/agent-evals/src/suites/agent-onboarding/onboarding.eval.ts new file mode 100644 index 00000000000..bfc3ad6b905 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/onboarding.eval.ts @@ -0,0 +1,31 @@ +import '../../load-env.js'; +import { describeEval } from 'vitest-evals'; +import { gradersToJudges, isJudgeEnabled } from './adapters.js'; +import { loadSuiteSystemPrompt, scenarioHarness } from './harness.js'; +import { agentOnboardingSuite } from './index.js'; + +const JUDGE_THRESHOLD = 0.8; +const system = await loadSuiteSystemPrompt(agentOnboardingSuite); + +for (const entry of agentOnboardingSuite.scenarios) { + const harness = scenarioHarness({ + suite: agentOnboardingSuite, + scenario: entry.scenario, + system, + }); + + describeEval( + entry.scenario.id, + { + harness, + judges: gradersToJudges(entry.graders, { judgeEnabled: isJudgeEnabled() }), + judgeThreshold: JUDGE_THRESHOLD, + skipIf: () => !process.env.ANTHROPIC_API_KEY, + }, + (it) => { + it(entry.scenario.description, async ({ run }) => { + await run(entry.scenario.userPrompt); + }); + } + ); +} diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts index dbfc78f487a..2171b9bbd5f 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts @@ -1,10 +1,16 @@ -import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; +import { catalog, defineGraders, labeled, sharedJudgeGraders } from '../../kit.js'; export const graders = defineGraders({ - usedLoginWhenDashboardPrompt: catalog.usedLoginWhenDashboardPrompt, - noSecretKeyFlag: catalog.noSecretKeyFlag, - backgroundConnectShell: catalog.backgroundConnectShell, - readAuthUrlFile: catalog.readAuthUrlFile, - reportedSuccess: catalog.reportedSuccess, + usedLoginWhenDashboardPrompt: labeled( + 'uses --login when the user is signed into the dashboard', + catalog.usedLoginWhenDashboardPrompt + ), + noSecretKeyFlag: labeled('does not pass --secret-key or NOVU_SECRET_KEY to connect', catalog.noSecretKeyFlag), + backgroundConnectShell: labeled( + 'runs connect in the background and polls output with BashOutput', + catalog.backgroundConnectShell + ), + readAuthUrlFile: labeled('reads the auth-url file or surfaces the /oauth/device URL', catalog.readAuthUrlFile), + reportedSuccess: labeled('confirms the agent is live in the final report', catalog.reportedSuccess), ...sharedJudgeGraders, }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts index d10a0087628..908d8ecea31 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts @@ -1,12 +1,17 @@ -import { catalog, defineGraders, type RunResult } from '../../kit.js'; +import { catalog, defineGraders, labeled, type RunResult } from '../../kit.js'; function polledAtLeast(result: RunResult, count: number): 'pass' | 'fail' { return result.polledShellIds.length >= count ? 'pass' : 'fail'; } export const graders = defineGraders({ - noTimersNoWatchers: catalog.noTimersNoWatchers, - backgroundConnectShell: catalog.backgroundConnectShell, - polledMultipleTimes: (result) => polledAtLeast(result, 3), - reportedSuccess: catalog.reportedSuccess, + noTimersNoWatchers: labeled('does not use timer/watcher commands or tail log files', catalog.noTimersNoWatchers), + backgroundConnectShell: labeled( + 'runs connect in the background and polls output with BashOutput', + catalog.backgroundConnectShell + ), + polledMultipleTimes: labeled('polls the background connect shell at least three times', (result) => + polledAtLeast(result, 3) + ), + reportedSuccess: labeled('confirms the agent is live in the final report', catalog.reportedSuccess), }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/graders.ts index 827a49f1851..e6ffa39d153 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/graders.ts @@ -1,11 +1,14 @@ -import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; +import { catalog, defineGraders, labeled, sharedJudgeGraders } from '../../kit.js'; const mailtoUrl = 'mailto:connect+agent123@inbound.novu.test?subject=Novu%20Connect'; export const graders = defineGraders({ - noSecretKeyFlag: catalog.noSecretKeyFlag, - backgroundConnectShell: catalog.backgroundConnectShell, - pastedMailto: catalog.pastedLiteralUrl(mailtoUrl), - reportedSuccess: catalog.reportedSuccess, + noSecretKeyFlag: labeled('does not pass --secret-key or NOVU_SECRET_KEY to connect', catalog.noSecretKeyFlag), + backgroundConnectShell: labeled( + 'runs connect in the background and polls output with BashOutput', + catalog.backgroundConnectShell + ), + pastedMailto: labeled('surfaces the mailto handoff URL to the user', catalog.pastedLiteralUrl(mailtoUrl)), + reportedSuccess: labeled('confirms the agent is live in the final report', catalog.reportedSuccess), ...sharedJudgeGraders, }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/graders.ts index 354d776515e..8294ec6d845 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/graders.ts @@ -1,18 +1,27 @@ -import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; +import { catalog, defineGraders, labeled, sharedJudgeGraders } from '../../kit.js'; const setupUrl = 'https://setup.novu.test/slack/abc123'; const authorizeUrl = 'https://slack.test/oauth/authorize/xyz'; const claimUrl = 'https://dashboard.novu.test/claim/token-abc'; export const graders = defineGraders({ - noSecretKeyFlag: catalog.noSecretKeyFlag, - backgroundConnectShell: catalog.backgroundConnectShell, - usedPickerForDecisions: catalog.usedPickerForDecisions, - confirmedBeforeRun: catalog.confirmedBeforeRun, - usedSecureTokenPath: catalog.usedSecureTokenPath, - pastedSetupUrl: catalog.pastedLiteralUrl(setupUrl), - pastedAuthorizeUrl: catalog.pastedLiteralUrl(authorizeUrl), - reportedClaimLink: catalog.pastedLiteralUrl(claimUrl), - reportedSuccess: catalog.reportedSuccess, + noSecretKeyFlag: labeled('does not pass --secret-key or NOVU_SECRET_KEY to connect', catalog.noSecretKeyFlag), + backgroundConnectShell: labeled( + 'runs connect in the background and polls output with BashOutput', + catalog.backgroundConnectShell + ), + usedPickerForDecisions: labeled( + 'uses AskUserQuestion for channel and token decisions', + catalog.usedPickerForDecisions + ), + confirmedBeforeRun: labeled('confirms with the user before running connect', catalog.confirmedBeforeRun), + usedSecureTokenPath: labeled( + 'uses the secure token path instead of passing --slack-config-token inline', + catalog.usedSecureTokenPath + ), + pastedSetupUrl: labeled('surfaces the Slack setup URL to the user', catalog.pastedLiteralUrl(setupUrl)), + pastedAuthorizeUrl: labeled('surfaces the Slack authorize URL to the user', catalog.pastedLiteralUrl(authorizeUrl)), + reportedClaimLink: labeled('surfaces the claim link to the user', catalog.pastedLiteralUrl(claimUrl)), + reportedSuccess: labeled('confirms the agent is live in the final report', catalog.reportedSuccess), ...sharedJudgeGraders, }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts index 4c57f4fd5ed..8578a8ccd00 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts @@ -1,7 +1,10 @@ -import { catalog, defineGraders } from '../../kit.js'; +import { catalog, defineGraders, labeled } from '../../kit.js'; export const graders = defineGraders({ - noConnectCommands: catalog.noConnectCommands, - noConnectOnKeylessWhatsapp: catalog.noConnectOnKeylessWhatsapp, - usedPickerForDecisions: catalog.usedPickerForDecisions, + noConnectCommands: labeled('does not run a connect command', catalog.noConnectCommands), + noConnectOnKeylessWhatsapp: labeled( + 'redirects the user to the dashboard instead of running connect', + catalog.noConnectOnKeylessWhatsapp + ), + usedPickerForDecisions: labeled('uses AskUserQuestion for channel decisions', catalog.usedPickerForDecisions), }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts index 5fb8ee96457..44f37f0debd 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts @@ -1,14 +1,14 @@ -import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; +import { catalog, defineGraders, labeled, sharedJudgeGraders } from '../../kit.js'; export const graders = defineGraders({ - descriptionExcludesInfraTokens: catalog.descriptionExcludesInfraTokens([ - 'postgres', - 'resend', - 'mongodb', - 'github', - 'sentry', - ]), - descriptionIncludesAudience: catalog.descriptionIncludesTokens(['staff', 'wine']), - confirmedBeforeRun: catalog.confirmedBeforeRun, + descriptionExcludesInfraTokens: labeled( + 'excludes infrastructure tokens from the drafted agent description', + catalog.descriptionExcludesInfraTokens(['postgres', 'resend', 'mongodb', 'github', 'sentry']) + ), + descriptionIncludesAudience: labeled( + 'includes audience-specific tokens in the drafted agent description', + catalog.descriptionIncludesTokens(['staff', 'wine']) + ), + confirmedBeforeRun: labeled('confirms with the user before running connect', catalog.confirmedBeforeRun), ...sharedJudgeGraders, }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts index 4ae74f03626..92b4524c299 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts @@ -1,10 +1,16 @@ -import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; +import { catalog, defineGraders, labeled, sharedJudgeGraders } from '../../kit.js'; export const graders = defineGraders({ - usedLoginWhenDashboardPrompt: catalog.usedLoginWhenDashboardPrompt, - killedFirstConnectShell: catalog.killedFirstConnectShell, - reranWithSlackToken: catalog.reranWithSlackToken, - pastedAuthorizeUrl: catalog.pastedLiteralUrl('https://slack.test/oauth/rerun-token'), - reportedSuccess: catalog.reportedSuccess, + usedLoginWhenDashboardPrompt: labeled( + 'uses --login when the user is signed into the dashboard', + catalog.usedLoginWhenDashboardPrompt + ), + killedFirstConnectShell: labeled('kills the first connect shell before re-running', catalog.killedFirstConnectShell), + reranWithSlackToken: labeled('re-runs connect with --slack-config-token', catalog.reranWithSlackToken), + pastedAuthorizeUrl: labeled( + 'surfaces the Slack authorize URL to the user', + catalog.pastedLiteralUrl('https://slack.test/oauth/rerun-token') + ), + reportedSuccess: labeled('confirms the agent is live in the final report', catalog.reportedSuccess), ...sharedJudgeGraders, }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/graders.ts index 53c78770ba3..a90c3f93bd6 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/graders.ts @@ -1,10 +1,16 @@ -import { catalog, defineGraders, sharedJudgeGraders } from '../../kit.js'; +import { catalog, defineGraders, labeled, sharedJudgeGraders } from '../../kit.js'; export const graders = defineGraders({ - noSecretKeyFlag: catalog.noSecretKeyFlag, - backgroundConnectShell: catalog.backgroundConnectShell, - qrHostAware: catalog.qrHostAware, - pastedSetupUrl: catalog.pastedLiteralUrl('https://setup.novu.test/telegram/abc'), - reportedSuccess: catalog.reportedSuccess, + noSecretKeyFlag: labeled('does not pass --secret-key or NOVU_SECRET_KEY to connect', catalog.noSecretKeyFlag), + backgroundConnectShell: labeled( + 'runs connect in the background and polls output with BashOutput', + catalog.backgroundConnectShell + ), + qrHostAware: labeled('opens the QR code image for host-aware delivery', catalog.qrHostAware), + pastedSetupUrl: labeled( + 'surfaces the Telegram setup URL to the user', + catalog.pastedLiteralUrl('https://setup.novu.test/telegram/abc') + ), + reportedSuccess: labeled('confirms the agent is live in the final report', catalog.reportedSuccess), ...sharedJudgeGraders, }); diff --git a/libs/agent-evals/src/suites/registry.ts b/libs/agent-evals/src/suites/registry.ts deleted file mode 100644 index cb28f5b62f6..00000000000 --- a/libs/agent-evals/src/suites/registry.ts +++ /dev/null @@ -1,16 +0,0 @@ -import type { ParsedCommand, Suite } from '../core/types.js'; -import { agentOnboardingSuite } from './agent-onboarding/index.js'; - -export const suites: Record> = { - [agentOnboardingSuite.id]: agentOnboardingSuite as unknown as Suite, -}; - -export const DEFAULT_SUITE = agentOnboardingSuite.id; - -export function getSuite(id: string): Suite | undefined { - return suites[id]; -} - -export function listSuiteIds(): string[] { - return Object.keys(suites); -} diff --git a/libs/agent-evals/vitest.config.ts b/libs/agent-evals/vitest.config.ts new file mode 100644 index 00000000000..d08f3fd0e06 --- /dev/null +++ b/libs/agent-evals/vitest.config.ts @@ -0,0 +1,8 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + include: ['src/**/*.test.ts'], + testTimeout: 30_000, + }, +}); diff --git a/libs/agent-evals/vitest.evals.config.ts b/libs/agent-evals/vitest.evals.config.ts new file mode 100644 index 00000000000..e7382ba544a --- /dev/null +++ b/libs/agent-evals/vitest.evals.config.ts @@ -0,0 +1,24 @@ +import { defineConfig } from 'vitest/config'; + +const concurrency = Number.parseInt(process.env.NOVU_EVAL_CONCURRENCY ?? '', 10); +const maxConcurrency = Number.isFinite(concurrency) && concurrency > 0 ? concurrency : 4; + +export default defineConfig({ + test: { + include: ['src/**/*.eval.ts'], + testTimeout: 300_000, + hookTimeout: 60_000, + // vitest-evals/reporter extends VerboseReporter and prints compact, human-readable + // per-grader scores + reasons. The stock 'default' reporter additionally dumps the + // full RunResult JSON inside the threshold AssertionError, so we omit it here. + reporters: ['vitest-evals/reporter'], + // Scenarios are independent and dominated by live-model latency, so run them + // concurrently. maxConcurrency caps in-flight requests to respect API rate limits. + sequence: { concurrent: true }, + maxConcurrency, + env: { + VITEST_EVALS_REPLAY_MODE: process.env.VITEST_EVALS_REPLAY_MODE ?? 'off', + VITEST_EVALS_REPLAY_DIR: '.vitest-evals/recordings', + }, + }, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c90e6f5cf8d..8ea63f1930c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -763,7 +763,7 @@ importers: version: 3.0.51(react@19.2.3)(zod@4.3.5) '@better-auth/sso': specifier: ^1.3.0 - version: 1.4.7(better-auth@1.5.6(d9d7da76424e2ace7367f5b90edcc4b7)) + version: 1.4.7(better-auth@1.5.6(36df9c69bfb921062bcaa789d1b80d99)) '@calcom/embed-react': specifier: 1.5.2 version: 1.5.2(react-dom@19.2.3(react@19.2.3))(react@19.2.3) @@ -973,7 +973,7 @@ importers: version: 6.2.6(react-dom@19.2.3(react@19.2.3))(react@19.2.3) better-auth: specifier: 1.5.6 - version: 1.5.6(d9d7da76424e2ace7367f5b90edcc4b7) + version: 1.5.6(36df9c69bfb921062bcaa789d1b80d99) class-variance-authority: specifier: ^0.7.0 version: 0.7.1 @@ -2130,7 +2130,7 @@ importers: dependencies: '@better-auth/sso': specifier: ^1.4.9 - version: 1.5.6(@better-auth/core@1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0))(@better-auth/utils@0.3.1)(better-auth@1.5.6(d9d7da76424e2ace7367f5b90edcc4b7))(better-call@1.3.2(zod@4.3.6)) + version: 1.5.6(@better-auth/core@1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0))(@better-auth/utils@0.3.1)(better-auth@1.5.6(36df9c69bfb921062bcaa789d1b80d99))(better-call@1.3.2(zod@4.3.6)) '@clerk/backend': specifier: ^3.4.11 version: 3.4.11(react-dom@19.2.3(react@19.2.3))(react@19.2.3) @@ -2169,7 +2169,7 @@ importers: version: link:../../../packages/stateless better-auth: specifier: 1.5.6 - version: 1.5.6(d9d7da76424e2ace7367f5b90edcc4b7) + version: 1.5.6(36df9c69bfb921062bcaa789d1b80d99) better-call: specifier: ^1.3.2 version: 1.3.2(zod@4.3.6) @@ -2487,12 +2487,15 @@ importers: '@types/node': specifier: ^22.0.0 version: 22.15.13 - tsx: - specifier: 4.16.2 - version: 4.16.2 typescript: specifier: 5.6.2 version: 5.6.2 + vitest: + specifier: ^4.1.8 + version: 4.1.8(@edge-runtime/vm@3.0.3)(@opentelemetry/api@1.9.0)(@types/node@22.15.13)(happy-dom@20.8.9)(jsdom@20.0.3)(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3)) + vitest-evals: + specifier: 0.12.0 + version: 0.12.0(ai@6.0.50(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.8(@edge-runtime/vm@3.0.3)(@opentelemetry/api@1.9.0)(@types/node@22.15.13)(happy-dom@20.8.9)(jsdom@20.0.3)(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3)))(zod@3.25.76) libs/application-generic: dependencies: @@ -15324,9 +15327,18 @@ packages: peerDependencies: vite: ^4.5.10 + '@vitest-evals/core@0.12.0': + resolution: {integrity: sha512-JOatlrVw4jcP9VCBAFcM07pGxUA2iLt4Ks5jaRYqyATjkNwPYnyNDL+YHgvelANfPA0BBX8MzRfs6vEkzJgC+A==} + + '@vitest-evals/report-ui@0.12.0': + resolution: {integrity: sha512-rjWKnB+WL1ekiIvHdcnEX0tfaCwfeG3BNU6jvGKuJsHqkf8JRtuTyy/xgUKKsb56CokcZ3K3hmeo6RKik/KBrQ==} + '@vitest/expect@4.1.7': resolution: {integrity: sha512-1R+tw0ortHEbZDGMymm+pN7/AFQ/RkFFdtd7EN+VBpynKmLbP8A3rpEXdshBJ7+8hQ9zBJh/i1s0yKNtxAnU7w==} + '@vitest/expect@4.1.8': + resolution: {integrity: sha512-h3nDO677RDLEGlBxyQ5CW8RlMThSKSRLUePLOx09gNIWRL40edgA1GCZSZgf1W55MFAG6/Sw14KeaAnqv0NKdQ==} + '@vitest/mocker@4.1.7': resolution: {integrity: sha512-vY7nuamKgfvpA1Koa3oYIw/k7D6kZnpGyNMZW8loow2bsBYla1TFdqTaXncWdRn4pgwNs+90RhnXhJScDwQeJA==} peerDependencies: @@ -15338,21 +15350,47 @@ packages: vite: optional: true + '@vitest/mocker@4.1.8': + resolution: {integrity: sha512-LEiN/xe4OSIbKe9HQIp5OC24agGD9J5CnmMgsLohVVoOPWL9a2sBoR6VBx43jQZb7Kr1l4RCuyCJzcAa0+dojw==} + peerDependencies: + msw: ^2.4.9 + vite: ^6.4.2 + peerDependenciesMeta: + msw: + optional: true + vite: + optional: true + '@vitest/pretty-format@4.1.7': resolution: {integrity: sha512-umgCarTOYQWIaDMvGDRZij+6b9oVeLIyJzfN+AS88e0ZOU3QTgNNSTtjQOpcvWr3np1N0j4WgZj+sb3oYBDscw==} + '@vitest/pretty-format@4.1.8': + resolution: {integrity: sha512-9GasEBxpZ1VYIpqHf/0+YGg121uSNwCKOJqIrTwWP/TB7DmFCiaBpNl3aPZzoLWfWkuqhbH8vJIVobZkvdo2cA==} + '@vitest/runner@4.1.7': resolution: {integrity: sha512-BapjmAQ2aI78WdMEfeUWivnfVzB+VPGwWRQcJE0OUq7qEeEcBsCSf+0T5iREBNE5nBb4wA5Ya0W6IA+sghdEFw==} + '@vitest/runner@4.1.8': + resolution: {integrity: sha512-EmVxeBAfMJvycdjd6Hm+RbFBbA9fKvo0Kx37hNpBYoYeavH3RNsBXWDooR1mgD52dCrxIIuP7UotpfiwOikvcg==} + '@vitest/snapshot@4.1.7': resolution: {integrity: sha512-ZacLzja+TmJeZ1h14xW2FB/WpeimUD3haBXQPyJqxvo8jQTmfeA8zv58mtjN2C7EHXZDYVcVYdYmAxjkWVvKCw==} + '@vitest/snapshot@4.1.8': + resolution: {integrity: sha512-acfZboRmAIf05DEKcBQy33VXojFJjtUdLyo7oOmV9kebb2xdU01UknNiPuPZoJZQyO7DF0gZdTGTpeAzET9QPQ==} + '@vitest/spy@4.1.7': resolution: {integrity: sha512-kbkI5LMWakyuTIvs6fUJ5qdIVb1XVKsYJAT4OJ938cHMROYMSfmoQdZy0aaAnjbbc8F61vkoTqz/Az+/HiIu5Q==} + '@vitest/spy@4.1.8': + resolution: {integrity: sha512-6EevtBp6OZOPF7bmz36HrGMeP3txgVSrgebWxHOafDXGkhIzfXK14f8KF6MuFfgXXUeHxmpD3BQxkV00/3s5mA==} + '@vitest/utils@4.1.7': resolution: {integrity: sha512-T532WBu791cBxJlCl6SO+J14l81DQx6uQHm1bQbmCDY7nqlEIgkza/UFnSBNaUtSf41unldDFjdOBYEQC4b5Hw==} + '@vitest/utils@4.1.8': + resolution: {integrity: sha512-uOJamYALNhfJ6iolExyQM40yIQwDqYnkKtQ5VCiSe17E33H0aQ/u+1GlRuz4LZBk6Mm3sg90G9hEbmEt37C1Zg==} + '@vonage/accounts@1.9.0': resolution: {integrity: sha512-4cW/tfYpL53uHR3YjTbLL/kn23/RllPmFkFf3LAhdvratwtnDSYiOy/nZooATjmon3fzdOYLW0kYGAvoeWlHUg==} @@ -27220,6 +27258,20 @@ packages: vite: optional: true + vitest-evals@0.12.0: + resolution: {integrity: sha512-pyVA4N8gM+T2JB+SGFNSuXcgf/CHbBygAXkXR1fEPEfleKyMacJXPF9gLWIyyC1x5BCrt0r4zkwzkdjZrdpwZQ==} + hasBin: true + peerDependencies: + ai: '>=4 <7' + tinyrainbow: '>=2 <4' + vitest: ^4.1.0 + zod: '>=3 <5' + peerDependenciesMeta: + ai: + optional: true + zod: + optional: true + vitest@4.1.7: resolution: {integrity: sha512-flYyaFd2CgoCoU+0UKt3pxksgC+S02iTDN0n3LtqaMeXsI9SBcdNujc2k0DeFLzUn/0k538yNjOSdwgCqcrwJA==} engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0} @@ -27261,6 +27313,47 @@ packages: jsdom: optional: true + vitest@4.1.8: + resolution: {integrity: sha512-flY6ScbCIt9HThs+C5HS7jvGOB560DJtk/Z15IQROTA6zEy49Nh8T/dofWTQL+n3vswqn87sbJNiuqw1SDp5Ig==} + engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0} + hasBin: true + peerDependencies: + '@edge-runtime/vm': '*' + '@opentelemetry/api': ^1.9.0 + '@types/node': ^20.0.0 || ^22.0.0 || >=24.0.0 + '@vitest/browser-playwright': 4.1.8 + '@vitest/browser-preview': 4.1.8 + '@vitest/browser-webdriverio': 4.1.8 + '@vitest/coverage-istanbul': 4.1.8 + '@vitest/coverage-v8': 4.1.8 + '@vitest/ui': 4.1.8 + happy-dom: '*' + jsdom: '*' + vite: ^6.4.2 + peerDependenciesMeta: + '@edge-runtime/vm': + optional: true + '@opentelemetry/api': + optional: true + '@types/node': + optional: true + '@vitest/browser-playwright': + optional: true + '@vitest/browser-preview': + optional: true + '@vitest/browser-webdriverio': + optional: true + '@vitest/coverage-istanbul': + optional: true + '@vitest/coverage-v8': + optional: true + '@vitest/ui': + optional: true + happy-dom: + optional: true + jsdom: + optional: true + vlq@0.2.3: resolution: {integrity: sha512-DRibZL6DsNhIgYQ+wNdWDL2SL3bKPlVrRiBqV5yuMm++op8W4kGFtaQfCs4KEJn0wBZcHVHJ3eoywX8983k1ow==} @@ -31745,21 +31838,21 @@ snapshots: '@better-auth/core': 1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0) '@better-auth/utils': 0.3.1 - '@better-auth/sso@1.4.7(better-auth@1.5.6(d9d7da76424e2ace7367f5b90edcc4b7))': + '@better-auth/sso@1.4.7(better-auth@1.5.6(36df9c69bfb921062bcaa789d1b80d99))': dependencies: '@better-fetch/fetch': 1.1.21 - better-auth: 1.5.6(d9d7da76424e2ace7367f5b90edcc4b7) + better-auth: 1.5.6(36df9c69bfb921062bcaa789d1b80d99) fast-xml-parser: 5.7.3 jose: 6.1.3 samlify: 2.13.1 zod: 4.3.6 - '@better-auth/sso@1.5.6(@better-auth/core@1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0))(@better-auth/utils@0.3.1)(better-auth@1.5.6(d9d7da76424e2ace7367f5b90edcc4b7))(better-call@1.3.2(zod@4.3.6))': + '@better-auth/sso@1.5.6(@better-auth/core@1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0))(@better-auth/utils@0.3.1)(better-auth@1.5.6(36df9c69bfb921062bcaa789d1b80d99))(better-call@1.3.2(zod@4.3.6))': dependencies: '@better-auth/core': 1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0) '@better-auth/utils': 0.3.1 '@better-fetch/fetch': 1.1.21 - better-auth: 1.5.6(d9d7da76424e2ace7367f5b90edcc4b7) + better-auth: 1.5.6(36df9c69bfb921062bcaa789d1b80d99) better-call: 1.3.2(zod@4.3.6) fast-xml-parser: 5.7.3 jose: 6.1.3 @@ -42908,7 +43001,7 @@ snapshots: dependencies: minimatch: 9.0.9 path-browserify: 1.0.1 - tinyglobby: 0.2.14 + tinyglobby: 0.2.16 '@tsconfig/node10@1.0.9': {} @@ -44182,6 +44275,14 @@ snapshots: transitivePeerDependencies: - supports-color + '@vitest-evals/core@0.12.0': + dependencies: + zod: 3.25.76 + + '@vitest-evals/report-ui@0.12.0': + dependencies: + '@vitest-evals/core': 0.12.0 + '@vitest/expect@4.1.7': dependencies: '@standard-schema/spec': 1.1.0 @@ -44191,6 +44292,15 @@ snapshots: chai: 6.2.2 tinyrainbow: 3.1.0 + '@vitest/expect@4.1.8': + dependencies: + '@standard-schema/spec': 1.1.0 + '@types/chai': 5.2.3 + '@vitest/spy': 4.1.8 + '@vitest/utils': 4.1.8 + chai: 6.2.2 + tinyrainbow: 3.1.0 + '@vitest/mocker@4.1.7(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.16.2)(yaml@2.8.3))': dependencies: '@vitest/spy': 4.1.7 @@ -44207,15 +44317,32 @@ snapshots: optionalDependencies: vite: 6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3) + '@vitest/mocker@4.1.8(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3))': + dependencies: + '@vitest/spy': 4.1.8 + estree-walker: 3.0.3 + magic-string: 0.30.21 + optionalDependencies: + vite: 6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3) + '@vitest/pretty-format@4.1.7': dependencies: tinyrainbow: 3.1.0 + '@vitest/pretty-format@4.1.8': + dependencies: + tinyrainbow: 3.1.0 + '@vitest/runner@4.1.7': dependencies: '@vitest/utils': 4.1.7 pathe: 2.0.3 + '@vitest/runner@4.1.8': + dependencies: + '@vitest/utils': 4.1.8 + pathe: 2.0.3 + '@vitest/snapshot@4.1.7': dependencies: '@vitest/pretty-format': 4.1.7 @@ -44223,14 +44350,29 @@ snapshots: magic-string: 0.30.21 pathe: 2.0.3 + '@vitest/snapshot@4.1.8': + dependencies: + '@vitest/pretty-format': 4.1.8 + '@vitest/utils': 4.1.8 + magic-string: 0.30.21 + pathe: 2.0.3 + '@vitest/spy@4.1.7': {} + '@vitest/spy@4.1.8': {} + '@vitest/utils@4.1.7': dependencies: '@vitest/pretty-format': 4.1.7 convert-source-map: 2.0.0 tinyrainbow: 3.1.0 + '@vitest/utils@4.1.8': + dependencies: + '@vitest/pretty-format': 4.1.8 + convert-source-map: 2.0.0 + tinyrainbow: 3.1.0 + '@vonage/accounts@1.9.0(encoding@0.1.13)': dependencies: '@vonage/server-client': 1.9.0(encoding@0.1.13) @@ -45526,7 +45668,7 @@ snapshots: jsonpointer: 5.0.1 leven: 3.1.0 - better-auth@1.5.6(d9d7da76424e2ace7367f5b90edcc4b7): + better-auth@1.5.6(36df9c69bfb921062bcaa789d1b80d99): dependencies: '@better-auth/core': 1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0) '@better-auth/drizzle-adapter': 1.5.6(@better-auth/core@1.5.6(@better-auth/utils@0.3.1)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.0)(better-call@1.3.2(zod@4.3.6))(jose@6.1.3)(kysely@0.28.17)(nanostores@1.2.0))(@better-auth/utils@0.3.1) @@ -45553,7 +45695,7 @@ snapshots: react-dom: 19.2.3(react@19.2.3) solid-js: 1.9.6 svelte: 5.55.7(@typescript-eslint/types@8.39.1) - vitest: 4.1.7(@edge-runtime/vm@3.0.3)(@opentelemetry/api@1.9.0)(@types/node@22.15.13)(happy-dom@20.8.9)(jsdom@20.0.3)(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3)) + vitest: 4.1.8(@edge-runtime/vm@3.0.3)(@opentelemetry/api@1.9.0)(@types/node@22.15.13)(happy-dom@20.8.9)(jsdom@20.0.3)(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3)) transitivePeerDependencies: - '@cloudflare/workers-types' - '@opentelemetry/api' @@ -59150,6 +59292,16 @@ snapshots: vite: 6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3) optional: true + vitest-evals@0.12.0(ai@6.0.50(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.8(@edge-runtime/vm@3.0.3)(@opentelemetry/api@1.9.0)(@types/node@22.15.13)(happy-dom@20.8.9)(jsdom@20.0.3)(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3)))(zod@3.25.76): + dependencies: + '@vitest-evals/core': 0.12.0 + '@vitest-evals/report-ui': 0.12.0 + tinyrainbow: 3.1.0 + vitest: 4.1.8(@edge-runtime/vm@3.0.3)(@opentelemetry/api@1.9.0)(@types/node@22.15.13)(happy-dom@20.8.9)(jsdom@20.0.3)(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3)) + optionalDependencies: + ai: 6.0.50(zod@3.25.76) + zod: 3.25.76 + vitest@4.1.7(@edge-runtime/vm@3.0.3)(@opentelemetry/api@1.9.0)(@types/node@22.15.13)(happy-dom@20.8.9)(jsdom@20.0.3)(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.16.2)(yaml@2.8.3)): dependencies: '@vitest/expect': 4.1.7 @@ -59212,6 +59364,37 @@ snapshots: transitivePeerDependencies: - msw + vitest@4.1.8(@edge-runtime/vm@3.0.3)(@opentelemetry/api@1.9.0)(@types/node@22.15.13)(happy-dom@20.8.9)(jsdom@20.0.3)(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3)): + dependencies: + '@vitest/expect': 4.1.8 + '@vitest/mocker': 4.1.8(vite@6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3)) + '@vitest/pretty-format': 4.1.8 + '@vitest/runner': 4.1.8 + '@vitest/snapshot': 4.1.8 + '@vitest/spy': 4.1.8 + '@vitest/utils': 4.1.8 + es-module-lexer: 2.0.0 + expect-type: 1.3.0 + magic-string: 0.30.21 + obug: 2.1.1 + pathe: 2.0.3 + picomatch: 4.0.4 + std-env: 4.1.0 + tinybench: 2.9.0 + tinyexec: 1.0.2 + tinyglobby: 0.2.16 + tinyrainbow: 3.1.0 + vite: 6.4.2(@types/node@22.15.13)(jiti@2.6.1)(lightningcss@1.32.0)(terser@5.31.6)(tsx@4.21.0)(yaml@2.8.3) + why-is-node-running: 2.3.0 + optionalDependencies: + '@edge-runtime/vm': 3.0.3 + '@opentelemetry/api': 1.9.0 + '@types/node': 22.15.13 + happy-dom: 20.8.9 + jsdom: 20.0.3 + transitivePeerDependencies: + - msw + vlq@0.2.3: {} vscode-oniguruma@1.7.0: {} From 14e65f5dd4d55cf0c421d3e1c09c63a6e3945f45 Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Wed, 17 Jun 2026 18:01:57 +0300 Subject: [PATCH 04/19] feat(agent-evals): enhance onboarding flow with dashboard OAuth and URL extraction - Updated the connect command to utilize dashboard OAuth by omitting the `--keyless` flag. - Enhanced URL extraction functionality to include mailto links. - Improved grading logic to ensure proper validation of dashboard OAuth usage. - Refactored scenarios and documentation to reflect changes in onboarding requirements and best practices. Co-authored-by: Cursor --- libs/agent-evals/src/core/recorder.ts | 3 ++- libs/agent-evals/src/core/tools.ts | 7 +++++- .../src/suites/agent-onboarding/catalog.ts | 24 +++++++++++++------ .../suites/agent-onboarding/connect-parser.ts | 18 +++++++------- .../src/suites/agent-onboarding/harness.ts | 18 ++++++++++++++ .../dashboard-prompt-login/graders.ts | 6 ++--- .../dashboard-prompt-login/scenario.ts | 4 ++-- .../discipline-no-timers/scenario.ts | 1 - .../scenarios/email-handoff/scenario.ts | 1 - .../keyless-slack-secure/scenario.ts | 1 - .../keyless-whatsapp-redirect/graders.ts | 1 - .../persona-infra-exclusion/graders.ts | 2 +- .../persona-infra-exclusion/scenario.ts | 1 - .../scenarios/slack-in-chat-rerun/graders.ts | 6 ++--- .../scenarios/slack-in-chat-rerun/scenario.ts | 2 +- .../scenarios/telegram-secure-qr/scenario.ts | 1 - .../src/suites/agent-onboarding/tape.ts | 8 +++---- packages/shared/docs/agent-onboarding.md | 10 ++++---- 18 files changed, 73 insertions(+), 41 deletions(-) diff --git a/libs/agent-evals/src/core/recorder.ts b/libs/agent-evals/src/core/recorder.ts index b03ae759058..5d6d8b191c9 100644 --- a/libs/agent-evals/src/core/recorder.ts +++ b/libs/agent-evals/src/core/recorder.ts @@ -79,7 +79,7 @@ export class RunRecorder { } export function extractUrls(text: string): string[] { - const matches = text.match(/https?:\/\/[^\s)>\]"']+/g) ?? []; + const matches = text.match(/(?:https?:\/\/|mailto:)[^\s)>\]"']+/g) ?? []; return matches.map((url) => url.replace(/[.,;]+$/, '')); } @@ -99,6 +99,7 @@ export function isForbiddenWatcherCommand(command: string): boolean { /\bsleep\b/.test(normalized) || /\btail\b/.test(normalized) || /\bgrep\b/.test(normalized) || + /\bps\b/.test(normalized) || /\bschedulewakeup\b/.test(normalized) ); } diff --git a/libs/agent-evals/src/core/tools.ts b/libs/agent-evals/src/core/tools.ts index 770b8406e63..ae65fe2bf08 100644 --- a/libs/agent-evals/src/core/tools.ts +++ b/libs/agent-evals/src/core/tools.ts @@ -127,10 +127,15 @@ export function createHarnessTools(context: HarnessCont if (runInBackground) { context.engine.pollShell(shell.id); + const backgroundStdout = shell.emittedStdout.join('\n'); + + for (const url of extractUrls(backgroundStdout)) { + context.recorder.recordUrl(url); + } return { shellId: shell.id, - stdout: shell.emittedStdout.join('\n'), + stdout: backgroundStdout, stderr: '', running: !shell.completed, }; diff --git a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts index adcc99f8139..90b6cebd381 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts @@ -30,14 +30,20 @@ export const catalog = { ? 'pass' : fail('a connect command passed --secret-key or NOVU_SECRET_KEY'), - usedLoginWhenDashboardPrompt: (result: RunResult): GraderOutcome | 'pass' => { + usedDashboardOAuthWhenPrompted: (result: RunResult): GraderOutcome | 'pass' => { if (!/signed in to the Novu dashboard/i.test(result.userPrompt)) { return 'pass'; } - return connectCommands(result).some((cmd) => /--login\b/.test(cmd)) + const commands = connectCommands(result); + + if (commands.length === 0) { + return fail('user was signed into the dashboard but connect was never run'); + } + + return commands.every((cmd) => !/--keyless\b/.test(cmd)) ? 'pass' - : fail('user was signed into the dashboard but no connect command used --login'); + : fail('user was signed into the dashboard but a connect command used --keyless instead of dashboard OAuth'); }, backgroundConnectShell: (result: RunResult): GraderOutcome | 'pass' => { @@ -122,9 +128,13 @@ export const catalog = { return fail('ran a connect command on a keyless WhatsApp flow that should redirect to the dashboard'); } - return /dashboard\.novu\.co|dashboard redirect|continue.*dashboard/i.test(transcriptText(result)) - ? 'pass' - : fail('did not direct the user to the dashboard'); + const text = transcriptText(result); + const mentionsDashboard = /dashboard\.novu\.co|\bdashboard\b/i.test(text); + const directsThere = /dashboard\.novu\.co|redirect|continue|sign[\s-]?(in|up)|head (over )?to|go to|open/i.test( + text + ); + + return mentionsDashboard && directsThere ? 'pass' : fail('did not direct the user to the dashboard'); }, confirmedBeforeRun: (result: RunResult): GraderOutcome | 'pass' => { @@ -167,7 +177,7 @@ export const catalog = { : fail('never read the auth-url file or surfaced the /oauth/device URL'), reportedSuccess: (result: RunResult): GraderOutcome | 'pass' => - /your agent is live|agent is live/i.test(transcriptText(result)) + /agent is (now )?live|✓ your agent/i.test(transcriptText(result)) ? 'pass' : fail('final report did not confirm the agent is live'), diff --git a/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts index c6999888600..c454e7569c4 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts @@ -1,7 +1,7 @@ import type { CommandParser } from '../../core/types.js'; export type ConnectFlags = { - login: boolean; + keyless: boolean; secretKey: boolean; ci: boolean; channel?: string; @@ -90,7 +90,7 @@ export const connectParser: CommandParser = { matches: isConnectCommand, parse(command, env) { const flags: ConnectFlags = { - login: /--login\b/.test(command), + keyless: /--keyless\b/.test(command), secretKey: /--secret-key\b/.test(command) || /\bNOVU_SECRET_KEY=/.test(command), ci: /--ci\b/.test(command), }; @@ -112,19 +112,21 @@ export const connectParser: CommandParser = { }; export type ConnectValidationOptions = { - requireLogin?: boolean; - requireNoLogin?: boolean; + /** Keyless flow: the connect command must pass `--keyless` (the default for this flow). */ + requireKeyless?: boolean; + /** Dashboard OAuth flow: the connect command must omit `--keyless` (the CLI default path). */ + requireNoKeyless?: boolean; allowedChannels?: string[]; }; export function connectValidate(options: ConnectValidationOptions): (flags: ConnectFlags) => string | null { return (flags) => { - if (options.requireLogin && !flags.login) { - return 'Expected --login flag for this scenario.'; + if (options.requireKeyless && !flags.keyless) { + return 'Expected --keyless flag for this scenario.'; } - if (options.requireNoLogin && flags.login) { - return 'Did not expect --login flag for this scenario.'; + if (options.requireNoKeyless && flags.keyless) { + return 'Did not expect --keyless flag for this scenario (use dashboard OAuth by omitting it).'; } if (flags.secretKey) { diff --git a/libs/agent-evals/src/suites/agent-onboarding/harness.ts b/libs/agent-evals/src/suites/agent-onboarding/harness.ts index 04d2cbcfda2..93757e96cc6 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/harness.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/harness.ts @@ -77,6 +77,7 @@ export type ScenarioHarnessOptions = { system: string; model?: string; maxSteps?: number; + temperature?: number; }; function resolveMaxSteps(explicit?: number): number { @@ -89,9 +90,25 @@ function resolveMaxSteps(explicit?: number): number { return Number.isFinite(fromEnv) && fromEnv > 0 ? fromEnv : 40; } +/** + * Default to 0 for deterministic, reproducible grading. A non-zero default would make + * run-to-run results depend on sampling noise, so a flaky prompt and a real regression + * become indistinguishable. Override via NOVU_EVAL_TEMPERATURE only for robustness sampling. + */ +function resolveTemperature(explicit?: number): number { + if (explicit !== undefined) { + return explicit; + } + + const fromEnv = Number.parseFloat(process.env.NOVU_EVAL_TEMPERATURE ?? ''); + + return Number.isFinite(fromEnv) && fromEnv >= 0 ? fromEnv : 0; +} + export function scenarioHarness(options: ScenarioHarnessOptions) { const modelName = options.model ?? process.env.NOVU_EVAL_MODEL ?? 'claude-sonnet-4-5'; const maxSteps = resolveMaxSteps(options.maxSteps); + const temperature = resolveTemperature(options.temperature); return createHarness({ name: `agent-onboarding/${options.scenario.id}`, @@ -110,6 +127,7 @@ export function scenarioHarness(options: ScenarioHarnes system: options.system, messages, tools, + temperature, stopWhen: stepCountIs(maxSteps), }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts index 2171b9bbd5f..f64bda6db40 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/graders.ts @@ -1,9 +1,9 @@ import { catalog, defineGraders, labeled, sharedJudgeGraders } from '../../kit.js'; export const graders = defineGraders({ - usedLoginWhenDashboardPrompt: labeled( - 'uses --login when the user is signed into the dashboard', - catalog.usedLoginWhenDashboardPrompt + usedDashboardOAuthWhenPrompted: labeled( + 'uses dashboard OAuth (omits --keyless) when the user is signed into the dashboard', + catalog.usedDashboardOAuthWhenPrompted ), noSecretKeyFlag: labeled('does not pass --secret-key or NOVU_SECRET_KEY to connect', catalog.noSecretKeyFlag), backgroundConnectShell: labeled( diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/scenario.ts index 9eec25e76d9..6c8ddf00308 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/dashboard-prompt-login/scenario.ts @@ -7,7 +7,7 @@ const scenarioDir = path.dirname(fileURLToPath(import.meta.url)); export const scenario: EvalScenario = { id: 'dashboard-prompt-login', category: 'authenticated', - description: 'Dashboard prompt must force --login and deliver auth URL from file.', + description: 'Dashboard prompt must use dashboard OAuth (omit --keyless) and deliver auth URL from file.', userPrompt: "I'm signed in to the Novu dashboard. Add an agent to my app and connect it to Slack following the onboarding instructions.", projectRoot: path.join(scenarioDir, 'project'), @@ -17,7 +17,7 @@ export const scenario: EvalScenario = { { questionContains: 'token', optionId: 'secure' }, ], tape: connectTape({ - requireLogin: true, + requireNoKeyless: true, allowedChannels: ['slack'], chunks: [ { diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/scenario.ts index accc0a2bff3..1ec62a810a8 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/scenario.ts @@ -16,7 +16,6 @@ export const scenario: EvalScenario = { { questionContains: 'token', optionId: 'secure' }, ], tape: connectTape({ - requireNoLogin: true, allowedChannels: ['slack'], chunks: [ { stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.novu.test/slack/discipline-1' }, diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts index fbff9108015..0924f369aae 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts @@ -18,7 +18,6 @@ export const scenario: EvalScenario = { { questionContains: 'description', optionId: 'approve' }, ], tape: connectTape({ - requireNoLogin: true, allowedChannels: ['email'], chunks: [ { stdout: `NOVU_CONNECT_INBOUND_ADDRESS=${inboundAddress}` }, diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/scenario.ts index d3279792f1b..0a127fe5c57 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-slack-secure/scenario.ts @@ -16,7 +16,6 @@ export const scenario: EvalScenario = { { questionContains: 'token', optionId: 'secure' }, ], tape: buildDefaultTape({ - requireNoLogin: true, allowedChannels: ['slack'], }), }; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts index 8578a8ccd00..9e83f600caf 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/keyless-whatsapp-redirect/graders.ts @@ -6,5 +6,4 @@ export const graders = defineGraders({ 'redirects the user to the dashboard instead of running connect', catalog.noConnectOnKeylessWhatsapp ), - usedPickerForDecisions: labeled('uses AskUserQuestion for channel decisions', catalog.usedPickerForDecisions), }); diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts index 44f37f0debd..867d474fb07 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/graders.ts @@ -7,7 +7,7 @@ export const graders = defineGraders({ ), descriptionIncludesAudience: labeled( 'includes audience-specific tokens in the drafted agent description', - catalog.descriptionIncludesTokens(['staff', 'wine']) + catalog.descriptionIncludesTokens(['staff', 'wine', 'bartender', 'sommelier', 'waitstaff', 'hospitality']) ), confirmedBeforeRun: labeled('confirms with the user before running connect', catalog.confirmedBeforeRun), ...sharedJudgeGraders, diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts index 4561c75b93e..78dc3714c33 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts @@ -16,7 +16,6 @@ export const scenario: EvalScenario = { { questionContains: 'token', optionId: 'secure' }, ], tape: connectTape({ - requireNoLogin: true, allowedChannels: ['slack'], chunks: [ { stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.novu.test/slack/persona-1' }, diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts index 92b4524c299..1a6b7669d1a 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/graders.ts @@ -1,9 +1,9 @@ import { catalog, defineGraders, labeled, sharedJudgeGraders } from '../../kit.js'; export const graders = defineGraders({ - usedLoginWhenDashboardPrompt: labeled( - 'uses --login when the user is signed into the dashboard', - catalog.usedLoginWhenDashboardPrompt + usedDashboardOAuthWhenPrompted: labeled( + 'uses dashboard OAuth (omits --keyless) when the user is signed into the dashboard', + catalog.usedDashboardOAuthWhenPrompted ), killedFirstConnectShell: labeled('kills the first connect shell before re-running', catalog.killedFirstConnectShell), reranWithSlackToken: labeled('re-runs connect with --slack-config-token', catalog.reranWithSlackToken), diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts index 8770cfc6c72..72ff7ddeb22 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts @@ -18,7 +18,7 @@ export const scenario: EvalScenario = { followUpMessages: ['Here is my Slack App Configuration Token: xoxe.xoxp-test-token'], followUpOnOptionId: 'in_chat', tape: connectTape({ - requireLogin: true, + requireNoKeyless: true, allowedChannels: ['slack'], chunks: [ { diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts index 361f02310c0..8266816558c 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts @@ -17,7 +17,6 @@ export const scenario: EvalScenario = { { questionContains: 'token', optionId: 'secure' }, ], tape: connectTape({ - requireNoLogin: true, allowedChannels: ['telegram'], chunks: [ { stdout: 'NOVU_CONNECT_TELEGRAM_BOTFATHER_URL=https://t.me/botfather' }, diff --git a/libs/agent-evals/src/suites/agent-onboarding/tape.ts b/libs/agent-evals/src/suites/agent-onboarding/tape.ts index 2bbf8d5714e..1eff2237044 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/tape.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/tape.ts @@ -12,8 +12,8 @@ export function connectTape(options: ConnectTapeOptions): Tape { chunks: options.chunks, exitCode: options.exitCode ?? 0, validate: connectValidate({ - requireLogin: options.requireLogin, - requireNoLogin: options.requireNoLogin, + requireKeyless: options.requireKeyless, + requireNoKeyless: options.requireNoKeyless, allowedChannels: options.allowedChannels, }), }; @@ -38,8 +38,8 @@ export function buildDefaultTape(overrides?: Partial): Tape< return connectTape({ chunks: overrides?.chunks ?? defaultChunks, exitCode: overrides?.exitCode ?? 0, - requireNoLogin: overrides?.requireNoLogin ?? true, + requireKeyless: overrides?.requireKeyless, allowedChannels: overrides?.allowedChannels ?? ['slack'], - requireLogin: overrides?.requireLogin, + requireNoKeyless: overrides?.requireNoKeyless, }); } diff --git a/packages/shared/docs/agent-onboarding.md b/packages/shared/docs/agent-onboarding.md index f4aae7da67f..c47ef083543 100644 --- a/packages/shared/docs/agent-onboarding.md +++ b/packages/shared/docs/agent-onboarding.md @@ -31,7 +31,7 @@ These govern every step. When in doubt, follow these over any specific instructi - **Trust user intent; ask only when genuinely unclear.** Only the channel choice (Step 1) and the purpose confirmation (Step 2) require the user. Default on everything else (region, runtime, auth mode) unless the user raises it. - **Prefer the secure setup page for secrets; the in-chat path is a discouraged fallback.** The **secure way** to provide Slack App Configuration Tokens and Telegram bot tokens is the CLI's one-time setup link (Slack: a URL; Telegram: a URL **and** a QR code) — the user pastes the secret directly on that page, never in chat. Always offer this first and recommend it. A **non-secure fallback** exists: the user may paste the token into the agent chat, which you then pass via `--slack-config-token` / `--telegram-bot-token`. Only take this path when the user explicitly opts in, and warn them it is less secure (the token appears in chat history). - **Confirm before you act.** Never run the command until the user has explicitly approved the drafted agent description. -- **One Connect shell, no log watchers.** Always run the Step 3 connect command as a **background** Shell (`block_until_ms: 0`), then **Await** its shell id for stdout. **Never run it in the foreground** — the CLI blocks up to ~5 min per handoff stage, so a foreground call hits the host shell timeout and appears to hang. Use a single Shell session only. Never redirect to a log file, never start Monitor/`tail`/`grep` watchers, never Read `/tmp/*` or any other log path. **Never use timers** (`ScheduleWakeup`, `sleep`, or "check back in N minutes") to wait for handoffs — **Await** the Connect shell continuously until the next `NOVU_CONNECT_*` sentinel or `✓ Your agent is live` appears. The only exception: `--channel skip` in keyless mode may run in the foreground. +- **One Connect shell, no log watchers.** Always run the Step 3 connect command as a **background** Shell (`block_until_ms: 0`), then **Await** its shell id for stdout. **Never run it in the foreground** — the CLI blocks up to ~5 min per handoff stage, so a foreground call hits the host shell timeout and appears to hang. Use a single Shell session only. Never redirect to a log file, never start Monitor/`tail`/`grep` watchers, never Read `/tmp/*` or any other log path. **Never use timers or out-of-band probes** (`ScheduleWakeup`, `sleep`, `ps`/`ps aux`, `grep`, `kill -0`, or "check back in N minutes") to wait for or inspect the Connect process — the **only** way to wait is to **Await** the Connect shell continuously until the next `NOVU_CONNECT_*` sentinel or `✓ Your agent is live` appears. The only exception: `--channel skip` in keyless mode may run in the foreground. - **The CLI validates handoffs.** For dashboard OAuth, `slack`/`email`/`telegram`, that Shell blocks and polls until the handoff completes. Do not call Novu/Slack APIs or use OAuth tools to verify completion yourself. - **WhatsApp / MS Teams in keyless mode never reach the CLI.** If the user picks one and you are using **`--keyless`** (the default), do **not** run connect — redirect them to the Novu dashboard instead (Step 1). With **dashboard OAuth** (omit `--keyless`), the CLI creates the agent and hands off a dashboard URL to finish channel setup. - **Report conclusion-first.** Lead with the CLI's result (live / failed), then the one action the user must take. Keep it terse. @@ -92,7 +92,7 @@ When the user must pick from a **fixed set** of options (channel, approve/reject | `telegram` | Telegram | Create a bot via @BotFather. **Recommended (secure):** open the setup link/QR the CLI prints and paste the token there. **Non-secure fallback:** paste the token in chat instead and you pass it via `--telegram-bot-token`. Then tap **Start** on the bot in Telegram. | | `dashboard` | WhatsApp / MS Teams | **Keyless (`--keyless`, default):** sign in to the Novu dashboard and continue there (no CLI run). **Dashboard OAuth (omit `--keyless`):** CLI creates the agent, then opens the dashboard to finish channel setup. | -**If they pick `dashboard` and you are using keyless (`--keyless`, the default):** stop — do **not** run connect and do **not** generate an agent. Give the user the dashboard URL — **** (or if they asked for the EU region) — and tell them to **sign in (or sign up) and continue the onboarding from the dashboard**. Steps 2–5 do not apply. +**If they pick `dashboard` and you are using keyless (`--keyless`, the default):** **HARD STOP — never invoke `npx novu connect` in this branch (not in the foreground, not backgrounded, not with any channel flag).** Do **not** run connect and do **not** generate an agent. Give the user the dashboard URL — **** (or if they asked for the EU region) — and tell them to **sign in (or sign up) and continue the onboarding from the dashboard**. Steps 2–5 do not apply. **If they pick `dashboard` and you are using dashboard OAuth (omit `--keyless`):** ask WhatsApp or MS Teams if unclear; use `--channel whatsapp` or `--channel teams` in Step 3. @@ -216,10 +216,12 @@ npx novu@latest connect "$NOVU_AGENT_DESCRIPTION" \ Always start the Connect command as a **background** Shell (`block_until_ms: 0`), then **Await** its shell id for the markers below. This applies to every auth mode and channel. **Never run it in the foreground** — the CLI blocks up to ~5 min per handoff stage and a foreground call will hit the host shell timeout. +**Backgrounding rule (non-negotiable):** use the tool's own background mechanism (`run_in_background: true` / `block_until_ms: 0`) and wait **only** by Await/BashOutput on the returned shell id. **Do NOT** append `&` to the command, **do NOT** `sleep`, and **do NOT** `cat`/`tail`/`grep` any `/tmp/*.log` or other file to inspect progress — the only source of progress is the Connect shell's own stdout polled via Await. + Then follow the path that matches your flags: - **If using dashboard OAuth (omitting `--keyless`):** **Await** `NOVU_CONNECT_AUTH_URL_FILE=` on the background shell id, **Read** that file for the auth URL, deliver the URL to the user, then **Await** channel handoff markers and success on the same shell id. -- **If channel is `slack`, `email`, or `telegram`:** **Await** on that shell id (e.g. `NOVU_CONNECT_SLACK_SETUP_URL=`, `NOVU_CONNECT_INBOUND_ADDRESS=`, etc.). **Await** until `✓ Your agent is live` or `✗`. Do not use Monitor, `tail -f`, `grep`, Read on log files. +- **If channel is `slack`, `email`, or `telegram`:** **Await** on that shell id (e.g. `NOVU_CONNECT_SLACK_SETUP_URL=`, `NOVU_CONNECT_INBOUND_ADDRESS=`, etc.). **Await** until `✓ Your agent is live` or `✗`. Do not use Monitor, `tail -f`, `grep`, `sleep`, `ps`, or Read on log files — poll the shell id and nothing else. - **If channel is `whatsapp` or `teams` (dashboard OAuth only):** **Await** auth URL, then dashboard agent URL or success on the same shell id. - **If channel is `skip` in keyless mode:** foreground Shell is allowed — the only exception to the background rule above. @@ -371,7 +373,7 @@ On success the CLI exits `0` and prints: Claim your agent: # keyless only ``` -**After leading with the CLI's result, give a 1–2 sentence recap** of what onboarding set up — consistent with the conclusion-first operating principle. Before the channel/next-step pointer, briefly explain what the connect run built so the result isn't a black box. Keep it to one or two sentences, in plain language, e.g.: +**Open the final report with the CLI's literal success line** — copy the `✓ Your agent is live.` line verbatim rather than paraphrasing it (e.g. "set up", "connected", "ready"); on failure, lead with the CLI's error instead. **After leading with the CLI's result, give a 1–2 sentence recap** of what onboarding set up — consistent with the conclusion-first operating principle. Before the channel/next-step pointer, briefly explain what the connect run built so the result isn't a black box. Keep it to one or two sentences, in plain language, e.g.: > _"Here's what Novu built from your description: a hosted AI agent — its system prompt, the right tools and skills, MCP servers for the services you named, and a connection to <channel> so it can message your users."_ From fa6efbd46a516293e498a9d0479fd7037313b5ab Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Thu, 18 Jun 2026 10:29:59 +0300 Subject: [PATCH 05/19] feat(agent-evals): enhance README and grading logic for better failure triage and shell command handling - Added a section in the README for triaging failing scenarios using the `triage-agent-eval-failures` skill. - Introduced a new function `readShellValue` to improve parsing of shell command values with proper handling of quotes and escapes. - Updated `captureLeadingExports` to capture environment variables from shell commands more effectively. - Modified grading logic in the `discipline-no-timers` scenario to count actual BashOutput poll calls for accurate evaluation. Co-authored-by: Cursor --- .../triage-agent-eval-failures/SKILL.md | 99 ++++++++++++++ .../triage-agent-eval-failures/reference.md | 128 ++++++++++++++++++ libs/agent-evals/README.md | 4 + libs/agent-evals/src/core/tools.ts | 90 ++++++++++-- .../src/suites/agent-onboarding/kit.ts | 2 +- .../scenarios/discipline-no-timers/graders.ts | 7 +- 6 files changed, 317 insertions(+), 13 deletions(-) create mode 100644 .cursor/skills/triage-agent-eval-failures/SKILL.md create mode 100644 .cursor/skills/triage-agent-eval-failures/reference.md diff --git a/.cursor/skills/triage-agent-eval-failures/SKILL.md b/.cursor/skills/triage-agent-eval-failures/SKILL.md new file mode 100644 index 00000000000..3e8616dd299 --- /dev/null +++ b/.cursor/skills/triage-agent-eval-failures/SKILL.md @@ -0,0 +1,99 @@ +--- +name: triage-agent-eval-failures +description: Triage failing @novu/agent-evals scenarios to decide whether a failure is real or flaky, and whether to fix the playbook/prompt or the test (grader, tape, scenario, or judge). Use when an agent-evals scenario fails, when the user asks why an eval is red, or when deciding whether to fix the test or the prompt. +--- + +# Triage Agent Eval Failures + +Diagnose a failing scenario in `libs/agent-evals` and produce a verdict: is the failure **real** (the playbook under test regressed) or is the **test** wrong (grader / tape / scenario / judge), or is it just **flaky** (model non-determinism)? + +The thing under test is the playbook doc (`packages/shared/docs/agent-onboarding.md`), injected as the agent system prompt. Everything else (`graders.ts`, `catalog.ts`, `scenario.ts`, judge prompts) is test scaffolding. **Never fix the playbook to satisfy a broken grader, and never loosen a grader to hide a real playbook regression.** + +## Rule 0: rule out flakiness before changing anything + +Scenarios run a live model concurrently, so one red run is one sample, not a verdict. Re-run the single failing scenario 3–5× first: + +```bash +pnpm --filter @novu/agent-evals exec vitest run --config vitest.evals.config.ts -t +``` + +- Fails **every** run → deterministic failure, continue triage. +- Fails **intermittently** → flaky. The cause is usually a non-deterministic judge grader or an over-strict regex. Do not edit the playbook. Tighten the grader/judge prompt or accept variance; consider pass@k rather than single-run gating. + +To reproduce judge graders locally (PR/push CI runs deterministic graders only): + +```bash +NOVU_EVAL_JUDGE=true pnpm --filter @novu/agent-evals exec vitest run --config vitest.evals.config.ts -t +``` + +## Step 1: identify which grader failed and its kind + +Each scenario registers graders in `scenarios//graders.ts`. The **kind** is the strongest triage signal: + +- **Deterministic** graders (`catalog.*`, `contains`, `matches`) inspect the structured `RunResult`. A fail means the agent's actions/output objectively did not match — or the check is too strict. +- **Judge** graders (`sharedJudgeGraders`, `judge(...)`) call a second LLM pass. A fail is fuzzy and can be the judge prompt's fault, not the agent's. + +Find the grader's logic: + +| Layer | Location | +| --- | --- | +| Per-scenario grader wiring | `src/suites/agent-onboarding/scenarios//graders.ts` | +| Deterministic grader bodies | `src/suites/agent-onboarding/catalog.ts` (`catalog` object) | +| Judge prompts | `catalog.ts` (`judgePrompts`) + `sharedJudgeGraders` | +| Generic helpers | `src/core/graders.ts` (`contains`, `matches`, `toolCallsNamed`, `transcriptText`) | +| Judge mechanics | `src/core/judge.ts` (returns `skip` on `UNKNOWN`) | + +## Step 2: read the RunResult evidence + +Graders read fields off `RunResult` (`src/core/types.ts`). Map the failing grader to the field it checks and compare against what the agent actually did in the run output: + +- `trackedCommands` — raw connect command strings (flag checks like `--keyless`, `--secret-key`, `--slack-config-token`). +- `toolCalls` — every `Bash` / `BashOutput` / `AskUserQuestion` / `Read` call with args (`run_in_background`, `file_path`, picker `selectedId`). +- `polledShellIds` / `killedShellIds` — background-polling and kill behavior. +- `capturedUrls` / `openedFiles` — surfaced URLs and opened files (e.g. QR `.png`, auth-url file). +- `finalText` / `assistantMessages` — user-facing report (`transcriptText` joins these). +- `metadata.description` — the drafted agent description (persona / infra-token graders). + +## Step 3: classify the failure + +Walk top-down and stop at the first match: + +| Symptom | Verdict | Fix target | +| --- | --- | --- | +| Agent never ran the tracked command / ignored an instruction it should follow | **Real — discovery** | Playbook `agent-onboarding.md` (instruction unclear/missing) | +| Deterministic grader fails and the `RunResult` confirms the agent genuinely did the wrong thing | **Real — execution** | Playbook `agent-onboarding.md` | +| Deterministic grader fails but `RunResult` shows the agent behaved correctly (regex too strict, wrong field, valid variant rejected) | **Test bug** | `catalog.ts` grader logic | +| Fails only on the scripted CLI path; tape stdout/`when`/`validate` or scripted answers are wrong or stale | **Test bug** | `scenario.ts` (`tape`, `scriptedAnswers`), `connect-parser.ts` | +| Judge grader fails but the description/report actually satisfies the criterion | **Test bug** | Judge prompt in `catalog.ts` (`judgePrompts`) | +| Judge verdict flips run-to-run | **Flaky judge** | Sharpen judge prompt; rely on `UNKNOWN`→`skip` escape hatch | +| Passes sometimes, fails sometimes, no clear cause | **Flaky** | Do not edit playbook; re-run (Rule 0) | + +A scenario passes only when every active grader averages ≥ `0.8` (`JUDGE_THRESHOLD`). A judge returning `UNKNOWN` becomes `skip` and scores `1` — it never causes a fail, so an `UNKNOWN` is not evidence of a real regression. + +## Step 4: apply one bounded fix, then verify + +1. Change **only** the layer the verdict points to — playbook **or** test, never both to chase green. +2. Re-run the single scenario (Step 0 command), with `NOVU_EVAL_JUDGE=true` if a judge grader was involved. +3. Confirm the fix holds across the 3–5 re-runs and that no other scenario regressed. +4. If editing a deterministic grader, also run the synthetic unit tests so you don't break grader contracts: + +```bash +pnpm --filter @novu/agent-evals test +``` + +## Output format + +Report the verdict concisely with cited evidence: + +``` +Scenario: +Failing grader: (deterministic | judge) +Re-run result: → real | flaky +Evidence: +Verdict: real playbook regression | test bug () | flaky +Fix target: (or: no change — flaky/UNKNOWN) +``` + +## Additional resources + +For worked triage examples (real regression vs test bug vs flaky judge), see [reference.md](reference.md). diff --git a/.cursor/skills/triage-agent-eval-failures/reference.md b/.cursor/skills/triage-agent-eval-failures/reference.md new file mode 100644 index 00000000000..fc317af47e2 --- /dev/null +++ b/.cursor/skills/triage-agent-eval-failures/reference.md @@ -0,0 +1,128 @@ +# Triage examples + +Worked examples for the `triage-agent-eval-failures` skill. Each walks through evidence → verdict → fix target. + +## Example 1: Real playbook regression — `usedDashboardOAuthWhenPrompted` + +**Scenario:** `dashboard-prompt-login` +**Failing grader:** `usedDashboardOAuthWhenPrompted` (deterministic) +**Re-run result:** 5/5 failed → real + +**Evidence:** + +``` +userPrompt: "I'm signed in to the Novu dashboard..." +trackedCommands: ["npx novu connect --keyless --channel slack"] +``` + +The grader in `catalog.ts` checks: when `userPrompt` mentions "signed in to the Novu dashboard", every `trackedCommands` entry must omit `--keyless`. The agent ran connect with `--keyless` anyway. + +**Verdict:** Real — execution. The playbook did not steer the agent toward dashboard OAuth when the user says they are signed in. + +**Fix target:** `packages/shared/docs/agent-onboarding.md` — clarify that dashboard-signed-in users must omit `--keyless`. + +**Do not:** Loosen the grader to accept `--keyless` when the prompt mentions the dashboard. + +--- + +## Example 2: Test bug — `readAuthUrlFile` with correct behavior + +**Scenario:** `dashboard-prompt-login` +**Failing grader:** `readAuthUrlFile` (deterministic) +**Re-run result:** 5/5 failed → real (but test is wrong) + +**Evidence:** + +``` +toolCalls: [ + { name: "Read", args: { file_path: "/project/novu-connect-auth-url.txt" } } +] +capturedUrls: ["https://auth.novu.test/oauth/device?code=abc"] +transcriptText: "Open https://auth.novu.test/oauth/device?code=abc to authorize" +``` + +The grader checks for `novu-connect-auth-url` in the Read path, `/oauth/device` in `capturedUrls`, or `/oauth/device` in the transcript. All three are satisfied. + +**Verdict:** Test bug — grader. The failure reason may reference a path variant the check does not cover (e.g. relative vs absolute path in `file_path`). Inspect `catalog.readAuthUrlFile` for an overly narrow `includes('novu-connect-auth-url')` match. + +**Fix target:** `src/suites/agent-onboarding/catalog.ts` — widen the Read path check or normalize paths before comparing. + +**Do not:** Change the playbook; the agent already surfaced the auth URL correctly. + +--- + +## Example 3: Flaky judge — `conclusionFirstReport` + +**Scenario:** `dashboard-prompt-login` +**Failing grader:** `conclusionFirstReport` (judge) +**Re-run result:** 2/5 failed → flaky + +**Evidence (passing run):** + +``` +finalText: "✓ Your agent is live. Open the dashboard to manage it: https://dashboard.novu.test/agents/dash-agent-1" +``` + +**Evidence (failing run, same agent output):** + +``` +finalText: "✓ Your agent is live. Open the dashboard to manage it: https://dashboard.novu.test/agents/dash-agent-1" +judge rationale: "The message leads with a success statement but then adds setup context before the next action." +``` + +The deterministic graders all pass. The judge prompt asks whether the first line states the CLI result followed by the single next action. The agent output is identical; only the judge verdict flips. + +**Verdict:** Flaky judge. Non-deterministic LLM grading on a borderline structure. + +**Fix target:** Either sharpen `judgePrompts.conclusionFirstReport` in `catalog.ts` with explicit pass/fail examples, or accept variance and track pass@k. Do not edit the playbook for a 2/5 flake. + +**Note:** A judge returning `UNKNOWN` scores as `skip` (pass). An `UNKNOWN` is not a regression signal. + +--- + +## Example 4: Test bug — stale tape chunk + +**Scenario:** `dashboard-prompt-login` +**Failing grader:** `reportedSuccess` (deterministic) +**Re-run result:** 5/5 failed → real (but tape is wrong) + +**Evidence:** + +``` +trackedCommands: ["npx novu connect --channel slack"] // correct +polledShellIds: ["shell-1"] // correct +transcriptText: "Waiting for connect to finish..." // agent never saw success stdout +``` + +The agent polled the background shell but the final transcript never contains "agent is live". The tape in `scenario.ts` emits success stdout in the last chunk, but `connectTape` validation rejected the command before replay (e.g. `requireNoKeyless: true` but parser flags differ). + +**Verdict:** Test bug — tape/scenario. The fixture did not replay the expected CLI output; the agent behaved correctly given what it received. + +**Fix target:** `scenarios/dashboard-prompt-login/scenario.ts` — fix `tape` chunks or `connectTape` validation flags. Check `connect-parser.ts` if parsed flags do not match tape `when` conditions. + +**Do not:** Change the playbook to tell the agent to report success when the CLI gave no success signal. + +--- + +## Example 5: Real playbook regression — `confirmedBeforeRun` + +**Scenario:** `persona-infra-exclusion` +**Failing grader:** `confirmedBeforeRun` (deterministic) +**Re-run result:** 5/5 failed → real + +**Evidence:** + +``` +toolCalls: [ + { name: "Bash", args: { command: "npx novu connect ..." } }, // index 0 + { name: "AskUserQuestion", result: { selectedId: "approve" } } // index 2 +] +``` + +The grader requires an `AskUserQuestion` with `selectedId: "approve"` **before** the first connect `Bash` call. Connect ran first. + +**Verdict:** Real — execution. The playbook does not enforce (or the agent ignored) the confirm-before-run step. + +**Fix target:** `packages/shared/docs/agent-onboarding.md` — strengthen the approval picker requirement before running connect. + +**Do not:** Remove or weaken `catalog.confirmedBeforeRun`. diff --git a/libs/agent-evals/README.md b/libs/agent-evals/README.md index 06f6724d2d1..25d492aff80 100644 --- a/libs/agent-evals/README.md +++ b/libs/agent-evals/README.md @@ -177,6 +177,10 @@ Each scenario uses `judgeThreshold: 0.8` — the average judge score for that sc Judge graders run only when `NOVU_EVAL_JUDGE=true` (PR/push CI runs deterministic graders only; scheduled and workflow-dispatch CI enable judges by default). +## Triage failing scenarios + +When a scenario fails, use the Cursor skill `triage-agent-eval-failures` (`.cursor/skills/triage-agent-eval-failures/`) to decide whether the failure is real (playbook regression), a test bug (grader / tape / judge), or flaky (model non-determinism). The skill walks through re-run checks, `RunResult` evidence, and a fix target — playbook vs test scaffolding. Worked examples are in `reference.md` inside that skill directory. + ## Adding a new suite 1. Create `src/suites//` with a `CommandParser`, scenario folders, grader catalog, and `harness.ts`. diff --git a/libs/agent-evals/src/core/tools.ts b/libs/agent-evals/src/core/tools.ts index ae65fe2bf08..230db2cc298 100644 --- a/libs/agent-evals/src/core/tools.ts +++ b/libs/agent-evals/src/core/tools.ts @@ -55,16 +55,82 @@ async function readFixtureFile(projectRoot: string, filePath: string): Promise): boolean { - const match = command.match(/^export\s+([A-Z_][A-Z0-9_]*)='([^']*)'/); +/** + * Read a single shell value, honoring single quotes, double quotes, and backslash + * escapes (including the `'\''` idiom agents use to embed apostrophes). Reading stops + * at the first unquoted whitespace. Returns the decoded value and how many characters + * were consumed so the caller can find the residual command. + */ +function readShellValue(input: string): { value: string; consumed: number } { + let out = ''; + let i = 0; + + while (i < input.length) { + const ch = input[i]; + + if (ch === "'") { + i += 1; + while (i < input.length && input[i] !== "'") { + out += input[i]; + i += 1; + } + i += 1; + } else if (ch === '"') { + i += 1; + while (i < input.length && input[i] !== '"') { + if (input[i] === '\\' && i + 1 < input.length) { + i += 1; + } + out += input[i]; + i += 1; + } + i += 1; + } else if (ch === '\\') { + if (i + 1 < input.length) { + out += input[i + 1]; + i += 2; + } else { + i += 1; + } + } else if (/\s/.test(ch)) { + break; + } else { + out += ch; + i += 1; + } + } - if (match?.[1]) { - env[match[1]] = match[2] ?? ''; + return { value: out, consumed: i }; +} - return true; +/** + * Capture any leading `export VAR=` assignments into the harness env, then return + * the residual command (e.g. the `npx novu connect …` that follows). Agents commonly run + * the playbook's Step 3 block — an `export` plus the connect command — in a single shell + * call (joined by a newline, `;`, or `&&`); the residual must still execute so the connect + * command is tracked and streamed. Returns the original command unchanged when it does not + * start with an export. + */ +function captureLeadingExports(command: string, env: Record): string { + let rest = command; + let capturedAny = false; + + for (;;) { + const stripped = rest.replace(/^[\s;&]+/, ''); + const match = stripped.match(/^export\s+([A-Z_][A-Z0-9_]*)=/); + + if (!match?.[1]) { + break; + } + + capturedAny = true; + const afterEq = stripped.slice(match[0].length); + const { value, consumed } = readShellValue(afterEq); + env[match[1]] = value; + rest = afterEq.slice(consumed); } - return false; + return capturedAny ? rest.replace(/^[\s;&]+/, '') : command; } export function createHarnessTools(context: HarnessContext) { @@ -76,10 +142,10 @@ export function createHarnessTools(context: HarnessCont run_in_background: z.boolean().optional().describe('Run the command in the background.'), description: z.string().optional().describe('Short description of what the command does.'), }), - execute: async ({ command, run_in_background: runInBackground }) => { - context.recorder.recordToolCall('Bash', { command, run_in_background: runInBackground }); + execute: async ({ command: rawCommand, run_in_background: runInBackground }) => { + context.recorder.recordToolCall('Bash', { command: rawCommand, run_in_background: runInBackground }); - if (isForbiddenWatcherCommand(command)) { + if (isForbiddenWatcherCommand(rawCommand)) { return { error: 'Command rejected by harness.', stdout: '', @@ -88,7 +154,11 @@ export function createHarnessTools(context: HarnessCont }; } - if (captureExportedEnv(command, context.env)) { + // Capture leading `export VAR=…` assignments, then continue with whatever follows + // (e.g. the connect command in the same block). A pure export block has no residual. + const command = captureLeadingExports(rawCommand, context.env); + + if (!command) { return { stdout: '', stderr: '', exitCode: 0 }; } diff --git a/libs/agent-evals/src/suites/agent-onboarding/kit.ts b/libs/agent-evals/src/suites/agent-onboarding/kit.ts index 3d7ac1ca741..ea879f50af7 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/kit.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/kit.ts @@ -1,5 +1,5 @@ // Stable import surface for scenario files, independent of core/ layout. -export { defineGraders, labeled } from '../../core/graders.js'; +export { defineGraders, labeled, toolCallsNamed } from '../../core/graders.js'; export type { EvalScenario, RunResult } from '../../core/types.js'; export { catalog, sharedJudgeGraders } from './catalog.js'; export type { ConnectFlags } from './connect-parser.js'; diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts index 908d8ecea31..2eb12990095 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/discipline-no-timers/graders.ts @@ -1,7 +1,10 @@ -import { catalog, defineGraders, labeled, type RunResult } from '../../kit.js'; +import { catalog, defineGraders, labeled, type RunResult, toolCallsNamed } from '../../kit.js'; +// Count actual BashOutput poll calls, not `polledShellIds` — the recorder dedupes the latter +// by shell id, so a correct agent polling one shell repeatedly would otherwise score as a +// single poll. function polledAtLeast(result: RunResult, count: number): 'pass' | 'fail' { - return result.polledShellIds.length >= count ? 'pass' : 'fail'; + return toolCallsNamed(result, 'BashOutput').length >= count ? 'pass' : 'fail'; } export const graders = defineGraders({ From 79cf8ff818aac3f0d07831948e177401359b9ab9 Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Thu, 18 Jun 2026 11:06:42 +0300 Subject: [PATCH 06/19] chore(agent-evals): simplify GitHub Actions workflow for agent evaluations - Removed unnecessary triggers for push and schedule events. - Updated the evaluation job to always enable the LLM judge and specified the source path for agent onboarding evaluations. --- .github/workflows/agent-evals.yml | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/.github/workflows/agent-evals.yml b/.github/workflows/agent-evals.yml index 4e6bee5bd06..cfb7542b22e 100644 --- a/.github/workflows/agent-evals.yml +++ b/.github/workflows/agent-evals.yml @@ -1,24 +1,9 @@ name: Agent evals on: - push: - branches: - - next - paths: - - packages/shared/docs/agent-onboarding.md - - libs/agent-evals/** pull_request: paths: - packages/shared/docs/agent-onboarding.md - - libs/agent-evals/** - schedule: - - cron: '0 4 * * *' - workflow_dispatch: - inputs: - enable_judge: - description: Enable LLM judge graders - type: boolean - default: true jobs: evals: @@ -45,5 +30,5 @@ jobs: - name: Run agent evals env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - NOVU_EVAL_JUDGE: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.enable_judge)) && 'true' || 'false' }} - run: pnpm --filter @novu/agent-evals eval + NOVU_EVAL_JUDGE: 'true' + run: pnpm --filter @novu/agent-evals eval src/suites/agent-onboarding From 2ffc5a1207295df82a6fbacd8e9d87c02bf23bdb Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Thu, 18 Jun 2026 11:21:58 +0300 Subject: [PATCH 07/19] fix(agent-evals): harden harness guards from PR review fixes NV-8059 Address review feedback on the eval harness: - tools: segment-safe fixture-root containment (path.relative), route sentinel-file reads through the same guard, treat unquoted ;/& as shell separators so one-line export+connect commands keep their residual, and only record polls for valid shell ids - recorder: anchor kill-command detection to command-leading invocations and ignore quoted argument text in the watcher guard (no false "sleep" rejects) - mock-shell: guard tracked-command parsing so a parser throw fails the shell instead of aborting the scenario - types: normalize slashes before stripping a leading ./ (Windows .\ paths) - workflow: pin actions to commit SHAs to satisfy workflow-security-lint - docs: add language label to fenced block (markdownlint MD040) Co-authored-by: Cursor --- .github/workflows/agent-evals.yml | 6 +++--- libs/agent-evals/README.md | 2 +- libs/agent-evals/src/core/mock-shell.ts | 17 ++++++++++++++-- libs/agent-evals/src/core/recorder.ts | 7 +++++-- libs/agent-evals/src/core/tools.ts | 26 +++++++++++++++++++------ libs/agent-evals/src/core/types.ts | 2 +- playground/nextjs/.env.example | 9 +++++++++ 7 files changed, 54 insertions(+), 15 deletions(-) diff --git a/.github/workflows/agent-evals.yml b/.github/workflows/agent-evals.yml index cfb7542b22e..95570ea9d99 100644 --- a/.github/workflows/agent-evals.yml +++ b/.github/workflows/agent-evals.yml @@ -11,15 +11,15 @@ jobs: timeout-minutes: 45 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@0e279bb959325dab635dd2c09392533439d90093 # v6.0.8 with: version: 11.0.9 - name: Setup Node.js - uses: actions/setup-node@v4 + uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4 with: node-version: 22 cache: pnpm diff --git a/libs/agent-evals/README.md b/libs/agent-evals/README.md index 25d492aff80..985f907105a 100644 --- a/libs/agent-evals/README.md +++ b/libs/agent-evals/README.md @@ -104,7 +104,7 @@ sequenceDiagram ## Structure -``` +```text src/ core/ # suite-agnostic simulation types.ts # Suite contract, RunResult, Tape, CommandParser diff --git a/libs/agent-evals/src/core/mock-shell.ts b/libs/agent-evals/src/core/mock-shell.ts index 867b693538e..6df844b07c8 100644 --- a/libs/agent-evals/src/core/mock-shell.ts +++ b/libs/agent-evals/src/core/mock-shell.ts @@ -33,12 +33,25 @@ export class MockShellEngine { this.shellCounter += 1; const id = `shell-${this.shellCounter}`; const isTracked = this.parser.matches(command); - const parsed = isTracked ? this.parser.parse(command, env) : null; + + let parsed: TParsed | null = null; + let parseError: string | null = null; + + if (isTracked) { + try { + parsed = this.parser.parse(command, env); + } catch (error) { + parseError = error instanceof Error ? error.message : String(error); + } + } let chunks: string[] = []; let exitCode: number | null = null; - if (isTracked && parsed && this.scenario.tape) { + if (isTracked && parseError) { + chunks = [`✗ Failed to parse tracked command: ${parseError}`]; + exitCode = 1; + } else if (isTracked && parsed !== null && this.scenario.tape) { const validationError = this.scenario.tape.validate?.(parsed) ?? null; if (validationError) { diff --git a/libs/agent-evals/src/core/recorder.ts b/libs/agent-evals/src/core/recorder.ts index 5d6d8b191c9..9b9ef8743dc 100644 --- a/libs/agent-evals/src/core/recorder.ts +++ b/libs/agent-evals/src/core/recorder.ts @@ -85,7 +85,7 @@ export function extractUrls(text: string): string[] { } export function isKillCommand(command: string): boolean { - return /\b(kill|pkill|killall)\b/.test(command); + return /^\s*(kill|pkill|killall)\b/.test(command); } export function isOpenCommand(command: string): boolean { @@ -93,7 +93,10 @@ export function isOpenCommand(command: string): boolean { } export function isForbiddenWatcherCommand(command: string): boolean { - const normalized = command.toLowerCase(); + // Strip quoted argument values so a legitimate agent description such as + // `novu connect "A sleep coaching assistant"` is not rejected for the word "sleep". + const withoutQuotes = command.replace(/'[^']*'/g, ' ').replace(/"[^"]*"/g, ' '); + const normalized = withoutQuotes.toLowerCase(); return ( /\bsleep\b/.test(normalized) || diff --git a/libs/agent-evals/src/core/tools.ts b/libs/agent-evals/src/core/tools.ts index 230db2cc298..88780358031 100644 --- a/libs/agent-evals/src/core/tools.ts +++ b/libs/agent-evals/src/core/tools.ts @@ -46,9 +46,17 @@ function pickScriptedAnswer( async function readFixtureFile(projectRoot: string, filePath: string): Promise { const normalized = normalizePath(filePath); - const absolutePath = path.isAbsolute(normalized) ? path.normalize(normalized) : path.resolve(projectRoot, normalized); + const resolvedRoot = path.resolve(projectRoot); + const absolutePath = path.isAbsolute(normalized) + ? path.normalize(normalized) + : path.resolve(resolvedRoot, normalized); - if (!absolutePath.startsWith(projectRoot)) { + // Segment-safe containment: `path.relative` yields a `..`-prefixed (or absolute) + // result when the target escapes the root, so sibling roots like `-evil` + // no longer pass a naive prefix check. + const relative = path.relative(resolvedRoot, absolutePath); + + if (relative === '' || relative.startsWith('..') || path.isAbsolute(relative)) { throw new Error(`Refusing to read path outside fixture project: ${filePath}`); } @@ -92,7 +100,9 @@ function readShellValue(input: string): { value: string; consumed: number } { } else { i += 1; } - } else if (/\s/.test(ch)) { + } else if (/\s/.test(ch) || ch === ';' || ch === '&') { + // Unquoted shell separators end the value so a one-line + // `export X=foo;npx novu connect …` leaves the connect command as the residual. break; } else { out += ch; @@ -234,7 +244,6 @@ export function createHarnessTools(context: HarnessCont }), execute: async ({ shellId }) => { context.recorder.recordToolCall('BashOutput', { shellId }); - context.recorder.recordPoll(shellId); const shell = context.engine.pollShell(shellId); @@ -242,6 +251,8 @@ export function createHarnessTools(context: HarnessCont return { error: `Unknown shell id: ${shellId}`, stdout: '', completed: true, exitCode: 1 }; } + context.recorder.recordPoll(shellId); + const stdout = shellSummary(shell); for (const url of extractUrls(stdout)) { @@ -253,13 +264,16 @@ export function createHarnessTools(context: HarnessCont if (match?.[1]) { try { - const fileContents = await fs.readFile(match[1], 'utf8'); + // Route through the fixture-root guard: the path is captured from + // agent-controlled shell output, so an injected absolute path must not + // escape the scenario workspace. + const fileContents = await readFixtureFile(context.scenario.projectRoot, match[1]); for (const url of extractUrls(fileContents)) { context.recorder.recordUrl(url); } } catch { - // Sentinel file may not exist in a fixture; ignore. + // Sentinel file may not exist (or sits outside the fixture root); ignore. } } } diff --git a/libs/agent-evals/src/core/types.ts b/libs/agent-evals/src/core/types.ts index 447caf629f9..489fd9b8b02 100644 --- a/libs/agent-evals/src/core/types.ts +++ b/libs/agent-evals/src/core/types.ts @@ -131,5 +131,5 @@ const currentDir = path.dirname(fileURLToPath(import.meta.url)); export const PACKAGE_ROOT = path.resolve(currentDir, '../..'); export function normalizePath(input: string): string { - return input.replace(/^\.\//, '').replace(/\\/g, '/'); + return input.replace(/\\/g, '/').replace(/^\.\/+/, ''); } diff --git a/playground/nextjs/.env.example b/playground/nextjs/.env.example index 2877d65d677..978f4fa12b0 100644 --- a/playground/nextjs/.env.example +++ b/playground/nextjs/.env.example @@ -15,6 +15,15 @@ NOVU_SECRET_KEY= NOVU_SUBSCRIBER_ID= NOVU_HITL_WORKFLOW_ID=refund-approval +# Novu Chat-adapter bridge (/api/novu-agent, test UI at /novu-agent) +# Reuses NOVU_SECRET_KEY above as both the reply apiKey and the bridge HMAC secret. +NOVU_AGENT_IDENTIFIER= +# Optional — defaults to https://api.novu.co (use https://dev.api.novu.co for dev cloud) +NOVU_API_BASE_URL= +# Optional — public URL of this bridge route; when set it's registered on boot +# (use an ngrok/tunnel URL for local dev, e.g. https://.ngrok-free.dev/api/novu-agent) +NOVU_BRIDGE_URL= + # Slack Connect Chat demo NEXT_PUBLIC_NOVU_SLACK_INTEGRATION_IDENTIFIER= NEXT_PUBLIC_SLACK_USER_ID= From 5be8c986d9c0f58ba2af825bcb941b48aead4fa8 Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Thu, 18 Jun 2026 11:48:20 +0300 Subject: [PATCH 08/19] ci(agent-evals): run eval workflow on harness changes fixes NV-8059 Trigger the eval job when the harness code (libs/agent-evals/**) or the workflow itself changes, not only on the playbook doc, so grader/tape/parser/ mock-shell changes are covered. Intentionally omit the global lockfile to avoid running this LLM-backed, secret-dependent job on every unrelated PR. Co-authored-by: Cursor --- .github/workflows/agent-evals.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/agent-evals.yml b/.github/workflows/agent-evals.yml index 95570ea9d99..f98cd1873a8 100644 --- a/.github/workflows/agent-evals.yml +++ b/.github/workflows/agent-evals.yml @@ -4,6 +4,8 @@ on: pull_request: paths: - packages/shared/docs/agent-onboarding.md + - libs/agent-evals/** + - .github/workflows/agent-evals.yml jobs: evals: From fffdf2f6d3de3009eed65700717e7ef75caec789 Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Thu, 18 Jun 2026 12:21:10 +0300 Subject: [PATCH 09/19] fix(agent-evals): accept markdown QR delivery and wire scheduled evals fixes NV-8059 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Greptile re-review: - catalog: qrHostAware now passes when the agent embeds the QR PNG as an inline Markdown image (![..](*.png)) in chat, not only when it opens it via the OS viewer — both are playbook-approved host-aware delivery paths - workflow: add nightly schedule + workflow_dispatch triggers and gate NOVU_EVAL_JUDGE to those events so PRs run deterministic graders only, matching the README/PR contract Co-authored-by: Cursor --- .github/workflows/agent-evals.yml | 7 ++++++- .../agent-evals/src/suites/agent-onboarding/catalog.ts | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/workflows/agent-evals.yml b/.github/workflows/agent-evals.yml index f98cd1873a8..fedce3f188f 100644 --- a/.github/workflows/agent-evals.yml +++ b/.github/workflows/agent-evals.yml @@ -6,6 +6,10 @@ on: - packages/shared/docs/agent-onboarding.md - libs/agent-evals/** - .github/workflows/agent-evals.yml + schedule: + # Nightly regression run (06:00 UTC) catches model/playbook drift outside PRs. + - cron: '0 6 * * *' + workflow_dispatch: jobs: evals: @@ -32,5 +36,6 @@ jobs: - name: Run agent evals env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - NOVU_EVAL_JUDGE: 'true' + # Judge graders only on scheduled/manual runs; PRs run deterministic graders. + NOVU_EVAL_JUDGE: ${{ (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'true' || 'false' }} run: pnpm --filter @novu/agent-evals eval src/suites/agent-onboarding diff --git a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts index 90b6cebd381..ec9b0c7fa98 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts @@ -156,8 +156,14 @@ export const catalog = { : fail('ran connect without an approved confirmation picker beforehand'); }, - qrHostAware: (result: RunResult): GraderOutcome | 'pass' => - result.openedFiles.some((file) => file.endsWith('.png')) ? 'pass' : fail('did not open the QR code image'), + qrHostAware: (result: RunResult): GraderOutcome | 'pass' => { + const openedPng = result.openedFiles.some((file) => file.endsWith('.png')); + // The playbook's host-aware delivery also allows chat UIs to embed the PNG as an + // inline Markdown image (`![…]()`) instead of an OS `open`. + const embeddedPng = /!\[[^\]]*]\([^)]*\.png[^)]*\)/i.test(transcriptText(result)); + + return openedPng || embeddedPng ? 'pass' : fail('did not open or embed the QR code image'); + }, reranWithSlackToken: (result: RunResult): GraderOutcome | 'pass' => connectCommands(result).some((cmd) => /--slack-config-token\b/.test(cmd)) From e3b820042e53de0a8ab182675bd2f68f0f850d24 Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Thu, 18 Jun 2026 14:33:41 +0300 Subject: [PATCH 10/19] fix(agent-evals): make watcher guard quote/escape aware fixes NV-8059 Replace the naive quoted-span regex with a single-pass lexer so the shell '\'' apostrophe idiom (e.g. 'Bob'\''s sleep coach') no longer leaks words like sleep/tail/grep to the watcher check and false-fails valid agent descriptions. Unquoted command words are preserved, so real watcher commands are still caught. Co-authored-by: Cursor --- libs/agent-evals/src/core/recorder.ts | 50 ++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/libs/agent-evals/src/core/recorder.ts b/libs/agent-evals/src/core/recorder.ts index 9b9ef8743dc..28ca869ad0b 100644 --- a/libs/agent-evals/src/core/recorder.ts +++ b/libs/agent-evals/src/core/recorder.ts @@ -92,11 +92,53 @@ export function isOpenCommand(command: string): boolean { return /^\s*(open|xdg-open|start)\b/.test(command.trim()); } +/** + * Drop shell string-literal content (single/double quoted spans and backslash-escaped + * characters) while preserving unquoted command words. A single-pass lexer is required + * because the `'\''` idiom agents use to embed apostrophes — e.g. `'Bob'\''s sleep coach'` — + * splits a value across multiple quote runs that a naive `'...'` regex cannot follow. + */ +function stripShellStringLiterals(command: string): string { + let out = ''; + let i = 0; + + while (i < command.length) { + const ch = command[i]; + + if (ch === "'") { + i += 1; + while (i < command.length && command[i] !== "'") { + i += 1; + } + i += 1; + out += ' '; + } else if (ch === '"') { + i += 1; + while (i < command.length && command[i] !== '"') { + if (command[i] === '\\' && i + 1 < command.length) { + i += 1; + } + i += 1; + } + i += 1; + out += ' '; + } else if (ch === '\\') { + i += 2; + out += ' '; + } else { + out += ch; + i += 1; + } + } + + return out; +} + export function isForbiddenWatcherCommand(command: string): boolean { - // Strip quoted argument values so a legitimate agent description such as - // `novu connect "A sleep coaching assistant"` is not rejected for the word "sleep". - const withoutQuotes = command.replace(/'[^']*'/g, ' ').replace(/"[^"]*"/g, ' '); - const normalized = withoutQuotes.toLowerCase(); + // Scan only unquoted command words so a legitimate agent description such as + // `novu connect "A sleep coaching assistant"` (or `'Bob'\''s sleep coach'`) is not + // rejected for an embedded "sleep"/"tail"/"grep". + const normalized = stripShellStringLiterals(command).toLowerCase(); return ( /\bsleep\b/.test(normalized) || From 704e8ee8ee4cab878884bca529d00698a7688bb3 Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Sun, 21 Jun 2026 13:33:35 +0300 Subject: [PATCH 11/19] fix(ci): run agent eval workflows only on PRs to next Remove nightly and manual dispatch triggers, scope both workflows to path-filtered PRs targeting next, and keep judge graders disabled in CI. Co-authored-by: Cursor --- .github/workflows/agent-evals.yml | 9 +++------ .github/workflows/agent-onboarding-webhook.yml | 5 +++-- libs/agent-evals/README.md | 6 +++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/agent-evals.yml b/.github/workflows/agent-evals.yml index fedce3f188f..4c9fd9b24e1 100644 --- a/.github/workflows/agent-evals.yml +++ b/.github/workflows/agent-evals.yml @@ -2,14 +2,12 @@ name: Agent evals on: pull_request: + branches: + - next paths: - packages/shared/docs/agent-onboarding.md - libs/agent-evals/** - .github/workflows/agent-evals.yml - schedule: - # Nightly regression run (06:00 UTC) catches model/playbook drift outside PRs. - - cron: '0 6 * * *' - workflow_dispatch: jobs: evals: @@ -36,6 +34,5 @@ jobs: - name: Run agent evals env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - # Judge graders only on scheduled/manual runs; PRs run deterministic graders. - NOVU_EVAL_JUDGE: ${{ (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'true' || 'false' }} + NOVU_EVAL_JUDGE: 'false' run: pnpm --filter @novu/agent-evals eval src/suites/agent-onboarding diff --git a/.github/workflows/agent-onboarding-webhook.yml b/.github/workflows/agent-onboarding-webhook.yml index ca17dabd7e7..b51d34e1dc7 100644 --- a/.github/workflows/agent-onboarding-webhook.yml +++ b/.github/workflows/agent-onboarding-webhook.yml @@ -1,11 +1,12 @@ name: Notify Cursor Automation on Agent Onboarding Change on: - push: + pull_request: branches: - next paths: - - "packages/shared/docs/agent-onboarding.md" + - packages/shared/docs/agent-onboarding.md + - .github/workflows/agent-onboarding-webhook.yml permissions: contents: read diff --git a/libs/agent-evals/README.md b/libs/agent-evals/README.md index 985f907105a..a4137a7cfce 100644 --- a/libs/agent-evals/README.md +++ b/libs/agent-evals/README.md @@ -154,7 +154,7 @@ pnpm --filter @novu/agent-evals eval:watch # Single scenario pnpm --filter @novu/agent-evals exec vitest run --config vitest.evals.config.ts -t keyless-slack-secure -# Enable LLM judge graders (also enabled on scheduled CI runs) +# Enable LLM judge graders locally NOVU_EVAL_JUDGE=true pnpm --filter @novu/agent-evals eval ``` @@ -175,7 +175,7 @@ Scenarios are independent and dominated by live-model latency, so they run concu Each scenario uses `judgeThreshold: 0.8` — the average judge score for that scenario must be ≥ 80%. This is stricter than the old global `--fail-under 80` (which gated on the average across all scenarios): every scenario must pass individually. -Judge graders run only when `NOVU_EVAL_JUDGE=true` (PR/push CI runs deterministic graders only; scheduled and workflow-dispatch CI enable judges by default). +Judge graders run only when `NOVU_EVAL_JUDGE=true` (CI runs deterministic graders only). ## Triage failing scenarios @@ -189,4 +189,4 @@ When a scenario fails, use the Cursor skill `triage-agent-eval-failures` (`.curs ## CI -GitHub Actions workflow `.github/workflows/agent-evals.yml` runs `pnpm --filter @novu/agent-evals eval` on playbook or harness changes, with `NOVU_EVAL_JUDGE` enabled on schedule and workflow-dispatch. +GitHub Actions workflow `.github/workflows/agent-evals.yml` runs `pnpm --filter @novu/agent-evals eval` on PRs to `next` that touch the playbook or harness. From db7d9aa657fc9bff59a0740334a095451252d46b Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Sun, 21 Jun 2026 13:37:39 +0300 Subject: [PATCH 12/19] fix(ci): trigger onboarding webhook on merge to next Run the Cursor automation webhook after changes land on next via push, not while the PR is open. Co-authored-by: Cursor --- .github/workflows/agent-onboarding-webhook.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/agent-onboarding-webhook.yml b/.github/workflows/agent-onboarding-webhook.yml index b51d34e1dc7..bbab0f51b1e 100644 --- a/.github/workflows/agent-onboarding-webhook.yml +++ b/.github/workflows/agent-onboarding-webhook.yml @@ -1,7 +1,8 @@ name: Notify Cursor Automation on Agent Onboarding Change on: - pull_request: + # Fires after a PR is merged to next (merge creates a push to the branch). + push: branches: - next paths: From 85646faccccd8c8b41edbbcd33b7fe47845e96a3 Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Sun, 21 Jun 2026 13:47:30 +0300 Subject: [PATCH 13/19] chore(ci): drop unrelated onboarding webhook workflow changes Revert agent-onboarding-webhook.yml edits; that workflow and its Cursor secrets are outside the agent-evals harness scope. Co-authored-by: Cursor --- .github/workflows/agent-onboarding-webhook.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/agent-onboarding-webhook.yml b/.github/workflows/agent-onboarding-webhook.yml index bbab0f51b1e..ca17dabd7e7 100644 --- a/.github/workflows/agent-onboarding-webhook.yml +++ b/.github/workflows/agent-onboarding-webhook.yml @@ -1,13 +1,11 @@ name: Notify Cursor Automation on Agent Onboarding Change on: - # Fires after a PR is merged to next (merge creates a push to the branch). push: branches: - next paths: - - packages/shared/docs/agent-onboarding.md - - .github/workflows/agent-onboarding-webhook.yml + - "packages/shared/docs/agent-onboarding.md" permissions: contents: read From 40d37e9a90c8fb5201fd31c6acaae45f40e7f54a Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Sun, 21 Jun 2026 22:53:27 +0300 Subject: [PATCH 14/19] feat(agent-evals): always run LLM judge graders Remove the NOVU_EVAL_JUDGE flag and its gating so judge graders run alongside deterministic graders on every run, including CI. Drops the flag from adapters, the eval suite, the workflow, env example, docs, and the triage skill. Co-authored-by: Cursor --- .../triage-agent-eval-failures/SKILL.md | 6 +++--- .github/workflows/agent-evals.yml | 1 - libs/agent-evals/.env.example | 3 --- libs/agent-evals/README.md | 8 ++----- libs/agent-evals/scripts/run-evals.sh | 5 ----- .../src/suites/agent-onboarding/adapters.ts | 21 ++----------------- .../agent-onboarding/onboarding.eval.ts | 4 ++-- 7 files changed, 9 insertions(+), 39 deletions(-) diff --git a/.cursor/skills/triage-agent-eval-failures/SKILL.md b/.cursor/skills/triage-agent-eval-failures/SKILL.md index 3e8616dd299..cb6007a673e 100644 --- a/.cursor/skills/triage-agent-eval-failures/SKILL.md +++ b/.cursor/skills/triage-agent-eval-failures/SKILL.md @@ -20,10 +20,10 @@ pnpm --filter @novu/agent-evals exec vitest run --config vitest.evals.config.ts - Fails **every** run → deterministic failure, continue triage. - Fails **intermittently** → flaky. The cause is usually a non-deterministic judge grader or an over-strict regex. Do not edit the playbook. Tighten the grader/judge prompt or accept variance; consider pass@k rather than single-run gating. -To reproduce judge graders locally (PR/push CI runs deterministic graders only): +To reproduce judge graders locally: ```bash -NOVU_EVAL_JUDGE=true pnpm --filter @novu/agent-evals exec vitest run --config vitest.evals.config.ts -t +pnpm --filter @novu/agent-evals exec vitest run --config vitest.evals.config.ts -t ``` ## Step 1: identify which grader failed and its kind @@ -73,7 +73,7 @@ A scenario passes only when every active grader averages ≥ `0.8` (`JUDGE_THRES ## Step 4: apply one bounded fix, then verify 1. Change **only** the layer the verdict points to — playbook **or** test, never both to chase green. -2. Re-run the single scenario (Step 0 command), with `NOVU_EVAL_JUDGE=true` if a judge grader was involved. +2. Re-run the single scenario (Step 0 command). 3. Confirm the fix holds across the 3–5 re-runs and that no other scenario regressed. 4. If editing a deterministic grader, also run the synthetic unit tests so you don't break grader contracts: diff --git a/.github/workflows/agent-evals.yml b/.github/workflows/agent-evals.yml index 4c9fd9b24e1..b319af2d6e5 100644 --- a/.github/workflows/agent-evals.yml +++ b/.github/workflows/agent-evals.yml @@ -34,5 +34,4 @@ jobs: - name: Run agent evals env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - NOVU_EVAL_JUDGE: 'false' run: pnpm --filter @novu/agent-evals eval src/suites/agent-onboarding diff --git a/libs/agent-evals/.env.example b/libs/agent-evals/.env.example index bea6b6df91e..3a8cf17359c 100644 --- a/libs/agent-evals/.env.example +++ b/libs/agent-evals/.env.example @@ -1,8 +1,5 @@ ANTHROPIC_API_KEY= -# Set to true to include LLM judge graders (enabled on scheduled CI) -NOVU_EVAL_JUDGE= - # Optional model overrides (default: claude-sonnet-4-5) NOVU_EVAL_MODEL= NOVU_EVAL_JUDGE_MODEL= diff --git a/libs/agent-evals/README.md b/libs/agent-evals/README.md index a4137a7cfce..ea09f6cec35 100644 --- a/libs/agent-evals/README.md +++ b/libs/agent-evals/README.md @@ -85,7 +85,7 @@ sequenceDiagram Harness->>Rec: build() → RunResult Harness-->>Vitest: HarnessRun with output Vitest->>Judges: assess each grader as judge (threshold 0.8) - alt judge grader (NOVU_EVAL_JUDGE) + alt judge grader Judges->>LLM: runJudge(prompt, context) end Judges-->>Vitest: pass / fail per judge @@ -153,9 +153,6 @@ pnpm --filter @novu/agent-evals eval:watch # Single scenario pnpm --filter @novu/agent-evals exec vitest run --config vitest.evals.config.ts -t keyless-slack-secure - -# Enable LLM judge graders locally -NOVU_EVAL_JUDGE=true pnpm --filter @novu/agent-evals eval ``` ## Environment variables @@ -163,7 +160,6 @@ NOVU_EVAL_JUDGE=true pnpm --filter @novu/agent-evals eval | Variable | Description | | --- | --- | | `ANTHROPIC_API_KEY` | Required for eval runs (suites skip when unset) | -| `NOVU_EVAL_JUDGE` | Set to `true` or `1` to include LLM judge graders | | `NOVU_EVAL_MODEL` | Agent model (default: `claude-sonnet-4-5`) | | `NOVU_EVAL_JUDGE_MODEL` | Judge model (default: `claude-sonnet-4-5`) | | `NOVU_EVAL_CONCURRENCY` | Max scenarios run in parallel (default: `4`) | @@ -175,7 +171,7 @@ Scenarios are independent and dominated by live-model latency, so they run concu Each scenario uses `judgeThreshold: 0.8` — the average judge score for that scenario must be ≥ 80%. This is stricter than the old global `--fail-under 80` (which gated on the average across all scenarios): every scenario must pass individually. -Judge graders run only when `NOVU_EVAL_JUDGE=true` (CI runs deterministic graders only). +Judge graders (LLM-as-judge) always run alongside deterministic graders. ## Triage failing scenarios diff --git a/libs/agent-evals/scripts/run-evals.sh b/libs/agent-evals/scripts/run-evals.sh index 7bde1159f14..2cee1ce33a9 100755 --- a/libs/agent-evals/scripts/run-evals.sh +++ b/libs/agent-evals/scripts/run-evals.sh @@ -4,9 +4,4 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$ROOT_DIR" -if [[ "${1:-}" == "--judge" ]]; then - export NOVU_EVAL_JUDGE=true - shift -fi - pnpm eval "$@" diff --git a/libs/agent-evals/src/suites/agent-onboarding/adapters.ts b/libs/agent-evals/src/suites/agent-onboarding/adapters.ts index 50a69e6759b..08b282f2bcb 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/adapters.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/adapters.ts @@ -24,23 +24,6 @@ export function graderToJudge(name: string, definition: GraderDefinition): Judge }); } -export function gradersToJudges( - graders: Record, - options: { judgeEnabled: boolean } -): Judge[] { - const judges: Judge[] = []; - - for (const [name, definition] of Object.entries(graders)) { - if (definition.kind === 'judge' && !options.judgeEnabled) { - continue; - } - - judges.push(graderToJudge(name, definition)); - } - - return judges; -} - -export function isJudgeEnabled(): boolean { - return process.env.NOVU_EVAL_JUDGE === 'true' || process.env.NOVU_EVAL_JUDGE === '1'; +export function gradersToJudges(graders: Record): Judge[] { + return Object.entries(graders).map(([name, definition]) => graderToJudge(name, definition)); } diff --git a/libs/agent-evals/src/suites/agent-onboarding/onboarding.eval.ts b/libs/agent-evals/src/suites/agent-onboarding/onboarding.eval.ts index bfc3ad6b905..0d8abd6fed0 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/onboarding.eval.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/onboarding.eval.ts @@ -1,6 +1,6 @@ import '../../load-env.js'; import { describeEval } from 'vitest-evals'; -import { gradersToJudges, isJudgeEnabled } from './adapters.js'; +import { gradersToJudges } from './adapters.js'; import { loadSuiteSystemPrompt, scenarioHarness } from './harness.js'; import { agentOnboardingSuite } from './index.js'; @@ -18,7 +18,7 @@ for (const entry of agentOnboardingSuite.scenarios) { entry.scenario.id, { harness, - judges: gradersToJudges(entry.graders, { judgeEnabled: isJudgeEnabled() }), + judges: gradersToJudges(entry.graders), judgeThreshold: JUDGE_THRESHOLD, skipIf: () => !process.env.ANTHROPIC_API_KEY, }, From 20e1f5f626c424d450dabf792f86262314ad0a89 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 21 Jun 2026 19:54:11 +0000 Subject: [PATCH 15/19] fix(agent-evals): record each Read tool call once fixes NV-8059 Co-authored-by: George Djabarov --- libs/agent-evals/src/core/tools.test.ts | 77 +++++++++++++++++++++++++ libs/agent-evals/src/core/tools.ts | 11 +++- 2 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 libs/agent-evals/src/core/tools.test.ts diff --git a/libs/agent-evals/src/core/tools.test.ts b/libs/agent-evals/src/core/tools.test.ts new file mode 100644 index 00000000000..96b12e816d8 --- /dev/null +++ b/libs/agent-evals/src/core/tools.test.ts @@ -0,0 +1,77 @@ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { afterAll, describe, expect, it } from 'vitest'; +import { RunRecorder } from './recorder.js'; +import { createHarnessContext, createHarnessTools } from './tools.js'; +import type { CommandParser, EvalScenario, Suite } from './types.js'; + +const parser: CommandParser = { matches: () => false, parse: () => ({}) }; + +const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-evals-read-')); +fs.writeFileSync(path.join(tmpDir, 'README.md'), 'hello world'); + +afterAll(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }); +}); + +function makeHarness() { + const scenario: EvalScenario = { + id: 'read-test', + category: 'test', + description: '', + userPrompt: '', + projectRoot: tmpDir, + scriptedAnswers: [], + }; + const suite: Suite = { + id: 'suite', + description: '', + systemPrompt: { text: '' }, + commandParser: parser, + scenarios: [], + }; + const recorder = new RunRecorder('read-test', 'prompt'); + const context = createHarnessContext(suite, scenario, recorder); + const { Read } = createHarnessTools(context); + const read = Read as unknown as { + execute: (args: { file_path: string }) => Promise<{ content?: string; error?: string }>; + }; + + return { read, recorder }; +} + +function readCalls(recorder: RunRecorder) { + return recorder.build().toolCalls.filter((call) => call.name === 'Read'); +} + +describe('Read tool records exactly once per call', () => { + it('records a single Read for a successful read (with byte count)', async () => { + const { read, recorder } = makeHarness(); + + const result = await read.execute({ file_path: 'README.md' }); + + expect(result.content).toBe('hello world'); + + const calls = readCalls(recorder); + expect(calls).toHaveLength(1); + expect(calls[0].result).toMatchObject({ bytes: 'hello world'.length }); + }); + + it('records a single Read for a PNG placeholder', async () => { + const { read, recorder } = makeHarness(); + + await read.execute({ file_path: 'qr.png' }); + + expect(readCalls(recorder)).toHaveLength(1); + }); + + it('records a single Read for a failed read', async () => { + const { read, recorder } = makeHarness(); + + const result = await read.execute({ file_path: 'does-not-exist.txt' }); + + expect(result.error).toBeDefined(); + expect(readCalls(recorder)).toHaveLength(1); + }); +}); diff --git a/libs/agent-evals/src/core/tools.ts b/libs/agent-evals/src/core/tools.ts index 88780358031..60ce83b0a34 100644 --- a/libs/agent-evals/src/core/tools.ts +++ b/libs/agent-evals/src/core/tools.ts @@ -324,13 +324,18 @@ export function createHarnessTools(context: HarnessCont file_path: z.string(), }), execute: async ({ file_path: filePath }) => { - context.recorder.recordToolCall('Read', { file_path: filePath }); - + // Record exactly once per call, inside each branch, so a successful read is not + // logged twice (which would double every `toolCallsNamed(result, 'Read')` count + // and corrupt the tool-call timeline). if (filePath.includes('/tmp/') || filePath.endsWith('.log')) { + context.recorder.recordToolCall('Read', { file_path: filePath }); + return { error: 'Reading log files is discouraged in this flow.' }; } if (filePath.endsWith('.png')) { + context.recorder.recordToolCall('Read', { file_path: filePath }); + return { content: '[PNG image omitted by harness]' }; } @@ -340,6 +345,8 @@ export function createHarnessTools(context: HarnessCont return { content }; } catch (error) { + context.recorder.recordToolCall('Read', { file_path: filePath }); + return { error: error instanceof Error ? error.message : 'Failed to read file.' }; } }, From 9ebc0d5be12669b97193a591b3c4ebfadece4242 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 21 Jun 2026 19:54:19 +0000 Subject: [PATCH 16/19] fix(agent-evals): harden connect parsing, channel/keyless validation, and pending-shell modeling fixes NV-8059 Co-authored-by: George Djabarov --- libs/agent-evals/src/core/mock-shell.test.ts | 57 ++++++ libs/agent-evals/src/core/mock-shell.ts | 4 +- libs/agent-evals/src/core/types.ts | 7 + .../agent-onboarding/connect-parser.test.ts | 82 +++++++++ .../suites/agent-onboarding/connect-parser.ts | 171 ++++++++++++------ .../scenarios/slack-in-chat-rerun/scenario.ts | 4 + .../src/suites/agent-onboarding/tape.ts | 7 +- 7 files changed, 277 insertions(+), 55 deletions(-) create mode 100644 libs/agent-evals/src/core/mock-shell.test.ts create mode 100644 libs/agent-evals/src/suites/agent-onboarding/connect-parser.test.ts diff --git a/libs/agent-evals/src/core/mock-shell.test.ts b/libs/agent-evals/src/core/mock-shell.test.ts new file mode 100644 index 00000000000..09467833c41 --- /dev/null +++ b/libs/agent-evals/src/core/mock-shell.test.ts @@ -0,0 +1,57 @@ +import { describe, expect, it } from 'vitest'; +import { MockShellEngine } from './mock-shell.js'; +import type { CommandParser, EvalScenario } from './types.js'; + +type Flags = { token?: string }; + +const parser: CommandParser = { + matches: (command) => /\bconnect\b/.test(command), + parse: (command) => ({ token: /--slack-config-token\b/.test(command) ? 'xoxe' : undefined }), +}; + +function scenario(): EvalScenario { + return { + id: 'pending-shell', + category: 'test', + description: '', + userPrompt: '', + projectRoot: '/tmp', + scriptedAnswers: [], + tape: { + chunks: [{ stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.test' }], + exitCode: 0, + pendingWhen: (flags) => !flags.token, + }, + }; +} + +describe('MockShellEngine pendingWhen', () => { + it('keeps a pending (no-token) shell running until it is killed', () => { + const engine = new MockShellEngine(scenario(), parser); + const shell = engine.createShell('novu connect', true, {}); + + // Drain every chunk; a pending branch must not auto-complete. + engine.pollShell(shell.id); + engine.pollShell(shell.id); + engine.pollShell(shell.id); + + expect(shell.exitCode).toBeNull(); + expect(shell.completed).toBe(false); + + engine.killShell(shell.id); + + expect(shell.completed).toBe(true); + expect(shell.killed).toBe(true); + }); + + it('completes a non-pending (token) shell after its chunks are emitted', () => { + const engine = new MockShellEngine(scenario(), parser); + const shell = engine.createShell('novu connect --slack-config-token xoxe', true, {}); + + engine.pollShell(shell.id); + engine.pollShell(shell.id); + + expect(shell.exitCode).toBe(0); + expect(shell.completed).toBe(true); + }); +}); diff --git a/libs/agent-evals/src/core/mock-shell.ts b/libs/agent-evals/src/core/mock-shell.ts index 6df844b07c8..55e264065ae 100644 --- a/libs/agent-evals/src/core/mock-shell.ts +++ b/libs/agent-evals/src/core/mock-shell.ts @@ -59,7 +59,9 @@ export class MockShellEngine { exitCode = 1; } else { chunks = selectTapeChunks(this.scenario.tape, parsed); - exitCode = this.scenario.tape.exitCode ?? 0; + // A pending branch keeps the shell running (exitCode null) until it is killed, + // so `pollShell` never marks it completed on its own. + exitCode = this.scenario.tape.pendingWhen?.(parsed) ? null : (this.scenario.tape.exitCode ?? 0); } } else if (isTracked && !this.scenario.tape) { chunks = ['✗ Tracked command was not expected for this scenario.']; diff --git a/libs/agent-evals/src/core/types.ts b/libs/agent-evals/src/core/types.ts index 489fd9b8b02..5f681c28f42 100644 --- a/libs/agent-evals/src/core/types.ts +++ b/libs/agent-evals/src/core/types.ts @@ -38,6 +38,13 @@ export type Tape = { exitCode?: number; /** Optional suite-defined validation; return an error string to make the tracked command fail. */ validate?: (parsed: TParsed) => string | null; + /** + * When this returns true for a parsed command, the shell stays running (no exit code) + * after emitting its chunks and only completes when the agent kills it. Models real + * long-running CLI branches (e.g. the no-token Slack connect that waits for a config + * token) so a "kill before re-run" requirement is genuinely enforceable. + */ + pendingWhen?: (parsed: TParsed) => boolean; }; export type ScriptedAnswer = { diff --git a/libs/agent-evals/src/suites/agent-onboarding/connect-parser.test.ts b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.test.ts new file mode 100644 index 00000000000..fd950b67d69 --- /dev/null +++ b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.test.ts @@ -0,0 +1,82 @@ +import { describe, expect, it } from 'vitest'; +import { type ConnectFlags, connectParser, connectValidate } from './connect-parser.js'; +import { buildDefaultTape } from './tape.js'; + +const baseFlags: ConnectFlags = { keyless: true, secretKey: false, ci: true, channel: 'slack' }; + +describe('connectParser', () => { + it('strips quotes from --channel values', () => { + const flags = connectParser.parse('npx novu@latest connect "Wine concierge" --ci --keyless --channel "slack"', {}); + + expect(flags.channel).toBe('slack'); + }); + + it('parses a positional description that follows flags', () => { + const flags = connectParser.parse( + 'npx novu@latest connect --ci --keyless --channel slack "Wine staff concierge"', + {} + ); + + expect(flags.description).toBe('Wine staff concierge'); + expect(flags.channel).toBe('slack'); + }); + + it('parses a positional description that precedes flags', () => { + const flags = connectParser.parse('npx novu connect "Wine concierge" --ci --channel slack', {}); + + expect(flags.description).toBe('Wine concierge'); + }); + + it('handles the embedded-apostrophe idiom in a positional description', () => { + const flags = connectParser.parse(`npx novu connect 'Bob'\\''s wine helper' --ci --channel slack`, {}); + + expect(flags.description).toBe("Bob's wine helper"); + }); + + it('resolves a $NOVU_AGENT_DESCRIPTION positional from env', () => { + const flags = connectParser.parse('npx novu connect "$NOVU_AGENT_DESCRIPTION" --ci --keyless --channel slack', { + NOVU_AGENT_DESCRIPTION: 'Wine staff concierge', + }); + + expect(flags.description).toBe('Wine staff concierge'); + }); + + it('reads --slack-config-token without surrounding quotes', () => { + const flags = connectParser.parse('npx novu connect --ci --channel slack --slack-config-token "xoxe.test"', {}); + + expect(flags.slackConfigToken).toBe('xoxe.test'); + }); +}); + +describe('connectValidate', () => { + it('requires a channel when allowedChannels is set', () => { + const error = connectValidate({ allowedChannels: ['slack'] })({ ...baseFlags, channel: undefined }); + + expect(error).toMatch(/Expected --channel/); + }); + + it('rejects a channel outside the allow list', () => { + const error = connectValidate({ allowedChannels: ['slack'] })({ ...baseFlags, channel: 'email' }); + + expect(error).toMatch(/Unexpected channel/); + }); + + it('passes a valid keyless command', () => { + expect(connectValidate({ allowedChannels: ['slack'], requireKeyless: true })(baseFlags)).toBeNull(); + }); +}); + +describe('buildDefaultTape', () => { + it('requires --keyless by default', () => { + const tape = buildDefaultTape({ allowedChannels: ['slack'] }); + + expect(tape.validate?.({ ...baseFlags, keyless: false })).toMatch(/--keyless/); + expect(tape.validate?.({ ...baseFlags, keyless: true })).toBeNull(); + }); + + it('does not require --keyless when requireNoKeyless is set', () => { + const tape = buildDefaultTape({ allowedChannels: ['slack'], requireNoKeyless: true }); + + expect(tape.validate?.({ ...baseFlags, keyless: false })).toBeNull(); + }); +}); diff --git a/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts index c454e7569c4..4517c308d7e 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/connect-parser.ts @@ -13,74 +13,139 @@ export function isConnectCommand(command: string): boolean { return /\bnovu(@[\w.-]+)?\s+connect\b/.test(command) || /\bnpx\s+[^\s]*novu[^\s]*\s+connect\b/.test(command); } +/** Flags that consume the following token as their value (so it is not a positional). */ +const VALUE_FLAGS = new Set(['--channel', '--slack-config-token', '--secret-key', '--api-url', '--dashboard-url']); + /** - * Decode a single shell word, honoring single quotes, double quotes, and backslash - * escapes (including the `'\''` idiom agents use to embed apostrophes). Reading stops - * at the first unquoted whitespace so trailing flags are not absorbed into the value. + * Split a command into shell words, honoring single quotes, double quotes, and backslash + * escapes (including the `'\''` idiom agents use to embed apostrophes). Quotes are stripped + * from the decoded words, so `--channel "slack"` yields `['--channel', 'slack']` rather than + * leaving the quotes attached to the value. */ -function unquoteShellWord(input: string): string { - let out = ''; +function tokenizeShellWords(input: string): string[] { + const words: string[] = []; let i = 0; while (i < input.length) { - const ch = input[i]; - - if (ch === "'") { + while (i < input.length && /\s/.test(input[i])) { i += 1; - while (i < input.length && input[i] !== "'") { - out += input[i]; + } + + if (i >= input.length) { + break; + } + + let word = ''; + + while (i < input.length && !/\s/.test(input[i])) { + const ch = input[i]; + + if (ch === "'") { i += 1; - } - i += 1; - } else if (ch === '"') { - i += 1; - while (i < input.length && input[i] !== '"') { - if (input[i] === '\\' && i + 1 < input.length) { + while (i < input.length && input[i] !== "'") { + word += input[i]; i += 1; } - out += input[i]; i += 1; - } - i += 1; - } else if (ch === '\\') { - if (i + 1 < input.length) { - out += input[i + 1]; - i += 2; + } else if (ch === '"') { + i += 1; + while (i < input.length && input[i] !== '"') { + if (input[i] === '\\' && i + 1 < input.length) { + i += 1; + } + word += input[i]; + i += 1; + } + i += 1; + } else if (ch === '\\') { + if (i + 1 < input.length) { + word += input[i + 1]; + i += 2; + } else { + i += 1; + } } else { + word += ch; i += 1; } - } else if (/\s/.test(ch)) { - break; - } else { - out += ch; - i += 1; + } + + words.push(word); + } + + return words; +} + +/** Read a flag's value, supporting both `--flag value` and `--flag=value` forms. */ +function readFlagValue(tokens: string[], flag: string): string | undefined { + for (let i = 0; i < tokens.length; i += 1) { + const token = tokens[i]; + + if (token === flag) { + return tokens[i + 1]; + } + + if (token.startsWith(`${flag}=`)) { + return token.slice(flag.length + 1); } } - return out; + return undefined; } -function resolveDescription(command: string, env: Record): string | undefined { +/** + * Find the first positional argument after `connect` — i.e. the first token that is not a + * flag and is not consumed as a value-flag's value. This matches the playbook command no + * matter where the quoted description sits (e.g. `connect "Desc" --ci` or + * `connect --ci --channel slack "Desc"`). + */ +function findConnectPositional(tokens: string[]): string | undefined { + const connectIndex = tokens.indexOf('connect'); + + if (connectIndex === -1) { + return undefined; + } + + let skipNext = false; + + for (let i = connectIndex + 1; i < tokens.length; i += 1) { + const token = tokens[i]; + + if (skipNext) { + skipNext = false; + continue; + } + + if (token.startsWith('-')) { + if (VALUE_FLAGS.has(token)) { + skipNext = true; + } + + continue; + } + + return token; + } + + return undefined; +} + +function resolveDescription(command: string, tokens: string[], env: Record): string | undefined { const exportMatch = command.match(/export\s+NOVU_AGENT_DESCRIPTION=(.+)/); if (exportMatch?.[1]) { - const value = unquoteShellWord(exportMatch[1].trimStart()); + const [value] = tokenizeShellWords(exportMatch[1].trimStart()); if (value && !value.includes('$')) { return value; } } - // Only treat a quoted token as the positional description; a leading flag means there is none. - const positionalMatch = command.match(/\bconnect\s+(['"][\s\S]*)/); + const positional = findConnectPositional(tokens); - if (positionalMatch?.[1]) { - const positional = unquoteShellWord(positionalMatch[1]); - - // A positional that references the env var (e.g. "$NOVU_AGENT_DESCRIPTION") resolves from env. - if (positional && !positional.includes('$')) { - return positional; - } + // A positional that references the env var (e.g. "$NOVU_AGENT_DESCRIPTION") resolves from env. + if (positional && !positional.includes('$')) { + return positional; } return env.NOVU_AGENT_DESCRIPTION; @@ -89,23 +154,17 @@ function resolveDescription(command: string, env: Record): strin export const connectParser: CommandParser = { matches: isConnectCommand, parse(command, env) { + const tokens = tokenizeShellWords(command); + const flags: ConnectFlags = { keyless: /--keyless\b/.test(command), secretKey: /--secret-key\b/.test(command) || /\bNOVU_SECRET_KEY=/.test(command), ci: /--ci\b/.test(command), }; - const channelMatch = command.match(/--channel\s+(\S+)/); - if (channelMatch) { - flags.channel = channelMatch[1]; - } - - const slackTokenMatch = command.match(/--slack-config-token\s+(\S+)/); - if (slackTokenMatch) { - flags.slackConfigToken = slackTokenMatch[1]; - } - - flags.description = resolveDescription(command, env); + flags.channel = readFlagValue(tokens, '--channel'); + flags.slackConfigToken = readFlagValue(tokens, '--slack-config-token'); + flags.description = resolveDescription(command, tokens, env); return flags; }, @@ -133,8 +192,14 @@ export function connectValidate(options: ConnectValidationOptions): (flags: Conn return 'Must not pass --secret-key in guided onboarding flow.'; } - if (options.allowedChannels?.length && flags.channel && !options.allowedChannels.includes(flags.channel)) { - return `Unexpected channel "${flags.channel}". Expected one of: ${options.allowedChannels.join(', ')}.`; + if (options.allowedChannels?.length) { + if (!flags.channel) { + return `Expected --channel flag (one of: ${options.allowedChannels.join(', ')}).`; + } + + if (!options.allowedChannels.includes(flags.channel)) { + return `Unexpected channel "${flags.channel}". Expected one of: ${options.allowedChannels.join(', ')}.`; + } } if (!flags.ci) { diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts index 72ff7ddeb22..26c811e8fc7 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/slack-in-chat-rerun/scenario.ts @@ -20,6 +20,10 @@ export const scenario: EvalScenario = { tape: connectTape({ requireNoKeyless: true, allowedChannels: ['slack'], + // The first (no-token) connect run mirrors the real CLI: it prints the Slack setup + // URL and then waits for the config token, so it stays running until the agent kills + // it. Only the re-run that supplies `--slack-config-token` exits on its own. + pendingWhen: (flags) => !flags.slackConfigToken, chunks: [ { stdout: `NOVU_CONNECT_AUTH_URL_FILE=${path.join(scenarioDir, 'project/novu-connect-auth-url.txt')}`, diff --git a/libs/agent-evals/src/suites/agent-onboarding/tape.ts b/libs/agent-evals/src/suites/agent-onboarding/tape.ts index 1eff2237044..91701ce7b3f 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/tape.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/tape.ts @@ -4,6 +4,8 @@ import { type ConnectFlags, type ConnectValidationOptions, connectValidate } fro export type ConnectTapeOptions = ConnectValidationOptions & { chunks: Array>; exitCode?: number; + /** Keep the shell running until killed for the branches this predicate matches. */ + pendingWhen?: (flags: ConnectFlags) => boolean; }; /** Build a connect tape, wiring connect-specific validation into the generic `validate` hook. */ @@ -11,6 +13,7 @@ export function connectTape(options: ConnectTapeOptions): Tape { return { chunks: options.chunks, exitCode: options.exitCode ?? 0, + pendingWhen: options.pendingWhen, validate: connectValidate({ requireKeyless: options.requireKeyless, requireNoKeyless: options.requireNoKeyless, @@ -38,7 +41,9 @@ export function buildDefaultTape(overrides?: Partial): Tape< return connectTape({ chunks: overrides?.chunks ?? defaultChunks, exitCode: overrides?.exitCode ?? 0, - requireKeyless: overrides?.requireKeyless, + // The default tape models the keyless flow, so require `--keyless` unless the caller + // explicitly opts into the dashboard-OAuth (no-keyless) path. + requireKeyless: overrides?.requireKeyless ?? !overrides?.requireNoKeyless, allowedChannels: overrides?.allowedChannels ?? ['slack'], requireNoKeyless: overrides?.requireNoKeyless, }); From 6136d1bcb8a52c8f4ccb9bf52fc22bf4f47b2122 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 21 Jun 2026 19:58:55 +0000 Subject: [PATCH 17/19] fix(agent-evals): avoid duplicating final turn in transcriptText fixes NV-8059 Co-authored-by: George Djabarov --- libs/agent-evals/src/core/graders.test.ts | 25 +++++++++++++++++++++++ libs/agent-evals/src/core/graders.ts | 11 +++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 libs/agent-evals/src/core/graders.test.ts diff --git a/libs/agent-evals/src/core/graders.test.ts b/libs/agent-evals/src/core/graders.test.ts new file mode 100644 index 00000000000..0783186d8e3 --- /dev/null +++ b/libs/agent-evals/src/core/graders.test.ts @@ -0,0 +1,25 @@ +import { describe, expect, it } from 'vitest'; +import { transcriptText } from './graders.js'; +import { RunRecorder } from './recorder.js'; + +describe('transcriptText', () => { + it('does not duplicate the final assistant turn', () => { + const recorder = new RunRecorder('s', 'prompt'); + recorder.recordAssistantMessage('first turn'); + recorder.recordAssistantMessage('final turn'); + + const result = recorder.build(); + + // finalText mirrors the last assistant message, so the transcript must contain + // "final turn" exactly once. + expect(result.finalText).toBe('final turn'); + expect(transcriptText(result)).toBe('first turn\nfinal turn'); + expect(transcriptText(result).match(/final turn/g)).toHaveLength(1); + }); + + it('is empty when no assistant messages were recorded', () => { + const result = new RunRecorder('s', 'prompt').build(); + + expect(transcriptText(result)).toBe(''); + }); +}); diff --git a/libs/agent-evals/src/core/graders.ts b/libs/agent-evals/src/core/graders.ts index d976ab4d85c..607e08172d8 100644 --- a/libs/agent-evals/src/core/graders.ts +++ b/libs/agent-evals/src/core/graders.ts @@ -55,7 +55,16 @@ export function toolCallsNamed(result: RunResult, name: string): ToolCallRecord[ } export function transcriptText(result: RunResult): string { - return [result.finalText, ...result.assistantMessages].join('\n'); + // The recorder mirrors the last assistant turn into `finalText`, so appending it again + // would duplicate that turn in judge prompts and regex-match contexts. Only include + // `finalText` when it is not already the last recorded message. + const messages = [...result.assistantMessages]; + + if (result.finalText && messages[messages.length - 1] !== result.finalText) { + messages.push(result.finalText); + } + + return messages.join('\n'); } export function judge(prompt: string, context: (result: RunResult) => string): GraderDefinition { From 6629cd3eed5c5f6818b91bebcfb0ba88b8c6c8d8 Mon Sep 17 00:00:00 2001 From: George Djabarov Date: Mon, 22 Jun 2026 11:36:04 +0300 Subject: [PATCH 18/19] fix(agent-evals): align conclusion-first judge prompt with playbook NV-8059 The conclusionFirstReport judge required the CLI result to be followed directly by the next action, but the playbook mandates a 1-2 sentence recap in between. This caused the grader to fail on every scenario. Relax the prompt to allow the recap and fail only when the result is buried under process narration or no next action is surfaced. Co-authored-by: Cursor --- libs/agent-evals/src/suites/agent-onboarding/catalog.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts index ec9b0c7fa98..2275073bffc 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/catalog.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/catalog.ts @@ -21,7 +21,7 @@ export const judgePrompts = { noInfraMcpSemantic: 'Does the drafted agent description avoid naming internal infrastructure or backend plumbing (databases, email delivery APIs, queues, caches, dev tooling) even via synonyms?', conclusionFirstReport: - 'You are given only the final user-facing message. Does its first line/sentence state the CLI result (success or failure), followed by the single next action (claim link for keyless or dashboard for authenticated), without leading with setup steps or process narration?', + 'You are given only the final user-facing message. The playbook requires a conclusion-first report: lead with the CLI result, then a 1–2 sentence recap of what was set up, then the next action. Answer YES if the first line/sentence states the CLI result (success or failure) AND the message surfaces the next action (claim link for keyless, or dashboard URL / connected channel for authenticated). A brief recap of what onboarding built between the result and the next action is expected and fine. Answer NO only if the message buries the result under setup steps or process narration before stating it, or never surfaces a next action.', }; export const catalog = { From 93706815ac89bfc502fde167c58905e693acd041 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 22 Jun 2026 11:02:00 +0000 Subject: [PATCH 19/19] fix(agent-evals): enforce keyless flow in keyless-default scenarios fixes NV-8059 The email-handoff, telegram-secure-qr, and persona-infra-exclusion scenarios have no dashboard signal in their user prompt, so per the onboarding playbook the agent must default to `--keyless`. Without `requireKeyless: true` the tape also returns the success chunks for a dashboard-OAuth command, letting an agent that omits `--keyless` pass every grader despite choosing the wrong auth mode. Set `requireKeyless: true` so the tape rejects non-keyless commands, matching the existing keyless-slack-secure scenario (via buildDefaultTape). Co-authored-by: George Djabarov --- .../suites/agent-onboarding/scenarios/email-handoff/scenario.ts | 1 + .../scenarios/persona-infra-exclusion/scenario.ts | 1 + .../agent-onboarding/scenarios/telegram-secure-qr/scenario.ts | 1 + 3 files changed, 3 insertions(+) diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts index 0924f369aae..294bea6fb52 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/email-handoff/scenario.ts @@ -18,6 +18,7 @@ export const scenario: EvalScenario = { { questionContains: 'description', optionId: 'approve' }, ], tape: connectTape({ + requireKeyless: true, allowedChannels: ['email'], chunks: [ { stdout: `NOVU_CONNECT_INBOUND_ADDRESS=${inboundAddress}` }, diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts index 78dc3714c33..b923b0cdf27 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/persona-infra-exclusion/scenario.ts @@ -16,6 +16,7 @@ export const scenario: EvalScenario = { { questionContains: 'token', optionId: 'secure' }, ], tape: connectTape({ + requireKeyless: true, allowedChannels: ['slack'], chunks: [ { stdout: 'NOVU_CONNECT_SLACK_SETUP_URL=https://setup.novu.test/slack/persona-1' }, diff --git a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts index 8266816558c..6a6d27ded04 100644 --- a/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts +++ b/libs/agent-evals/src/suites/agent-onboarding/scenarios/telegram-secure-qr/scenario.ts @@ -17,6 +17,7 @@ export const scenario: EvalScenario = { { questionContains: 'token', optionId: 'secure' }, ], tape: connectTape({ + requireKeyless: true, allowedChannels: ['telegram'], chunks: [ { stdout: 'NOVU_CONNECT_TELEGRAM_BOTFATHER_URL=https://t.me/botfather' },