Eval

refactor: de-brand terminology, add --dry-run and batch instructions #18

Workflow file for this run

	name: Eval

	on:
	workflow_dispatch:
	inputs:
	model:
	description: "Model for responses (default: repo config)"
	required: false
	judge-model:
	description: "Model for judging (default: repo config)"
	required: false
	fail-threshold:
	description: "Fail if pass rate (%) is below this value"
	required: false
	default: "50"
	push:
	branches: [main]
	paths:
	- ".github/copilot-instructions.md"
	- ".github/*/.instructions.md"
	- "agentrc.eval.json"
	pull_request:
	types: [opened, synchronize, reopened, labeled]
	paths:
	- ".github/copilot-instructions.md"
	- ".github/*/.instructions.md"
	- "agentrc.eval.json"

	concurrency:
	group: eval-${{ github.ref }}
	cancel-in-progress: true

	permissions:
	contents: read
	pull-requests: write

	jobs:
	eval:
	name: Run Evals
	runs-on: ubuntu-latest
	timeout-minutes: 30
	steps:
	- uses: actions/checkout@v6

	- uses: actions/setup-node@v6
	with:
	node-version: 22
	cache: npm

	- run: npm ci

	- run: npm run build

	- name: Install Copilot CLI
	run: npm install -g @github/copilot
	env:
	GH_TOKEN: ${{ secrets.COPILOT_TOKEN }}

	- name: Verify Copilot CLI
	run: copilot --version
	env:
	GH_TOKEN: ${{ secrets.COPILOT_TOKEN }}

	- name: Run evals
	id: eval
	continue-on-error: true
	run: \|
	mkdir -p .agentrc/evals

	ARGS="--json --output .agentrc/evals/results.json"

	if [ -n "${{ inputs.model }}" ]; then
	ARGS="$ARGS --model ${{ inputs.model }}"
	fi
	if [ -n "${{ inputs.judge-model }}" ]; then
	ARGS="$ARGS --judge-model ${{ inputs.judge-model }}"
	fi

	THRESHOLD="${{ inputs.fail-threshold \|\| '50' }}"
	ARGS="$ARGS --fail-level $THRESHOLD"

	# shellcheck disable=SC2086
	node dist/index.js eval $ARGS 2>&1 \| tee .agentrc/evals/eval.log
	env:
	GH_TOKEN: ${{ secrets.COPILOT_TOKEN }}

	- name: Upload eval results
	if: always()
	uses: actions/upload-artifact@v7
	with:
	name: eval-results
	path: .agentrc/evals/
	if-no-files-found: warn

	- name: Report eval results
	if: always()
	uses: actions/github-script@v8
	with:
	script: \|
	const fs = require('fs');

	let summary = '';
	let hasResults = false;
	let isPR = context.eventName === 'pull_request';

	// Skip reporting when the eval step crashed (no results produced)
	if (!fs.existsSync('.agentrc/evals/results.json')) {
	const evalOutcome = '${{ steps.eval.outcome }}';
	core.warning(`Eval step finished with outcome "${evalOutcome}" — no results file produced.`);
	// Don't post an unhelpful comment to the PR
	return;
	}

	try {
	const raw = fs.readFileSync('.agentrc/evals/results.json', 'utf8');
	const data = JSON.parse(raw);
	hasResults = true;
	const results = data.results \|\| [];
	const total = results.length;
	const passed = results.filter(r => r.verdict === 'pass').length;
	const failed = results.filter(r => r.verdict === 'fail').length;
	const unknown = results.filter(r => r.verdict === 'unknown').length;
	const passRate = total > 0 ? Math.round((passed / total) * 100) : 0;
	const duration = data.runMetrics?.durationMs
	? `${(data.runMetrics.durationMs / 1000).toFixed(1)}s`
	: 'N/A';

	const icon = passed === total ? '✅' : failed > 0 ? '❌' : '⚠️';
	summary += `## ${icon} AgentRC Eval: ${passed}/${total} pass (${passRate}%)\n\n`;
	summary += `> ${duration} · model \`${data.model}\` · judge \`${data.judgeModel}\`\n\n`;

	summary += `\| Case \| Verdict \| Score \| Rationale \|\n`;
	summary += `\|------\|---------\|-------\|-----------\|\n`;
	for (const r of results) {
	const vIcon = r.verdict === 'pass' ? '✅' : r.verdict === 'fail' ? '❌' : '⚠️';
	const rationale = (r.rationale \|\| '').replace(/\\|/g, '\\\|').replace(/\n/g, ' ').slice(0, 200);
	summary += `\| \`${r.id}\` \| ${vIcon} ${r.verdict \|\| 'unknown'} \| ${r.score ?? '-'} \| ${rationale} \|\n`;
	}

	// Per-case response details in collapsed sections
	summary += '\n### Details\n\n';
	for (const r of results) {
	const m = r.metrics \|\| {};
	const wi = m.withInstructions \|\| {};
	const wo = m.withoutInstructions \|\| {};
	const fmtMs = ms => ms < 1000 ? `${ms}ms` : `${(ms/1000).toFixed(1)}s`;
	const fmtTok = n => n >= 1000 ? `${(n/1000).toFixed(1)}k` : String(n \|\| 0);

	summary += `<details><summary><code>${r.id}</code> · ${r.verdict === 'pass' ? '✅' : '❌'} ${r.score ?? 0}/100</summary>\n\n`;
	summary += `Prompt: ${r.prompt}\n\n`;
	summary += `Expected: ${r.expectation}\n\n`;
	if (r.rationale) summary += `Judge: ${r.rationale}\n\n`;

	summary += `\| Metric \| Without instructions \| With instructions \|\n`;
	summary += `\|--------\|---------------------\|-------------------\|\n`;
	summary += `\| Time \| ${fmtMs(wo.durationMs \|\| 0)} \| ${fmtMs(wi.durationMs \|\| 0)} \|\n`;
	summary += `\| Tokens \| ${fmtTok(wo.tokenUsage?.totalTokens)} \| ${fmtTok(wi.tokenUsage?.totalTokens)} \|\n`;
	summary += `\| Tool calls \| ${wo.toolCalls?.count \|\| 0} \| ${wi.toolCalls?.count \|\| 0} \|\n`;
	summary += `\n</details>\n\n`;
	}
	} catch (err) {
	summary += `## ⚠️ AgentRC Eval\n\nFailed to parse eval results: ${err.message}\n`;
	}

	// Write to Actions job summary (visible in run UI and PR checks tab)
	await core.summary.addRaw(summary).write();

	// Only post/update PR comment when we have actual results
	if (isPR && hasResults) {
	const marker = '<!-- agentrc-eval-results -->';
	const body = marker + '\n' + summary;

	const { data: comments } = await github.rest.issues.listComments({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	});

	const existing = comments.find(c => c.body?.includes(marker));
	if (existing) {
	await github.rest.issues.updateComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: existing.id,
	body,
	});
	} else {
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	body,
	});
	}
	}

	- name: Fail on threshold
	if: steps.eval.outcome == 'failure'
	run: \|
	echo "::error::Eval pass rate fell below threshold"
	exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

refactor: de-brand terminology, add --dry-run and batch instructions #18

Workflow file

refactor: de-brand terminology, add --dry-run and batch instructions #18

Uh oh!

Workflow file for this run