eval

fix: pin docker builder to bookworm to resolve glibc mismatch #45

Workflow file for this run

	name: eval

	on:
	pull_request:

	concurrency:
	group: ${{ github.workflow }}-${{ github.head_ref \|\| github.run_id }}
	cancel-in-progress: true

	permissions:
	contents: read
	pull-requests: write

	jobs:
	eval:
	name: run eval
	runs-on: ubuntu-latest
	steps:
	- name: Checkout heimdall-rs (PR branch)
	uses: actions/checkout@v4

	- name: Checkout heimdall-eval
	uses: actions/checkout@v4
	with:
	repository: Jon-Becker/heimdall-eval
	path: heimdall-eval
	token: ${{ secrets.EVAL_REPO_TOKEN }}

	- name: Install Rust
	uses: dtolnay/rust-toolchain@stable

	- name: Cache Rust dependencies
	uses: Swatinem/rust-cache@v2
	with:
	cache-on-failure: true

	- name: Build heimdall
	run: cargo build --release --bin heimdall

	- name: Install Foundry
	uses: foundry-rs/foundry-toolchain@v1

	- name: Add heimdall to PATH
	run: echo "${{ github.workspace }}/target/release" >> $GITHUB_PATH

	- name: Run evals
	working-directory: heimdall-eval
	run: make run-all

	- name: Run AI evaluation
	working-directory: heimdall-eval
	env:
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	run: \|
	npm install -g @anthropic-ai/claude-code
	make eval-all

	- name: Comment on PR
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');
	const evalsJson = JSON.parse(fs.readFileSync('heimdall-eval/heimdall/evals.json', 'utf8'));

	const entries = Object.entries(evalsJson);
	const cfgScores = entries.map(([_, v]) => v.cfg).filter(x => x != null);
	const decompScores = entries.map(([_, v]) => v.decompilation).filter(x => x != null);

	const avgCfg = cfgScores.length ? Math.floor(cfgScores.reduce((a, b) => a + b, 0) / cfgScores.length) : 'N/A';
	const avgDecomp = decompScores.length ? Math.floor(decompScores.reduce((a, b) => a + b, 0) / decompScores.length) : 'N/A';

	const icon = avgDecomp >= 70 ? ':white_check_mark:' : (avgDecomp >= 50 ? ':warning:' : ':x:');

	// Identify low-scoring evals for detailed breakdown
	const lowScoringEntries = entries.filter(([_, scores]) => {
	const cfg = scores.cfg ?? null;
	const decomp = scores.decompilation ?? null;
	return (cfg != null && cfg < 70) \|\| (decomp != null && decomp < 70);
	});

	let tableRows = entries.map(([name, scores]) => {
	const cfg = scores.cfg ?? 'N/A';
	const decomp = scores.decompilation ?? 'N/A';
	return `\| ${name} \| ${cfg} \| ${decomp} \|`;
	}).join('\n');

	// Collect detailed breakdowns for low-scoring evals
	let detailsSections = [];
	for (const [name, scores] of lowScoringEntries) {
	const cfg = scores.cfg ?? null;
	const decomp = scores.decompilation ?? null;

	let details = [`### ${name} (CFG: ${cfg ?? 'N/A'}, Decompilation: ${decomp ?? 'N/A'})\n`];

	// Read decompilation eval details if score < 70
	if (decomp != null && decomp < 70) {
	const evalPath = path.join('heimdall-eval/heimdall', name, 'eval.json');
	if (fs.existsSync(evalPath)) {
	try {
	const evalData = JSON.parse(fs.readFileSync(evalPath, 'utf8'));
	details.push(`Decompilation\n\`\`\`json\n${JSON.stringify(evalData, null, 2)}\n\`\`\`\n`);
	} catch (e) {
	details.push(`Decompilation - Could not read details\n`);
	}
	}
	}

	// Read CFG eval details if score < 70
	if (cfg != null && cfg < 70) {
	const cfgEvalPath = path.join('heimdall-eval/heimdall', name, 'cfg_eval.json');
	if (fs.existsSync(cfgEvalPath)) {
	try {
	const cfgEvalData = JSON.parse(fs.readFileSync(cfgEvalPath, 'utf8'));
	details.push(`CFG\n\`\`\`json\n${JSON.stringify(cfgEvalData, null, 2)}\n\`\`\`\n`);
	} catch (e) {
	details.push(`CFG - Could not read details\n`);
	}
	}
	}

	if (details.length > 1) {
	detailsSections.push(details.join('\n'));
	}
	}

	let detailsBlock = '';
	if (detailsSections.length > 0) {
	detailsBlock = `\n\n<details>\n<summary>:warning: ${lowScoringEntries.length} eval(s) scoring <70%</summary>\n\n${detailsSections.join('\n---\n\n')}\n</details>`;
	}

	const body = `## ${icon} Eval Report for ${context.sha}

	\| Test Case \| CFG \| Decompilation \|
	\|-----------\|-----\|---------------\|
	${tableRows}
	\| Average \| ${avgCfg} \| ${avgDecomp} \|${detailsBlock}`;

	// Delete old eval comments
	const { data: comments } = await github.rest.issues.listComments({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	});

	const evalComments = comments.filter(
	c => c.user.login === 'github-actions[bot]' && c.body.includes('Eval Report for')
	);

	for (const comment of evalComments) {
	await github.rest.issues.deleteComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: comment.id,
	});
	}

	// Create new comment
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	body: body
	});

	- name: Upload eval artifacts
	uses: actions/upload-artifact@v4
	with:
	name: eval-results
	path: \|
	heimdall-eval/heimdall/*/eval.json
	heimdall-eval/heimdall/*/cfg_eval.json
	heimdall-eval/heimdall/evals.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fix: pin docker builder to bookworm to resolve glibc mismatch #45

Workflow file

fix: pin docker builder to bookworm to resolve glibc mismatch #45

Uh oh!

Workflow file for this run