fix: pin docker builder to bookworm to resolve glibc mismatch #45
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: eval | |
| on: | |
| pull_request: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |
| cancel-in-progress: true | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| jobs: | |
| eval: | |
| name: run eval | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout heimdall-rs (PR branch) | |
| uses: actions/checkout@v4 | |
| - name: Checkout heimdall-eval | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: Jon-Becker/heimdall-eval | |
| path: heimdall-eval | |
| token: ${{ secrets.EVAL_REPO_TOKEN }} | |
| - name: Install Rust | |
| uses: dtolnay/rust-toolchain@stable | |
| - name: Cache Rust dependencies | |
| uses: Swatinem/rust-cache@v2 | |
| with: | |
| cache-on-failure: true | |
| - name: Build heimdall | |
| run: cargo build --release --bin heimdall | |
| - name: Install Foundry | |
| uses: foundry-rs/foundry-toolchain@v1 | |
| - name: Add heimdall to PATH | |
| run: echo "${{ github.workspace }}/target/release" >> $GITHUB_PATH | |
| - name: Run evals | |
| working-directory: heimdall-eval | |
| run: make run-all | |
| - name: Run AI evaluation | |
| working-directory: heimdall-eval | |
| env: | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| run: | | |
| npm install -g @anthropic-ai/claude-code | |
| make eval-all | |
| - name: Comment on PR | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const evalsJson = JSON.parse(fs.readFileSync('heimdall-eval/heimdall/evals.json', 'utf8')); | |
| const entries = Object.entries(evalsJson); | |
| const cfgScores = entries.map(([_, v]) => v.cfg).filter(x => x != null); | |
| const decompScores = entries.map(([_, v]) => v.decompilation).filter(x => x != null); | |
| const avgCfg = cfgScores.length ? Math.floor(cfgScores.reduce((a, b) => a + b, 0) / cfgScores.length) : 'N/A'; | |
| const avgDecomp = decompScores.length ? Math.floor(decompScores.reduce((a, b) => a + b, 0) / decompScores.length) : 'N/A'; | |
| const icon = avgDecomp >= 70 ? ':white_check_mark:' : (avgDecomp >= 50 ? ':warning:' : ':x:'); | |
| // Identify low-scoring evals for detailed breakdown | |
| const lowScoringEntries = entries.filter(([_, scores]) => { | |
| const cfg = scores.cfg ?? null; | |
| const decomp = scores.decompilation ?? null; | |
| return (cfg != null && cfg < 70) || (decomp != null && decomp < 70); | |
| }); | |
| let tableRows = entries.map(([name, scores]) => { | |
| const cfg = scores.cfg ?? 'N/A'; | |
| const decomp = scores.decompilation ?? 'N/A'; | |
| return `| ${name} | ${cfg} | ${decomp} |`; | |
| }).join('\n'); | |
| // Collect detailed breakdowns for low-scoring evals | |
| let detailsSections = []; | |
| for (const [name, scores] of lowScoringEntries) { | |
| const cfg = scores.cfg ?? null; | |
| const decomp = scores.decompilation ?? null; | |
| let details = [`### ${name} (CFG: ${cfg ?? 'N/A'}, Decompilation: ${decomp ?? 'N/A'})\n`]; | |
| // Read decompilation eval details if score < 70 | |
| if (decomp != null && decomp < 70) { | |
| const evalPath = path.join('heimdall-eval/heimdall', name, 'eval.json'); | |
| if (fs.existsSync(evalPath)) { | |
| try { | |
| const evalData = JSON.parse(fs.readFileSync(evalPath, 'utf8')); | |
| details.push(`**Decompilation**\n\`\`\`json\n${JSON.stringify(evalData, null, 2)}\n\`\`\`\n`); | |
| } catch (e) { | |
| details.push(`**Decompilation** - Could not read details\n`); | |
| } | |
| } | |
| } | |
| // Read CFG eval details if score < 70 | |
| if (cfg != null && cfg < 70) { | |
| const cfgEvalPath = path.join('heimdall-eval/heimdall', name, 'cfg_eval.json'); | |
| if (fs.existsSync(cfgEvalPath)) { | |
| try { | |
| const cfgEvalData = JSON.parse(fs.readFileSync(cfgEvalPath, 'utf8')); | |
| details.push(`**CFG**\n\`\`\`json\n${JSON.stringify(cfgEvalData, null, 2)}\n\`\`\`\n`); | |
| } catch (e) { | |
| details.push(`**CFG** - Could not read details\n`); | |
| } | |
| } | |
| } | |
| if (details.length > 1) { | |
| detailsSections.push(details.join('\n')); | |
| } | |
| } | |
| let detailsBlock = ''; | |
| if (detailsSections.length > 0) { | |
| detailsBlock = `\n\n<details>\n<summary>:warning: ${lowScoringEntries.length} eval(s) scoring <70%</summary>\n\n${detailsSections.join('\n---\n\n')}\n</details>`; | |
| } | |
| const body = `## ${icon} Eval Report for ${context.sha} | |
| | Test Case | CFG | Decompilation | | |
| |-----------|-----|---------------| | |
| ${tableRows} | |
| | **Average** | **${avgCfg}** | **${avgDecomp}** |${detailsBlock}`; | |
| // Delete old eval comments | |
| const { data: comments } = await github.rest.issues.listComments({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: context.issue.number, | |
| }); | |
| const evalComments = comments.filter( | |
| c => c.user.login === 'github-actions[bot]' && c.body.includes('Eval Report for') | |
| ); | |
| for (const comment of evalComments) { | |
| await github.rest.issues.deleteComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| comment_id: comment.id, | |
| }); | |
| } | |
| // Create new comment | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: context.issue.number, | |
| body: body | |
| }); | |
| - name: Upload eval artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-results | |
| path: | | |
| heimdall-eval/heimdall/*/eval.json | |
| heimdall-eval/heimdall/*/cfg_eval.json | |
| heimdall-eval/heimdall/evals.json |