Skip to content

Commit 3e16208

Browse files
authored
feat(ci): include low eval breakdowns (#678)
* feat: eval breakdowns * fix: eval output
1 parent 4103af2 commit 3e16208

1 file changed

Lines changed: 53 additions & 1 deletion

File tree

.github/workflows/eval.yml

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ jobs:
6060
with:
6161
script: |
6262
const fs = require('fs');
63+
const path = require('path');
6364
const evalsJson = JSON.parse(fs.readFileSync('heimdall-eval/heimdall/evals.json', 'utf8'));
6465
6566
const entries = Object.entries(evalsJson);
@@ -71,18 +72,69 @@ jobs:
7172
7273
const icon = avgDecomp >= 70 ? ':white_check_mark:' : (avgDecomp >= 50 ? ':warning:' : ':x:');
7374
75+
// Identify low-scoring evals for detailed breakdown
76+
const lowScoringEntries = entries.filter(([_, scores]) => {
77+
const cfg = scores.cfg ?? null;
78+
const decomp = scores.decompilation ?? null;
79+
return (cfg != null && cfg < 70) || (decomp != null && decomp < 70);
80+
});
81+
7482
let tableRows = entries.map(([name, scores]) => {
7583
const cfg = scores.cfg ?? 'N/A';
7684
const decomp = scores.decompilation ?? 'N/A';
7785
return `| ${name} | ${cfg} | ${decomp} |`;
7886
}).join('\n');
7987
88+
// Collect detailed breakdowns for low-scoring evals
89+
let detailsSections = [];
90+
for (const [name, scores] of lowScoringEntries) {
91+
const cfg = scores.cfg ?? null;
92+
const decomp = scores.decompilation ?? null;
93+
94+
let details = [`### ${name} (CFG: ${cfg ?? 'N/A'}, Decompilation: ${decomp ?? 'N/A'})\n`];
95+
96+
// Read decompilation eval details if score < 70
97+
if (decomp != null && decomp < 70) {
98+
const evalPath = path.join('heimdall-eval/heimdall', name, 'eval.json');
99+
if (fs.existsSync(evalPath)) {
100+
try {
101+
const evalData = JSON.parse(fs.readFileSync(evalPath, 'utf8'));
102+
details.push(`**Decompilation**\n\`\`\`json\n${JSON.stringify(evalData, null, 2)}\n\`\`\`\n`);
103+
} catch (e) {
104+
details.push(`**Decompilation** - Could not read details\n`);
105+
}
106+
}
107+
}
108+
109+
// Read CFG eval details if score < 70
110+
if (cfg != null && cfg < 70) {
111+
const cfgEvalPath = path.join('heimdall-eval/heimdall', name, 'cfg_eval.json');
112+
if (fs.existsSync(cfgEvalPath)) {
113+
try {
114+
const cfgEvalData = JSON.parse(fs.readFileSync(cfgEvalPath, 'utf8'));
115+
details.push(`**CFG**\n\`\`\`json\n${JSON.stringify(cfgEvalData, null, 2)}\n\`\`\`\n`);
116+
} catch (e) {
117+
details.push(`**CFG** - Could not read details\n`);
118+
}
119+
}
120+
}
121+
122+
if (details.length > 1) {
123+
detailsSections.push(details.join('\n'));
124+
}
125+
}
126+
127+
let detailsBlock = '';
128+
if (detailsSections.length > 0) {
129+
detailsBlock = `\n\n<details>\n<summary>:warning: ${lowScoringEntries.length} eval(s) scoring &lt;70%</summary>\n\n${detailsSections.join('\n---\n\n')}\n</details>`;
130+
}
131+
80132
const body = `## ${icon} Eval Report for ${context.sha}
81133
82134
| Test Case | CFG | Decompilation |
83135
|-----------|-----|---------------|
84136
${tableRows}
85-
| **Average** | **${avgCfg}** | **${avgDecomp}** |`;
137+
| **Average** | **${avgCfg}** | **${avgDecomp}** |${detailsBlock}`;
86138
87139
// Delete old eval comments
88140
const { data: comments } = await github.rest.issues.listComments({

0 commit comments

Comments
 (0)