6060 with :
6161 script : |
6262 const fs = require('fs');
63+ const path = require('path');
6364 const evalsJson = JSON.parse(fs.readFileSync('heimdall-eval/heimdall/evals.json', 'utf8'));
6465
6566 const entries = Object.entries(evalsJson);
@@ -71,18 +72,69 @@ jobs:
7172
7273 const icon = avgDecomp >= 70 ? ':white_check_mark:' : (avgDecomp >= 50 ? ':warning:' : ':x:');
7374
75+ // Identify low-scoring evals for detailed breakdown
76+ const lowScoringEntries = entries.filter(([_, scores]) => {
77+ const cfg = scores.cfg ?? null;
78+ const decomp = scores.decompilation ?? null;
79+ return (cfg != null && cfg < 70) || (decomp != null && decomp < 70);
80+ });
81+
7482 let tableRows = entries.map(([name, scores]) => {
7583 const cfg = scores.cfg ?? 'N/A';
7684 const decomp = scores.decompilation ?? 'N/A';
7785 return `| ${name} | ${cfg} | ${decomp} |`;
7886 }).join('\n');
7987
88+ // Collect detailed breakdowns for low-scoring evals
89+ let detailsSections = [];
90+ for (const [name, scores] of lowScoringEntries) {
91+ const cfg = scores.cfg ?? null;
92+ const decomp = scores.decompilation ?? null;
93+
94+ let details = [`### ${name} (CFG: ${cfg ?? 'N/A'}, Decompilation: ${decomp ?? 'N/A'})\n`];
95+
96+ // Read decompilation eval details if score < 70
97+ if (decomp != null && decomp < 70) {
98+ const evalPath = path.join('heimdall-eval/heimdall', name, 'eval.json');
99+ if (fs.existsSync(evalPath)) {
100+ try {
101+ const evalData = JSON.parse(fs.readFileSync(evalPath, 'utf8'));
102+ details.push(`**Decompilation**\n\`\`\`json\n${JSON.stringify(evalData, null, 2)}\n\`\`\`\n`);
103+ } catch (e) {
104+ details.push(`**Decompilation** - Could not read details\n`);
105+ }
106+ }
107+ }
108+
109+ // Read CFG eval details if score < 70
110+ if (cfg != null && cfg < 70) {
111+ const cfgEvalPath = path.join('heimdall-eval/heimdall', name, 'cfg_eval.json');
112+ if (fs.existsSync(cfgEvalPath)) {
113+ try {
114+ const cfgEvalData = JSON.parse(fs.readFileSync(cfgEvalPath, 'utf8'));
115+ details.push(`**CFG**\n\`\`\`json\n${JSON.stringify(cfgEvalData, null, 2)}\n\`\`\`\n`);
116+ } catch (e) {
117+ details.push(`**CFG** - Could not read details\n`);
118+ }
119+ }
120+ }
121+
122+ if (details.length > 1) {
123+ detailsSections.push(details.join('\n'));
124+ }
125+ }
126+
127+ let detailsBlock = '';
128+ if (detailsSections.length > 0) {
129+ detailsBlock = `\n\n<details>\n<summary>:warning: ${lowScoringEntries.length} eval(s) scoring <70%</summary>\n\n${detailsSections.join('\n---\n\n')}\n</details>`;
130+ }
131+
80132 const body = `## ${icon} Eval Report for ${context.sha}
81133
82134 | Test Case | CFG | Decompilation |
83135 |-----------|-----|---------------|
84136 ${tableRows}
85- | **Average** | **${avgCfg}** | **${avgDecomp}** |`;
137+ | **Average** | **${avgCfg}** | **${avgDecomp}** |${detailsBlock} `;
86138
87139 // Delete old eval comments
88140 const { data: comments } = await github.rest.issues.listComments({
0 commit comments