Skip to content

Commit 6e9e3dc

Browse files
committed
Add solve_s@i metric
1 parent aac5e5d commit 6e9e3dc

File tree

2 files changed

+115
-0
lines changed

2 files changed

+115
-0
lines changed

index.html

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,18 @@
210210
color: white;
211211
border-color: #0066cc;
212212
}
213+
.leaderboard-table th[colspan] {
214+
text-align: center;
215+
}
216+
.leaderboard-table th[data-column] {
217+
text-align: center;
218+
}
219+
.leaderboard-table td {
220+
text-align: center;
221+
}
222+
.leaderboard-table td.model-name {
223+
text-align: left;
224+
}
213225

214226
/* Media queries for responsive design */
215227
@media screen and (max-width: 768px) {
@@ -376,6 +388,30 @@ <h2>📊 Leaderboard (<span id="current-temp">T=0</span>)</h2>
376388
* GPT-o4-mini:high was measured under T=1 since o-series models only support T=1.
377389
</p>
378390

391+
<h2>📊 solve<sub>s</sub>@i Metrics (T=0)</h2>
392+
<table class="leaderboard-table" id="solve-metrics">
393+
<thead>
394+
<tr>
395+
<th rowspan="2" data-column="Model">Model</th>
396+
<th colspan="3">Stage 3 (Verification)</th>
397+
<th colspan="3">Stage 2 (Solution Gen.)</th>
398+
<th colspan="3">Stage 1 (Execution)</th>
399+
</tr>
400+
<tr>
401+
<th data-column="solve@10 stage3" class="sort-desc">@10</th>
402+
<th data-column="solve@5 stage3">@5</th>
403+
<th data-column="solve@1 stage3">@1</th>
404+
<th data-column="solve@10 stage2">@10</th>
405+
<th data-column="solve@5 stage2">@5</th>
406+
<th data-column="solve@1 stage2">@1</th>
407+
<th data-column="solve@10 stage1">@10</th>
408+
<th data-column="solve@5 stage1">@5</th>
409+
<th data-column="solve@1 stage1">@1</th>
410+
</tr>
411+
</thead>
412+
<tbody></tbody>
413+
</table>
414+
379415
<h2>📚 Citation</h2>
380416
<div class="citation-box">@article{chen2025heurigym,
381417
title={HeuriGym: An Agentic Benchmark for LLM-Crafted Heuristics in Combinatorial Optimization},
@@ -644,8 +680,77 @@ <h2>📚 Citation</h2>
644680
d3.csv("results/model_metadata.csv").then(data => {
645681
metadata = data;
646682
loadData(currentTemperature);
683+
loadSolveMetrics();
647684
});
648685

686+
function loadSolveMetrics() {
687+
d3.csv("results/solve_metrics_summary.csv").then(data => {
688+
const tbody = d3.select("#solve-metrics tbody");
689+
tbody.selectAll("tr").remove();
690+
691+
// Sort data by stage 3@10 by default
692+
data.sort((a, b) => d3.descending(+a["solve@10 stage3"], +b["solve@10 stage3"]));
693+
694+
// Store the data in a variable accessible to the click handler
695+
window.solveMetricsData = data;
696+
697+
function updateTable(sortedData) {
698+
tbody.selectAll("tr").remove();
699+
700+
sortedData.forEach(d => {
701+
const row = tbody.append("tr");
702+
const modelCell = row.append("td").attr("class", "model-name");
703+
const modelLink = modelCell.append("a")
704+
.attr("href", metadata.find(m => m.Model === d.Model)?.URL || "#")
705+
.attr("target", "_blank")
706+
.text(d.Model);
707+
708+
// Add all solve metrics with percentage formatting
709+
const metrics = [
710+
"solve@10 stage3", "solve@5 stage3", "solve@1 stage3",
711+
"solve@10 stage2", "solve@5 stage2", "solve@1 stage2",
712+
"solve@10 stage1", "solve@5 stage1", "solve@1 stage1"
713+
];
714+
715+
metrics.forEach(metric => {
716+
row.append("td").text((d[metric] * 100).toFixed(2) + "%");
717+
});
718+
});
719+
}
720+
721+
// Initial table render
722+
updateTable(data);
723+
724+
// Add click handlers to table headers
725+
d3.selectAll("#solve-metrics th[data-column]").on("click", function() {
726+
const column = this.getAttribute("data-column");
727+
const isAscending = this.classList.contains("sort-asc");
728+
729+
// Sort the data
730+
const sortedData = [...window.solveMetricsData].sort((a, b) => {
731+
if (column === "Model") {
732+
return isAscending ?
733+
b[column].localeCompare(a[column]) :
734+
a[column].localeCompare(b[column]);
735+
}
736+
return isAscending ?
737+
d3.descending(+a[column], +b[column]) :
738+
d3.ascending(+a[column], +b[column]);
739+
});
740+
741+
// Update sort indicators
742+
d3.selectAll("#solve-metrics th")
743+
.classed("sort-asc", false)
744+
.classed("sort-desc", false);
745+
d3.select(this)
746+
.classed(isAscending ? "sort-desc" : "sort-asc", true);
747+
748+
// Update the table with sorted data
749+
updateTable(sortedData);
750+
});
751+
});
752+
}
753+
649754
function copyCitation() {
650755
const citationText = `@article{chen2025heurigym,
651756
title={HeuriGym: An Agentic Benchmark for LLM-Crafted Heuristics in Combinatorial Optimization},

results/solve_metrics_summary.csv

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Model,solve@10 stage3,solve@5 stage3,solve@1 stage3,solve@10 stage2,solve@5 stage2,solve@1 stage2,solve@10 stage1,solve@5 stage1,solve@1 stage1
2+
DeepSeek-V3-0304,0.46788990825688076,0.42660550458715596,0.14220183486238533,0.8761467889908257,0.8302752293577982,0.6605504587155964,1.0,1.0,0.908256880733945
3+
DeepSeek-R1,0.7339449541284404,0.7293577981651376,0.44036697247706424,0.8807339449541285,0.8807339449541285,0.6055045871559633,1.0,1.0,0.7155963302752294
4+
Gemini-2.5-Flash-0417,0.6743119266055045,0.5825688073394495,0.25229357798165136,0.8394495412844036,0.7935779816513762,0.5642201834862385,1.0,1.0,0.7293577981651376
5+
Gemini-2.5-Pro-0506,0.6513761467889908,0.6422018348623854,0.2018348623853211,0.8944954128440367,0.8899082568807339,0.42660550458715596,1.0,1.0,0.5137614678899083
6+
LLaMA-4-Maverick-17B-128E-Instruct,0.3577981651376147,0.3348623853211009,0.05963302752293578,0.8486238532110092,0.7431192660550459,0.08256880733944955,0.8532110091743119,0.8532110091743119,0.13302752293577982
7+
LLaMA-3.3-70B-Instruct,0.3394495412844037,0.3394495412844037,0.20642201834862386,0.7844036697247706,0.7844036697247706,0.4036697247706422,0.9954128440366973,0.9954128440366973,0.6192660550458715
8+
Qwen3-235B-A22B,0.45871559633027525,0.4541284403669725,0.3853211009174312,0.8623853211009175,0.8302752293577982,0.5596330275229358,1.0,1.0,0.7064220183486238
9+
Claude-3.7-Sonnet,0.6009174311926605,0.5871559633027523,0.09174311926605505,0.9770642201834863,0.9770642201834863,0.41284403669724773,1.0,1.0,0.6009174311926605
10+
GPT-o4-mini:high,0.7477064220183486,0.6972477064220184,0.5321100917431193,1.0,1.0,0.9311926605504587,1.0,1.0,1.0

0 commit comments

Comments
 (0)