Add solve_s@i metric

chhzh123 · chhzh123 · commit 6e9e3dce3716 · 2025-06-16T06:26:44.000Z
diff --git a/index.html b/index.html
@@ -210,6 +210,18 @@
       color: white;
       border-color: #0066cc;
     }
+    .leaderboard-table th[colspan] {
+      text-align: center;
+    }
+    .leaderboard-table th[data-column] {
+      text-align: center;
+    }
+    .leaderboard-table td {
+      text-align: center;
+    }
+    .leaderboard-table td.model-name {
+      text-align: left;
+    }
 
     /* Media queries for responsive design */
     @media screen and (max-width: 768px) {
@@ -376,6 +388,30 @@ <h2>📊 Leaderboard (<span id="current-temp">T=0</span>)</h2>
       * GPT-o4-mini:high was measured under T=1 since o-series models only support T=1.
     </p>
 
+    <h2>📊 solve<sub>s</sub>@i Metrics (T=0)</h2>
+    <table class="leaderboard-table" id="solve-metrics">
+      <thead>
+        <tr>
+          <th rowspan="2" data-column="Model">Model</th>
+          <th colspan="3">Stage 3 (Verification)</th>
+          <th colspan="3">Stage 2 (Solution Gen.)</th>
+          <th colspan="3">Stage 1 (Execution)</th>
+        </tr>
+        <tr>
+          <th data-column="solve@10 stage3" class="sort-desc">@10</th>
+          <th data-column="solve@5 stage3">@5</th>
+          <th data-column="solve@1 stage3">@1</th>
+          <th data-column="solve@10 stage2">@10</th>
+          <th data-column="solve@5 stage2">@5</th>
+          <th data-column="solve@1 stage2">@1</th>
+          <th data-column="solve@10 stage1">@10</th>
+          <th data-column="solve@5 stage1">@5</th>
+          <th data-column="solve@1 stage1">@1</th>
+        </tr>
+      </thead>
+      <tbody></tbody>
+    </table>
+
     <h2>📚 Citation</h2>
     <div class="citation-box">@article{chen2025heurigym,
     title={HeuriGym: An Agentic Benchmark for LLM-Crafted Heuristics in Combinatorial Optimization},
@@ -644,8 +680,77 @@ <h2>📚 Citation</h2>
     d3.csv("results/model_metadata.csv").then(data => {
       metadata = data;
       loadData(currentTemperature);
+      loadSolveMetrics();
     });
 
+    function loadSolveMetrics() {
+      d3.csv("results/solve_metrics_summary.csv").then(data => {
+        const tbody = d3.select("#solve-metrics tbody");
+        tbody.selectAll("tr").remove();
+        
+        // Sort data by stage 3@10 by default
+        data.sort((a, b) => d3.descending(+a["solve@10 stage3"], +b["solve@10 stage3"]));
+        
+        // Store the data in a variable accessible to the click handler
+        window.solveMetricsData = data;
+        
+        function updateTable(sortedData) {
+          tbody.selectAll("tr").remove();
+          
+          sortedData.forEach(d => {
+            const row = tbody.append("tr");
+            const modelCell = row.append("td").attr("class", "model-name");
+            const modelLink = modelCell.append("a")
+              .attr("href", metadata.find(m => m.Model === d.Model)?.URL || "#")
+              .attr("target", "_blank")
+              .text(d.Model);
+            
+            // Add all solve metrics with percentage formatting
+            const metrics = [
+              "solve@10 stage3", "solve@5 stage3", "solve@1 stage3",
+              "solve@10 stage2", "solve@5 stage2", "solve@1 stage2",
+              "solve@10 stage1", "solve@5 stage1", "solve@1 stage1"
+            ];
+            
+            metrics.forEach(metric => {
+              row.append("td").text((d[metric] * 100).toFixed(2) + "%");
+            });
+          });
+        }
+
+        // Initial table render
+        updateTable(data);
+
+        // Add click handlers to table headers
+        d3.selectAll("#solve-metrics th[data-column]").on("click", function() {
+          const column = this.getAttribute("data-column");
+          const isAscending = this.classList.contains("sort-asc");
+          
+          // Sort the data
+          const sortedData = [...window.solveMetricsData].sort((a, b) => {
+            if (column === "Model") {
+              return isAscending ? 
+                b[column].localeCompare(a[column]) : 
+                a[column].localeCompare(b[column]);
+            }
+            return isAscending ? 
+              d3.descending(+a[column], +b[column]) : 
+              d3.ascending(+a[column], +b[column]);
+          });
+          
+          // Update sort indicators
+          d3.selectAll("#solve-metrics th")
+            .classed("sort-asc", false)
+            .classed("sort-desc", false);
+          d3.select(this)
+            .classed(isAscending ? "sort-desc" : "sort-asc", true);
+          
+          // Update the table with sorted data
+          updateTable(sortedData);
+        });
+      });
+    }
+
     function copyCitation() {
       const citationText = `@article{chen2025heurigym,
   title={HeuriGym: An Agentic Benchmark for LLM-Crafted Heuristics in Combinatorial Optimization},
diff --git a/results/solve_metrics_summary.csv b/results/solve_metrics_summary.csv
@@ -0,0 +1,10 @@
+Model,solve@10 stage3,solve@5 stage3,solve@1 stage3,solve@10 stage2,solve@5 stage2,solve@1 stage2,solve@10 stage1,solve@5 stage1,solve@1 stage1
+DeepSeek-V3-0304,0.46788990825688076,0.42660550458715596,0.14220183486238533,0.8761467889908257,0.8302752293577982,0.6605504587155964,1.0,1.0,0.908256880733945
+DeepSeek-R1,0.7339449541284404,0.7293577981651376,0.44036697247706424,0.8807339449541285,0.8807339449541285,0.6055045871559633,1.0,1.0,0.7155963302752294
+Gemini-2.5-Flash-0417,0.6743119266055045,0.5825688073394495,0.25229357798165136,0.8394495412844036,0.7935779816513762,0.5642201834862385,1.0,1.0,0.7293577981651376
+Gemini-2.5-Pro-0506,0.6513761467889908,0.6422018348623854,0.2018348623853211,0.8944954128440367,0.8899082568807339,0.42660550458715596,1.0,1.0,0.5137614678899083
+LLaMA-4-Maverick-17B-128E-Instruct,0.3577981651376147,0.3348623853211009,0.05963302752293578,0.8486238532110092,0.7431192660550459,0.08256880733944955,0.8532110091743119,0.8532110091743119,0.13302752293577982
+LLaMA-3.3-70B-Instruct,0.3394495412844037,0.3394495412844037,0.20642201834862386,0.7844036697247706,0.7844036697247706,0.4036697247706422,0.9954128440366973,0.9954128440366973,0.6192660550458715
+Qwen3-235B-A22B,0.45871559633027525,0.4541284403669725,0.3853211009174312,0.8623853211009175,0.8302752293577982,0.5596330275229358,1.0,1.0,0.7064220183486238
+Claude-3.7-Sonnet,0.6009174311926605,0.5871559633027523,0.09174311926605505,0.9770642201834863,0.9770642201834863,0.41284403669724773,1.0,1.0,0.6009174311926605
+GPT-o4-mini:high,0.7477064220183486,0.6972477064220184,0.5321100917431193,1.0,1.0,0.9311926605504587,1.0,1.0,1.0