Update DeepPlanning leaderboard for v1.1 dataset

zhangyingerjelly · tuhahaha · commit 31a4d36d1236 · 2026-03-04T16:13:57.000+08:00
diff --git a/qwen-agent-docs/website/content/en/benchmarks/deepplanning/index.mdx b/qwen-agent-docs/website/content/en/benchmarks/deepplanning/index.mdx
@@ -134,6 +134,25 @@ DeepPlanning evaluates three critical agentic abilities:
 
 ---
 
+<h2 className="text-center text-3xl font-semibold mt-8 mb-2">📢 Change Log</h2>
+<div className="flex justify-center mb-6">
+  <div className="h-1 w-16 bg-gradient-to-r from-orange-500 to-red-500 rounded"></div>
+</div>
+
+<div className="max-w-4xl mx-auto mb-8">
+  <div className="border border-orange-200 dark:border-orange-800 rounded-lg bg-orange-50/50 dark:bg-orange-950/30 p-5">
+    <h4 className="font-semibold text-lg mb-3 text-orange-800 dark:text-orange-300">v1.1 (2026-03-03)</h4>
+    <ul className="list-disc list-inside space-y-1.5 text-sm text-gray-700 dark:text-gray-300 leading-relaxed">
+      <li>Updated several tasks in the <strong>Shopping Planning</strong> benchmark and corrected erroneous answer annotations for a subset of questions. Dataset available at <a href="https://huggingface.co/datasets/Qwen/DeepPlanning" target="_blank" className="text-blue-600 dark:text-blue-400 underline">Qwen/DeepPlanning</a> on Hugging Face.</li>
+      <li>Added new models to the leaderboard: <strong>Claude-4.6-Opus</strong>, <strong>Qwen-3.5-Plus</strong>, <strong>GLM-5</strong>, <strong>Seed-2.0-pro-high</strong>, <strong>Kimi-K2.5-thinking</strong>.</li>
+    </ul>
+    <h4 className="font-semibold text-lg mt-4 mb-3 text-gray-600 dark:text-gray-400">v1.0 (2026-01)</h4>
+    <ul className="list-disc list-inside space-y-1.5 text-sm text-gray-500 dark:text-gray-400 leading-relaxed">
+      <li>Initial release of the DeepPlanning benchmark with Travel Planning and Shopping Planning domains.</li>
+    </ul>
+  </div>
+</div>
+
 <Leaderboard />
 
 ---
diff --git a/qwen-agent-docs/website/src/components/leaderboard.tsx b/qwen-agent-docs/website/src/components/leaderboard.tsx
@@ -1,27 +1,56 @@
 "use client";
 
-import React from "react";
+import React, { useState } from "react";
 
 interface ModelScore {
   model: string;
   icon: string;
   isThinking: boolean;
   avgAcc: number;
   travel: {
-    csScore: number;
-    psScore: number;
+    csScore: number | null;
+    psScore: number | null;
     compScore: number;
     caseAcc: number;
   };
   shopping: {
     matchScore: number;
     caseAcc: number;
   };
+  note?: string;
 }
 
-// All Models Data
-const allModels: ModelScore[] = [
+type VersionKey = "v1.1" | "v1.0";
+
+// ── v1.1 Data (default) ─────────────────────────────────────────────
+const allModelsV1_1: ModelScore[] = [
   // Thinking Models
+  { model: "Anthropic/Claude-4.6-Opus (max)", icon: "/icons/icon_anthropic.png", isThinking: true, avgAcc: 58.85, travel: { csScore: 86.1, psScore: 80.3, compScore: 83.2, caseAcc: 61.5 }, shopping: { matchScore: 85.3, caseAcc: 56.2 } },
+  { model: "OpenAI/GPT-5.2-high", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 48.2, travel: { csScore: 88.5, psScore: 83.3, compScore: 85.8, caseAcc: 35.0 }, shopping: { matchScore: 88.4, caseAcc: 61.4 } },
+  { model: "Alibaba/Qwen-3.5-Plus (w/o thinking)", icon: "/icons/icon_qwen.png", isThinking: false, avgAcc: 37.6, travel: { csScore: 83.6, psScore: 79.9, compScore: 81.6, caseAcc: 26.3 }, shopping: { matchScore: 82.4, caseAcc: 48.9 } },
+  { model: "Anthropic/Claude-4.5-Opus (w/ thinking)", icon: "/icons/icon_anthropic.png", isThinking: true, avgAcc: 37.05, travel: { csScore: 79.3, psScore: 70.9, compScore: 75.1, caseAcc: 22.7 }, shopping: { matchScore: 83.7, caseAcc: 51.4 } },
+  { model: "Alibaba/Qwen-3.5-Plus (w/ thinking)", icon: "/icons/icon_qwen.png", isThinking: true, avgAcc: 35.85, travel: { csScore: 76.8, psScore: 75.4, compScore: 76.2, caseAcc: 25.0 }, shopping: { matchScore: 82.1, caseAcc: 46.7 } },
+  { model: "Google/Gemini-3-Flash-Preview", icon: "/icons/ico_gemini.png", isThinking: true, avgAcc: 33.75, travel: { csScore: 67.1, psScore: 57.7, compScore: 62.4, caseAcc: 5.9 }, shopping: { matchScore: 86.9, caseAcc: 61.6 } },
+  { model: "OpenAI/GPT-5-high", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 30.5, travel: { csScore: 78.7, psScore: 65.9, compScore: 72.3, caseAcc: 18.9 }, shopping: { matchScore: 68.7, caseAcc: 42.1 } },
+  { model: "Alibaba/Qwen3-Max (w/ thinking)", icon: "/icons/icon_qwen.png", isThinking: true, avgAcc: 29.7, travel: { csScore: 64.0, psScore: 61.7, compScore: 62.8, caseAcc: 13.8 }, shopping: { matchScore: 80.6, caseAcc: 45.6 } },
+  { model: "Google/Gemini-3-Pro-Preview", icon: "/icons/ico_gemini.png", isThinking: true, avgAcc: 27.35, travel: { csScore: 58.4, psScore: 25.1, compScore: 41.8, caseAcc: 0.7 }, shopping: { matchScore: 83.4, caseAcc: 54.0 } },
+  { model: "DeepSeek-AI/DeepSeek-V3.2 (w/ thinking)", icon: "/icons/icon_dpsk.png", isThinking: true, avgAcc: 27.35, travel: { csScore: 47.4, psScore: 35.0, compScore: 41.2, caseAcc: 0.7 }, shopping: { matchScore: 84.0, caseAcc: 54.0 } },
+  { model: "Anthropic/Claude-4.5-Sonnet (w/ thinking)", icon: "/icons/icon_anthropic.png", isThinking: true, avgAcc: 26.8, travel: { csScore: 65.2, psScore: 58.4, compScore: 61.8, caseAcc: 7.6 }, shopping: { matchScore: 78.0, caseAcc: 46.0 } },
+  { model: "Anthropic/Claude-4.5-Opus (w/o thinking)", icon: "/icons/icon_anthropic.png", isThinking: false, avgAcc: 26.35, travel: { csScore: 67.5, psScore: 58.8, compScore: 63.1, caseAcc: 6.7 }, shopping: { matchScore: 81.0, caseAcc: 46.0 } },
+  { model: "ByteDance/Seed-2.0-pro-high", icon: "/icons/icon_seed.png", isThinking: true, avgAcc: 21.55, travel: { csScore: 56.0, psScore: 60.6, compScore: 58.3, caseAcc: 2.1 }, shopping: { matchScore: 76.7, caseAcc: 41.0 } },
+  { model: "xAI/Grok-4.1-fast (reasoning)", icon: "/icons/icon_x.png", isThinking: true, avgAcc: 19.15, travel: { csScore: 57.1, psScore: 37.7, compScore: 47.4, caseAcc: 2.7 }, shopping: { matchScore: 73.2, caseAcc: 35.6 } },
+  { model: "DeepSeek-AI/DeepSeek-V3.2 (w/o thinking)", icon: "/icons/icon_dpsk.png", isThinking: false, avgAcc: 19.0, travel: { csScore: 37.4, psScore: 12.1, compScore: 24.7, caseAcc: 0.0 }, shopping: { matchScore: 76.0, caseAcc: 38.0 } },
+  { model: "Anthropic/Claude-4.5-Sonnet (w/o thinking)", icon: "/icons/icon_anthropic.png", isThinking: false, avgAcc: 16.05, travel: { csScore: 53.4, psScore: 42.8, compScore: 48.1, caseAcc: 1.1 }, shopping: { matchScore: 71.0, caseAcc: 31.0 } },
+  { model: "Alibaba/Qwen3-Max (w/o thinking)", icon: "/icons/icon_qwen.png", isThinking: false, avgAcc: 15.45, travel: { csScore: 36.7, psScore: 30.7, compScore: 31.8, caseAcc: 0.8 }, shopping: { matchScore: 72.3, caseAcc: 30.1 } },
+  { model: "Z.ai/GLM-5 (w/ thinking)", icon: "/icons/icon_glm.png", isThinking: true, avgAcc: 14.55, travel: { csScore: 44.3, psScore: 42.3, compScore: 43.3, caseAcc: 0.4 }, shopping: { matchScore: 72.2, caseAcc: 28.7 } },
+  { model: "Moonshot-AI/Kimi-K2.5 (w/ thinking)", icon: "/icons/icon_kimi.png", isThinking: true, avgAcc: 14.35, travel: { csScore: 47.8, psScore: 43.7, compScore: 45.8, caseAcc: 0.4 }, shopping: { matchScore: 71.9, caseAcc: 28.3 } },
+  { model: "OpenAI/o4-mini", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 13.65, travel: { csScore: 58.0, psScore: 36.6, compScore: 47.2, caseAcc: 3.0 }, shopping: { matchScore: 62.5, caseAcc: 24.3 } },
+  { model: "OpenAI/GPT-5.2-none", icon: "/icons/icon_openai.png", isThinking: false, avgAcc: 6.75, travel: { csScore: 54.3, psScore: 29.9, compScore: 42.1, caseAcc: 0.4 }, shopping: { matchScore: 59.4, caseAcc: 13.1 } },
+  { model: "xAI/Grok-4.1-fast (non-reasoning)", icon: "/icons/icon_x.png", isThinking: false, avgAcc: 5.2, travel: { csScore: 39.6, psScore: 19.7, compScore: 29.6, caseAcc: 0.0 }, shopping: { matchScore: 52.9, caseAcc: 10.4 } },
+];
+
+// ── v1.0 Data ────────────────────────────────────────────────────────
+const allModelsV1_0: ModelScore[] = [
   { model: "OpenAI/GPT-5.2-high", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 44.6, travel: { csScore: 88.5, psScore: 83.3, compScore: 85.8, caseAcc: 35.0 }, shopping: { matchScore: 84.8, caseAcc: 54.2 } },
   { model: "Anthropic/Claude-4.5-Opus (w/ thinking)", icon: "/icons/icon_anthropic.png", isThinking: true, avgAcc: 33.9, travel: { csScore: 79.3, psScore: 70.9, compScore: 75.1, caseAcc: 22.7 }, shopping: { matchScore: 80.0, caseAcc: 45.0 } },
   { model: "OpenAI/GPT-5-high", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 31.6, travel: { csScore: 78.7, psScore: 65.9, compScore: 72.3, caseAcc: 18.9 }, shopping: { matchScore: 80.4, caseAcc: 44.2 } },
@@ -38,7 +67,6 @@ const allModels: ModelScore[] = [
   { model: "Z.ai/GLM-4.7 (w/ thinking)", icon: "/icons/icon_glm.png", isThinking: true, avgAcc: 14.0, travel: { csScore: 44.0, psScore: 44.6, compScore: 44.3, caseAcc: 0.4 }, shopping: { matchScore: 72.5, caseAcc: 27.5 } },
   { model: "OpenAI/o4-mini", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 12.4, travel: { csScore: 58.0, psScore: 36.6, compScore: 47.2, caseAcc: 3.0 }, shopping: { matchScore: 69.1, caseAcc: 21.7 } },
   { model: "Moonshot-AI/Kimi-K2-thinking", icon: "/icons/icon_kimi.png", isThinking: true, avgAcc: 12.1, travel: { csScore: 45.2, psScore: 32.5, compScore: 38.9, caseAcc: 0.0 }, shopping: { matchScore: 65.8, caseAcc: 24.2 } },
-  // Non-thinking Models
   { model: "Anthropic/Claude-4.5-Opus (w/o thinking)", icon: "/icons/icon_anthropic.png", isThinking: false, avgAcc: 26.3, travel: { csScore: 67.5, psScore: 58.8, compScore: 63.1, caseAcc: 6.7 }, shopping: { matchScore: 82.2, caseAcc: 45.8 } },
   { model: "Anthropic/Claude-4.5-Sonnet (w/o thinking)", icon: "/icons/icon_anthropic.png", isThinking: false, avgAcc: 17.2, travel: { csScore: 53.4, psScore: 42.8, compScore: 48.1, caseAcc: 1.1 }, shopping: { matchScore: 75.8, caseAcc: 33.3 } },
   { model: "Alibaba/Qwen3-Max (w/o thinking)", icon: "/icons/icon_qwen.png", isThinking: false, avgAcc: 12.8, travel: { csScore: 36.7, psScore: 30.7, compScore: 31.8, caseAcc: 0.8 }, shopping: { matchScore: 70.2, caseAcc: 24.7 } },
@@ -50,6 +78,11 @@ const allModels: ModelScore[] = [
   { model: "xAI/Grok-4.1-fast (non-reasoning)", icon: "/icons/icon_x.png", isThinking: false, avgAcc: 3.0, travel: { csScore: 39.6, psScore: 19.7, compScore: 29.6, caseAcc: 0.0 }, shopping: { matchScore: 50.1, caseAcc: 5.9 } },
 ];
 
+const versionData: Record<VersionKey, ModelScore[]> = {
+  "v1.1": allModelsV1_1,
+  "v1.0": allModelsV1_0,
+};
+
 function RankBadge({ rank }: { rank: number }) {
   if (rank === 1) {
     return (
@@ -105,7 +138,6 @@ function sortByScore(models: ModelScore[]): ModelScore[] {
   return [...models].sort((a, b) => b.avgAcc - a.avgAcc);
 }
 
-// Find best values for highlighting
 function findBestValues(models: ModelScore[]) {
   const best = {
     avgAcc: 0,
@@ -114,8 +146,8 @@ function findBestValues(models: ModelScore[]) {
   };
   models.forEach((m) => {
     if (m.avgAcc > best.avgAcc) best.avgAcc = m.avgAcc;
-    if (m.travel.csScore > best.travel.csScore) best.travel.csScore = m.travel.csScore;
-    if (m.travel.psScore > best.travel.psScore) best.travel.psScore = m.travel.psScore;
+    if (m.travel.csScore !== null && m.travel.csScore > best.travel.csScore) best.travel.csScore = m.travel.csScore;
+    if (m.travel.psScore !== null && m.travel.psScore > best.travel.psScore) best.travel.psScore = m.travel.psScore;
     if (m.travel.compScore > best.travel.compScore) best.travel.compScore = m.travel.compScore;
     if (m.travel.caseAcc > best.travel.caseAcc) best.travel.caseAcc = m.travel.caseAcc;
     if (m.shopping.matchScore > best.shopping.matchScore) best.shopping.matchScore = m.shopping.matchScore;
@@ -124,7 +156,14 @@ function findBestValues(models: ModelScore[]) {
   return best;
 }
 
-function ScoreCell({ value, isBest }: { value: number; isBest: boolean }) {
+function ScoreCell({ value, isBest }: { value: number | null; isBest: boolean }) {
+  if (value === null) {
+    return (
+      <td className="px-2 py-2.5 text-center text-sm text-gray-400 dark:text-gray-500 italic">
+        —
+      </td>
+    );
+  }
   return (
     <td className={`px-2 py-2.5 text-center text-sm ${isBest ? "font-bold text-gray-900 dark:text-white" : "text-gray-600 dark:text-gray-400"}`}>
       {value.toFixed(1)}
@@ -133,8 +172,12 @@ function ScoreCell({ value, isBest }: { value: number; isBest: boolean }) {
 }
 
 export function Leaderboard() {
-  const sortedModels = sortByScore(allModels);
-  const best = findBestValues(allModels);
+  const versions: VersionKey[] = ["v1.1", "v1.0"];
+  const [activeVersion, setActiveVersion] = useState<VersionKey>("v1.1");
+
+  const currentModels = versionData[activeVersion];
+  const sortedModels = sortByScore(currentModels);
+  const best = findBestValues(currentModels);
 
   return (
     <div className="my-8">
@@ -144,10 +187,34 @@ export function Leaderboard() {
       </h2>
 
       {/* Subtitle */}
-      <p className="text-center text-sm text-gray-500 dark:text-gray-400 mb-6">
-        Comprehensive evaluation results on DeepPlanning. Results are averaged over four runs. <strong>Bold</strong> indicates the best result.
+      <p className="text-center text-sm text-gray-500 dark:text-gray-400 mb-4">
+        Comprehensive evaluation results on DeepPlanning <strong>{activeVersion}</strong>. Results are averaged over four runs. <strong>Bold</strong> indicates the best result.
       </p>
 
+      {/* Version Toggle */}
+      <div className="flex justify-center mb-6">
+        <div className="inline-flex rounded-lg border border-gray-200 dark:border-gray-700 bg-gray-50 dark:bg-gray-800 p-0.5">
+          {versions.map((v) => (
+            <button
+              key={v}
+              onClick={() => setActiveVersion(v)}
+              className={`px-4 py-1.5 text-sm font-medium rounded-md transition-all ${
+                activeVersion === v
+                  ? "bg-white dark:bg-gray-700 text-gray-900 dark:text-white shadow-sm"
+                  : "text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-300"
+              }`}
+            >
+              {v}
+              {v === "v1.1" && (
+                <span className="ml-1.5 text-[10px] font-semibold px-1.5 py-0.5 rounded-full bg-green-100 dark:bg-green-900 text-green-700 dark:text-green-300">
+                  Latest
+                </span>
+              )}
+            </button>
+          ))}
+        </div>
+      </div>
+
       {/* Table */}
       <div className="overflow-x-auto rounded-lg border border-gray-200 dark:border-gray-700 shadow-sm">
         <table className="w-full text-sm border-collapse bg-white dark:bg-gray-900">
@@ -190,6 +257,11 @@ export function Leaderboard() {
                   <div className="flex items-center gap-2">
                     <ModelIcon icon={item.icon} />
                     <span className="font-medium text-gray-800 dark:text-gray-200">{item.model}</span>
+                    {item.note && (
+                      <span className="text-[10px] px-1.5 py-0.5 rounded bg-amber-100 dark:bg-amber-900 text-amber-700 dark:text-amber-300" title={item.note}>
+                        *
+                      </span>
+                    )}
                   </div>
                 </td>
                 <ScoreCell value={item.avgAcc} isBest={item.avgAcc === best.avgAcc} />
@@ -209,6 +281,11 @@ export function Leaderboard() {
       <p className="mt-3 text-xs text-gray-500 dark:text-gray-400 text-center">
         CS Score = Commonsense Score | PS Score = Personalized Score | Comp Score = Composite Score | Case Acc. = Case Accuracy | Match Score = Match Score. <strong>Bold</strong> values indicate best performance per category.
       </p>
+      {activeVersion === "v1.1" && sortedModels.some((m) => m.note) && (
+        <p className="mt-1 text-xs text-amber-600 dark:text-amber-400 text-center">
+          * Some scores are still being evaluated. &ldquo;—&rdquo; indicates pending results.
+        </p>
+      )}
     </div>
   );
 }