Skip to content

Commit 31a4d36

Browse files
zhangyingerjellytuhahaha
authored andcommitted
Update DeepPlanning leaderboard for v1.1 dataset
1 parent 8bfb20e commit 31a4d36

2 files changed

Lines changed: 110 additions & 14 deletions

File tree

qwen-agent-docs/website/content/en/benchmarks/deepplanning/index.mdx

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,25 @@ DeepPlanning evaluates three critical agentic abilities:
134134

135135
---
136136

137+
<h2 className="text-center text-3xl font-semibold mt-8 mb-2">📢 Change Log</h2>
138+
<div className="flex justify-center mb-6">
139+
<div className="h-1 w-16 bg-gradient-to-r from-orange-500 to-red-500 rounded"></div>
140+
</div>
141+
142+
<div className="max-w-4xl mx-auto mb-8">
143+
<div className="border border-orange-200 dark:border-orange-800 rounded-lg bg-orange-50/50 dark:bg-orange-950/30 p-5">
144+
<h4 className="font-semibold text-lg mb-3 text-orange-800 dark:text-orange-300">v1.1 (2026-03-03)</h4>
145+
<ul className="list-disc list-inside space-y-1.5 text-sm text-gray-700 dark:text-gray-300 leading-relaxed">
146+
<li>Updated several tasks in the <strong>Shopping Planning</strong> benchmark and corrected erroneous answer annotations for a subset of questions. Dataset available at <a href="https://huggingface.co/datasets/Qwen/DeepPlanning" target="_blank" className="text-blue-600 dark:text-blue-400 underline">Qwen/DeepPlanning</a> on Hugging Face.</li>
147+
<li>Added new models to the leaderboard: <strong>Claude-4.6-Opus</strong>, <strong>Qwen-3.5-Plus</strong>, <strong>GLM-5</strong>, <strong>Seed-2.0-pro-high</strong>, <strong>Kimi-K2.5-thinking</strong>.</li>
148+
</ul>
149+
<h4 className="font-semibold text-lg mt-4 mb-3 text-gray-600 dark:text-gray-400">v1.0 (2026-01)</h4>
150+
<ul className="list-disc list-inside space-y-1.5 text-sm text-gray-500 dark:text-gray-400 leading-relaxed">
151+
<li>Initial release of the DeepPlanning benchmark with Travel Planning and Shopping Planning domains.</li>
152+
</ul>
153+
</div>
154+
</div>
155+
137156
<Leaderboard />
138157

139158
---

qwen-agent-docs/website/src/components/leaderboard.tsx

Lines changed: 91 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,56 @@
11
"use client";
22

3-
import React from "react";
3+
import React, { useState } from "react";
44

55
interface ModelScore {
66
model: string;
77
icon: string;
88
isThinking: boolean;
99
avgAcc: number;
1010
travel: {
11-
csScore: number;
12-
psScore: number;
11+
csScore: number | null;
12+
psScore: number | null;
1313
compScore: number;
1414
caseAcc: number;
1515
};
1616
shopping: {
1717
matchScore: number;
1818
caseAcc: number;
1919
};
20+
note?: string;
2021
}
2122

22-
// All Models Data
23-
const allModels: ModelScore[] = [
23+
type VersionKey = "v1.1" | "v1.0";
24+
25+
// ── v1.1 Data (default) ─────────────────────────────────────────────
26+
const allModelsV1_1: ModelScore[] = [
2427
// Thinking Models
28+
{ model: "Anthropic/Claude-4.6-Opus (max)", icon: "/icons/icon_anthropic.png", isThinking: true, avgAcc: 58.85, travel: { csScore: 86.1, psScore: 80.3, compScore: 83.2, caseAcc: 61.5 }, shopping: { matchScore: 85.3, caseAcc: 56.2 } },
29+
{ model: "OpenAI/GPT-5.2-high", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 48.2, travel: { csScore: 88.5, psScore: 83.3, compScore: 85.8, caseAcc: 35.0 }, shopping: { matchScore: 88.4, caseAcc: 61.4 } },
30+
{ model: "Alibaba/Qwen-3.5-Plus (w/o thinking)", icon: "/icons/icon_qwen.png", isThinking: false, avgAcc: 37.6, travel: { csScore: 83.6, psScore: 79.9, compScore: 81.6, caseAcc: 26.3 }, shopping: { matchScore: 82.4, caseAcc: 48.9 } },
31+
{ model: "Anthropic/Claude-4.5-Opus (w/ thinking)", icon: "/icons/icon_anthropic.png", isThinking: true, avgAcc: 37.05, travel: { csScore: 79.3, psScore: 70.9, compScore: 75.1, caseAcc: 22.7 }, shopping: { matchScore: 83.7, caseAcc: 51.4 } },
32+
{ model: "Alibaba/Qwen-3.5-Plus (w/ thinking)", icon: "/icons/icon_qwen.png", isThinking: true, avgAcc: 35.85, travel: { csScore: 76.8, psScore: 75.4, compScore: 76.2, caseAcc: 25.0 }, shopping: { matchScore: 82.1, caseAcc: 46.7 } },
33+
{ model: "Google/Gemini-3-Flash-Preview", icon: "/icons/ico_gemini.png", isThinking: true, avgAcc: 33.75, travel: { csScore: 67.1, psScore: 57.7, compScore: 62.4, caseAcc: 5.9 }, shopping: { matchScore: 86.9, caseAcc: 61.6 } },
34+
{ model: "OpenAI/GPT-5-high", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 30.5, travel: { csScore: 78.7, psScore: 65.9, compScore: 72.3, caseAcc: 18.9 }, shopping: { matchScore: 68.7, caseAcc: 42.1 } },
35+
{ model: "Alibaba/Qwen3-Max (w/ thinking)", icon: "/icons/icon_qwen.png", isThinking: true, avgAcc: 29.7, travel: { csScore: 64.0, psScore: 61.7, compScore: 62.8, caseAcc: 13.8 }, shopping: { matchScore: 80.6, caseAcc: 45.6 } },
36+
{ model: "Google/Gemini-3-Pro-Preview", icon: "/icons/ico_gemini.png", isThinking: true, avgAcc: 27.35, travel: { csScore: 58.4, psScore: 25.1, compScore: 41.8, caseAcc: 0.7 }, shopping: { matchScore: 83.4, caseAcc: 54.0 } },
37+
{ model: "DeepSeek-AI/DeepSeek-V3.2 (w/ thinking)", icon: "/icons/icon_dpsk.png", isThinking: true, avgAcc: 27.35, travel: { csScore: 47.4, psScore: 35.0, compScore: 41.2, caseAcc: 0.7 }, shopping: { matchScore: 84.0, caseAcc: 54.0 } },
38+
{ model: "Anthropic/Claude-4.5-Sonnet (w/ thinking)", icon: "/icons/icon_anthropic.png", isThinking: true, avgAcc: 26.8, travel: { csScore: 65.2, psScore: 58.4, compScore: 61.8, caseAcc: 7.6 }, shopping: { matchScore: 78.0, caseAcc: 46.0 } },
39+
{ model: "Anthropic/Claude-4.5-Opus (w/o thinking)", icon: "/icons/icon_anthropic.png", isThinking: false, avgAcc: 26.35, travel: { csScore: 67.5, psScore: 58.8, compScore: 63.1, caseAcc: 6.7 }, shopping: { matchScore: 81.0, caseAcc: 46.0 } },
40+
{ model: "ByteDance/Seed-2.0-pro-high", icon: "/icons/icon_seed.png", isThinking: true, avgAcc: 21.55, travel: { csScore: 56.0, psScore: 60.6, compScore: 58.3, caseAcc: 2.1 }, shopping: { matchScore: 76.7, caseAcc: 41.0 } },
41+
{ model: "xAI/Grok-4.1-fast (reasoning)", icon: "/icons/icon_x.png", isThinking: true, avgAcc: 19.15, travel: { csScore: 57.1, psScore: 37.7, compScore: 47.4, caseAcc: 2.7 }, shopping: { matchScore: 73.2, caseAcc: 35.6 } },
42+
{ model: "DeepSeek-AI/DeepSeek-V3.2 (w/o thinking)", icon: "/icons/icon_dpsk.png", isThinking: false, avgAcc: 19.0, travel: { csScore: 37.4, psScore: 12.1, compScore: 24.7, caseAcc: 0.0 }, shopping: { matchScore: 76.0, caseAcc: 38.0 } },
43+
{ model: "Anthropic/Claude-4.5-Sonnet (w/o thinking)", icon: "/icons/icon_anthropic.png", isThinking: false, avgAcc: 16.05, travel: { csScore: 53.4, psScore: 42.8, compScore: 48.1, caseAcc: 1.1 }, shopping: { matchScore: 71.0, caseAcc: 31.0 } },
44+
{ model: "Alibaba/Qwen3-Max (w/o thinking)", icon: "/icons/icon_qwen.png", isThinking: false, avgAcc: 15.45, travel: { csScore: 36.7, psScore: 30.7, compScore: 31.8, caseAcc: 0.8 }, shopping: { matchScore: 72.3, caseAcc: 30.1 } },
45+
{ model: "Z.ai/GLM-5 (w/ thinking)", icon: "/icons/icon_glm.png", isThinking: true, avgAcc: 14.55, travel: { csScore: 44.3, psScore: 42.3, compScore: 43.3, caseAcc: 0.4 }, shopping: { matchScore: 72.2, caseAcc: 28.7 } },
46+
{ model: "Moonshot-AI/Kimi-K2.5 (w/ thinking)", icon: "/icons/icon_kimi.png", isThinking: true, avgAcc: 14.35, travel: { csScore: 47.8, psScore: 43.7, compScore: 45.8, caseAcc: 0.4 }, shopping: { matchScore: 71.9, caseAcc: 28.3 } },
47+
{ model: "OpenAI/o4-mini", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 13.65, travel: { csScore: 58.0, psScore: 36.6, compScore: 47.2, caseAcc: 3.0 }, shopping: { matchScore: 62.5, caseAcc: 24.3 } },
48+
{ model: "OpenAI/GPT-5.2-none", icon: "/icons/icon_openai.png", isThinking: false, avgAcc: 6.75, travel: { csScore: 54.3, psScore: 29.9, compScore: 42.1, caseAcc: 0.4 }, shopping: { matchScore: 59.4, caseAcc: 13.1 } },
49+
{ model: "xAI/Grok-4.1-fast (non-reasoning)", icon: "/icons/icon_x.png", isThinking: false, avgAcc: 5.2, travel: { csScore: 39.6, psScore: 19.7, compScore: 29.6, caseAcc: 0.0 }, shopping: { matchScore: 52.9, caseAcc: 10.4 } },
50+
];
51+
52+
// ── v1.0 Data ────────────────────────────────────────────────────────
53+
const allModelsV1_0: ModelScore[] = [
2554
{ model: "OpenAI/GPT-5.2-high", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 44.6, travel: { csScore: 88.5, psScore: 83.3, compScore: 85.8, caseAcc: 35.0 }, shopping: { matchScore: 84.8, caseAcc: 54.2 } },
2655
{ model: "Anthropic/Claude-4.5-Opus (w/ thinking)", icon: "/icons/icon_anthropic.png", isThinking: true, avgAcc: 33.9, travel: { csScore: 79.3, psScore: 70.9, compScore: 75.1, caseAcc: 22.7 }, shopping: { matchScore: 80.0, caseAcc: 45.0 } },
2756
{ model: "OpenAI/GPT-5-high", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 31.6, travel: { csScore: 78.7, psScore: 65.9, compScore: 72.3, caseAcc: 18.9 }, shopping: { matchScore: 80.4, caseAcc: 44.2 } },
@@ -38,7 +67,6 @@ const allModels: ModelScore[] = [
3867
{ model: "Z.ai/GLM-4.7 (w/ thinking)", icon: "/icons/icon_glm.png", isThinking: true, avgAcc: 14.0, travel: { csScore: 44.0, psScore: 44.6, compScore: 44.3, caseAcc: 0.4 }, shopping: { matchScore: 72.5, caseAcc: 27.5 } },
3968
{ model: "OpenAI/o4-mini", icon: "/icons/icon_openai.png", isThinking: true, avgAcc: 12.4, travel: { csScore: 58.0, psScore: 36.6, compScore: 47.2, caseAcc: 3.0 }, shopping: { matchScore: 69.1, caseAcc: 21.7 } },
4069
{ model: "Moonshot-AI/Kimi-K2-thinking", icon: "/icons/icon_kimi.png", isThinking: true, avgAcc: 12.1, travel: { csScore: 45.2, psScore: 32.5, compScore: 38.9, caseAcc: 0.0 }, shopping: { matchScore: 65.8, caseAcc: 24.2 } },
41-
// Non-thinking Models
4270
{ model: "Anthropic/Claude-4.5-Opus (w/o thinking)", icon: "/icons/icon_anthropic.png", isThinking: false, avgAcc: 26.3, travel: { csScore: 67.5, psScore: 58.8, compScore: 63.1, caseAcc: 6.7 }, shopping: { matchScore: 82.2, caseAcc: 45.8 } },
4371
{ model: "Anthropic/Claude-4.5-Sonnet (w/o thinking)", icon: "/icons/icon_anthropic.png", isThinking: false, avgAcc: 17.2, travel: { csScore: 53.4, psScore: 42.8, compScore: 48.1, caseAcc: 1.1 }, shopping: { matchScore: 75.8, caseAcc: 33.3 } },
4472
{ model: "Alibaba/Qwen3-Max (w/o thinking)", icon: "/icons/icon_qwen.png", isThinking: false, avgAcc: 12.8, travel: { csScore: 36.7, psScore: 30.7, compScore: 31.8, caseAcc: 0.8 }, shopping: { matchScore: 70.2, caseAcc: 24.7 } },
@@ -50,6 +78,11 @@ const allModels: ModelScore[] = [
5078
{ model: "xAI/Grok-4.1-fast (non-reasoning)", icon: "/icons/icon_x.png", isThinking: false, avgAcc: 3.0, travel: { csScore: 39.6, psScore: 19.7, compScore: 29.6, caseAcc: 0.0 }, shopping: { matchScore: 50.1, caseAcc: 5.9 } },
5179
];
5280

81+
const versionData: Record<VersionKey, ModelScore[]> = {
82+
"v1.1": allModelsV1_1,
83+
"v1.0": allModelsV1_0,
84+
};
85+
5386
function RankBadge({ rank }: { rank: number }) {
5487
if (rank === 1) {
5588
return (
@@ -105,7 +138,6 @@ function sortByScore(models: ModelScore[]): ModelScore[] {
105138
return [...models].sort((a, b) => b.avgAcc - a.avgAcc);
106139
}
107140

108-
// Find best values for highlighting
109141
function findBestValues(models: ModelScore[]) {
110142
const best = {
111143
avgAcc: 0,
@@ -114,8 +146,8 @@ function findBestValues(models: ModelScore[]) {
114146
};
115147
models.forEach((m) => {
116148
if (m.avgAcc > best.avgAcc) best.avgAcc = m.avgAcc;
117-
if (m.travel.csScore > best.travel.csScore) best.travel.csScore = m.travel.csScore;
118-
if (m.travel.psScore > best.travel.psScore) best.travel.psScore = m.travel.psScore;
149+
if (m.travel.csScore !== null && m.travel.csScore > best.travel.csScore) best.travel.csScore = m.travel.csScore;
150+
if (m.travel.psScore !== null && m.travel.psScore > best.travel.psScore) best.travel.psScore = m.travel.psScore;
119151
if (m.travel.compScore > best.travel.compScore) best.travel.compScore = m.travel.compScore;
120152
if (m.travel.caseAcc > best.travel.caseAcc) best.travel.caseAcc = m.travel.caseAcc;
121153
if (m.shopping.matchScore > best.shopping.matchScore) best.shopping.matchScore = m.shopping.matchScore;
@@ -124,7 +156,14 @@ function findBestValues(models: ModelScore[]) {
124156
return best;
125157
}
126158

127-
function ScoreCell({ value, isBest }: { value: number; isBest: boolean }) {
159+
function ScoreCell({ value, isBest }: { value: number | null; isBest: boolean }) {
160+
if (value === null) {
161+
return (
162+
<td className="px-2 py-2.5 text-center text-sm text-gray-400 dark:text-gray-500 italic">
163+
164+
</td>
165+
);
166+
}
128167
return (
129168
<td className={`px-2 py-2.5 text-center text-sm ${isBest ? "font-bold text-gray-900 dark:text-white" : "text-gray-600 dark:text-gray-400"}`}>
130169
{value.toFixed(1)}
@@ -133,8 +172,12 @@ function ScoreCell({ value, isBest }: { value: number; isBest: boolean }) {
133172
}
134173

135174
export function Leaderboard() {
136-
const sortedModels = sortByScore(allModels);
137-
const best = findBestValues(allModels);
175+
const versions: VersionKey[] = ["v1.1", "v1.0"];
176+
const [activeVersion, setActiveVersion] = useState<VersionKey>("v1.1");
177+
178+
const currentModels = versionData[activeVersion];
179+
const sortedModels = sortByScore(currentModels);
180+
const best = findBestValues(currentModels);
138181

139182
return (
140183
<div className="my-8">
@@ -144,10 +187,34 @@ export function Leaderboard() {
144187
</h2>
145188

146189
{/* Subtitle */}
147-
<p className="text-center text-sm text-gray-500 dark:text-gray-400 mb-6">
148-
Comprehensive evaluation results on DeepPlanning. Results are averaged over four runs. <strong>Bold</strong> indicates the best result.
190+
<p className="text-center text-sm text-gray-500 dark:text-gray-400 mb-4">
191+
Comprehensive evaluation results on DeepPlanning <strong>{activeVersion}</strong>. Results are averaged over four runs. <strong>Bold</strong> indicates the best result.
149192
</p>
150193

194+
{/* Version Toggle */}
195+
<div className="flex justify-center mb-6">
196+
<div className="inline-flex rounded-lg border border-gray-200 dark:border-gray-700 bg-gray-50 dark:bg-gray-800 p-0.5">
197+
{versions.map((v) => (
198+
<button
199+
key={v}
200+
onClick={() => setActiveVersion(v)}
201+
className={`px-4 py-1.5 text-sm font-medium rounded-md transition-all ${
202+
activeVersion === v
203+
? "bg-white dark:bg-gray-700 text-gray-900 dark:text-white shadow-sm"
204+
: "text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-300"
205+
}`}
206+
>
207+
{v}
208+
{v === "v1.1" && (
209+
<span className="ml-1.5 text-[10px] font-semibold px-1.5 py-0.5 rounded-full bg-green-100 dark:bg-green-900 text-green-700 dark:text-green-300">
210+
Latest
211+
</span>
212+
)}
213+
</button>
214+
))}
215+
</div>
216+
</div>
217+
151218
{/* Table */}
152219
<div className="overflow-x-auto rounded-lg border border-gray-200 dark:border-gray-700 shadow-sm">
153220
<table className="w-full text-sm border-collapse bg-white dark:bg-gray-900">
@@ -190,6 +257,11 @@ export function Leaderboard() {
190257
<div className="flex items-center gap-2">
191258
<ModelIcon icon={item.icon} />
192259
<span className="font-medium text-gray-800 dark:text-gray-200">{item.model}</span>
260+
{item.note && (
261+
<span className="text-[10px] px-1.5 py-0.5 rounded bg-amber-100 dark:bg-amber-900 text-amber-700 dark:text-amber-300" title={item.note}>
262+
*
263+
</span>
264+
)}
193265
</div>
194266
</td>
195267
<ScoreCell value={item.avgAcc} isBest={item.avgAcc === best.avgAcc} />
@@ -209,6 +281,11 @@ export function Leaderboard() {
209281
<p className="mt-3 text-xs text-gray-500 dark:text-gray-400 text-center">
210282
CS Score = Commonsense Score | PS Score = Personalized Score | Comp Score = Composite Score | Case Acc. = Case Accuracy | Match Score = Match Score. <strong>Bold</strong> values indicate best performance per category.
211283
</p>
284+
{activeVersion === "v1.1" && sortedModels.some((m) => m.note) && (
285+
<p className="mt-1 text-xs text-amber-600 dark:text-amber-400 text-center">
286+
* Some scores are still being evaluated. &ldquo;—&rdquo; indicates pending results.
287+
</p>
288+
)}
212289
</div>
213290
);
214291
}

0 commit comments

Comments
 (0)