11"use client" ;
22
3- import React from "react" ;
3+ import React , { useState } from "react" ;
44
55interface ModelScore {
66 model : string ;
77 icon : string ;
88 isThinking : boolean ;
99 avgAcc : number ;
1010 travel : {
11- csScore : number ;
12- psScore : number ;
11+ csScore : number | null ;
12+ psScore : number | null ;
1313 compScore : number ;
1414 caseAcc : number ;
1515 } ;
1616 shopping : {
1717 matchScore : number ;
1818 caseAcc : number ;
1919 } ;
20+ note ?: string ;
2021}
2122
22- // All Models Data
23- const allModels : ModelScore [ ] = [
23+ type VersionKey = "v1.1" | "v1.0" ;
24+
25+ // ── v1.1 Data (default) ─────────────────────────────────────────────
26+ const allModelsV1_1 : ModelScore [ ] = [
2427 // Thinking Models
28+ { model : "Anthropic/Claude-4.6-Opus (max)" , icon : "/icons/icon_anthropic.png" , isThinking : true , avgAcc : 58.85 , travel : { csScore : 86.1 , psScore : 80.3 , compScore : 83.2 , caseAcc : 61.5 } , shopping : { matchScore : 85.3 , caseAcc : 56.2 } } ,
29+ { model : "OpenAI/GPT-5.2-high" , icon : "/icons/icon_openai.png" , isThinking : true , avgAcc : 48.2 , travel : { csScore : 88.5 , psScore : 83.3 , compScore : 85.8 , caseAcc : 35.0 } , shopping : { matchScore : 88.4 , caseAcc : 61.4 } } ,
30+ { model : "Alibaba/Qwen-3.5-Plus (w/o thinking)" , icon : "/icons/icon_qwen.png" , isThinking : false , avgAcc : 37.6 , travel : { csScore : 83.6 , psScore : 79.9 , compScore : 81.6 , caseAcc : 26.3 } , shopping : { matchScore : 82.4 , caseAcc : 48.9 } } ,
31+ { model : "Anthropic/Claude-4.5-Opus (w/ thinking)" , icon : "/icons/icon_anthropic.png" , isThinking : true , avgAcc : 37.05 , travel : { csScore : 79.3 , psScore : 70.9 , compScore : 75.1 , caseAcc : 22.7 } , shopping : { matchScore : 83.7 , caseAcc : 51.4 } } ,
32+ { model : "Alibaba/Qwen-3.5-Plus (w/ thinking)" , icon : "/icons/icon_qwen.png" , isThinking : true , avgAcc : 35.85 , travel : { csScore : 76.8 , psScore : 75.4 , compScore : 76.2 , caseAcc : 25.0 } , shopping : { matchScore : 82.1 , caseAcc : 46.7 } } ,
33+ { model : "Google/Gemini-3-Flash-Preview" , icon : "/icons/ico_gemini.png" , isThinking : true , avgAcc : 33.75 , travel : { csScore : 67.1 , psScore : 57.7 , compScore : 62.4 , caseAcc : 5.9 } , shopping : { matchScore : 86.9 , caseAcc : 61.6 } } ,
34+ { model : "OpenAI/GPT-5-high" , icon : "/icons/icon_openai.png" , isThinking : true , avgAcc : 30.5 , travel : { csScore : 78.7 , psScore : 65.9 , compScore : 72.3 , caseAcc : 18.9 } , shopping : { matchScore : 68.7 , caseAcc : 42.1 } } ,
35+ { model : "Alibaba/Qwen3-Max (w/ thinking)" , icon : "/icons/icon_qwen.png" , isThinking : true , avgAcc : 29.7 , travel : { csScore : 64.0 , psScore : 61.7 , compScore : 62.8 , caseAcc : 13.8 } , shopping : { matchScore : 80.6 , caseAcc : 45.6 } } ,
36+ { model : "Google/Gemini-3-Pro-Preview" , icon : "/icons/ico_gemini.png" , isThinking : true , avgAcc : 27.35 , travel : { csScore : 58.4 , psScore : 25.1 , compScore : 41.8 , caseAcc : 0.7 } , shopping : { matchScore : 83.4 , caseAcc : 54.0 } } ,
37+ { model : "DeepSeek-AI/DeepSeek-V3.2 (w/ thinking)" , icon : "/icons/icon_dpsk.png" , isThinking : true , avgAcc : 27.35 , travel : { csScore : 47.4 , psScore : 35.0 , compScore : 41.2 , caseAcc : 0.7 } , shopping : { matchScore : 84.0 , caseAcc : 54.0 } } ,
38+ { model : "Anthropic/Claude-4.5-Sonnet (w/ thinking)" , icon : "/icons/icon_anthropic.png" , isThinking : true , avgAcc : 26.8 , travel : { csScore : 65.2 , psScore : 58.4 , compScore : 61.8 , caseAcc : 7.6 } , shopping : { matchScore : 78.0 , caseAcc : 46.0 } } ,
39+ { model : "Anthropic/Claude-4.5-Opus (w/o thinking)" , icon : "/icons/icon_anthropic.png" , isThinking : false , avgAcc : 26.35 , travel : { csScore : 67.5 , psScore : 58.8 , compScore : 63.1 , caseAcc : 6.7 } , shopping : { matchScore : 81.0 , caseAcc : 46.0 } } ,
40+ { model : "ByteDance/Seed-2.0-pro-high" , icon : "/icons/icon_seed.png" , isThinking : true , avgAcc : 21.55 , travel : { csScore : 56.0 , psScore : 60.6 , compScore : 58.3 , caseAcc : 2.1 } , shopping : { matchScore : 76.7 , caseAcc : 41.0 } } ,
41+ { model : "xAI/Grok-4.1-fast (reasoning)" , icon : "/icons/icon_x.png" , isThinking : true , avgAcc : 19.15 , travel : { csScore : 57.1 , psScore : 37.7 , compScore : 47.4 , caseAcc : 2.7 } , shopping : { matchScore : 73.2 , caseAcc : 35.6 } } ,
42+ { model : "DeepSeek-AI/DeepSeek-V3.2 (w/o thinking)" , icon : "/icons/icon_dpsk.png" , isThinking : false , avgAcc : 19.0 , travel : { csScore : 37.4 , psScore : 12.1 , compScore : 24.7 , caseAcc : 0.0 } , shopping : { matchScore : 76.0 , caseAcc : 38.0 } } ,
43+ { model : "Anthropic/Claude-4.5-Sonnet (w/o thinking)" , icon : "/icons/icon_anthropic.png" , isThinking : false , avgAcc : 16.05 , travel : { csScore : 53.4 , psScore : 42.8 , compScore : 48.1 , caseAcc : 1.1 } , shopping : { matchScore : 71.0 , caseAcc : 31.0 } } ,
44+ { model : "Alibaba/Qwen3-Max (w/o thinking)" , icon : "/icons/icon_qwen.png" , isThinking : false , avgAcc : 15.45 , travel : { csScore : 36.7 , psScore : 30.7 , compScore : 31.8 , caseAcc : 0.8 } , shopping : { matchScore : 72.3 , caseAcc : 30.1 } } ,
45+ { model : "Z.ai/GLM-5 (w/ thinking)" , icon : "/icons/icon_glm.png" , isThinking : true , avgAcc : 14.55 , travel : { csScore : 44.3 , psScore : 42.3 , compScore : 43.3 , caseAcc : 0.4 } , shopping : { matchScore : 72.2 , caseAcc : 28.7 } } ,
46+ { model : "Moonshot-AI/Kimi-K2.5 (w/ thinking)" , icon : "/icons/icon_kimi.png" , isThinking : true , avgAcc : 14.35 , travel : { csScore : 47.8 , psScore : 43.7 , compScore : 45.8 , caseAcc : 0.4 } , shopping : { matchScore : 71.9 , caseAcc : 28.3 } } ,
47+ { model : "OpenAI/o4-mini" , icon : "/icons/icon_openai.png" , isThinking : true , avgAcc : 13.65 , travel : { csScore : 58.0 , psScore : 36.6 , compScore : 47.2 , caseAcc : 3.0 } , shopping : { matchScore : 62.5 , caseAcc : 24.3 } } ,
48+ { model : "OpenAI/GPT-5.2-none" , icon : "/icons/icon_openai.png" , isThinking : false , avgAcc : 6.75 , travel : { csScore : 54.3 , psScore : 29.9 , compScore : 42.1 , caseAcc : 0.4 } , shopping : { matchScore : 59.4 , caseAcc : 13.1 } } ,
49+ { model : "xAI/Grok-4.1-fast (non-reasoning)" , icon : "/icons/icon_x.png" , isThinking : false , avgAcc : 5.2 , travel : { csScore : 39.6 , psScore : 19.7 , compScore : 29.6 , caseAcc : 0.0 } , shopping : { matchScore : 52.9 , caseAcc : 10.4 } } ,
50+ ] ;
51+
52+ // ── v1.0 Data ────────────────────────────────────────────────────────
53+ const allModelsV1_0 : ModelScore [ ] = [
2554 { model : "OpenAI/GPT-5.2-high" , icon : "/icons/icon_openai.png" , isThinking : true , avgAcc : 44.6 , travel : { csScore : 88.5 , psScore : 83.3 , compScore : 85.8 , caseAcc : 35.0 } , shopping : { matchScore : 84.8 , caseAcc : 54.2 } } ,
2655 { model : "Anthropic/Claude-4.5-Opus (w/ thinking)" , icon : "/icons/icon_anthropic.png" , isThinking : true , avgAcc : 33.9 , travel : { csScore : 79.3 , psScore : 70.9 , compScore : 75.1 , caseAcc : 22.7 } , shopping : { matchScore : 80.0 , caseAcc : 45.0 } } ,
2756 { model : "OpenAI/GPT-5-high" , icon : "/icons/icon_openai.png" , isThinking : true , avgAcc : 31.6 , travel : { csScore : 78.7 , psScore : 65.9 , compScore : 72.3 , caseAcc : 18.9 } , shopping : { matchScore : 80.4 , caseAcc : 44.2 } } ,
@@ -38,7 +67,6 @@ const allModels: ModelScore[] = [
3867 { model : "Z.ai/GLM-4.7 (w/ thinking)" , icon : "/icons/icon_glm.png" , isThinking : true , avgAcc : 14.0 , travel : { csScore : 44.0 , psScore : 44.6 , compScore : 44.3 , caseAcc : 0.4 } , shopping : { matchScore : 72.5 , caseAcc : 27.5 } } ,
3968 { model : "OpenAI/o4-mini" , icon : "/icons/icon_openai.png" , isThinking : true , avgAcc : 12.4 , travel : { csScore : 58.0 , psScore : 36.6 , compScore : 47.2 , caseAcc : 3.0 } , shopping : { matchScore : 69.1 , caseAcc : 21.7 } } ,
4069 { model : "Moonshot-AI/Kimi-K2-thinking" , icon : "/icons/icon_kimi.png" , isThinking : true , avgAcc : 12.1 , travel : { csScore : 45.2 , psScore : 32.5 , compScore : 38.9 , caseAcc : 0.0 } , shopping : { matchScore : 65.8 , caseAcc : 24.2 } } ,
41- // Non-thinking Models
4270 { model : "Anthropic/Claude-4.5-Opus (w/o thinking)" , icon : "/icons/icon_anthropic.png" , isThinking : false , avgAcc : 26.3 , travel : { csScore : 67.5 , psScore : 58.8 , compScore : 63.1 , caseAcc : 6.7 } , shopping : { matchScore : 82.2 , caseAcc : 45.8 } } ,
4371 { model : "Anthropic/Claude-4.5-Sonnet (w/o thinking)" , icon : "/icons/icon_anthropic.png" , isThinking : false , avgAcc : 17.2 , travel : { csScore : 53.4 , psScore : 42.8 , compScore : 48.1 , caseAcc : 1.1 } , shopping : { matchScore : 75.8 , caseAcc : 33.3 } } ,
4472 { model : "Alibaba/Qwen3-Max (w/o thinking)" , icon : "/icons/icon_qwen.png" , isThinking : false , avgAcc : 12.8 , travel : { csScore : 36.7 , psScore : 30.7 , compScore : 31.8 , caseAcc : 0.8 } , shopping : { matchScore : 70.2 , caseAcc : 24.7 } } ,
@@ -50,6 +78,11 @@ const allModels: ModelScore[] = [
5078 { model : "xAI/Grok-4.1-fast (non-reasoning)" , icon : "/icons/icon_x.png" , isThinking : false , avgAcc : 3.0 , travel : { csScore : 39.6 , psScore : 19.7 , compScore : 29.6 , caseAcc : 0.0 } , shopping : { matchScore : 50.1 , caseAcc : 5.9 } } ,
5179] ;
5280
81+ const versionData : Record < VersionKey , ModelScore [ ] > = {
82+ "v1.1" : allModelsV1_1 ,
83+ "v1.0" : allModelsV1_0 ,
84+ } ;
85+
5386function RankBadge ( { rank } : { rank : number } ) {
5487 if ( rank === 1 ) {
5588 return (
@@ -105,7 +138,6 @@ function sortByScore(models: ModelScore[]): ModelScore[] {
105138 return [ ...models ] . sort ( ( a , b ) => b . avgAcc - a . avgAcc ) ;
106139}
107140
108- // Find best values for highlighting
109141function findBestValues ( models : ModelScore [ ] ) {
110142 const best = {
111143 avgAcc : 0 ,
@@ -114,8 +146,8 @@ function findBestValues(models: ModelScore[]) {
114146 } ;
115147 models . forEach ( ( m ) => {
116148 if ( m . avgAcc > best . avgAcc ) best . avgAcc = m . avgAcc ;
117- if ( m . travel . csScore > best . travel . csScore ) best . travel . csScore = m . travel . csScore ;
118- if ( m . travel . psScore > best . travel . psScore ) best . travel . psScore = m . travel . psScore ;
149+ if ( m . travel . csScore !== null && m . travel . csScore > best . travel . csScore ) best . travel . csScore = m . travel . csScore ;
150+ if ( m . travel . psScore !== null && m . travel . psScore > best . travel . psScore ) best . travel . psScore = m . travel . psScore ;
119151 if ( m . travel . compScore > best . travel . compScore ) best . travel . compScore = m . travel . compScore ;
120152 if ( m . travel . caseAcc > best . travel . caseAcc ) best . travel . caseAcc = m . travel . caseAcc ;
121153 if ( m . shopping . matchScore > best . shopping . matchScore ) best . shopping . matchScore = m . shopping . matchScore ;
@@ -124,7 +156,14 @@ function findBestValues(models: ModelScore[]) {
124156 return best ;
125157}
126158
127- function ScoreCell ( { value, isBest } : { value : number ; isBest : boolean } ) {
159+ function ScoreCell ( { value, isBest } : { value : number | null ; isBest : boolean } ) {
160+ if ( value === null ) {
161+ return (
162+ < td className = "px-2 py-2.5 text-center text-sm text-gray-400 dark:text-gray-500 italic" >
163+ —
164+ </ td >
165+ ) ;
166+ }
128167 return (
129168 < td className = { `px-2 py-2.5 text-center text-sm ${ isBest ? "font-bold text-gray-900 dark:text-white" : "text-gray-600 dark:text-gray-400" } ` } >
130169 { value . toFixed ( 1 ) }
@@ -133,8 +172,12 @@ function ScoreCell({ value, isBest }: { value: number; isBest: boolean }) {
133172}
134173
135174export function Leaderboard ( ) {
136- const sortedModels = sortByScore ( allModels ) ;
137- const best = findBestValues ( allModels ) ;
175+ const versions : VersionKey [ ] = [ "v1.1" , "v1.0" ] ;
176+ const [ activeVersion , setActiveVersion ] = useState < VersionKey > ( "v1.1" ) ;
177+
178+ const currentModels = versionData [ activeVersion ] ;
179+ const sortedModels = sortByScore ( currentModels ) ;
180+ const best = findBestValues ( currentModels ) ;
138181
139182 return (
140183 < div className = "my-8" >
@@ -144,10 +187,34 @@ export function Leaderboard() {
144187 </ h2 >
145188
146189 { /* Subtitle */ }
147- < p className = "text-center text-sm text-gray-500 dark:text-gray-400 mb-6 " >
148- Comprehensive evaluation results on DeepPlanning. Results are averaged over four runs. < strong > Bold</ strong > indicates the best result.
190+ < p className = "text-center text-sm text-gray-500 dark:text-gray-400 mb-4 " >
191+ Comprehensive evaluation results on DeepPlanning < strong > { activeVersion } </ strong > . Results are averaged over four runs. < strong > Bold</ strong > indicates the best result.
149192 </ p >
150193
194+ { /* Version Toggle */ }
195+ < div className = "flex justify-center mb-6" >
196+ < div className = "inline-flex rounded-lg border border-gray-200 dark:border-gray-700 bg-gray-50 dark:bg-gray-800 p-0.5" >
197+ { versions . map ( ( v ) => (
198+ < button
199+ key = { v }
200+ onClick = { ( ) => setActiveVersion ( v ) }
201+ className = { `px-4 py-1.5 text-sm font-medium rounded-md transition-all ${
202+ activeVersion === v
203+ ? "bg-white dark:bg-gray-700 text-gray-900 dark:text-white shadow-sm"
204+ : "text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-300"
205+ } `}
206+ >
207+ { v }
208+ { v === "v1.1" && (
209+ < span className = "ml-1.5 text-[10px] font-semibold px-1.5 py-0.5 rounded-full bg-green-100 dark:bg-green-900 text-green-700 dark:text-green-300" >
210+ Latest
211+ </ span >
212+ ) }
213+ </ button >
214+ ) ) }
215+ </ div >
216+ </ div >
217+
151218 { /* Table */ }
152219 < div className = "overflow-x-auto rounded-lg border border-gray-200 dark:border-gray-700 shadow-sm" >
153220 < table className = "w-full text-sm border-collapse bg-white dark:bg-gray-900" >
@@ -190,6 +257,11 @@ export function Leaderboard() {
190257 < div className = "flex items-center gap-2" >
191258 < ModelIcon icon = { item . icon } />
192259 < span className = "font-medium text-gray-800 dark:text-gray-200" > { item . model } </ span >
260+ { item . note && (
261+ < span className = "text-[10px] px-1.5 py-0.5 rounded bg-amber-100 dark:bg-amber-900 text-amber-700 dark:text-amber-300" title = { item . note } >
262+ *
263+ </ span >
264+ ) }
193265 </ div >
194266 </ td >
195267 < ScoreCell value = { item . avgAcc } isBest = { item . avgAcc === best . avgAcc } />
@@ -209,6 +281,11 @@ export function Leaderboard() {
209281 < p className = "mt-3 text-xs text-gray-500 dark:text-gray-400 text-center" >
210282 CS Score = Commonsense Score | PS Score = Personalized Score | Comp Score = Composite Score | Case Acc. = Case Accuracy | Match Score = Match Score. < strong > Bold</ strong > values indicate best performance per category.
211283 </ p >
284+ { activeVersion === "v1.1" && sortedModels . some ( ( m ) => m . note ) && (
285+ < p className = "mt-1 text-xs text-amber-600 dark:text-amber-400 text-center" >
286+ * Some scores are still being evaluated. “—” indicates pending results.
287+ </ p >
288+ ) }
212289 </ div >
213290 ) ;
214291}
0 commit comments