@@ -32,9 +32,10 @@ function dateTime(value) {
3232 return new Date ( value ) . toLocaleString ( ) ;
3333}
3434
35- function metricCard ( title , value , detail ) {
35+ function metricCard ( title , value , detail , tooltip ) {
36+ const tip = tooltip ? ` data-tooltip="${ escapeHtml ( tooltip ) } "` : "" ;
3637 return `
37- <article class="metric-card">
38+ <article class="metric-card"${ tip } >
3839 <p class="metric-title">${ title } </p>
3940 <p class="metric-value">${ value } </p>
4041 <p class="metric-detail">${ detail } </p>
@@ -107,47 +108,56 @@ function renderCards(data) {
107108 metricCard (
108109 "Direct" ,
109110 pct ( overall . direct . pct ) ,
110- `${ overall . direct . correct } /${ overall . direct . total } `
111+ `${ overall . direct . correct } /${ overall . direct . total } ` ,
112+ "Accuracy when the model answers the question directly (open-ended, no answer choices provided)."
111113 ) ,
112114 metricCard (
113115 "MCQ with refusal" ,
114116 pct ( overall . mcq_with_refusal . pct ) ,
115- `${ overall . mcq_with_refusal . correct } /${ overall . mcq_with_refusal . total } `
117+ `${ overall . mcq_with_refusal . correct } /${ overall . mcq_with_refusal . total } ` ,
118+ "Accuracy on multiple-choice questions where \"I don't know\" is included as an answer option."
116119 ) ,
117120 metricCard (
118121 "MCQ without refusal" ,
119122 pct ( overall . mcq_without_refusal . pct ) ,
120- `${ overall . mcq_without_refusal . correct } /${ overall . mcq_without_refusal . total } `
123+ `${ overall . mcq_without_refusal . correct } /${ overall . mcq_without_refusal . total } ` ,
124+ "Accuracy on multiple-choice questions without an \"I don't know\" option, forcing a best guess."
121125 ) ,
122126 metricCard (
123127 "MCQ Lift" ,
124128 pp ( h . mcq_lift_pp ) ,
125- `Direct \u2192 MCQ w/o refusal`
129+ `Direct \u2192 MCQ w/o refusal` ,
130+ "Percentage-point gain when switching from direct (open-ended) to MCQ without refusal. Shows how much answer choices help the model."
126131 ) ,
127132 metricCard (
128133 "Refusal Gap" ,
129134 `${ h . refusal_gap_pp } pp` ,
130- `MCQ w/o \u2192 MCQ w/ refusal`
135+ `MCQ w/o \u2192 MCQ w/ refusal` ,
136+ "Percentage-point drop from MCQ without refusal to MCQ with refusal. Measures how often the model opts for \"I don't know\" when given the chance."
131137 ) ,
132138 metricCard (
133139 "MCQ rescue rate" ,
134140 pct ( rescue . rescued_pct ) ,
135- `${ rescue . rescued } /${ rescue . direct_wrong } direct misses rescued`
141+ `${ rescue . rescued } /${ rescue . direct_wrong } direct misses rescued` ,
142+ "Of questions answered wrong in direct mode, the percentage that were answered correctly in MCQ without refusal mode."
136143 ) ,
137144 metricCard (
138145 "Best repeat" ,
139146 pct ( h . best_repeat_pct ) ,
140- h . best_repeat_label
147+ h . best_repeat_label ,
148+ "The highest MCQ without refusal accuracy achieved by any single repeat run."
141149 ) ,
142150 metricCard (
143151 "Always-correct questions" ,
144152 pct ( consistency . always_correct_pct ) ,
145- `${ consistency . always_correct } /${ data . totals . questions } questions`
153+ `${ consistency . always_correct } /${ data . totals . questions } questions` ,
154+ "Questions answered correctly in MCQ without refusal mode across every single repeat run."
146155 ) ,
147156 metricCard (
148157 "Task groups at 100%" ,
149158 pct ( h . task_groups_at_100_pct ) ,
150- `${ h . task_groups_at_100 } /${ h . total_task_groups } task groups`
159+ `${ h . task_groups_at_100 } /${ h . total_task_groups } task groups` ,
160+ "Task groups where every question was answered correctly in MCQ without refusal across all runs."
151161 ) ,
152162 ] . join ( "" ) ;
153163
@@ -674,3 +684,27 @@ init().catch((error) => {
674684 header . textContent = "Failed to load benchmark data." ;
675685 console . error ( error ) ;
676686} ) ;
687+
688+ // --- Floating tooltip for metric cards ---
689+ ( function ( ) {
690+ const tip = document . createElement ( "div" ) ;
691+ tip . className = "tip" ;
692+ document . body . appendChild ( tip ) ;
693+
694+ document . addEventListener ( "mouseover" , ( e ) => {
695+ const card = e . target . closest ( "[data-tooltip]" ) ;
696+ if ( ! card ) return ;
697+ tip . textContent = card . dataset . tooltip ;
698+ const r = card . getBoundingClientRect ( ) ;
699+ tip . style . left = r . left + r . width / 2 + "px" ;
700+ tip . style . top = r . top - 8 + "px" ;
701+ tip . style . transform = "translate(-50%, -100%)" ;
702+ tip . classList . add ( "visible" ) ;
703+ } ) ;
704+
705+ document . addEventListener ( "mouseout" , ( e ) => {
706+ const card = e . target . closest ( "[data-tooltip]" ) ;
707+ if ( ! card ) return ;
708+ tip . classList . remove ( "visible" ) ;
709+ } ) ;
710+ } ) ( ) ;
0 commit comments