niklasfrick · niklasfrick · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/frontend/src/__tests__/EngineCardSpecDecode.test.tsx b/frontend/src/__tests__/EngineCardSpecDecode.test.tsx
@@ -0,0 +1,108 @@
+import { describe, it, expect } from 'vitest'
+import { render, screen } from '@testing-library/react'
+import { EngineCard } from '../components/engines/EngineCard'
+import type { EngineMetrics, EngineSnapshot } from '../types/metrics'
+
+/** Minimal but complete EngineMetrics, spec-decode fields null by default. */
+function metrics(overrides: Partial<EngineMetrics> = {}): EngineMetrics {
+  return {
+    tokens_per_sec: 100,
+    avg_tokens_per_sec: 80,
+    per_request_tps: 50,
+    ttft_ms: 120,
+    active_requests: 1,
+    queued_requests: 0,
+    kv_cache_percent: 40,
+    kv_cache_is_estimated: false,
+    total_requests: 10,
+    e2e_latency_ms: 500,
+    prompt_tokens_per_sec: 200,
+    avg_prompt_tokens_per_sec: 180,
+    per_request_prompt_tps: 70,
+    swapped_requests: 0,
+    prefix_cache_hit_rate: 30,
+    queue_time_ms: 50,
+    inter_token_latency_ms: 25,
+    preemptions_total: 0,
+    total_prompt_tokens: 1000,
+    total_generation_tokens: 2000,
+    prefix_cache_queries_total: 5000,
+    avg_batch_size: 4,
+    ttft_percentiles: null,
+    itl_percentiles: null,
+    e2e_percentiles: null,
+    ttft_goodput_pct: null,
+    itl_goodput_pct: null,
+    e2e_goodput_pct: null,
+    ttft_buckets: null,
+    itl_buckets: null,
+    e2e_buckets: null,
+    tpot_ms: 28,
+    tpot_percentiles: null,
+    tpot_goodput_pct: null,
+    tpot_buckets: null,
+    spec_decode_draft_tokens_total: null,
+    spec_decode_accepted_tokens_total: null,
+    spec_decode_drafts_total: null,
+    spec_decode_acceptance_rate: null,
+    spec_decode_acceptance_rate_live: null,
+    spec_decode_mean_acceptance_length: null,
+    ...overrides,
+  }
+}
+
+function snapshot(m: EngineMetrics | null): EngineSnapshot {
+  return {
+    engine_type: 'Vllm',
+    endpoint: 'http://localhost:8000',
+    status: { type: 'Running' },
+    model: {
+      name: 'test/model',
+      parameter_size: null,
+      quantization: null,
+      precision: null,
+      tensor_type: null,
+      model_type: null,
+      pipeline_tag: null,
+    },
+    metrics: m,
+    recent_requests: [],
+    deployment_mode: 'Docker',
+  }
+}
+
+describe('EngineCard speculative-decoding section', () => {
+  it('always renders the renamed cache card header', () => {
+    render(<EngineCard engine={snapshot(metrics())} />)
+    expect(screen.getByText('Cache & Speculative Decoding')).toBeTruthy()
+  })
+
+  it('hides the spec-decode section when the model has no spec-decode metrics', () => {
+    render(<EngineCard engine={snapshot(metrics())} />)
+    expect(screen.queryByText('Speculative Decoding')).toBeNull()
+    expect(screen.queryByText('Accept Len')).toBeNull()
+  })
+
+  it('renders TAR, acceptance length, and accepted/draft counters when enabled', () => {
+    render(
+      <EngineCard
+        engine={snapshot(
+          metrics({
+            spec_decode_draft_tokens_total: 100_000,
+            spec_decode_accepted_tokens_total: 72_000,
+            spec_decode_drafts_total: 24_000,
+            spec_decode_acceptance_rate: 72,
+            spec_decode_acceptance_rate_live: 68,
+            spec_decode_mean_acceptance_length: 3,
+          }),
+        )}
+      />,
+    )
+    expect(screen.getByText('Speculative Decoding')).toBeTruthy()
+    expect(screen.getByText('72')).toBeTruthy()
+    expect(screen.getByText(/68% live/)).toBeTruthy()
+    expect(screen.getByText('Accept Len')).toBeTruthy()
+    expect(screen.getByText('Accepted')).toBeTruthy()
+    expect(screen.getByText('Draft')).toBeTruthy()
+  })
+})
diff --git a/frontend/src/__tests__/HBar.test.tsx b/frontend/src/__tests__/HBar.test.tsx
@@ -0,0 +1,48 @@
+import { describe, it, expect } from 'vitest'
+import { render, screen } from '@testing-library/react'
+import { HBar } from '../components/gauges/HBar'
+import type { GaugeSegment } from '../components/gauges/ArcGauge'
+
+describe('HBar', () => {
+  it('renders a single value with label and unit', () => {
+    render(<HBar value={42} label="GPU Util" unit="%" />)
+    expect(screen.getByText('GPU Util')).toBeTruthy()
+    expect(screen.getByText('42')).toBeTruthy()
+    expect(screen.getByText('%')).toBeTruthy()
+  })
+
+  it('fills the bar proportionally to value/max', () => {
+    render(<HBar value={30} max={120} label="X" unit="W" />)
+    const fill = screen.getByTestId('hbar-fill') as HTMLElement
+    // 30/120 = 25%
+    expect(fill.style.width).toBe('25%')
+  })
+
+  it('clamps the fill width to [0, 100]%', () => {
+    render(<HBar value={500} max={100} label="X" unit="%" />)
+    expect((screen.getByTestId('hbar-fill') as HTMLElement).style.width).toBe('100%')
+  })
+
+  it('prefers displayValue over value for the readout', () => {
+    render(<HBar value={75} displayValue={150} label="GPU Power" unit="W" />)
+    expect(screen.getByText('150')).toBeTruthy()
+  })
+
+  it('renders stacked segments with a legend and no single-value fill', () => {
+    // Mirrors how the Memory card calls it: an explicit `value` (used %) plus
+    // segments for the stacked breakdown.
+    const segments: GaugeSegment[] = [
+      { value: 25, total: 100, color: '#76B900', label: 'GPU: 25' },
+      { value: 25, total: 100, color: '#3B82F6', label: 'CPU: 25' },
+      { value: 50, total: 100, color: '#27272A', label: 'Free: 50' },
+    ]
+    render(<HBar value={50} label="" unit="%" segments={segments} />)
+    // No single-value fill is rendered in segment mode.
+    expect(screen.queryByTestId('hbar-fill')).toBeNull()
+    // Legend labels present.
+    expect(screen.getByText('GPU: 25')).toBeTruthy()
+    expect(screen.getByText('Free: 50')).toBeTruthy()
+    // Readout uses the explicit value (used %), matching ArcGauge precedence.
+    expect(screen.getByText('50')).toBeTruthy()
+  })
+})
diff --git a/frontend/src/__tests__/format.test.ts b/frontend/src/__tests__/format.test.ts
@@ -1,5 +1,5 @@
 import { describe, it, expect } from 'vitest'
-import { formatBytes, formatGiB, formatCompactTokens } from '../lib/format'
+import { formatBytes, formatGiB, formatCompactTokens, formatAcceptanceLength } from '../lib/format'
 
 const GIB = 1_073_741_824
 const MIB = 1_048_576
@@ -53,3 +53,18 @@ describe('formatCompactTokens', () => {
     expect(formatCompactTokens(3.4e12)).toBe('3.4T')
   })
 })
+
+describe('formatAcceptanceLength', () => {
+  it('renders -- for null, negative, or non-finite', () => {
+    expect(formatAcceptanceLength(null)).toBe('--')
+    expect(formatAcceptanceLength(-1)).toBe('--')
+    expect(formatAcceptanceLength(Number.NaN)).toBe('--')
+    expect(formatAcceptanceLength(Number.POSITIVE_INFINITY)).toBe('--')
+  })
+
+  it('formats accepted-tokens-per-draft to two decimals', () => {
+    expect(formatAcceptanceLength(3)).toBe('3.00')
+    expect(formatAcceptanceLength(3.4167)).toBe('3.42')
+    expect(formatAcceptanceLength(0)).toBe('0.00')
+  })
+})
diff --git a/frontend/src/components/engines/EngineCard.tsx b/frontend/src/components/engines/EngineCard.tsx
@@ -10,6 +10,7 @@ import {
   KvBar,
   TrendArrow,
   GoodputTile,
+  SpecDecodeSection,
   computeTrend,
   fmtVal,
   fmtInt,
@@ -134,6 +135,17 @@ export function EngineCard({
   const prefixCacheHit = v('prefix_cache_hit_rate')
   const prefixCacheQueries = v('prefix_cache_queries_total')
   const preemptions = v('preemptions_total')
+  // Speculative decoding — present only when the served model has it enabled.
+  const specDraftTokens = v('spec_decode_draft_tokens_total')
+  const specAcceptedTokens = v('spec_decode_accepted_tokens_total')
+  const specAcceptanceRate = v('spec_decode_acceptance_rate')
+  const specAcceptanceRateLive = v('spec_decode_acceptance_rate_live')
+  const specMeanAcceptanceLength = v('spec_decode_mean_acceptance_length')
+  // Show the section only once the engine has actually drafted tokens. The
+  // counter is present (non-null) whenever spec decoding is configured, but it
+  // sits at 0 on a freshly-started idle engine — gating on >0 keeps the card
+  // from showing an all-dashes section until the metrics carry real values.
+  const hasSpecDecode = specDraftTokens !== null && specDraftTokens > 0
   const engineKey = `${engine.engine_type}-${engine.endpoint}`
   const modelName = engine.model?.name ?? null
   const { thresholds: slo, setThresholds: setSlo, reset: resetSlo, isCustomized: sloCustomized } =
@@ -218,7 +230,7 @@ export function EngineCard({
       ) : (
         <>
           {/* ── Grouped metrics with trend arrows — 6 categories ── */}
-          <div className="grid grid-cols-2 md:grid-cols-3 xl:grid-cols-6 gap-2 py-1">
+          <div className="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-6 gap-2 py-1">
             {/* Prefill Throughput */}
             <div className="bg-white/[0.02] rounded-md px-3 py-2.5 2xl:px-4 2xl:py-3 min-w-0">
               <div className="text-[11px] 2xl:text-xs min-[1920px]:text-sm font-semibold text-zinc-300 tracking-tight mb-1.5 truncate">Prompt Processing / Prefill Throughput</div>
@@ -289,9 +301,9 @@ export function EngineCard({
               </div>
             </div>
 
-            {/* Cache */}
+            {/* Cache & Speculative Decoding */}
             <div className="bg-white/[0.02] rounded-md px-3 py-2.5 2xl:px-4 2xl:py-3 min-w-0">
-              <div className="text-[11px] 2xl:text-xs min-[1920px]:text-sm font-semibold text-zinc-300 tracking-tight mb-1.5 truncate">Cache</div>
+              <div className="text-[11px] 2xl:text-xs min-[1920px]:text-sm font-semibold text-zinc-300 tracking-tight mb-1.5 truncate">Cache &amp; Speculative Decoding</div>
               <div className="grid grid-cols-2 gap-1.5">
                 <div className="flex flex-col gap-0.5 min-w-0">
                   <span className="text-[10px] font-medium text-zinc-400 uppercase tracking-wider truncate">KV Cache</span>
@@ -316,6 +328,15 @@ export function EngineCard({
                   className="text-lg xl:text-xl 2xl:text-2xl min-[1920px]:text-3xl min-[2560px]:text-4xl font-bold text-zinc-100 font-mono tabular-nums leading-none"
                 />
               </div>
+              {hasSpecDecode && (
+                <SpecDecodeSection
+                  acceptanceRate={specAcceptanceRate}
+                  acceptanceRateLive={specAcceptanceRateLive}
+                  meanAcceptanceLength={specMeanAcceptanceLength}
+                  acceptedTokens={specAcceptedTokens}
+                  draftTokens={specDraftTokens}
+                />
+              )}
             </div>
           </div>
 
@@ -324,7 +345,7 @@ export function EngineCard({
            *   1 Prefill · 2 Decode · 3 Latency · 4 SLO Goodput · 5 Requests · 6 Cache
            * E2E sits under SLO Goodput (col 4); Requests under col 5; KV under Cache (col 6). */}
           {showCharts && chartData && (
-            <div className="grid grid-cols-2 md:grid-cols-3 xl:grid-cols-6 gap-3 pt-1">
+            <div className="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-6 gap-3 pt-1">
               <TimeSeriesChart
                 title="Prefill Throughput (tok/s)"
                 hideTooltipLabel

diff --git a/frontend/src/components/engines/EngineCardPrimitives.tsx b/frontend/src/components/engines/EngineCardPrimitives.tsx
@@ -5,7 +5,7 @@
  */
 
 import type { LatencyPercentiles } from '@/types/metrics'
-import { formatCompactTokens } from '@/lib/format'
+import { formatCompactTokens, formatAcceptanceLength } from '@/lib/format'
 import { AnimatedCounter } from './AnimatedCounter'
 
 export interface ChartDataPoint {
@@ -166,6 +166,107 @@ export function fmtVal(v: number | null, fmt: (n: number) => string): string {
   return v === null ? '--' : fmt(v)
 }
 
+interface SpecDecodeSectionProps {
+  /** Lifetime token acceptance rate (TAR), 0-100. */
+  acceptanceRate: number | null
+  /** Live (windowed) TAR, 0-100. Shown as a secondary line when present. */
+  acceptanceRateLive: number | null
+  /** Mean accepted tokens per draft attempt. */
+  meanAcceptanceLength: number | null
+  /** Cumulative accepted tokens (counts up). */
+  acceptedTokens: number | null
+  /** Cumulative drafted tokens (counts up). */
+  draftTokens: number | null
+}
+
+/**
+ * Speculative-decoding sub-section of the Cache card. Rendered only when the
+ * served model has speculative decoding enabled (the caller gates on metric
+ * presence). Surfaces the token acceptance rate (lifetime + live), mean
+ * acceptance length, and the cumulative accepted/draft token counters that
+ * animate upward. Higher acceptance is better, so TAR is colored green (≥70%),
+ * yellow (≥40%), else red.
+ */
+export function SpecDecodeSection({
+  acceptanceRate,
+  acceptanceRateLive,
+  meanAcceptanceLength,
+  acceptedTokens,
+  draftTokens,
+}: SpecDecodeSectionProps) {
+  const tarColor =
+    acceptanceRate === null
+      ? 'text-zinc-100'
+      : acceptanceRate >= 70
+        ? 'text-[#76B900]'
+        : acceptanceRate >= 40
+          ? 'text-yellow-400'
+          : 'text-red-400'
+  return (
+    <div className="flex flex-col gap-0.5 mt-2 pt-2 border-t border-white/[0.04] min-w-0">
+      <span className="text-[10px] 2xl:text-xs min-[1920px]:text-sm font-medium text-zinc-400 uppercase tracking-wider truncate">
+        Speculative Decoding
+      </span>
+      <div className="grid grid-cols-2 gap-1.5 mt-0.5">
+        <div className="flex flex-col gap-0.5 min-w-0">
+          <span className="text-[10px] font-medium text-zinc-400 uppercase tracking-wider truncate">
+            TAR
+            {acceptanceRateLive !== null && (
+              <span className="ml-1 normal-case tracking-normal text-zinc-600">
+                {Math.round(acceptanceRateLive)}% live
+              </span>
+            )}
+          </span>
+          <div className="flex items-baseline">
+            <span
+              className={`text-lg xl:text-xl 2xl:text-2xl min-[1920px]:text-3xl min-[2560px]:text-4xl font-bold font-mono tabular-nums leading-none ${tarColor}`}
+            >
+              {acceptanceRate === null ? '--' : Math.round(acceptanceRate)}
+            </span>
+            <span className="text-xs text-zinc-500 ml-1">%</span>
+          </div>
+        </div>
+        <MetricTile
+          label="Accept Len"
+          value={fmtVal(meanAcceptanceLength, formatAcceptanceLength)}
+          unit="tok/draft"
+        />
+      </div>
+      {/* Cumulative accepted/draft tokens. Each value gets its own line (label
+          above) so the abbreviated counters never clip on narrow cards, while
+          smaller fonts than the headline numbers keep the section compact. */}
+      <div className="grid grid-cols-2 gap-1.5 mt-1.5 min-w-0">
+        <div className="flex flex-col gap-0.5 min-w-0">
+          <span className="text-[10px] font-medium text-zinc-400 uppercase tracking-wider truncate">
+            Accepted
+          </span>
+          <div className="flex items-baseline min-w-0">
+            <AnimatedCounter
+              value={acceptedTokens}
+              format={formatCompactTokens}
+              className="text-sm xl:text-base 2xl:text-lg min-[1920px]:text-xl font-bold font-mono tabular-nums leading-none text-zinc-100 truncate"
+            />
+            <span className="text-[10px] ml-1 text-zinc-500">tok</span>
+          </div>
+        </div>
+        <div className="flex flex-col gap-0.5 min-w-0">
+          <span className="text-[10px] font-medium text-zinc-400 uppercase tracking-wider truncate">
+            Draft
+          </span>
+          <div className="flex items-baseline min-w-0">
+            <AnimatedCounter
+              value={draftTokens}
+              format={formatCompactTokens}
+              className="text-sm xl:text-base 2xl:text-lg min-[1920px]:text-xl font-bold font-mono tabular-nums leading-none text-zinc-100 truncate"
+            />
+            <span className="text-[10px] ml-1 text-zinc-500">tok</span>
+          </div>
+        </div>
+      </div>
+    </div>
+  )
+}
+
 export function fmtInt(v: number | null): string {
   return v === null ? '--' : String(Math.round(v))
 }