From 6fd2dd342e4ad1bf3ea1d492a85e6034bd7ccc1d Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 13:14:48 +0200
Subject: [PATCH 01/20] [benchmark] Add custom ScalarMetric and DiscreteMetric
 primitives

---
 .../BenchmarkComparisonReportView.tsx         | 65 +++++++++---
 .../src/components/DailyBenchmarkChart.tsx    | 11 ++-
 .../benchmark/compareBenchmarkReports.test.ts | 93 +++++++++++++++++
 .../lib/benchmark/compareBenchmarkReports.ts  | 84 +++++++++++++++-
 .../src/lib/benchmark/types.ts                | 16 +++
 .../src/lib/ciReports/benchmarkReport.ts      |  6 +-
 .../src/views/BenchmarkDetails.tsx            |  1 +
 packages/benchmark/README.md                  | 56 +++++++++++
 packages/benchmark/src/DiscreteMetric.ts      | 20 ++++
 packages/benchmark/src/Metric.ts              | 91 +++++++++++++++++
 packages/benchmark/src/ScalarMetric.ts        | 39 ++++++++
 packages/benchmark/src/ciReport.ts            | 23 ++++-
 packages/benchmark/src/index.tsx              | 10 ++
 packages/benchmark/src/reporter.test.ts       | 99 +++++++++++++++++++
 packages/benchmark/src/reporter.ts            | 70 +++++++++++--
 packages/benchmark/src/stats.test.ts          | 23 ++++-
 packages/benchmark/src/stats.ts               | 21 ++++
 .../benchmark/src/taskMetaAugmentation.ts     |  4 +-
 packages/benchmark/src/types.ts               | 57 +++++++++++
 test/performance/tests/metric.bench.tsx       | 44 +++++++++
 20 files changed, 804 insertions(+), 29 deletions(-)
 create mode 100644 packages/benchmark/src/DiscreteMetric.ts
 create mode 100644 packages/benchmark/src/Metric.ts
 create mode 100644 packages/benchmark/src/ScalarMetric.ts
 create mode 100644 test/performance/tests/metric.bench.tsx

diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
index afb076129..cae212d02 100644
--- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
+++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
@@ -19,7 +19,7 @@ import {
   type DiffValue,
   type BenchmarkDiffSeverity,
 } from '@/lib/benchmark/compareBenchmarkReports';
-import type { BenchmarkReport } from '@/lib/benchmark/types';
+import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types';
 import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters';
 
 const SEVERITY_COLOR: Record<BenchmarkDiffSeverity, string> = {
@@ -72,13 +72,36 @@ function computeEntryBar(
   return computeDiffBar(comparison.duration, range.min, range.max);
 }
 
-function FormattedDiffMs({ diff, percent = false }: { diff: DiffValue; percent?: boolean }) {
+/** Formats a metric value with its `Intl.NumberFormatOptions`, falling back to milliseconds. */
+function formatMetricNumber(value: number, format?: Intl.NumberFormatOptions): string {
+  if (format) {
+    return new Intl.NumberFormat(undefined, format).format(value);
+  }
+  return formatMs(value);
+}
+
+function formatSignedDiff(value: number, format?: Intl.NumberFormatOptions): string {
+  if (format) {
+    return new Intl.NumberFormat(undefined, { signDisplay: 'exceptZero', ...format }).format(value);
+  }
+  return formatDiffMs(value);
+}
+
+function FormattedDiffMs({
+  diff,
+  percent = false,
+  format,
+}: {
+  diff: DiffValue;
+  percent?: boolean;
+  format?: Intl.NumberFormatOptions;
+}) {
   if (diff.absoluteDiff === 0) {
     return '\u2014';
   }
   return (
     <React.Fragment>
-      {formatDiffMs(diff.absoluteDiff)}
+      {formatSignedDiff(diff.absoluteDiff, format)}
       {percent && (
         <React.Fragment>
           {' '}
@@ -91,12 +114,20 @@ function FormattedDiffMs({ diff, percent = false }: { diff: DiffValue; percent?:
   );
 }
 
-function DiffCell({ diff, sx }: { diff: DiffValue; sx?: object }) {
+function DiffCell({
+  diff,
+  sx,
+  format,
+}: {
+  diff: DiffValue;
+  sx?: object;
+  format?: Intl.NumberFormatOptions;
+}) {
   const color = SEVERITY_COLOR[diff.severity];
   return (
     <Tooltip title={diff.hint} arrow>
       <TableCell align="right" sx={{ color, ...sx }}>
-        <FormattedDiffMs diff={diff} percent />
+        <FormattedDiffMs diff={diff} percent format={format} />
       </TableCell>
     </Tooltip>
   );
@@ -114,6 +145,7 @@ interface ComparisonTableRow {
   valueFill?: number;
   valueColor?: string;
   diffBar?: { left: number; width: number; color: string };
+  format?: Intl.NumberFormatOptions;
 }
 
 const NAME_CELL_SX = {
@@ -181,7 +213,7 @@ function ComparisonTable({
               <TableCell align="right">{row.outliers}</TableCell>
               {hasBase && (
                 <TableCell align="right">
-                  {row.base != null ? formatMs(row.base) : '\u2014'}
+                  {row.base != null ? formatMetricNumber(row.base, row.format) : '\u2014'}
                 </TableCell>
               )}
               {(() => {
@@ -196,10 +228,10 @@ function ComparisonTable({
                     : undefined;
                   return row.removed ? (
                     <TableCell align="right" sx={{ color: 'success.main', ...diffBarSx }}>
-                      <FormattedDiffMs diff={row.comparison} percent />
+                      <FormattedDiffMs diff={row.comparison} percent format={row.format} />
                     </TableCell>
                   ) : (
-                    <DiffCell diff={row.comparison} sx={diffBarSx} />
+                    <DiffCell diff={row.comparison} sx={diffBarSx} format={row.format} />
                   );
                 }
                 return <TableCell align="right">{'\u2014'}</TableCell>;
@@ -406,9 +438,9 @@ function BenchmarkAccordion({
                     '\u2014'
                   ) : (
                     <React.Fragment>
-                      {formatMs(row.value)}{' '}
+                      {formatMetricNumber(row.value, row.format)}{' '}
                       <Typography component="span" variant="caption" color="text.secondary">
-                        ±{formatMs(row.stdDev)}
+                        ±{formatMetricNumber(row.stdDev, row.format)}
                       </Typography>
                     </React.Fragment>
                   ),
@@ -416,6 +448,7 @@ function BenchmarkAccordion({
                   comparison: row.diff,
                   base: row.diff?.base,
                   removed: row.removed,
+                  format: row.format,
                 };
               })}
             />
@@ -433,10 +466,18 @@ function BenchmarkAccordion({
 interface BenchmarkComparisonReportViewProps {
   value: BenchmarkReport;
   base: BenchmarkReport | null;
+  definitions?: Record<string, MetricDefinition>;
 }
 
-export function BenchmarkComparisonReportView({ value, base }: BenchmarkComparisonReportViewProps) {
-  const comparisonReport = React.useMemo(() => compareBenchmarkReports(value, base), [value, base]);
+export function BenchmarkComparisonReportView({
+  value,
+  base,
+  definitions,
+}: BenchmarkComparisonReportViewProps) {
+  const comparisonReport = React.useMemo(
+    () => compareBenchmarkReports(value, base, definitions),
+    [value, base, definitions],
+  );
 
   const globalMaxDuration = React.useMemo(() => {
     let max = 0;
diff --git a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
index 27e09ec1f..40e9a5d21 100644
--- a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
+++ b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
@@ -10,7 +10,7 @@ import Tabs from '@mui/material/Tabs';
 import Tab from '@mui/material/Tab';
 import { BarChartPro } from '@mui/x-charts-pro/BarChartPro';
 import { useXScale, useDrawingArea } from '@mui/x-charts-pro/hooks';
-import type { BenchmarkReport } from '@/lib/benchmark/types';
+import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types';
 import { formatMs } from '@/utils/formatters';
 import { useMasterCommits, type GitHubCommit } from '../hooks/useMasterCommits';
 import { useCiReports } from '../hooks/useCiReports';
@@ -81,6 +81,7 @@ interface CommitReportData {
   timestamp: number;
   commit: GitHubCommit;
   report: BenchmarkReport | null;
+  metricDefinitions?: Record<string, MetricDefinition>;
 }
 
 function collectBenchmarkNames(chartData: CommitReportData[]): string[] {
@@ -126,6 +127,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
         timestamp,
         commit,
         report: reports[commit.sha]?.report ?? null,
+        metricDefinitions: reports[commit.sha]?.metricDefinitions,
       })),
     [commits, reports],
   );
@@ -302,6 +304,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
     return {
       value: reportData.report,
       base: baselineData?.report ?? null,
+      definitions: reportData.metricDefinitions,
       valueCommit: reportData.commit,
       baseCommit: baselineData?.commit ?? null,
     };
@@ -533,7 +536,11 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
                     ? `Comparing baseline ${inlinePair.baseCommit.sha.substring(0, 7)} → report ${inlinePair.valueCommit.sha.substring(0, 7)}`
                     : `Report ${inlinePair.valueCommit.sha.substring(0, 7)}`}
                 </Typography>
-                <BenchmarkComparisonReportView value={inlinePair.value} base={inlinePair.base} />
+                <BenchmarkComparisonReportView
+                  value={inlinePair.value}
+                  base={inlinePair.base}
+                  definitions={inlinePair.definitions}
+                />
               </Box>
             )}
             {activeTab === 'comparison' && !inlinePair && (
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
index 87370da5b..c666fe82a 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
@@ -1,7 +1,32 @@
 import { describe, it, expect } from 'vitest';
 import { compareBenchmarkReports } from './compareBenchmarkReports';
+import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition } from './types';
 import { makeReport, makeReportFromConfig } from './test-fixtures';
 
+function reportWithMetrics(metrics: Record<string, number>): BenchmarkReport {
+  const entry: BenchmarkReportEntry = {
+    iterations: 10,
+    totalDuration: 0,
+    renders: [],
+    metrics: Object.fromEntries(
+      Object.entries(metrics).map(([name, mean]) => [name, { mean, stdDev: 0, outliers: 0 }]),
+    ),
+  };
+  return { Bench: entry };
+}
+
+const definitions: Record<string, MetricDefinition> = {
+  scalar_alarm: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', threshold: 0.1 } },
+  scalar_higher: { kind: 'scalar', alarm: { direction: 'higherIsBetter', threshold: 0.1 } },
+  scalar_info: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } },
+  discrete_alarm: { kind: 'discrete', alarm: { direction: 'lowerIsBetter' } },
+};
+
+function metricEntry(current: BenchmarkReport, base: BenchmarkReport, metricName: string) {
+  const result = compareBenchmarkReports(current, base, definitions);
+  return result.entries[0].metrics.find((metric) => metric.name === metricName)!;
+}
+
 describe('compareBenchmarkReports', () => {
   it('marks diffs within ±20% as neutral noise', () => {
     const result = compareBenchmarkReports(
@@ -183,4 +208,72 @@ describe('compareBenchmarkReports', () => {
       expect(order).toEqual(['DurationRegression', 'FewerRenders']);
     });
   });
+
+  describe('custom metrics', () => {
+    it('keeps an informational metric (no alarm) neutral even on a large change', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_info: 200 }),
+        reportWithMetrics({ scalar_info: 100 }),
+        'scalar_info',
+      );
+      expect(metric.diff.severity).toBe('neutral');
+      expect(metric.format).toEqual({ style: 'unit', unit: 'byte' });
+    });
+
+    it('flags a scalar alarm regression beyond its own threshold', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_alarm: 120 }), // +20%, threshold is 10%
+        reportWithMetrics({ scalar_alarm: 100 }),
+        'scalar_alarm',
+      );
+      expect(metric.diff.severity).toBe('error');
+      expect(metric.diff.hint).toContain('Regression');
+    });
+
+    it('honors higherIsBetter so an increase is an improvement', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_higher: 120 }),
+        reportWithMetrics({ scalar_higher: 100 }),
+        'scalar_higher',
+      );
+      expect(metric.diff.severity).toBe('success');
+      expect(metric.diff.hint).toContain('Improvement');
+    });
+
+    it('compares discrete alarms exactly — any change is flagged', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ discrete_alarm: 4 }),
+        reportWithMetrics({ discrete_alarm: 3 }),
+        'discrete_alarm',
+      );
+      expect(metric.diff.severity).toBe('error');
+      expect(metric.diff.hint).toBe('Regression: +1');
+    });
+
+    it('keeps metrics without a definition on the default noise-band behavior', () => {
+      // No definition for `paint:default`, so it uses the global ±20% noise band.
+      const withinNoise = metricEntry(
+        reportWithMetrics({ 'paint:default': 110 }),
+        reportWithMetrics({ 'paint:default': 100 }),
+        'paint:default',
+      );
+      expect(withinNoise.diff.severity).toBe('neutral');
+
+      const regression = metricEntry(
+        reportWithMetrics({ 'paint:default': 130 }),
+        reportWithMetrics({ 'paint:default': 100 }),
+        'paint:default',
+      );
+      expect(regression.diff.severity).toBe('error');
+    });
+
+    it('resolves a sub-series definition by its base metric name', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ 'scalar_alarm#large': 120 }),
+        reportWithMetrics({ 'scalar_alarm#large': 100 }),
+        'scalar_alarm#large',
+      );
+      expect(metric.diff.severity).toBe('error');
+    });
+  });
 });
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
index af0aa2ca4..810f46748 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
@@ -1,5 +1,5 @@
 import { formatDiffMs, percentFormatter } from '@/utils/formatters';
-import type { BenchmarkReport, BenchmarkReportEntry, RenderStats } from './types';
+import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition, RenderStats } from './types';
 
 const NOISE_THRESHOLD = 0.2;
 
@@ -21,6 +21,8 @@ export interface ComparisonEntry {
   outliers: number;
   diff: DiffValue;
   removed: boolean;
+  /** Display formatting for custom metrics; absent for renders and built-in (paint) metrics. */
+  format?: Intl.NumberFormatOptions;
 }
 
 export interface ComparisonItem {
@@ -153,21 +155,92 @@ function compareRenders(
   return entries;
 }
 
+/** Strips a `#sub-series` suffix to recover the metric name used to look up its definition. */
+function baseMetricName(key: string): string {
+  const hashIndex = key.indexOf('#');
+  return hashIndex === -1 ? key : key.slice(0, hashIndex);
+}
+
+/**
+ * Diff for a custom metric, honoring its definition: a metric without an `alarm` is
+ * informational (always neutral), a `discrete` metric compares exactly, and a `scalar` metric
+ * uses its own noise band and direction. Metrics without a definition (e.g. paint) keep the
+ * default `makeDiffValue` behavior and never reach this function.
+ */
+function makeMetricDiff(
+  current: number | null,
+  base: number | null,
+  definition: MetricDefinition,
+): DiffValue {
+  if (base === null) {
+    return { current, base, absoluteDiff: 0, relativeDiff: 0, severity: 'neutral', hint: 'New' };
+  }
+
+  const currentVal = current ?? 0;
+  const absoluteDiff = currentVal - base;
+  const relativeDiff = base !== 0 ? absoluteDiff / base : 0;
+
+  const { alarm, kind } = definition;
+  const isDiscrete = kind === 'discrete';
+  const diffStr = isDiscrete
+    ? `${absoluteDiff > 0 ? '+' : ''}${absoluteDiff}`
+    : `${formatDiffMs(absoluteDiff)} (${percentFormatter.format(relativeDiff)})`;
+
+  // Informational metric: show the change, never flag it.
+  if (!alarm) {
+    return {
+      current,
+      base,
+      absoluteDiff,
+      relativeDiff,
+      severity: 'neutral',
+      hint: absoluteDiff === 0 ? 'No change' : diffStr,
+    };
+  }
+
+  const withinNoise = isDiscrete
+    ? absoluteDiff === 0
+    : Math.abs(relativeDiff) <= (alarm.threshold ?? NOISE_THRESHOLD);
+  const direction = alarm.direction ?? 'lowerIsBetter';
+  const isRegression = direction === 'lowerIsBetter' ? absoluteDiff > 0 : absoluteDiff < 0;
+
+  let severity: BenchmarkDiffSeverity = 'neutral';
+  if (!withinNoise && absoluteDiff !== 0) {
+    severity = isRegression ? 'error' : 'success';
+  }
+
+  let hint: string;
+  if (absoluteDiff === 0) {
+    hint = 'No change';
+  } else if (withinNoise) {
+    hint = `Within noise: ${diffStr}`;
+  } else {
+    hint = `${isRegression ? 'Regression' : 'Improvement'}: ${diffStr}`;
+  }
+
+  return { current, base, absoluteDiff, relativeDiff, severity, hint };
+}
+
 function compareMetrics(
   currentMetrics: Record<string, { mean: number; stdDev: number; outliers: number }>,
   baseEntry: BenchmarkReportEntry | undefined,
+  definitions: Record<string, MetricDefinition> | undefined,
 ): ComparisonEntry[] {
   const entries: ComparisonEntry[] = [];
 
   for (const [name, stats] of Object.entries(currentMetrics)) {
+    const definition = definitions?.[baseMetricName(name)];
     const baseStats = baseEntry?.metrics[name];
     entries.push({
       name,
       value: stats.mean,
       stdDev: stats.stdDev,
       outliers: stats.outliers,
-      diff: makeDiffValue(stats.mean, baseStats?.mean ?? null),
+      diff: definition
+        ? makeMetricDiff(stats.mean, baseStats?.mean ?? null, definition)
+        : makeDiffValue(stats.mean, baseStats?.mean ?? null),
       removed: false,
+      format: definition?.format,
     });
   }
 
@@ -175,6 +248,7 @@ function compareMetrics(
   if (baseEntry) {
     for (const [name, baseStats] of Object.entries(baseEntry.metrics)) {
       if (!(name in currentMetrics)) {
+        const definition = definitions?.[baseMetricName(name)];
         entries.push({
           name,
           value: 0,
@@ -182,6 +256,7 @@ function compareMetrics(
           outliers: 0,
           diff: makeDiffValue(null, baseStats.mean),
           removed: true,
+          format: definition?.format,
         });
       }
     }
@@ -227,6 +302,7 @@ function compareItems(a: ComparisonItem, b: ComparisonItem): number {
 export function compareBenchmarkReports(
   current: BenchmarkReport,
   base: BenchmarkReport | null,
+  definitions?: Record<string, MetricDefinition>,
 ): BenchmarkComparisonReport {
   const effectiveBase = base ?? {};
   const entries: ComparisonItem[] = [];
@@ -251,7 +327,7 @@ export function compareBenchmarkReports(
         ? makeCountDiffValue(entry.renders.length, baseEntry.renders.length)
         : undefined,
       renders: compareRenders(entry.renders, baseEntry),
-      metrics: compareMetrics(entry.metrics, baseEntry),
+      metrics: compareMetrics(entry.metrics, baseEntry, definitions),
       iterations: entry.iterations,
     });
 
@@ -280,7 +356,7 @@ export function compareBenchmarkReports(
       name,
       duration,
       renders: compareRenders([], baseEntry),
-      metrics: compareMetrics({}, baseEntry),
+      metrics: compareMetrics({}, baseEntry, definitions),
       iterations: 0,
     });
 
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/types.ts b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
index ec29672c9..5935592e1 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/types.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
@@ -31,6 +31,21 @@ export interface BenchmarkReportEntry {
 
 export type BenchmarkReport = Record<string, BenchmarkReportEntry>;
 
+export type MetricDirection = 'lowerIsBetter' | 'higherIsBetter';
+
+export interface MetricAlarm {
+  direction?: MetricDirection;
+  threshold?: number;
+}
+
+/** Per-metric config for custom metrics, hoisted to the top level of the report (keyed by name). */
+export interface MetricDefinition {
+  kind: 'scalar' | 'discrete';
+  format?: Intl.NumberFormatOptions;
+  alarm?: MetricAlarm;
+  group?: string;
+}
+
 export interface BenchmarkBaseUpload {
   version: 1;
   timestamp: number;
@@ -40,6 +55,7 @@ export interface BenchmarkBaseUpload {
   prNumber?: number;
   branch: string;
   report: BenchmarkReport;
+  metricDefinitions?: Record<string, MetricDefinition>;
 }
 
 export interface BenchmarkUpload extends BenchmarkBaseUpload {
diff --git a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
index b1eb97f7a..026049805 100644
--- a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
+++ b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
@@ -45,7 +45,11 @@ export async function generateBenchmarkReport(
     markdownContent += `_:information_source: Using benchmark from parent commit ${actualBaseCommit} (fallback from merge base ${mergeBaseCommit})._\n\n`;
   }
 
-  const comparison = compareBenchmarkReports(headReport.report, baseReport);
+  const comparison = compareBenchmarkReports(
+    headReport.report,
+    baseReport,
+    headReport.metricDefinitions,
+  );
 
   const detailsUrl = new URL(`${DASHBOARD_ORIGIN}/benchmark-details/${repo}`);
   detailsUrl.searchParams.set('sha', commitSha);
diff --git a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
index a558ffe01..178bda254 100644
--- a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
+++ b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
@@ -143,6 +143,7 @@ export default function BenchmarkDetails() {
           <BenchmarkComparisonReportView
             value={report.report}
             base={effectiveBase?.report ?? null}
+            definitions={report.metricDefinitions}
           />
         )}
       </Paper>
diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index f4c980bcd..3cfedb79f 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -92,6 +92,60 @@ This produces a `paint:my-component` metric alongside the automatic `paint:defau
 
 `waitForElementTiming` accepts an optional `timeout` in milliseconds (default: 5000). Pass `0` or `Infinity` to rely on the test timeout instead.
 
+### Custom metrics
+
+Record your own measurements — a timing, a count, anything measured inside or outside React — from a plain `it()` loop. There are two primitives:
+
+- `ScalarMetric` — a continuous value (timings, sizes). Aggregated as mean ± standard deviation with IQR outlier removal, and compared against a baseline with a relative noise band. It also offers a `console.time`-style timing helper.
+- `DiscreteMetric` — a count of events. Compared as an exact integer (any change is significant) and formatted as a whole number.
+
+```tsx
+import { it } from 'vitest';
+import { ScalarMetric, DiscreteMetric } from '@mui/internal-benchmark';
+
+const duration = new ScalarMetric({
+  name: 'work_duration',
+  format: { style: 'unit', unit: 'millisecond' }, // Intl.NumberFormatOptions
+  alarm: { direction: 'lowerIsBetter', threshold: 0.1 }, // flag regressions over 10%
+});
+
+const clicks = new DiscreteMetric({ name: 'button_clicks' });
+
+it('measures work', () => {
+  for (let i = 0; i < 100; i += 1) {
+    duration.time();
+    runWork();
+    duration.timeEnd(); // records the elapsed milliseconds
+
+    clicks.record(countClicks()); // a discrete count per run
+  }
+});
+```
+
+A metric is declared once (typically at module scope) and reused across tests and iterations. `record()` attaches the value to whichever test is running, so the same instance works in any `it()`.
+
+#### Metric configuration
+
+- `name` — the metric's report key (**required**).
+- `format` — an [`Intl.NumberFormatOptions`](https://developer.mozilla.org/en-US/docs/Web/API/Intl/NumberFormat/NumberFormat) object used to display the value.
+- `alarm` — opts the metric into regression flagging. Omit it and the metric is informational (its diff is shown but never flagged). Holds:
+  - `direction` — `'lowerIsBetter'` (default) or `'higherIsBetter'`.
+  - `threshold` — the relative noise band (e.g. `0.1` = 10%) beyond which a change is a regression. Scalar metrics only; discrete metrics compare exactly.
+
+#### Sub-series
+
+Pass `record(value, { id })` to split one metric into labeled sub-series, reported as `name#id`. For `ScalarMetric.time()`/`timeEnd()`, pass a label that maps to the same `id`:
+
+```tsx
+const phase = new ScalarMetric({ name: 'render_phase' });
+
+phase.time('header');
+renderHeader();
+phase.timeEnd('header'); // -> render_phase#header
+```
+
+Custom metrics are aggregated in the browser and only the resulting stats cross to the runner, so the amount of data is independent of how many values you record.
+
 ### Options
 
 ```tsx
@@ -148,5 +202,7 @@ The feature is opt-in — without `BENCHMARK_BASELINE_PATH` (or the `baselinePat
 
 - `benchmark` — define a benchmark test case
 - `ElementTiming` — invisible marker component for paint timing (renders a `<span>` tracked by the Element Timing API)
+- `ScalarMetric` — record a continuous custom measurement (with a `console.time`-style timing helper)
+- `DiscreteMetric` — record a discrete custom count
 - `createBenchmarkVitestConfig` — create a Vitest config with browser benchmarking defaults
 - `BenchmarkReporter` — Vitest reporter that collects and outputs benchmark results
diff --git a/packages/benchmark/src/DiscreteMetric.ts b/packages/benchmark/src/DiscreteMetric.ts
new file mode 100644
index 000000000..90cec67c5
--- /dev/null
+++ b/packages/benchmark/src/DiscreteMetric.ts
@@ -0,0 +1,20 @@
+import { Metric } from './Metric';
+import type { MetricConfig, MetricKind } from './types';
+
+/**
+ * A discrete count of events or occurrences. Compared against a baseline as an exact integer
+ * (any change is significant — there is no noise band), and formatted as a whole number by default.
+ *
+ * ```ts
+ * const clicks = new DiscreteMetric({ name: 'button_clicks' });
+ * clicks.record(countClicks());
+ * ```
+ */
+export class DiscreteMetric extends Metric {
+  readonly kind: MetricKind = 'discrete';
+
+  constructor(config: MetricConfig | string) {
+    const resolved = typeof config === 'string' ? { name: config } : config;
+    super({ ...resolved, format: resolved.format ?? { maximumFractionDigits: 0 } });
+  }
+}
diff --git a/packages/benchmark/src/Metric.ts b/packages/benchmark/src/Metric.ts
new file mode 100644
index 000000000..921c10ba3
--- /dev/null
+++ b/packages/benchmark/src/Metric.ts
@@ -0,0 +1,91 @@
+import { onTestFinished, TestRunner, type RunnerTestCase } from 'vitest';
+import type { MetricConfig, MetricKind, MetricReport, MetricSampleStats } from './types';
+import { aggregateSamples } from './stats';
+// Import for TaskMeta augmentation side effect
+import './taskMetaAugmentation';
+
+interface SeriesAccumulator {
+  kind: MetricKind;
+  config: MetricConfig;
+  /** Raw samples per sub-series id (`''` is the base series), kept in browser memory only. */
+  series: Map<string, number[]>;
+}
+
+type TestAccumulator = Map<string, SeriesAccumulator>;
+
+// Raw samples never cross the browser→runner boundary. They accumulate here per test (keyed by
+// the running task so a module-scoped metric shared across tests never mixes data) and are
+// aggregated into compact stats by an `onTestFinished` hook before being written to `task.meta`.
+const accumulators = new WeakMap<RunnerTestCase, TestAccumulator>();
+
+function flush(test: RunnerTestCase, accumulator: TestAccumulator): void {
+  const store: Record<string, MetricReport> = {};
+  for (const [name, entry] of accumulator) {
+    const series: Record<string, MetricSampleStats> = {};
+    for (const [seriesId, samples] of entry.series) {
+      series[seriesId] = { ...aggregateSamples(samples), count: samples.length };
+    }
+    store[name] = { kind: entry.kind, config: entry.config, series };
+  }
+  test.meta.benchmarkMetrics = store;
+}
+
+export interface MetricRecordOptions {
+  /** Sub-series label. Recorded under `${name}#${id}` in the report; omit for the base series. */
+  id?: string;
+}
+
+/**
+ * Base class for custom benchmark metrics. Use `ScalarMetric` or `DiscreteMetric`.
+ *
+ * Records are tied to the test that is running when `record()` is called (resolved via
+ * `getCurrentTest()`), so a single instance can be declared at module scope and reused across
+ * tests and loop iterations, inside or outside React.
+ */
+export abstract class Metric {
+  abstract readonly kind: MetricKind;
+
+  readonly name: string;
+
+  protected readonly config: MetricConfig;
+
+  constructor(config: MetricConfig | string) {
+    this.config = typeof config === 'string' ? { name: config } : config;
+    this.name = this.config.name;
+  }
+
+  /**
+   * Records a single measured value. Samples accumulate in browser memory and are aggregated
+   * once when the test finishes. Pass `options.id` to split into a labeled sub-series.
+   */
+  record(value: number, options?: MetricRecordOptions): void {
+    const test = TestRunner.getCurrentTest<RunnerTestCase | undefined>();
+    if (!test) {
+      throw new Error(
+        `${this.constructor.name}.record() must be called inside a running benchmark test.`,
+      );
+    }
+
+    let accumulator = accumulators.get(test);
+    if (!accumulator) {
+      const created: TestAccumulator = new Map();
+      accumulator = created;
+      accumulators.set(test, created);
+      onTestFinished(() => flush(test, created));
+    }
+
+    let entry = accumulator.get(this.name);
+    if (!entry) {
+      entry = { kind: this.kind, config: this.config, series: new Map() };
+      accumulator.set(this.name, entry);
+    }
+
+    const seriesId = options?.id ?? '';
+    let samples = entry.series.get(seriesId);
+    if (!samples) {
+      samples = [];
+      entry.series.set(seriesId, samples);
+    }
+    samples.push(value);
+  }
+}
diff --git a/packages/benchmark/src/ScalarMetric.ts b/packages/benchmark/src/ScalarMetric.ts
new file mode 100644
index 000000000..9b1c02ac8
--- /dev/null
+++ b/packages/benchmark/src/ScalarMetric.ts
@@ -0,0 +1,39 @@
+import { Metric } from './Metric';
+import type { MetricKind } from './types';
+
+/**
+ * A continuous measurement (timings, sizes, …). Samples are aggregated with mean ± standard
+ * deviation and IQR outlier removal, and compared against a baseline with a relative noise band.
+ *
+ * Besides `record()`, it offers a `console.time`-style timing helper:
+ *
+ * ```ts
+ * const metric = new ScalarMetric({ name: 'render', format: { style: 'unit', unit: 'millisecond' } });
+ * metric.time();
+ * doWork();
+ * metric.timeEnd(); // records the elapsed milliseconds
+ * ```
+ */
+export class ScalarMetric extends Metric {
+  readonly kind: MetricKind = 'scalar';
+
+  private readonly pending = new Map<string, number>();
+
+  /** Starts a timer. Pass a `label` to time a sub-series; it maps to `record`'s `id`. */
+  time(label?: string): void {
+    this.pending.set(label ?? '', performance.now());
+  }
+
+  /** Stops the timer started by `time(label)` and records the elapsed milliseconds. */
+  timeEnd(label?: string): void {
+    const key = label ?? '';
+    const start = this.pending.get(key);
+    if (start === undefined) {
+      throw new Error(
+        `${this.name}.timeEnd(${label ? `"${label}"` : ''}) was called without a matching time().`,
+      );
+    }
+    this.pending.delete(key);
+    this.record(performance.now() - start, label !== undefined ? { id: label } : undefined);
+  }
+}
diff --git a/packages/benchmark/src/ciReport.ts b/packages/benchmark/src/ciReport.ts
index 682739639..a8d832645 100644
--- a/packages/benchmark/src/ciReport.ts
+++ b/packages/benchmark/src/ciReport.ts
@@ -66,7 +66,27 @@ const benchmarkReportEntrySchema = z.object({
 
 const benchmarkReportSchema = z.record(z.string(), benchmarkReportEntrySchema);
 
-const benchmarkBaseUploadSchema = ciReportUploadSchema('benchmark', 1, benchmarkReportSchema);
+const metricDefinitionSchema = z.object({
+  kind: z.enum(['scalar', 'discrete']),
+  format: z.custom<Intl.NumberFormatOptions>().optional(),
+  alarm: z
+    .object({
+      direction: z.enum(['lowerIsBetter', 'higherIsBetter']).optional(),
+      threshold: z.number().optional(),
+    })
+    .optional(),
+  group: z.string().optional(),
+});
+
+const benchmarkBaseUploadSchema = ciReportUploadSchema(
+  'benchmark',
+  1,
+  benchmarkReportSchema,
+).extend({
+  // Per-metric config for custom metrics, hoisted to the top level (keyed by metric name) so
+  // it is stored once rather than duplicated in every report entry. Optional and additive.
+  metricDefinitions: z.record(z.string(), metricDefinitionSchema).optional(),
+});
 
 export const benchmarkUploadSchema = benchmarkBaseUploadSchema.extend({
   base: benchmarkBaseUploadSchema.optional(),
@@ -74,6 +94,7 @@ export const benchmarkUploadSchema = benchmarkBaseUploadSchema.extend({
 
 export type RenderStats = z.infer<typeof renderStatsSchema>;
 export type MetricStats = z.infer<typeof metricStatsSchema>;
+export type BenchmarkMetricDefinition = z.infer<typeof metricDefinitionSchema>;
 export type BenchmarkReportEntry = z.infer<typeof benchmarkReportEntrySchema>;
 export type BenchmarkReport = z.infer<typeof benchmarkReportSchema>;
 export type BenchmarkBaseUpload = z.infer<typeof benchmarkBaseUploadSchema>;
diff --git a/packages/benchmark/src/index.tsx b/packages/benchmark/src/index.tsx
index b9e77223e..41c65cbd9 100644
--- a/packages/benchmark/src/index.tsx
+++ b/packages/benchmark/src/index.tsx
@@ -14,7 +14,17 @@ interface PerformanceElementTiming extends PerformanceEntry {
 }
 
 export type { RenderEvent, BenchmarkMetric, IterationData, InteractionContext } from './types';
+export type {
+  MetricKind,
+  MetricDirection,
+  MetricAlarm,
+  MetricConfig,
+  MetricDefinition,
+} from './types';
 export { ElementTiming } from './ElementTiming';
+export { Metric, type MetricRecordOptions } from './Metric';
+export { ScalarMetric } from './ScalarMetric';
+export { DiscreteMetric } from './DiscreteMetric';
 
 function BenchProfiler({
   captures,
diff --git a/packages/benchmark/src/reporter.test.ts b/packages/benchmark/src/reporter.test.ts
index 88f7f96b8..ec86476b5 100644
--- a/packages/benchmark/src/reporter.test.ts
+++ b/packages/benchmark/src/reporter.test.ts
@@ -1,5 +1,6 @@
 import * as os from 'node:os';
 import * as path from 'node:path';
+import * as fs from 'node:fs/promises';
 import { describe, it, expect, vi } from 'vitest';
 import type { TestCase } from 'vitest/node';
 import type { RenderEvent, IterationData } from './types';
@@ -323,4 +324,102 @@ describe('BenchmarkReporter', () => {
       consoleSpy.mockRestore();
     });
   });
+
+  describe('custom metrics', () => {
+    it('merges aggregated custom metrics into the report and hoists definitions', async () => {
+      const outputPath = path.join(os.tmpdir(), `benchmark-custom-metrics-${process.pid}.json`);
+      const reporter = new BenchmarkReporter({ outputPath });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(
+        mockTestCase({
+          fullName: 'custom only',
+          meta: {
+            benchmarkName: 'custom only',
+            benchmarkMetrics: {
+              fib_duration: {
+                kind: 'scalar',
+                config: {
+                  name: 'fib_duration',
+                  format: { maximumFractionDigits: 3 },
+                  alarm: { direction: 'lowerIsBetter', threshold: 0.25 },
+                },
+                series: { '': { mean: 1.5, stdDev: 0.1, outliers: 0, count: 50 } },
+              },
+              fib_phase: {
+                kind: 'scalar',
+                config: { name: 'fib_phase' },
+                series: {
+                  small: { mean: 0.2, stdDev: 0.01, outliers: 0, count: 50 },
+                  large: { mean: 3.4, stdDev: 0.2, outliers: 1, count: 50 },
+                },
+              },
+            },
+          },
+          state: 'passed',
+        }),
+      );
+
+      await reporter.onTestRunEnd();
+      const written = JSON.parse(await fs.readFile(outputPath, 'utf8'));
+
+      // Metric-only test still produces an entry; iteration count derived from samples.
+      expect(written.report['custom only'].iterations).toBe(50);
+      expect(written.report['custom only'].renders).toEqual([]);
+
+      // Base series keyed by name; sub-series keyed `name#id`.
+      expect(written.report['custom only'].metrics).toMatchObject({
+        fib_duration: { mean: 1.5, stdDev: 0.1, outliers: 0 },
+        'fib_phase#small': { mean: 0.2 },
+        'fib_phase#large': { mean: 3.4, outliers: 1 },
+      });
+
+      // The sample count must not leak into the per-entry metric stats.
+      expect(written.report['custom only'].metrics.fib_duration).not.toHaveProperty('count');
+
+      // Config is hoisted once, keyed by metric name.
+      expect(written.metricDefinitions).toMatchObject({
+        fib_duration: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', threshold: 0.25 } },
+        fib_phase: { kind: 'scalar' },
+      });
+
+      await fs.rm(outputPath, { force: true });
+      consoleSpy.mockRestore();
+    });
+
+    it('merges custom metrics alongside React render iterations', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-combined-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(
+        mockTestCase({
+          fullName: 'combined',
+          meta: {
+            benchmarkName: 'combined',
+            benchmarkIterations: [
+              iteration([event('App', 'mount', 0, 10)]),
+              iteration([event('App', 'mount', 0, 12)]),
+            ],
+            benchmarkMetrics: {
+              clicks: {
+                kind: 'discrete',
+                config: { name: 'clicks', format: { maximumFractionDigits: 0 } },
+                series: { '': { mean: 3, stdDev: 0, outliers: 0, count: 2 } },
+              },
+            },
+          },
+          state: 'passed',
+        }),
+      );
+
+      const output = consoleSpy.mock.calls.map((call) => call[0]).join('\n');
+      // Both the React render table and the custom metric are printed.
+      expect(output).toContain('combined');
+      expect(output).toContain('clicks');
+
+      consoleSpy.mockRestore();
+    });
+  });
 });
diff --git a/packages/benchmark/src/reporter.ts b/packages/benchmark/src/reporter.ts
index 8e2d03d9c..15b27d784 100644
--- a/packages/benchmark/src/reporter.ts
+++ b/packages/benchmark/src/reporter.ts
@@ -1,7 +1,7 @@
 import * as path from 'node:path';
 import * as fs from 'node:fs/promises';
 import type { Reporter, TestCase } from 'vitest/node';
-import type { RenderEvent, IterationData } from './types';
+import type { RenderEvent, IterationData, MetricReport, MetricDefinition } from './types';
 import type { BenchmarkBaseUpload, BenchmarkReportEntry } from './ciReport';
 import { benchmarkUploadSchema, getCiMetadata } from './ciReport';
 import { calculateMean, calculateStdDev, quantile, isOutlier } from './stats';
@@ -190,10 +190,52 @@ function printDurationMatrix(name: string, report: BenchmarkReportEntry, footer:
   );
 }
 
+/** Strips a `#sub-series` suffix to recover the metric name used to look up its definition. */
+function baseMetricName(key: string): string {
+  const hashIndex = key.indexOf('#');
+  return hashIndex === -1 ? key : key.slice(0, hashIndex);
+}
+
+function formatMetricValue(value: number, definition?: MetricDefinition): string {
+  if (definition?.format) {
+    return new Intl.NumberFormat(undefined, definition.format).format(value);
+  }
+  return value.toFixed(2);
+}
+
+/**
+ * Merges aggregated custom metrics (already stats, not raw samples) into a report entry, keyed
+ * `name` or `name#id` for sub-series, and collects each metric's config into the shared
+ * top-level definitions. For a metric-only test, derives the iteration count from the samples.
+ */
+function mergeCustomMetrics(
+  report: BenchmarkReportEntry,
+  customMetrics: Record<string, MetricReport>,
+  definitions: Record<string, MetricDefinition>,
+): void {
+  let maxCount = 0;
+  for (const [metricName, metric] of Object.entries(customMetrics)) {
+    for (const [seriesId, stats] of Object.entries(metric.series)) {
+      const key = seriesId === '' ? metricName : `${metricName}#${seriesId}`;
+      report.metrics[key] = { mean: stats.mean, stdDev: stats.stdDev, outliers: stats.outliers };
+      maxCount = Math.max(maxCount, stats.count);
+    }
+    definitions[metricName] = {
+      kind: metric.kind,
+      ...(metric.config.format ? { format: metric.config.format } : {}),
+      ...(metric.config.alarm ? { alarm: metric.config.alarm } : {}),
+    };
+  }
+  if (report.iterations === 0) {
+    report.iterations = maxCount;
+  }
+}
+
 function printMetricsTable(
   name: string,
   metrics: Record<string, { mean: number; stdDev: number; outliers: number }>,
   iterationCount: number,
+  definitions: Record<string, MetricDefinition>,
 ): void {
   const entries = Object.entries(metrics);
   if (entries.length === 0) {
@@ -201,10 +243,12 @@ function printMetricsTable(
   }
 
   const rows: string[][] = entries.map(([metricName, stats]) => {
-    const iqrStr = `${stats.mean.toFixed(2)}±${stats.stdDev.toFixed(2)}`;
+    const definition = definitions[baseMetricName(metricName)];
+    const iqrStr = `${formatMetricValue(stats.mean, definition)}±${formatMetricValue(stats.stdDev, definition)}`;
     const cv = stats.mean > 0 ? (stats.stdDev / stats.mean) * 100 : 0;
+    const label = definition?.alarm ? `${metricName} ⚠` : metricName;
     return [
-      metricName.slice(0, LABEL_WIDTH).padStart(LABEL_WIDTH),
+      label.slice(0, LABEL_WIDTH).padStart(LABEL_WIDTH),
       cyan(iqrStr.padStart(STAT_WIDTH)),
       colorCV(cv),
       stats.outliers > 0 ? yellow(String(stats.outliers).padStart(4)) : dim('0'.padStart(4)),
@@ -214,7 +258,7 @@ function printMetricsTable(
   printTable(
     [
       { header: 'Metric', width: LABEL_WIDTH },
-      { header: 'Mean±σ (ms)', width: STAT_WIDTH },
+      { header: 'Mean±σ', width: STAT_WIDTH },
       { header: 'Var%', width: CV_WIDTH },
       { header: 'Out', width: 4 },
     ],
@@ -242,6 +286,8 @@ async function loadBaselineReport(baselinePath: string): Promise<BenchmarkBaseUp
 class BenchmarkReporter implements Reporter {
   private benchmarks: Record<string, BenchmarkReportEntry> = {};
 
+  private metricDefinitions: Record<string, MetricDefinition> = {};
+
   private outputPath: string;
 
   private upload: boolean;
@@ -266,14 +312,21 @@ class BenchmarkReporter implements Reporter {
 
     const meta = testCase.meta();
     const iterations = meta.benchmarkIterations;
+    const customMetrics = meta.benchmarkMetrics;
 
-    if (!iterations) {
+    if (!iterations && !customMetrics) {
       console.warn(yellow(`  No iterations recorded for: ${testCase.fullName}`));
       return;
     }
 
     const name = meta.benchmarkName ?? testCase.fullName;
-    const report = generateReportFromIterations(iterations);
+    const report = iterations
+      ? generateReportFromIterations(iterations)
+      : { iterations: 0, totalDuration: 0, renders: [], metrics: {} };
+
+    if (customMetrics) {
+      mergeCustomMetrics(report, customMetrics, this.metricDefinitions);
+    }
 
     this.benchmarks[name] = report;
 
@@ -284,7 +337,7 @@ class BenchmarkReporter implements Reporter {
 
     printDurationMatrix(`${name} — React`, report, summary);
 
-    printMetricsTable(name, report.metrics, report.iterations);
+    printMetricsTable(name, report.metrics, report.iterations, this.metricDefinitions);
   }
 
   async onTestRunEnd(): Promise<void> {
@@ -303,11 +356,14 @@ class BenchmarkReporter implements Reporter {
 
     const baseline = this.baselinePath ? await loadBaselineReport(this.baselinePath) : undefined;
 
+    const hasMetricDefinitions = Object.keys(this.metricDefinitions).length > 0;
+
     const results = {
       version: 1 as const,
       reportType: 'benchmark' as const,
       ...(await getCiMetadata()),
       report: this.benchmarks,
+      ...(hasMetricDefinitions ? { metricDefinitions: this.metricDefinitions } : {}),
       ...(baseline ? { base: baseline } : {}),
     };
 
diff --git a/packages/benchmark/src/stats.test.ts b/packages/benchmark/src/stats.test.ts
index 13c387122..cd6dd8b94 100644
--- a/packages/benchmark/src/stats.test.ts
+++ b/packages/benchmark/src/stats.test.ts
@@ -1,5 +1,5 @@
 import { describe, it, expect } from 'vitest';
-import { calculateMean, calculateStdDev, quantile, isOutlier } from './stats';
+import { calculateMean, calculateStdDev, quantile, isOutlier, aggregateSamples } from './stats';
 
 describe('calculateMean', () => {
   it('returns the mean of values', () => {
@@ -80,3 +80,24 @@ describe('isOutlier', () => {
     expect(isOutlier(35, 10, 20)).toBe(false);
   });
 });
+
+describe('aggregateSamples', () => {
+  it('computes the mean, standard deviation, and zero outliers for a clean series', () => {
+    const result = aggregateSamples([2, 4, 6]);
+    expect(result.mean).toBe(4);
+    expect(result.outliers).toBe(0);
+    expect(result.stdDev).toBeCloseTo(Math.sqrt(8 / 3));
+  });
+
+  it('removes IQR outliers before computing the mean', () => {
+    const result = aggregateSamples([10, 10, 10, 10, 10, 1000]);
+    expect(result.mean).toBe(10);
+    expect(result.outliers).toBe(1);
+  });
+
+  it('falls back to all values when filtering would remove everything', () => {
+    const result = aggregateSamples([5]);
+    expect(result.mean).toBe(5);
+    expect(result.outliers).toBe(0);
+  });
+});
diff --git a/packages/benchmark/src/stats.ts b/packages/benchmark/src/stats.ts
index d8d3e2a27..3d1ffca7f 100644
--- a/packages/benchmark/src/stats.ts
+++ b/packages/benchmark/src/stats.ts
@@ -27,3 +27,24 @@ export function isOutlier(value: number, q1: number, q3: number): boolean {
   const iqr = q3 - q1;
   return value < q1 - 1.5 * iqr || value > q3 + 1.5 * iqr;
 }
+
+/**
+ * Aggregates a series of samples into a mean, standard deviation, and outlier count using
+ * IQR-based outlier removal. Falls back to the raw values when filtering would remove
+ * everything. This is the shared aggregation core for custom metrics.
+ */
+export function aggregateSamples(values: number[]): {
+  mean: number;
+  stdDev: number;
+  outliers: number;
+} {
+  const sorted = [...values].sort((first, second) => first - second);
+  const q1 = quantile(sorted, 0.25);
+  const q3 = quantile(sorted, 0.75);
+  const filtered = values.filter((value) => !isOutlier(value, q1, q3));
+  const used = filtered.length > 0 ? filtered : values;
+
+  const mean = calculateMean(used);
+  const stdDev = calculateStdDev(used, mean);
+  return { mean, stdDev, outliers: values.length - used.length };
+}
diff --git a/packages/benchmark/src/taskMetaAugmentation.ts b/packages/benchmark/src/taskMetaAugmentation.ts
index 58d922d51..eee8aa740 100644
--- a/packages/benchmark/src/taskMetaAugmentation.ts
+++ b/packages/benchmark/src/taskMetaAugmentation.ts
@@ -1,10 +1,12 @@
 /// <reference types="vitest" />
 
-import type { IterationData } from './types';
+import type { IterationData, MetricReport } from './types';
 
 declare module 'vitest' {
   interface TaskMeta {
     benchmarkName?: string;
     benchmarkIterations?: IterationData[];
+    /** Custom metrics recorded via `ScalarMetric`/`DiscreteMetric`, keyed by metric name. */
+    benchmarkMetrics?: Record<string, MetricReport>;
   }
 }
diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts
index 3156cc275..e2a530ece 100644
--- a/packages/benchmark/src/types.ts
+++ b/packages/benchmark/src/types.ts
@@ -34,3 +34,60 @@ export interface InteractionContext {
    */
   waitForElementTiming: (identifier: string, timeout?: number) => Promise<void>;
 }
+
+/**
+ * Whether a custom metric measures a continuous value or a discrete count.
+ * - `scalar` — continuous measurements (timings, sizes); compared with a relative noise band.
+ * - `discrete` — counts/events; compared as exact integers.
+ */
+export type MetricKind = 'scalar' | 'discrete';
+
+/** Which direction of change counts as a regression for a metric in alarm mode. */
+export type MetricDirection = 'lowerIsBetter' | 'higherIsBetter';
+
+export interface MetricAlarm {
+  /** Defaults to `lowerIsBetter`. */
+  direction?: MetricDirection;
+  /**
+   * Relative noise band (e.g. `0.1` = 10%) beyond which a change is flagged as a regression.
+   * Scalar metrics only — discrete metrics compare exactly and ignore this.
+   */
+  threshold?: number;
+}
+
+export interface MetricConfig {
+  name: string;
+  /** Display formatting applied by the reporter and dashboard via `Intl.NumberFormat`. */
+  format?: Intl.NumberFormatOptions;
+  /**
+   * Regression judgment. Its presence opts the metric into alarming; when omitted the metric
+   * is informational (the diff is shown but never flagged).
+   */
+  alarm?: MetricAlarm;
+}
+
+/** Aggregated stats for a single metric sub-series, as they cross the browser→runner boundary. */
+export interface MetricSampleStats {
+  mean: number;
+  stdDev: number;
+  outliers: number;
+  /** Number of recorded samples (used to derive iteration counts; stripped from the report). */
+  count: number;
+}
+
+/** A custom metric's aggregated data attached to `task.meta`, keyed by metric name. */
+export interface MetricReport {
+  kind: MetricKind;
+  config: MetricConfig;
+  /** Aggregated stats keyed by sub-series id (`''` is the base series). */
+  series: Record<string, MetricSampleStats>;
+}
+
+/** Per-metric configuration hoisted to the top level of the report (keyed by metric name). */
+export interface MetricDefinition {
+  kind: MetricKind;
+  format?: Intl.NumberFormatOptions;
+  alarm?: MetricAlarm;
+  /** Reserved for the future render/paint migration; unused today. */
+  group?: string;
+}
diff --git a/test/performance/tests/metric.bench.tsx b/test/performance/tests/metric.bench.tsx
new file mode 100644
index 000000000..c1a4508ed
--- /dev/null
+++ b/test/performance/tests/metric.bench.tsx
@@ -0,0 +1,44 @@
+import { it } from 'vitest';
+import { ScalarMetric, DiscreteMetric } from '@mui/internal-benchmark';
+
+function fib(n: number): number {
+  return n < 2 ? n : fib(n - 1) + fib(n - 2);
+}
+
+// A scalar timing metric with an alarm: regressions beyond 25% are flagged.
+const duration = new ScalarMetric({
+  name: 'fib_duration',
+  format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
+  alarm: { direction: 'lowerIsBetter', threshold: 0.25 },
+});
+
+// A discrete count metric (informational), compared as an exact integer.
+const evenResults = new DiscreteMetric({ name: 'fib_even_results' });
+
+it('custom scalar + discrete metrics', () => {
+  for (let run = 0; run < 50; run += 1) {
+    duration.time();
+    const result = fib(22);
+    duration.timeEnd();
+
+    evenResults.record(result % 2 === 0 ? 1 : 0);
+  }
+});
+
+// A second scalar metric demonstrating labeled sub-series via `time`/`timeEnd` labels.
+const phases = new ScalarMetric({
+  name: 'fib_phase',
+  format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
+});
+
+it('sub-series via labels', () => {
+  for (let run = 0; run < 50; run += 1) {
+    phases.time('small');
+    fib(18);
+    phases.timeEnd('small'); // -> fib_phase#small
+
+    phases.time('large');
+    fib(24);
+    phases.timeEnd('large'); // -> fib_phase#large
+  }
+});

From 5fe6d878ddc34e4d28c2dfb8b90598dbb662819b Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 13:52:17 +0200
Subject: [PATCH 02/20] [benchmark] Simplify: reuse aggregateSamples, cache
 metric formatters

---
 .../BenchmarkComparisonReportView.tsx         | 24 +++++----------
 .../src/utils/formatters.ts                   | 26 ++++++++++++++++
 packages/benchmark/src/reporter.ts            | 30 ++++---------------
 3 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
index cae212d02..985733b95 100644
--- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
+++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
@@ -20,7 +20,12 @@ import {
   type BenchmarkDiffSeverity,
 } from '@/lib/benchmark/compareBenchmarkReports';
 import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types';
-import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters';
+import {
+  formatMs,
+  formatMetricNumber,
+  formatMetricDiff,
+  percentFormatter,
+} from '@/utils/formatters';
 
 const SEVERITY_COLOR: Record<BenchmarkDiffSeverity, string> = {
   error: 'error.main',
@@ -72,21 +77,6 @@ function computeEntryBar(
   return computeDiffBar(comparison.duration, range.min, range.max);
 }
 
-/** Formats a metric value with its `Intl.NumberFormatOptions`, falling back to milliseconds. */
-function formatMetricNumber(value: number, format?: Intl.NumberFormatOptions): string {
-  if (format) {
-    return new Intl.NumberFormat(undefined, format).format(value);
-  }
-  return formatMs(value);
-}
-
-function formatSignedDiff(value: number, format?: Intl.NumberFormatOptions): string {
-  if (format) {
-    return new Intl.NumberFormat(undefined, { signDisplay: 'exceptZero', ...format }).format(value);
-  }
-  return formatDiffMs(value);
-}
-
 function FormattedDiffMs({
   diff,
   percent = false,
@@ -101,7 +91,7 @@ function FormattedDiffMs({
   }
   return (
     <React.Fragment>
-      {formatSignedDiff(diff.absoluteDiff, format)}
+      {formatMetricDiff(diff.absoluteDiff, format)}
       {percent && (
         <React.Fragment>
           {' '}
diff --git a/apps/code-infra-dashboard/src/utils/formatters.ts b/apps/code-infra-dashboard/src/utils/formatters.ts
index 27dd11347..107514d2a 100644
--- a/apps/code-infra-dashboard/src/utils/formatters.ts
+++ b/apps/code-infra-dashboard/src/utils/formatters.ts
@@ -22,6 +22,32 @@ export function formatDiffMs(value: number): string {
   return `${sign}${durationFormatter.format(value)} ms`;
 }
 
+// `Intl.NumberFormat` construction is comparatively expensive, so cache one instance per unique
+// format spec — these formatters run for every metric cell on every render.
+const numberFormatterCache = new Map<string, Intl.NumberFormat>();
+
+function getNumberFormatter(format: Intl.NumberFormatOptions): Intl.NumberFormat {
+  const key = JSON.stringify(format);
+  let formatter = numberFormatterCache.get(key);
+  if (!formatter) {
+    formatter = new Intl.NumberFormat(undefined, format);
+    numberFormatterCache.set(key, formatter);
+  }
+  return formatter;
+}
+
+/** Formats a metric value with its `Intl.NumberFormatOptions`, falling back to milliseconds. */
+export function formatMetricNumber(value: number, format?: Intl.NumberFormatOptions): string {
+  return format ? getNumberFormatter(format).format(value) : formatMs(value);
+}
+
+/** Formats a signed metric diff with its format, falling back to a millisecond diff. */
+export function formatMetricDiff(value: number, format?: Intl.NumberFormatOptions): string {
+  return format
+    ? getNumberFormatter({ signDisplay: 'exceptZero', ...format }).format(value)
+    : formatDiffMs(value);
+}
+
 interface ColumnDefinition {
   field: string;
   header?: string;
diff --git a/packages/benchmark/src/reporter.ts b/packages/benchmark/src/reporter.ts
index 15b27d784..f76160ad8 100644
--- a/packages/benchmark/src/reporter.ts
+++ b/packages/benchmark/src/reporter.ts
@@ -4,15 +4,13 @@ import type { Reporter, TestCase } from 'vitest/node';
 import type { RenderEvent, IterationData, MetricReport, MetricDefinition } from './types';
 import type { BenchmarkBaseUpload, BenchmarkReportEntry } from './ciReport';
 import { benchmarkUploadSchema, getCiMetadata } from './ciReport';
-import { calculateMean, calculateStdDev, quantile, isOutlier } from './stats';
+import { calculateMean, aggregateSamples } from './stats';
 import { dim, red, green, yellow, cyan, printTable, fileUrl } from './format';
 import { uploadCiReport } from './upload';
 import { syncPrComment } from './syncPrComment';
 // Import for TaskMeta augmentation side effect
 import './taskMetaAugmentation';
 
-const byNumeric = (a: number, b: number) => a - b;
-
 function getEventKey(event: RenderEvent): string {
   return `${event.id}:${event.phase}`;
 }
@@ -35,16 +33,7 @@ function aggregateMetrics(
 
   const result: Record<string, { mean: number; stdDev: number; outliers: number }> = {};
   for (const [name, values] of metricValues) {
-    // Apply IQR filtering
-    const sorted = [...values].sort(byNumeric);
-    const q1 = quantile(sorted, 0.25);
-    const q3 = quantile(sorted, 0.75);
-    const filtered = values.filter((d) => !isOutlier(d, q1, q3));
-    const used = filtered.length > 0 ? filtered : values;
-
-    const mean = calculateMean(used);
-    const stdDev = calculateStdDev(used, mean);
-    result[name] = { mean, stdDev, outliers: values.length - used.length };
+    result[name] = aggregateSamples(values);
   }
   return result;
 }
@@ -74,14 +63,7 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep
   for (let index = 0; index < expectedLength; index += 1) {
     const durations = iterations.map((iteration) => iteration.renders[index].actualDuration);
 
-    const sorted = [...durations].sort(byNumeric);
-    const q1 = quantile(sorted, 0.25);
-    const q3 = quantile(sorted, 0.75);
-    const filtered = durations.filter((d) => !isOutlier(d, q1, q3));
-    const used = filtered.length > 0 ? filtered : durations;
-
-    const iqrMean = calculateMean(used);
-    const iqrStdDev = calculateStdDev(used, iqrMean);
+    const { mean: iqrMean, stdDev: iqrStdDev, outliers } = aggregateSamples(durations);
     const coefficientOfVariation = iqrMean > 0 ? iqrStdDev / iqrMean : 0;
 
     if (iqrMean > 1 && coefficientOfVariation > 0.1) {
@@ -96,7 +78,7 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep
       event: firstIteration.renders[index],
       iqrMean,
       iqrStdDev,
-      outliers: durations.length - used.length,
+      outliers,
     });
   }
 
@@ -222,8 +204,8 @@ function mergeCustomMetrics(
     }
     definitions[metricName] = {
       kind: metric.kind,
-      ...(metric.config.format ? { format: metric.config.format } : {}),
-      ...(metric.config.alarm ? { alarm: metric.config.alarm } : {}),
+      format: metric.config.format,
+      alarm: metric.config.alarm,
     };
   }
   if (report.iterations === 0) {

From f51afe2d1b6ab3bf4c4cbbec6338aa3e163ad3c4 Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 13:58:59 +0200
Subject: [PATCH 03/20] [benchmark] Drop unused reserved 'group' field from
 MetricDefinition

---
 apps/code-infra-dashboard/src/lib/benchmark/types.ts | 1 -
 packages/benchmark/src/ciReport.ts                   | 1 -
 packages/benchmark/src/types.ts                      | 2 --
 3 files changed, 4 deletions(-)

diff --git a/apps/code-infra-dashboard/src/lib/benchmark/types.ts b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
index 5935592e1..c1b9cf7ad 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/types.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
@@ -43,7 +43,6 @@ export interface MetricDefinition {
   kind: 'scalar' | 'discrete';
   format?: Intl.NumberFormatOptions;
   alarm?: MetricAlarm;
-  group?: string;
 }
 
 export interface BenchmarkBaseUpload {
diff --git a/packages/benchmark/src/ciReport.ts b/packages/benchmark/src/ciReport.ts
index a8d832645..6a59ea4cb 100644
--- a/packages/benchmark/src/ciReport.ts
+++ b/packages/benchmark/src/ciReport.ts
@@ -75,7 +75,6 @@ const metricDefinitionSchema = z.object({
       threshold: z.number().optional(),
     })
     .optional(),
-  group: z.string().optional(),
 });
 
 const benchmarkBaseUploadSchema = ciReportUploadSchema(
diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts
index e2a530ece..91a6b0856 100644
--- a/packages/benchmark/src/types.ts
+++ b/packages/benchmark/src/types.ts
@@ -88,6 +88,4 @@ export interface MetricDefinition {
   kind: MetricKind;
   format?: Intl.NumberFormatOptions;
   alarm?: MetricAlarm;
-  /** Reserved for the future render/paint migration; unused today. */
-  group?: string;
 }

From 0eeb9915fe00a5ae2050c83ef3c6708bd38c5eb2 Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 14:35:58 +0200
Subject: [PATCH 04/20] [benchmark] Add warning severity tier to custom-metric
 alarms

---
 .../BenchmarkComparisonReportView.tsx         |  2 +
 .../src/lib/benchmark/buildMarkdownReport.ts  |  1 +
 .../benchmark/compareBenchmarkReports.test.ts | 54 +++++++++++++++++--
 .../lib/benchmark/compareBenchmarkReports.ts  | 53 +++++++++++++-----
 .../src/lib/benchmark/types.ts                |  5 +-
 packages/benchmark/README.md                  |  6 ++-
 packages/benchmark/src/ciReport.ts            |  3 +-
 packages/benchmark/src/reporter.test.ts       |  7 ++-
 packages/benchmark/src/types.ts               | 12 +++--
 test/performance/tests/metric.bench.tsx       |  4 +-
 10 files changed, 118 insertions(+), 29 deletions(-)

diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
index 985733b95..7b10306c0 100644
--- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
+++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
@@ -29,6 +29,7 @@ import {
 
 const SEVERITY_COLOR: Record<BenchmarkDiffSeverity, string> = {
   error: 'error.main',
+  warning: 'warning.main',
   success: 'success.main',
   neutral: 'text.secondary',
 };
@@ -37,6 +38,7 @@ const MIN_BAR_WIDTH_PX = 4;
 
 const DIFF_BAR_COLORS: Record<BenchmarkDiffSeverity, string> = {
   error: 'var(--mui-palette-error-main)',
+  warning: 'var(--mui-palette-warning-main)',
   success: 'var(--mui-palette-success-main)',
   neutral: 'var(--mui-palette-action-disabled)',
 };
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
index c21135cc2..d96375004 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
@@ -8,6 +8,7 @@ export interface BuildMarkdownReportOptions {
 
 const SEVERITY_PREFIX: Record<string, string> = {
   error: '🔺',
+  warning: '⚠️',
   success: '▼',
 };
 
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
index c666fe82a..72904e588 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
@@ -16,10 +16,12 @@ function reportWithMetrics(metrics: Record<string, number>): BenchmarkReport {
 }
 
 const definitions: Record<string, MetricDefinition> = {
-  scalar_alarm: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', threshold: 0.1 } },
-  scalar_higher: { kind: 'scalar', alarm: { direction: 'higherIsBetter', threshold: 0.1 } },
+  scalar_alarm: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', error: 0.1 } },
+  scalar_tiered: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 } },
+  scalar_higher: { kind: 'scalar', alarm: { direction: 'higherIsBetter', error: 0.1 } },
   scalar_info: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } },
   discrete_alarm: { kind: 'discrete', alarm: { direction: 'lowerIsBetter' } },
+  discrete_tiered: { kind: 'discrete', alarm: { direction: 'lowerIsBetter', warn: 1, error: 3 } },
 };
 
 function metricEntry(current: BenchmarkReport, base: BenchmarkReport, metricName: string) {
@@ -220,9 +222,9 @@ describe('compareBenchmarkReports', () => {
       expect(metric.format).toEqual({ style: 'unit', unit: 'byte' });
     });
 
-    it('flags a scalar alarm regression beyond its own threshold', () => {
+    it('flags a scalar alarm regression beyond its error band', () => {
       const metric = metricEntry(
-        reportWithMetrics({ scalar_alarm: 120 }), // +20%, threshold is 10%
+        reportWithMetrics({ scalar_alarm: 120 }), // +20%, error band is 10%
         reportWithMetrics({ scalar_alarm: 100 }),
         'scalar_alarm',
       );
@@ -230,6 +232,50 @@ describe('compareBenchmarkReports', () => {
       expect(metric.diff.hint).toContain('Regression');
     });
 
+    it('flags a scalar regression between warn and error bands as a warning', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_tiered: 115 }), // +15%: past warn (10%), within error (25%)
+        reportWithMetrics({ scalar_tiered: 100 }),
+        'scalar_tiered',
+      );
+      expect(metric.diff.severity).toBe('warning');
+      expect(metric.diff.hint).toContain('Warning');
+    });
+
+    it('escalates a scalar regression past the error band to error', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_tiered: 130 }), // +30%: past error (25%)
+        reportWithMetrics({ scalar_tiered: 100 }),
+        'scalar_tiered',
+      );
+      expect(metric.diff.severity).toBe('error');
+    });
+
+    it('keeps a scalar change within the warn band neutral', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_tiered: 105 }), // +5%: within warn
+        reportWithMetrics({ scalar_tiered: 100 }),
+        'scalar_tiered',
+      );
+      expect(metric.diff.severity).toBe('neutral');
+    });
+
+    it('tiers discrete metrics by absolute count delta, inclusive of the band', () => {
+      const warn = metricEntry(
+        reportWithMetrics({ discrete_tiered: 4 }), // +1: meets warn (1), below error (3)
+        reportWithMetrics({ discrete_tiered: 3 }),
+        'discrete_tiered',
+      );
+      expect(warn.diff.severity).toBe('warning');
+
+      const error = metricEntry(
+        reportWithMetrics({ discrete_tiered: 6 }), // +3: meets error (3)
+        reportWithMetrics({ discrete_tiered: 3 }),
+        'discrete_tiered',
+      );
+      expect(error.diff.severity).toBe('error');
+    });
+
     it('honors higherIsBetter so an increase is an improvement', () => {
       const metric = metricEntry(
         reportWithMetrics({ scalar_higher: 120 }),
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
index 810f46748..201ac0ced 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
@@ -3,7 +3,7 @@ import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition, RenderSta
 
 const NOISE_THRESHOLD = 0.2;
 
-export type BenchmarkDiffSeverity = 'error' | 'success' | 'neutral';
+export type BenchmarkDiffSeverity = 'error' | 'warning' | 'success' | 'neutral';
 
 export interface DiffValue {
   current: number | null;
@@ -36,8 +36,9 @@ export interface ComparisonItem {
 
 const SEVERITY_RANK: Record<BenchmarkDiffSeverity, number> = {
   error: 0,
-  success: 1,
-  neutral: 2,
+  warning: 1,
+  success: 2,
+  neutral: 3,
 };
 
 export interface BenchmarkComparisonReport {
@@ -162,10 +163,11 @@ function baseMetricName(key: string): string {
 }
 
 /**
- * Diff for a custom metric, honoring its definition: a metric without an `alarm` is
- * informational (always neutral), a `discrete` metric compares exactly, and a `scalar` metric
- * uses its own noise band and direction. Metrics without a definition (e.g. paint) keep the
- * default `makeDiffValue` behavior and never reach this function.
+ * Diff for a custom metric, honoring its definition. A metric without an `alarm` is
+ * informational (always neutral). With an `alarm`, a regression past the `warn` band is flagged
+ * `warning` and past the `error` band `error`; improvements are `success`. Bands are relative
+ * fractions for `scalar` metrics and absolute count deltas for `discrete` metrics. Metrics
+ * without a definition (e.g. paint) keep the default `makeDiffValue` behavior and never reach here.
  */
 function makeMetricDiff(
   current: number | null,
@@ -198,24 +200,47 @@ function makeMetricDiff(
     };
   }
 
-  const withinNoise = isDiscrete
-    ? absoluteDiff === 0
-    : Math.abs(relativeDiff) <= (alarm.threshold ?? NOISE_THRESHOLD);
+  // Scalar bands are relative fractions; discrete bands are absolute count deltas, and meet their
+  // threshold inclusively (an `error: 2` discrete alarm fires on a delta of exactly 2).
+  const magnitude = isDiscrete ? Math.abs(absoluteDiff) : Math.abs(relativeDiff);
+  const meets = (band: number) => (isDiscrete ? magnitude >= band : magnitude > band);
+
+  // When no bands are given, `error` falls back to the global noise band (scalar) or any change
+  // (discrete). When only `warn` is given, there is no error band.
+  const defaultError = isDiscrete ? 0 : NOISE_THRESHOLD;
+  const errorBand = alarm.error ?? (alarm.warn === undefined ? defaultError : undefined);
+  const warnBand = alarm.warn;
+
+  let level: 'none' | 'warn' | 'error' = 'none';
+  if (absoluteDiff !== 0) {
+    if (errorBand !== undefined && meets(errorBand)) {
+      level = 'error';
+    } else if (warnBand !== undefined && meets(warnBand)) {
+      level = 'warn';
+    }
+  }
+
   const direction = alarm.direction ?? 'lowerIsBetter';
   const isRegression = direction === 'lowerIsBetter' ? absoluteDiff > 0 : absoluteDiff < 0;
 
   let severity: BenchmarkDiffSeverity = 'neutral';
-  if (!withinNoise && absoluteDiff !== 0) {
-    severity = isRegression ? 'error' : 'success';
+  if (level !== 'none') {
+    if (!isRegression) {
+      severity = 'success';
+    } else {
+      severity = level === 'error' ? 'error' : 'warning';
+    }
   }
 
   let hint: string;
   if (absoluteDiff === 0) {
     hint = 'No change';
-  } else if (withinNoise) {
+  } else if (level === 'none') {
     hint = `Within noise: ${diffStr}`;
+  } else if (!isRegression) {
+    hint = `Improvement: ${diffStr}`;
   } else {
-    hint = `${isRegression ? 'Regression' : 'Improvement'}: ${diffStr}`;
+    hint = `${level === 'error' ? 'Regression' : 'Warning'}: ${diffStr}`;
   }
 
   return { current, base, absoluteDiff, relativeDiff, severity, hint };
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/types.ts b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
index c1b9cf7ad..85d0e5209 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/types.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
@@ -35,7 +35,10 @@ export type MetricDirection = 'lowerIsBetter' | 'higherIsBetter';
 
 export interface MetricAlarm {
   direction?: MetricDirection;
-  threshold?: number;
+  /** Softer band (relative fraction for scalar, absolute count delta for discrete). */
+  warn?: number;
+  /** Harder band; defaults to the global noise band when omitted. */
+  error?: number;
 }
 
 /** Per-metric config for custom metrics, hoisted to the top level of the report (keyed by name). */
diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index 3cfedb79f..6742b2926 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -106,7 +106,7 @@ import { ScalarMetric, DiscreteMetric } from '@mui/internal-benchmark';
 const duration = new ScalarMetric({
   name: 'work_duration',
   format: { style: 'unit', unit: 'millisecond' }, // Intl.NumberFormatOptions
-  alarm: { direction: 'lowerIsBetter', threshold: 0.1 }, // flag regressions over 10%
+  alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 }, // warn >10%, error >25%
 });
 
 const clicks = new DiscreteMetric({ name: 'button_clicks' });
@@ -130,7 +130,9 @@ A metric is declared once (typically at module scope) and reused across tests an
 - `format` — an [`Intl.NumberFormatOptions`](https://developer.mozilla.org/en-US/docs/Web/API/Intl/NumberFormat/NumberFormat) object used to display the value.
 - `alarm` — opts the metric into regression flagging. Omit it and the metric is informational (its diff is shown but never flagged). Holds:
   - `direction` — `'lowerIsBetter'` (default) or `'higherIsBetter'`.
-  - `threshold` — the relative noise band (e.g. `0.1` = 10%) beyond which a change is a regression. Scalar metrics only; discrete metrics compare exactly.
+  - `warn` — softer band; a regression past it is flagged as a warning.
+  - `error` — harder band; a regression past it is flagged as an error. Defaults to the dashboard's global noise band when omitted.
+  - Bands are relative fractions for scalar metrics (`0.1` = 10%) and absolute count deltas for discrete metrics (`1`, `2`). Either band is optional.
 
 #### Sub-series
 
diff --git a/packages/benchmark/src/ciReport.ts b/packages/benchmark/src/ciReport.ts
index 6a59ea4cb..1add86b94 100644
--- a/packages/benchmark/src/ciReport.ts
+++ b/packages/benchmark/src/ciReport.ts
@@ -72,7 +72,8 @@ const metricDefinitionSchema = z.object({
   alarm: z
     .object({
       direction: z.enum(['lowerIsBetter', 'higherIsBetter']).optional(),
-      threshold: z.number().optional(),
+      warn: z.number().optional(),
+      error: z.number().optional(),
     })
     .optional(),
 });
diff --git a/packages/benchmark/src/reporter.test.ts b/packages/benchmark/src/reporter.test.ts
index ec86476b5..f79630677 100644
--- a/packages/benchmark/src/reporter.test.ts
+++ b/packages/benchmark/src/reporter.test.ts
@@ -342,7 +342,7 @@ describe('BenchmarkReporter', () => {
                 config: {
                   name: 'fib_duration',
                   format: { maximumFractionDigits: 3 },
-                  alarm: { direction: 'lowerIsBetter', threshold: 0.25 },
+                  alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 },
                 },
                 series: { '': { mean: 1.5, stdDev: 0.1, outliers: 0, count: 50 } },
               },
@@ -379,7 +379,10 @@ describe('BenchmarkReporter', () => {
 
       // Config is hoisted once, keyed by metric name.
       expect(written.metricDefinitions).toMatchObject({
-        fib_duration: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', threshold: 0.25 } },
+        fib_duration: {
+          kind: 'scalar',
+          alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 },
+        },
         fib_phase: { kind: 'scalar' },
       });
 
diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts
index 91a6b0856..7ce69458d 100644
--- a/packages/benchmark/src/types.ts
+++ b/packages/benchmark/src/types.ts
@@ -49,10 +49,16 @@ export interface MetricAlarm {
   /** Defaults to `lowerIsBetter`. */
   direction?: MetricDirection;
   /**
-   * Relative noise band (e.g. `0.1` = 10%) beyond which a change is flagged as a regression.
-   * Scalar metrics only — discrete metrics compare exactly and ignore this.
+   * Softer band: a regression past `warn` (but within `error`) is flagged as a warning.
+   * Scalar metrics: a relative fraction (`0.1` = 10%). Discrete metrics: an absolute count delta.
    */
-  threshold?: number;
+  warn?: number;
+  /**
+   * Harder band: a regression past `error` is flagged as an error (the alarm). When omitted it
+   * defaults to the dashboard's global noise band.
+   * Scalar metrics: a relative fraction (`0.25` = 25%). Discrete metrics: an absolute count delta.
+   */
+  error?: number;
 }
 
 export interface MetricConfig {
diff --git a/test/performance/tests/metric.bench.tsx b/test/performance/tests/metric.bench.tsx
index c1a4508ed..ad5ff0899 100644
--- a/test/performance/tests/metric.bench.tsx
+++ b/test/performance/tests/metric.bench.tsx
@@ -5,11 +5,11 @@ function fib(n: number): number {
   return n < 2 ? n : fib(n - 1) + fib(n - 2);
 }
 
-// A scalar timing metric with an alarm: regressions beyond 25% are flagged.
+// A scalar timing metric with an alarm: regressions past 10% warn, past 25% error.
 const duration = new ScalarMetric({
   name: 'fib_duration',
   format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
-  alarm: { direction: 'lowerIsBetter', threshold: 0.25 },
+  alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 },
 });
 
 // A discrete count metric (informational), compared as an exact integer.

From b0d79507a944445a15c9e88b7cd8c74eae1e4ce0 Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 14:36:22 +0200
Subject: [PATCH 05/20] [benchmark] Reject two metrics sharing a name; add
 primitive unit tests

---
 packages/benchmark/src/Metric.test.ts | 33 +++++++++++++++++++++++++++
 packages/benchmark/src/Metric.ts      |  8 ++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 packages/benchmark/src/Metric.test.ts

diff --git a/packages/benchmark/src/Metric.test.ts b/packages/benchmark/src/Metric.test.ts
new file mode 100644
index 000000000..64b4d67a9
--- /dev/null
+++ b/packages/benchmark/src/Metric.test.ts
@@ -0,0 +1,33 @@
+import { describe, it, expect } from 'vitest';
+import { ScalarMetric } from './ScalarMetric';
+import { DiscreteMetric } from './DiscreteMetric';
+
+describe('Metric.record', () => {
+  it('throws when two different metrics share a name in the same test', () => {
+    const timing = new ScalarMetric({ name: 'shared' });
+    const count = new DiscreteMetric({ name: 'shared' });
+    timing.record(1);
+    expect(() => count.record(2)).toThrow(/share the name "shared"/);
+  });
+
+  it('allows the same instance to record many values under one name', () => {
+    const metric = new ScalarMetric({ name: 'reused' });
+    expect(() => {
+      metric.record(1);
+      metric.record(2, { id: 'sub' });
+    }).not.toThrow();
+  });
+});
+
+describe('ScalarMetric.timeEnd', () => {
+  it('throws when called without a matching time()', () => {
+    const metric = new ScalarMetric({ name: 'timer' });
+    expect(() => metric.timeEnd()).toThrow(/without a matching time/);
+  });
+
+  it('throws when the label does not match an open timer', () => {
+    const metric = new ScalarMetric({ name: 'labelled-timer' });
+    metric.time('a');
+    expect(() => metric.timeEnd('b')).toThrow(/without a matching time/);
+  });
+});
diff --git a/packages/benchmark/src/Metric.ts b/packages/benchmark/src/Metric.ts
index 921c10ba3..117b97ba1 100644
--- a/packages/benchmark/src/Metric.ts
+++ b/packages/benchmark/src/Metric.ts
@@ -5,6 +5,8 @@ import { aggregateSamples } from './stats';
 import './taskMetaAugmentation';
 
 interface SeriesAccumulator {
+  /** The instance that registered this name, used to reject two metrics sharing a name. */
+  owner: Metric;
   kind: MetricKind;
   config: MetricConfig;
   /** Raw samples per sub-series id (`''` is the base series), kept in browser memory only. */
@@ -76,8 +78,12 @@ export abstract class Metric {
 
     let entry = accumulator.get(this.name);
     if (!entry) {
-      entry = { kind: this.kind, config: this.config, series: new Map() };
+      entry = { owner: this, kind: this.kind, config: this.config, series: new Map() };
       accumulator.set(this.name, entry);
+    } else if (entry.owner !== this) {
+      throw new Error(
+        `Two metrics share the name "${this.name}". Metric names must be unique; reuse a single instance instead.`,
+      );
     }
 
     const seriesId = options?.id ?? '';

From 5a0a107febc6452d25d8e153273d3563afdb7f1e Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 14:41:21 +0200
Subject: [PATCH 06/20] [code-infra-dashboard] Extract props interfaces for
 FormattedDiffMs and DiffCell

---
 .../BenchmarkComparisonReportView.tsx         | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
index 7b10306c0..88e86af90 100644
--- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
+++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
@@ -79,15 +79,13 @@ function computeEntryBar(
   return computeDiffBar(comparison.duration, range.min, range.max);
 }
 
-function FormattedDiffMs({
-  diff,
-  percent = false,
-  format,
-}: {
+interface FormattedDiffMsProps {
   diff: DiffValue;
   percent?: boolean;
   format?: Intl.NumberFormatOptions;
-}) {
+}
+
+function FormattedDiffMs({ diff, percent = false, format }: FormattedDiffMsProps) {
   if (diff.absoluteDiff === 0) {
     return '\u2014';
   }
@@ -106,15 +104,13 @@ function FormattedDiffMs({
   );
 }
 
-function DiffCell({
-  diff,
-  sx,
-  format,
-}: {
+interface DiffCellProps {
   diff: DiffValue;
   sx?: object;
   format?: Intl.NumberFormatOptions;
-}) {
+}
+
+function DiffCell({ diff, sx, format }: DiffCellProps) {
   const color = SEVERITY_COLOR[diff.severity];
   return (
     <Tooltip title={diff.hint} arrow>

From 303669c4fdfc0ffc7f00d6a22bc8d40cbad30de2 Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 16:30:18 +0200
Subject: [PATCH 07/20] [benchmark] Migrate paint to the metric primitive;
 surface alarms in PR comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Record Element-Timing values into a single 'paint' ScalarMetric with a sub-series
per identifier (paint#default, …) and a default alarm; drop IterationData.metrics,
BenchmarkMetric, and the reporter's aggregateMetrics. On the dashboard, drop the
paintDefault totals special-case, add an 'N metrics' accordion indicator, and surface
error-level metric alarms in the PR comment (warnings stay dashboard-only).
---
 .../BenchmarkComparisonReportView.tsx         | 31 +++++-----
 .../src/components/DailyBenchmarkChart.tsx    |  2 +-
 .../lib/benchmark/buildMarkdownReport.test.ts | 59 ++++++++++++++++++
 .../src/lib/benchmark/buildMarkdownReport.ts  | 36 ++++++++---
 .../lib/benchmark/compareBenchmarkReports.ts  | 19 ------
 packages/benchmark/README.md                  |  4 +-
 packages/benchmark/src/index.tsx              | 25 +++++---
 packages/benchmark/src/reporter.test.ts       | 60 +------------------
 packages/benchmark/src/reporter.ts            | 27 +--------
 packages/benchmark/src/types.ts               |  8 ---
 10 files changed, 129 insertions(+), 142 deletions(-)

diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
index 88e86af90..b0487c30c 100644
--- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
+++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
@@ -266,24 +266,20 @@ function TotalsSummary({ totals }: { totals: BenchmarkComparisonReport['totals']
           </Typography>
         </Typography>
       </Tooltip>
-      {totals.paintDefault && (
-        <Tooltip title={totals.paintDefault.hint} arrow>
-          <Typography variant="body2">
-            <strong>Paint:</strong>{' '}
-            <Typography
-              component="span"
-              variant="body2"
-              sx={{ color: SEVERITY_COLOR[totals.paintDefault.severity] }}
-            >
-              <FormattedDiffMs diff={totals.paintDefault} percent />
-            </Typography>
-          </Typography>
-        </Tooltip>
-      )}
     </Box>
   );
 }
 
+function worstMetricSeverity(metrics: ComparisonItem['metrics']): BenchmarkDiffSeverity {
+  if (metrics.some((row) => row.diff.severity === 'error')) {
+    return 'error';
+  }
+  if (metrics.some((row) => row.diff.severity === 'warning')) {
+    return 'warning';
+  }
+  return 'neutral';
+}
+
 function BenchmarkAccordion({
   comparison,
   hasBase,
@@ -298,6 +294,8 @@ function BenchmarkAccordion({
   entryDiffRange: { min: number; max: number };
 }) {
   const renderCount = comparison.renders.filter((row) => !row.removed).length;
+  const metricCount = comparison.metrics.filter((row) => !row.removed).length;
+  const metricSeverity = worstMetricSeverity(comparison.metrics);
 
   const entryBar = computeEntryBar(comparison, hasBase, entryDiffRange);
 
@@ -365,6 +363,11 @@ function BenchmarkAccordion({
                 </Tooltip>
               )}
             </Typography>
+            {metricCount > 0 && (
+              <Typography variant="body2" sx={{ color: SEVERITY_COLOR[metricSeverity] }}>
+                {metricCount} metrics
+              </Typography>
+            )}
           </Box>
         </Box>
       </AccordionSummary>
diff --git a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
index 40e9a5d21..53d562dff 100644
--- a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
+++ b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
@@ -156,7 +156,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
         return entry.totalDuration;
       }
       if (chartMode === 'paint') {
-        return entry.metrics['paint:default']?.mean ?? null;
+        return entry.metrics['paint#default']?.mean ?? null;
       }
       return entry.renders.length;
     },
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
index f861d5aae..30d6fa221 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
@@ -1,8 +1,30 @@
 import { describe, it, expect } from 'vitest';
 import { buildBenchmarkMarkdownReport } from './buildMarkdownReport';
 import { compareBenchmarkReports } from './compareBenchmarkReports';
+import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition } from './types';
 import { makeReport, makeReportFromConfig } from './test-fixtures';
 
+// A single benchmark whose duration/renders are neutral, so the only signal is one metric.
+function metricReport(name: string, mean: number): BenchmarkReport {
+  const entry: BenchmarkReportEntry = {
+    iterations: 10,
+    totalDuration: 100,
+    renders: [],
+    metrics: { [name]: { mean, stdDev: 0, outliers: 0 } },
+  };
+  return { Bench: entry };
+}
+
+const alarmDefinitions: Record<string, MetricDefinition> = {
+  clicks: {
+    kind: 'discrete',
+    format: { maximumFractionDigits: 0 },
+    alarm: { direction: 'lowerIsBetter', error: 1 },
+  },
+  tti: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 } },
+  fps: { kind: 'scalar' }, // informational (no alarm)
+};
+
 describe('buildBenchmarkMarkdownReport', () => {
   it('drops within-noise rows but keeps significant regressions', () => {
     const report = compareBenchmarkReports(
@@ -126,4 +148,41 @@ describe('buildBenchmarkMarkdownReport', () => {
     expect(markdown).not.toContain('No significant changes');
     expect(markdown).toContain('| Test |');
   });
+
+  describe('metric alarms', () => {
+    it('surfaces an error-level metric, making its otherwise-neutral test significant', () => {
+      const report = compareBenchmarkReports(
+        metricReport('clicks', 5), // +2 vs 3, discrete error band 1
+        metricReport('clicks', 3),
+        alarmDefinitions,
+      );
+      const markdown = buildBenchmarkMarkdownReport(report);
+      expect(markdown).not.toContain('No significant changes');
+      expect(markdown).toContain('**Metric alarms**');
+      expect(markdown).toContain('| Bench | clicks |');
+      expect(markdown).toContain('🔺');
+    });
+
+    it('does not surface warning-level metrics (kept on the dashboard only)', () => {
+      const report = compareBenchmarkReports(
+        metricReport('tti', 115), // +15%: past warn (10%), within error (25%) -> warning
+        metricReport('tti', 100),
+        alarmDefinitions,
+      );
+      const markdown = buildBenchmarkMarkdownReport(report);
+      expect(markdown).toContain('No significant changes');
+      expect(markdown).not.toContain('Metric alarms');
+    });
+
+    it('ignores informational metrics — no alarm section, test stays within noise', () => {
+      const report = compareBenchmarkReports(
+        metricReport('fps', 200), // big change but no alarm configured
+        metricReport('fps', 100),
+        alarmDefinitions,
+      );
+      const markdown = buildBenchmarkMarkdownReport(report);
+      expect(markdown).toContain('No significant changes');
+      expect(markdown).not.toContain('Metric alarms');
+    });
+  });
 });
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
index d96375004..d33a6c403 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
@@ -1,5 +1,12 @@
-import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters';
-import type { BenchmarkComparisonReport, DiffValue } from './compareBenchmarkReports';
+import { formatMs, formatDiffMs, formatMetricDiff, percentFormatter } from '@/utils/formatters';
+import type {
+  BenchmarkComparisonReport,
+  ComparisonEntry,
+  DiffValue,
+} from './compareBenchmarkReports';
+
+// Only error-level alarms surface in the PR comment; warnings stay on the dashboard.
+const isAlarmed = (metric: ComparisonEntry): boolean => metric.diff.severity === 'error';
 
 export interface BuildMarkdownReportOptions {
   maxRows?: number;
@@ -43,11 +50,6 @@ export function buildBenchmarkMarkdownReport(
       `**Total duration:** ${formatMs(report.totals.duration.current ?? 0)}${formatDiff(report.totals.duration, 'ms')}`,
       `**Renders:** ${report.totals.renderCount.current ?? 0}${formatDiff(report.totals.renderCount, 'count')}`,
     ];
-    if (report.totals.paintDefault) {
-      totalParts.push(
-        `**Paint:** ${formatMs(report.totals.paintDefault.current ?? 0)}${formatDiff(report.totals.paintDefault, 'ms')}`,
-      );
-    }
     lines.push(totalParts.join(' | '));
     lines.push('');
   }
@@ -57,7 +59,8 @@ export function buildBenchmarkMarkdownReport(
       entry.duration.severity !== 'neutral' ||
       (entry.renderCount?.severity ?? 'neutral') !== 'neutral' ||
       entry.duration.current === null ||
-      entry.duration.base === null,
+      entry.duration.base === null ||
+      entry.metrics.some(isAlarmed),
   );
 
   const detailsLink = reportUrl ? `[details](${reportUrl})` : '';
@@ -100,5 +103,22 @@ export function buildBenchmarkMarkdownReport(
     lines.push(detailsLink);
   }
 
+  // Error-level metric alarms across all tests (independent of the test-table row cap).
+  const alarms = report.entries.flatMap((entry) =>
+    entry.metrics.filter(isAlarmed).map((metric) => ({ test: entry.name, metric })),
+  );
+  if (alarms.length > 0) {
+    lines.push('');
+    lines.push('**Metric alarms**');
+    lines.push('');
+    lines.push('| Test | Metric | Change |');
+    lines.push('|:-----|:-------|-------:|');
+    for (const { test, metric } of alarms) {
+      const prefix = SEVERITY_PREFIX[metric.diff.severity] ?? '';
+      const change = `${prefix} ${formatMetricDiff(metric.diff.absoluteDiff, metric.format)}`;
+      lines.push(`| ${test} | ${metric.name} | ${change} |`);
+    }
+  }
+
   return lines.join('\n');
 }
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
index 201ac0ced..7dc88b99f 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
@@ -47,7 +47,6 @@ export interface BenchmarkComparisonReport {
   totals: {
     duration: DiffValue;
     renderCount: DiffValue;
-    paintDefault: DiffValue | null;
   };
 }
 
@@ -336,9 +335,6 @@ export function compareBenchmarkReports(
   let totalBaseDuration = 0;
   let totalCurrentRenders = 0;
   let totalBaseRenders = 0;
-  let totalCurrentPaint = 0;
-  let totalBasePaint = 0;
-  let hasPaint = false;
 
   // Process current entries
   for (const [name, entry] of Object.entries(current)) {
@@ -360,14 +356,6 @@ export function compareBenchmarkReports(
     totalBaseDuration += baseEntry?.totalDuration ?? 0;
     totalCurrentRenders += entry.renders.length;
     totalBaseRenders += baseEntry?.renders.length ?? 0;
-
-    const paintMetric = entry.metrics['paint:default'];
-    const basePaintMetric = baseEntry?.metrics['paint:default'];
-    if (paintMetric || basePaintMetric) {
-      hasPaint = true;
-      totalCurrentPaint += paintMetric?.mean ?? 0;
-      totalBasePaint += basePaintMetric?.mean ?? 0;
-    }
   }
 
   // Process removed entries (in base but not in current)
@@ -387,12 +375,6 @@ export function compareBenchmarkReports(
 
     totalBaseDuration += baseEntry.totalDuration;
     totalBaseRenders += baseEntry.renders.length;
-
-    const basePaintMetric = baseEntry.metrics['paint:default'];
-    if (basePaintMetric) {
-      hasPaint = true;
-      totalBasePaint += basePaintMetric.mean;
-    }
   }
 
   entries.sort(compareItems);
@@ -403,7 +385,6 @@ export function compareBenchmarkReports(
     totals: {
       duration: makeDiffValue(totalCurrentDuration, totalBaseDuration),
       renderCount: makeCountDiffValue(totalCurrentRenders, totalBaseRenders),
-      paintDefault: hasPaint ? makeDiffValue(totalCurrentPaint, totalBasePaint) : null,
     },
   };
 }
diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index 6742b2926..2f52682ca 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -63,7 +63,7 @@ benchmark(
 
 ### Paint metrics
 
-By default, every benchmark captures a `paint:default` metric — the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically.
+By default, every benchmark captures a `paint#default` metric — the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically. Paint timings are recorded as sub-series of a single `paint` metric.
 
 You can track additional paint metrics by placing `<ElementTiming>` markers and awaiting them in an interaction callback. The component renders an invisible `<span>` that fires in the same paint frame as its surrounding content.
 
@@ -88,7 +88,7 @@ benchmark(
 );
 ```
 
-This produces a `paint:my-component` metric alongside the automatic `paint:default`.
+This produces a `paint#my-component` sub-series alongside the automatic `paint#default`.
 
 `waitForElementTiming` accepts an optional `timeout` in milliseconds (default: 5000). Pass `0` or `Infinity` to rely on the test timeout instead.
 
diff --git a/packages/benchmark/src/index.tsx b/packages/benchmark/src/index.tsx
index 41c65cbd9..f4e00bbc7 100644
--- a/packages/benchmark/src/index.tsx
+++ b/packages/benchmark/src/index.tsx
@@ -2,8 +2,9 @@ import * as React from 'react';
 import { expect, it } from 'vitest';
 import * as ReactDOMClient from 'react-dom/client'; // aliased to react-dom/profiling by Vite
 import * as ReactDOM from 'react-dom';
-import type { RenderEvent, BenchmarkMetric, IterationData, InteractionContext } from './types';
+import type { RenderEvent, IterationData, InteractionContext } from './types';
 import { ElementTiming } from './ElementTiming';
+import { ScalarMetric } from './ScalarMetric';
 // Import for TaskMeta augmentation side effect
 import './taskMetaAugmentation';
 
@@ -13,7 +14,7 @@ interface PerformanceElementTiming extends PerformanceEntry {
   readonly identifier: string;
 }
 
-export type { RenderEvent, BenchmarkMetric, IterationData, InteractionContext } from './types';
+export type { RenderEvent, IterationData, InteractionContext } from './types';
 export type {
   MetricKind,
   MetricDirection,
@@ -23,7 +24,7 @@ export type {
 } from './types';
 export { ElementTiming } from './ElementTiming';
 export { Metric, type MetricRecordOptions } from './Metric';
-export { ScalarMetric } from './ScalarMetric';
+export { ScalarMetric };
 export { DiscreteMetric } from './DiscreteMetric';
 
 function BenchProfiler({
@@ -97,6 +98,15 @@ export function benchmark(
     const totalRuns = warmupRuns + runs;
     const iterations: IterationData[] = [];
 
+    // Paint timings are recorded as one scalar metric with a sub-series per `elementtiming`
+    // identifier (`paint#default`, `paint#grid-header`, …), so they share a single definition.
+    // A default alarm keeps a >20% paint regression flagged, matching the previous behavior.
+    const paint = new ScalarMetric({
+      name: 'paint',
+      format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 2 },
+      alarm: { error: 0.2 },
+    });
+
     const hasElementTiming = supportsElementTiming();
 
     if (typeof window.gc !== 'function') {
@@ -207,11 +217,10 @@ export function benchmark(
       container.remove();
 
       if (!isWarmup) {
-        const metrics: BenchmarkMetric[] = elementEntries.map((entry) => ({
-          name: `paint:${entry.identifier}`,
-          value: entry.renderTime - iterationStart,
-        }));
-        iterations.push({ renders: captures, metrics });
+        for (const entry of elementEntries) {
+          paint.record(entry.renderTime - iterationStart, { id: entry.identifier });
+        }
+        iterations.push({ renders: captures });
       }
 
       if (options?.afterEach) {
diff --git a/packages/benchmark/src/reporter.test.ts b/packages/benchmark/src/reporter.test.ts
index f79630677..b2e089d9c 100644
--- a/packages/benchmark/src/reporter.test.ts
+++ b/packages/benchmark/src/reporter.test.ts
@@ -16,8 +16,8 @@ function event(
   return { id, phase, startTime, actualDuration };
 }
 
-function iteration(renders: RenderEvent[], metrics: IterationData['metrics'] = []): IterationData {
-  return { renders, metrics };
+function iteration(renders: RenderEvent[]): IterationData {
+  return { renders };
 }
 
 describe('generateReportFromIterations', () => {
@@ -126,42 +126,7 @@ describe('generateReportFromIterations', () => {
     expect(report.renders[0].outliers).toBe(1);
   });
 
-  it('aggregates metrics across iterations', () => {
-    const iterations = [
-      iteration(
-        [event('App', 'mount', 0, 10)],
-        [
-          { name: 'paint:bench', value: 60 },
-          { name: 'paint:grid', value: 55 },
-        ],
-      ),
-      iteration(
-        [event('App', 'mount', 0, 10)],
-        [
-          { name: 'paint:bench', value: 64 },
-          { name: 'paint:grid', value: 57 },
-        ],
-      ),
-      iteration(
-        [event('App', 'mount', 0, 10)],
-        [
-          { name: 'paint:bench', value: 62 },
-          { name: 'paint:grid', value: 56 },
-        ],
-      ),
-    ];
-    const report = generateReportFromIterations(iterations);
-
-    expect(report.metrics).toHaveProperty('paint:bench');
-    expect(report.metrics).toHaveProperty('paint:grid');
-    expect(report.metrics['paint:bench'].mean).toBeCloseTo(62, 0);
-    expect(report.metrics['paint:grid'].mean).toBeCloseTo(56, 0);
-    expect(report.metrics['paint:bench'].stdDev).toBeGreaterThanOrEqual(0);
-    expect(report.metrics['paint:grid'].stdDev).toBeGreaterThanOrEqual(0);
-    expect(report.metrics['paint:bench'].outliers).toBe(0);
-  });
-
-  it('returns empty metrics when no metrics are present', () => {
+  it('does not aggregate metrics from iterations (paint flows through benchmarkMetrics)', () => {
     const iterations = [
       iteration([event('App', 'mount', 0, 10)]),
       iteration([event('App', 'mount', 0, 12)]),
@@ -170,25 +135,6 @@ describe('generateReportFromIterations', () => {
 
     expect(report.metrics).toEqual({});
   });
-
-  it('handles multiple metric identifiers with different counts', () => {
-    const iterations = [
-      iteration(
-        [event('App', 'mount', 0, 10)],
-        [
-          { name: 'paint:bench', value: 60 },
-          { name: 'paint:header', value: 50 },
-        ],
-      ),
-      iteration([event('App', 'mount', 0, 10)], [{ name: 'paint:bench', value: 64 }]),
-    ];
-    const report = generateReportFromIterations(iterations);
-
-    expect(report.metrics['paint:bench'].mean).toBeCloseTo(62, 0);
-    // paint:header only has 1 data point
-    expect(report.metrics['paint:header'].mean).toBe(50);
-    expect(report.metrics['paint:header'].stdDev).toBe(0);
-  });
 });
 
 function mockTestCase(options: {
diff --git a/packages/benchmark/src/reporter.ts b/packages/benchmark/src/reporter.ts
index f76160ad8..64b10c18c 100644
--- a/packages/benchmark/src/reporter.ts
+++ b/packages/benchmark/src/reporter.ts
@@ -15,29 +15,6 @@ function getEventKey(event: RenderEvent): string {
   return `${event.id}:${event.phase}`;
 }
 
-function aggregateMetrics(
-  iterations: IterationData[],
-): Record<string, { mean: number; stdDev: number; outliers: number }> {
-  // Collect all metric names across iterations
-  const metricValues = new Map<string, number[]>();
-  for (const iteration of iterations) {
-    for (const metric of iteration.metrics) {
-      let values = metricValues.get(metric.name);
-      if (!values) {
-        values = [];
-        metricValues.set(metric.name, values);
-      }
-      values.push(metric.value);
-    }
-  }
-
-  const result: Record<string, { mean: number; stdDev: number; outliers: number }> = {};
-  for (const [name, values] of metricValues) {
-    result[name] = aggregateSamples(values);
-  }
-  return result;
-}
-
 function generateReportFromIterations(iterations: IterationData[]): BenchmarkReportEntry {
   if (iterations.length === 0) {
     return { iterations: 0, totalDuration: 0, renders: [], metrics: {} };
@@ -112,8 +89,8 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep
     totalDuration += iqrMean;
   }
 
-  // Aggregate metrics
-  const metrics = aggregateMetrics(iterations);
+  // Custom + paint metrics are merged separately from `task.meta.benchmarkMetrics`.
+  const metrics = {};
 
   return {
     iterations: iterationCount,
diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts
index 7ce69458d..c7ff5576b 100644
--- a/packages/benchmark/src/types.ts
+++ b/packages/benchmark/src/types.ts
@@ -14,16 +14,8 @@ export interface RenderEvent {
   startTime: number;
 }
 
-export interface BenchmarkMetric {
-  /** Metric name, e.g. "paint:bench", "paint:grid-header" */
-  name: string;
-  /** Measured value in ms */
-  value: number;
-}
-
 export interface IterationData {
   renders: RenderEvent[];
-  metrics: BenchmarkMetric[];
 }
 
 export interface InteractionContext {

From a2daaa8294c4ca0ba2de97c661b5bfa541b46cab Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 16:57:49 +0200
Subject: [PATCH 08/20] [benchmark] Namespace harness paint as bench:paint with
 default base series

---
 .../src/components/DailyBenchmarkChart.tsx          |  2 +-
 packages/benchmark/README.md                        |  4 ++--
 packages/benchmark/src/index.tsx                    | 13 ++++++++-----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
index 53d562dff..6ce870982 100644
--- a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
+++ b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
@@ -156,7 +156,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
         return entry.totalDuration;
       }
       if (chartMode === 'paint') {
-        return entry.metrics['paint#default']?.mean ?? null;
+        return entry.metrics['bench:paint']?.mean ?? null;
       }
       return entry.renders.length;
     },
diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index 2f52682ca..88e0efc20 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -63,7 +63,7 @@ benchmark(
 
 ### Paint metrics
 
-By default, every benchmark captures a `paint#default` metric — the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically. Paint timings are recorded as sub-series of a single `paint` metric.
+By default, every benchmark captures a `bench:paint` metric — the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically. The harness owns the `bench:` namespace, so avoid it for your own metric names.
 
 You can track additional paint metrics by placing `<ElementTiming>` markers and awaiting them in an interaction callback. The component renders an invisible `<span>` that fires in the same paint frame as its surrounding content.
 
@@ -88,7 +88,7 @@ benchmark(
 );
 ```
 
-This produces a `paint#my-component` sub-series alongside the automatic `paint#default`.
+This produces a `bench:paint#my-component` sub-series alongside the automatic `bench:paint`.
 
 `waitForElementTiming` accepts an optional `timeout` in milliseconds (default: 5000). Pass `0` or `Infinity` to rely on the test timeout instead.
 
diff --git a/packages/benchmark/src/index.tsx b/packages/benchmark/src/index.tsx
index f4e00bbc7..a9a91d589 100644
--- a/packages/benchmark/src/index.tsx
+++ b/packages/benchmark/src/index.tsx
@@ -98,11 +98,12 @@ export function benchmark(
     const totalRuns = warmupRuns + runs;
     const iterations: IterationData[] = [];
 
-    // Paint timings are recorded as one scalar metric with a sub-series per `elementtiming`
-    // identifier (`paint#default`, `paint#grid-header`, …), so they share a single definition.
-    // A default alarm keeps a >20% paint regression flagged, matching the previous behavior.
+    // Paint timings are recorded as one harness-owned `bench:paint` metric: the default sentinel
+    // is the base series (`bench:paint`) and named `elementtiming` markers are sub-series
+    // (`bench:paint#grid-header`, …), all sharing a single definition. A default alarm keeps a
+    // >20% paint regression flagged, matching the previous behavior.
     const paint = new ScalarMetric({
-      name: 'paint',
+      name: 'bench:paint',
       format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 2 },
       alarm: { error: 0.2 },
     });
@@ -218,7 +219,9 @@ export function benchmark(
 
       if (!isWarmup) {
         for (const entry of elementEntries) {
-          paint.record(entry.renderTime - iterationStart, { id: entry.identifier });
+          // The default sentinel is the base series; named markers become sub-series.
+          const id = entry.identifier === 'default' ? undefined : entry.identifier;
+          paint.record(entry.renderTime - iterationStart, id !== undefined ? { id } : undefined);
         }
         iterations.push({ renders: captures });
       }

From c22bb9ee244f840dcacb4b7b0baab3527fab113d Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 17:18:30 +0200
Subject: [PATCH 09/20] [code-infra-dashboard] Migrate legacy paint keys on
 fetch; fix metric diff formatting

Address PR review: centralize report migrations in a migrateBenchmarkReport()
applied at the fetch boundary (renames legacy paint:* keys to bench:paint), make
scalar diff hints respect the metric's format instead of hard-coding ms, and
enforce the sign in formatMetricDiff.
---
 .../benchmark/compareBenchmarkReports.test.ts |  3 ++
 .../lib/benchmark/compareBenchmarkReports.ts  |  4 +-
 .../benchmark/migrateBenchmarkReport.test.ts  | 40 +++++++++++++++++++
 .../lib/benchmark/migrateBenchmarkReport.ts   | 35 ++++++++++++++++
 .../src/utils/fetchCiReport.ts                | 20 ++++++----
 .../src/utils/formatters.ts                   |  3 +-
 6 files changed, 95 insertions(+), 10 deletions(-)
 create mode 100644 apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts
 create mode 100644 apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts

diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
index 72904e588..13bfbf58c 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
@@ -220,6 +220,9 @@ describe('compareBenchmarkReports', () => {
       );
       expect(metric.diff.severity).toBe('neutral');
       expect(metric.format).toEqual({ style: 'unit', unit: 'byte' });
+      // The hint respects the metric's format rather than hard-coding milliseconds.
+      expect(metric.diff.hint).toContain('byte');
+      expect(metric.diff.hint).not.toContain('ms');
     });
 
     it('flags a scalar alarm regression beyond its error band', () => {
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
index 7dc88b99f..88bf43783 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
@@ -1,4 +1,4 @@
-import { formatDiffMs, percentFormatter } from '@/utils/formatters';
+import { formatDiffMs, formatMetricDiff, percentFormatter } from '@/utils/formatters';
 import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition, RenderStats } from './types';
 
 const NOISE_THRESHOLD = 0.2;
@@ -185,7 +185,7 @@ function makeMetricDiff(
   const isDiscrete = kind === 'discrete';
   const diffStr = isDiscrete
     ? `${absoluteDiff > 0 ? '+' : ''}${absoluteDiff}`
-    : `${formatDiffMs(absoluteDiff)} (${percentFormatter.format(relativeDiff)})`;
+    : `${formatMetricDiff(absoluteDiff, definition.format)} (${percentFormatter.format(relativeDiff)})`;
 
   // Informational metric: show the change, never flag it.
   if (!alarm) {
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts
new file mode 100644
index 000000000..538b2537b
--- /dev/null
+++ b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts
@@ -0,0 +1,40 @@
+import { describe, it, expect } from 'vitest';
+import { migrateBenchmarkReport } from './migrateBenchmarkReport';
+import type { BenchmarkReport } from './types';
+
+function report(metrics: Record<string, number>): BenchmarkReport {
+  return {
+    Bench: {
+      iterations: 10,
+      totalDuration: 0,
+      renders: [],
+      metrics: Object.fromEntries(
+        Object.entries(metrics).map(([name, mean]) => [name, { mean, stdDev: 0, outliers: 0 }]),
+      ),
+    },
+  };
+}
+
+describe('migrateBenchmarkReport', () => {
+  it('renames legacy paint:default to bench:paint', () => {
+    const migrated = migrateBenchmarkReport(report({ 'paint:default': 12 }));
+    expect(Object.keys(migrated.Bench.metrics)).toEqual(['bench:paint']);
+    expect(migrated.Bench.metrics['bench:paint'].mean).toBe(12);
+  });
+
+  it('renames named legacy paint keys to bench:paint sub-series', () => {
+    const migrated = migrateBenchmarkReport(report({ 'paint:grid-header': 8 }));
+    expect(Object.keys(migrated.Bench.metrics)).toEqual(['bench:paint#grid-header']);
+  });
+
+  it('leaves current and custom metric keys untouched (idempotent)', () => {
+    const migrated = migrateBenchmarkReport(
+      report({ 'bench:paint': 5, 'bench:paint#header': 3, button_clicks: 2 }),
+    );
+    expect(Object.keys(migrated.Bench.metrics).sort()).toEqual([
+      'bench:paint',
+      'bench:paint#header',
+      'button_clicks',
+    ]);
+  });
+});
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts
new file mode 100644
index 000000000..733342348
--- /dev/null
+++ b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts
@@ -0,0 +1,35 @@
+import type { BenchmarkReport, BenchmarkReportEntry } from './types';
+
+const LEGACY_PAINT_PREFIX = 'paint:';
+
+/**
+ * Renames legacy harness paint keys (`paint:default`, `paint:grid-header`) to the current
+ * `bench:paint` scheme (`bench:paint`, `bench:paint#grid-header`). Idempotent: keys already in the
+ * new scheme are untouched.
+ */
+function migrateMetricKey(key: string): string {
+  if (!key.startsWith(LEGACY_PAINT_PREFIX)) {
+    return key;
+  }
+  const identifier = key.slice(LEGACY_PAINT_PREFIX.length);
+  return identifier === 'default' ? 'bench:paint' : `bench:paint#${identifier}`;
+}
+
+function migrateEntry(entry: BenchmarkReportEntry): BenchmarkReportEntry {
+  return {
+    ...entry,
+    metrics: Object.fromEntries(
+      Object.entries(entry.metrics).map(([key, stats]) => [migrateMetricKey(key), stats]),
+    ),
+  };
+}
+
+/**
+ * Applies forward migrations to a benchmark report fetched from S3 so older uploads read with the
+ * current shape. Centralizes legacy normalization — add future migrations here.
+ */
+export function migrateBenchmarkReport(report: BenchmarkReport): BenchmarkReport {
+  return Object.fromEntries(
+    Object.entries(report).map(([name, entry]) => [name, migrateEntry(entry)]),
+  );
+}
diff --git a/apps/code-infra-dashboard/src/utils/fetchCiReport.ts b/apps/code-infra-dashboard/src/utils/fetchCiReport.ts
index 67e96b349..2b2e7cfab 100644
--- a/apps/code-infra-dashboard/src/utils/fetchCiReport.ts
+++ b/apps/code-infra-dashboard/src/utils/fetchCiReport.ts
@@ -1,5 +1,6 @@
 import type { SizeSnapshotWithMetadata } from '@/lib/bundleSize/types';
 import type { BenchmarkReport, BenchmarkUpload } from '@/lib/benchmark/types';
+import { migrateBenchmarkReport } from '@/lib/benchmark/migrateBenchmarkReport';
 
 export interface CiReportTypes {
   'benchmark.json': BenchmarkUpload;
@@ -17,14 +18,19 @@ export type CiReportName = keyof CiReportTypes;
  * simply reasserts what the body already contains.
  */
 function normalizeBenchmarkArtifact(raw: unknown, repo: string, sha: string): BenchmarkUpload {
-  if (raw && typeof raw === 'object' && 'report' in raw) {
-    return { ...(raw as BenchmarkUpload), commitSha: sha, repo };
-  }
+  const upload: BenchmarkUpload =
+    raw && typeof raw === 'object' && 'report' in raw
+      ? { ...(raw as BenchmarkUpload), commitSha: sha, repo }
+      : ({ commitSha: sha, repo, report: raw as BenchmarkReport } as BenchmarkUpload);
+
+  // Apply forward migrations so older uploads read with the current metric naming.
   return {
-    commitSha: sha,
-    repo,
-    report: raw as BenchmarkReport,
-  } as BenchmarkUpload;
+    ...upload,
+    report: migrateBenchmarkReport(upload.report),
+    ...(upload.base
+      ? { base: { ...upload.base, report: migrateBenchmarkReport(upload.base.report) } }
+      : {}),
+  };
 }
 
 /**
diff --git a/apps/code-infra-dashboard/src/utils/formatters.ts b/apps/code-infra-dashboard/src/utils/formatters.ts
index 107514d2a..20266ac33 100644
--- a/apps/code-infra-dashboard/src/utils/formatters.ts
+++ b/apps/code-infra-dashboard/src/utils/formatters.ts
@@ -43,8 +43,9 @@ export function formatMetricNumber(value: number, format?: Intl.NumberFormatOpti
 
 /** Formats a signed metric diff with its format, falling back to a millisecond diff. */
 export function formatMetricDiff(value: number, format?: Intl.NumberFormatOptions): string {
+  // `signDisplay` is spread last so a diff always shows its sign, regardless of the metric's format.
   return format
-    ? getNumberFormatter({ signDisplay: 'exceptZero', ...format }).format(value)
+    ? getNumberFormatter({ ...format, signDisplay: 'exceptZero' }).format(value)
     : formatDiffMs(value);
 }
 

From 170450270d37ee0dcd9a541a1b6d17286cba491c Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 17:23:34 +0200
Subject: [PATCH 10/20] [benchmark] Reject conflicting metric definitions
 reused across benchmarks

---
 packages/benchmark/src/reporter.test.ts | 45 +++++++++++++++++++++++++
 packages/benchmark/src/reporter.ts      | 13 ++++++-
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/packages/benchmark/src/reporter.test.ts b/packages/benchmark/src/reporter.test.ts
index b2e089d9c..48e139ff8 100644
--- a/packages/benchmark/src/reporter.test.ts
+++ b/packages/benchmark/src/reporter.test.ts
@@ -370,5 +370,50 @@ describe('BenchmarkReporter', () => {
 
       consoleSpy.mockRestore();
     });
+
+    function metricCase(testName: string, metricName: string, alarm: unknown) {
+      return mockTestCase({
+        fullName: testName,
+        meta: {
+          benchmarkName: testName,
+          benchmarkMetrics: {
+            [metricName]: {
+              kind: 'scalar',
+              config: { name: metricName, alarm },
+              series: { '': { mean: 1, stdDev: 0, outliers: 0, count: 1 } },
+            },
+          },
+        },
+        state: 'passed',
+      });
+    }
+
+    it('allows reusing a metric name across benchmarks with identical config', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-reuse-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 }));
+      expect(() =>
+        reporter.onTestCaseResult(metricCase('b', 'shared', { error: 0.2 })),
+      ).not.toThrow();
+
+      consoleSpy.mockRestore();
+    });
+
+    it('throws when a metric name is reused with conflicting config across benchmarks', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-conflict-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 }));
+      expect(() => reporter.onTestCaseResult(metricCase('b', 'shared', { error: 0.9 }))).toThrow(
+        /conflicting/,
+      );
+
+      consoleSpy.mockRestore();
+    });
   });
 });
diff --git a/packages/benchmark/src/reporter.ts b/packages/benchmark/src/reporter.ts
index 64b10c18c..c0488529a 100644
--- a/packages/benchmark/src/reporter.ts
+++ b/packages/benchmark/src/reporter.ts
@@ -179,11 +179,22 @@ function mergeCustomMetrics(
       report.metrics[key] = { mean: stats.mean, stdDev: stats.stdDev, outliers: stats.outliers };
       maxCount = Math.max(maxCount, stats.count);
     }
-    definitions[metricName] = {
+    const definition: MetricDefinition = {
       kind: metric.kind,
       format: metric.config.format,
       alarm: metric.config.alarm,
     };
+    // A metric name maps to one definition. Reusing it across benchmarks is fine when the config
+    // matches (e.g. the harness `bench:paint`), but conflicting config would silently apply
+    // last-write-wins to every entry — reject it instead.
+    const existing = definitions[metricName];
+    if (existing && JSON.stringify(existing) !== JSON.stringify(definition)) {
+      throw new Error(
+        `Benchmark metric "${metricName}" is defined with conflicting configuration across ` +
+          `benchmarks. A metric name must map to a single kind, format, and alarm.`,
+      );
+    }
+    definitions[metricName] = definition;
   }
   if (report.iterations === 0) {
     report.iterations = maxCount;

From 6f49c006fc11dabc8428d37c8e5404853dfc30f9 Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Fri, 5 Jun 2026 18:40:19 +0200
Subject: [PATCH 11/20] [benchmark] Address review: reset reporter per run,
 robust conflict check, reject '#' names, merge base+head definitions

- Reporter resets accumulated state in onTestRunStart so watch re-runs start clean
  (an edited metric config no longer conflicts with its previous-run definition).
- Conflicting-definition check uses an order-insensitive comparison.
- Metric names containing '#' (the sub-series separator) are rejected at construction.
- Dashboard + PR markdown merge base and head metricDefinitions (head wins) so a
  base-only/removed metric keeps its formatting and alarm metadata.
- Clarify docs: error defaults to the global noise band only when both bands are omitted.
---
 .../src/components/DailyBenchmarkChart.tsx    |  6 +++-
 .../lib/benchmark/compareBenchmarkReports.ts  | 17 ++++++++++
 .../src/lib/benchmark/types.ts                |  2 +-
 .../src/lib/ciReports/benchmarkReport.ts      |  8 +++--
 .../src/views/BenchmarkDetails.tsx            |  6 +++-
 packages/benchmark/README.md                  |  2 +-
 packages/benchmark/src/Metric.test.ts         |  4 +++
 packages/benchmark/src/Metric.ts              |  5 +++
 packages/benchmark/src/reporter.test.ts       | 33 +++++++++++++++++++
 packages/benchmark/src/reporter.ts            | 26 ++++++++++++++-
 packages/benchmark/src/types.ts               |  5 +--
 11 files changed, 105 insertions(+), 9 deletions(-)

diff --git a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
index 6ce870982..7f92dd5dd 100644
--- a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
+++ b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
@@ -11,6 +11,7 @@ import Tab from '@mui/material/Tab';
 import { BarChartPro } from '@mui/x-charts-pro/BarChartPro';
 import { useXScale, useDrawingArea } from '@mui/x-charts-pro/hooks';
 import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types';
+import { mergeMetricDefinitions } from '@/lib/benchmark/compareBenchmarkReports';
 import { formatMs } from '@/utils/formatters';
 import { useMasterCommits, type GitHubCommit } from '../hooks/useMasterCommits';
 import { useCiReports } from '../hooks/useCiReports';
@@ -304,7 +305,10 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
     return {
       value: reportData.report,
       base: baselineData?.report ?? null,
-      definitions: reportData.metricDefinitions,
+      definitions: mergeMetricDefinitions(
+        baselineData?.metricDefinitions,
+        reportData.metricDefinitions,
+      ),
       valueCommit: reportData.commit,
       baseCommit: baselineData?.commit ?? null,
     };
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
index 88bf43783..b455e837d 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
@@ -323,6 +323,23 @@ function compareItems(a: ComparisonItem, b: ComparisonItem): number {
   return Math.abs(b.duration.absoluteDiff) - Math.abs(a.duration.absoluteDiff);
 }
 
+/**
+ * Merges base and head metric definitions (head wins) so a metric present only in the base
+ * report (e.g. one removed in the head) keeps its formatting/alarm metadata in the diff.
+ */
+export function mergeMetricDefinitions(
+  base: Record<string, MetricDefinition> | undefined,
+  head: Record<string, MetricDefinition> | undefined,
+): Record<string, MetricDefinition> | undefined {
+  if (!base) {
+    return head;
+  }
+  if (!head) {
+    return base;
+  }
+  return { ...base, ...head };
+}
+
 export function compareBenchmarkReports(
   current: BenchmarkReport,
   base: BenchmarkReport | null,
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/types.ts b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
index 85d0e5209..92a0f9d67 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/types.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
@@ -37,7 +37,7 @@ export interface MetricAlarm {
   direction?: MetricDirection;
   /** Softer band (relative fraction for scalar, absolute count delta for discrete). */
   warn?: number;
-  /** Harder band; defaults to the global noise band when omitted. */
+  /** Harder band; defaults to the global noise band only when both `warn` and `error` are omitted. */
   error?: number;
 }
 
diff --git a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
index 026049805..bcd972d31 100644
--- a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
+++ b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
@@ -1,6 +1,9 @@
 import { fetchCiReport } from '@/utils/fetchCiReport';
 import { fetchCiReportWithFallback } from '@/utils/fetchCiReportWithFallback';
-import { compareBenchmarkReports } from '@/lib/benchmark/compareBenchmarkReports';
+import {
+  compareBenchmarkReports,
+  mergeMetricDefinitions,
+} from '@/lib/benchmark/compareBenchmarkReports';
 import { buildBenchmarkMarkdownReport } from '@/lib/benchmark/buildMarkdownReport';
 import { DASHBOARD_ORIGIN } from '@/constants';
 
@@ -45,10 +48,11 @@ export async function generateBenchmarkReport(
     markdownContent += `_:information_source: Using benchmark from parent commit ${actualBaseCommit} (fallback from merge base ${mergeBaseCommit})._\n\n`;
   }
 
+  const baseUpload = useInlinedBase ? inlinedBase : fetchedBaseUpload;
   const comparison = compareBenchmarkReports(
     headReport.report,
     baseReport,
-    headReport.metricDefinitions,
+    mergeMetricDefinitions(baseUpload?.metricDefinitions, headReport.metricDefinitions),
   );
 
   const detailsUrl = new URL(`${DASHBOARD_ORIGIN}/benchmark-details/${repo}`);
diff --git a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
index 178bda254..6f6cf49ad 100644
--- a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
+++ b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
@@ -14,6 +14,7 @@ import Heading from '../components/Heading';
 import ReportHeader from '../components/ReportHeader';
 import ErrorDisplay from '../components/ErrorDisplay';
 import { BenchmarkComparisonReportView } from '../components/BenchmarkComparisonReportView';
+import { mergeMetricDefinitions } from '../lib/benchmark/compareBenchmarkReports';
 import { useBaseSha } from '../hooks/useBaseSha';
 
 interface InlinedBaseAlertProps {
@@ -143,7 +144,10 @@ export default function BenchmarkDetails() {
           <BenchmarkComparisonReportView
             value={report.report}
             base={effectiveBase?.report ?? null}
-            definitions={report.metricDefinitions}
+            definitions={mergeMetricDefinitions(
+              effectiveBase?.metricDefinitions,
+              report.metricDefinitions,
+            )}
           />
         )}
       </Paper>
diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index 88e0efc20..69ebe79a5 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -131,7 +131,7 @@ A metric is declared once (typically at module scope) and reused across tests an
 - `alarm` — opts the metric into regression flagging. Omit it and the metric is informational (its diff is shown but never flagged). Holds:
   - `direction` — `'lowerIsBetter'` (default) or `'higherIsBetter'`.
   - `warn` — softer band; a regression past it is flagged as a warning.
-  - `error` — harder band; a regression past it is flagged as an error. Defaults to the dashboard's global noise band when omitted.
+  - `error` — harder band; a regression past it is flagged as an error. Defaults to the dashboard's global noise band only when both `warn` and `error` are omitted; with only `warn` set there is no error band (warning-only).
   - Bands are relative fractions for scalar metrics (`0.1` = 10%) and absolute count deltas for discrete metrics (`1`, `2`). Either band is optional.
 
 #### Sub-series
diff --git a/packages/benchmark/src/Metric.test.ts b/packages/benchmark/src/Metric.test.ts
index 64b4d67a9..60a46227d 100644
--- a/packages/benchmark/src/Metric.test.ts
+++ b/packages/benchmark/src/Metric.test.ts
@@ -17,6 +17,10 @@ describe('Metric.record', () => {
       metric.record(2, { id: 'sub' });
     }).not.toThrow();
   });
+
+  it('rejects a metric name containing the "#" sub-series separator', () => {
+    expect(() => new ScalarMetric({ name: 'paint#default' })).toThrow(/must not contain "#"/);
+  });
 });
 
 describe('ScalarMetric.timeEnd', () => {
diff --git a/packages/benchmark/src/Metric.ts b/packages/benchmark/src/Metric.ts
index 117b97ba1..b54df9430 100644
--- a/packages/benchmark/src/Metric.ts
+++ b/packages/benchmark/src/Metric.ts
@@ -54,6 +54,11 @@ export abstract class Metric {
   constructor(config: MetricConfig | string) {
     this.config = typeof config === 'string' ? { name: config } : config;
     this.name = this.config.name;
+    if (this.name.includes('#')) {
+      throw new Error(
+        `Metric name "${this.name}" must not contain "#" — it is reserved as the sub-series separator.`,
+      );
+    }
   }
 
   /**
diff --git a/packages/benchmark/src/reporter.test.ts b/packages/benchmark/src/reporter.test.ts
index 48e139ff8..b60279df2 100644
--- a/packages/benchmark/src/reporter.test.ts
+++ b/packages/benchmark/src/reporter.test.ts
@@ -415,5 +415,38 @@ describe('BenchmarkReporter', () => {
 
       consoleSpy.mockRestore();
     });
+
+    it('does not flag the same config written with a different key order', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-order-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(
+        metricCase('a', 'shared', { direction: 'lowerIsBetter', error: 0.2 }),
+      );
+      expect(() =>
+        reporter.onTestCaseResult(
+          metricCase('b', 'shared', { error: 0.2, direction: 'lowerIsBetter' }),
+        ),
+      ).not.toThrow();
+
+      consoleSpy.mockRestore();
+    });
+
+    it('resets between runs so an edited config does not conflict on watch reload', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-reload-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 }));
+      reporter.onTestRunStart(); // simulate a watch re-run
+      expect(() =>
+        reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.9 })),
+      ).not.toThrow();
+
+      consoleSpy.mockRestore();
+    });
   });
 });
diff --git a/packages/benchmark/src/reporter.ts b/packages/benchmark/src/reporter.ts
index c0488529a..4abc383ca 100644
--- a/packages/benchmark/src/reporter.ts
+++ b/packages/benchmark/src/reporter.ts
@@ -15,6 +15,21 @@ function getEventKey(event: RenderEvent): string {
   return `${event.id}:${event.phase}`;
 }
 
+/** Order-insensitive JSON: sorts object keys and drops `undefined` so equal configs compare equal. */
+function stableStringify(value: unknown): string {
+  if (value === null || typeof value !== 'object') {
+    return JSON.stringify(value);
+  }
+  if (Array.isArray(value)) {
+    return `[${value.map(stableStringify).join(',')}]`;
+  }
+  const record = value as Record<string, unknown>;
+  const keys = Object.keys(record)
+    .filter((key) => record[key] !== undefined)
+    .sort();
+  return `{${keys.map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(',')}}`;
+}
+
 function generateReportFromIterations(iterations: IterationData[]): BenchmarkReportEntry {
   if (iterations.length === 0) {
     return { iterations: 0, totalDuration: 0, renders: [], metrics: {} };
@@ -188,7 +203,7 @@ function mergeCustomMetrics(
     // matches (e.g. the harness `bench:paint`), but conflicting config would silently apply
     // last-write-wins to every entry — reject it instead.
     const existing = definitions[metricName];
-    if (existing && JSON.stringify(existing) !== JSON.stringify(definition)) {
+    if (existing && stableStringify(existing) !== stableStringify(definition)) {
       throw new Error(
         `Benchmark metric "${metricName}" is defined with conflicting configuration across ` +
           `benchmarks. A metric name must map to a single kind, format, and alarm.`,
@@ -275,6 +290,15 @@ class BenchmarkReporter implements Reporter {
     this.baselinePath = options?.baselinePath ?? process.env.BENCHMARK_BASELINE_PATH;
   }
 
+  // Reset accumulated state at the start of every run so watch-mode re-runs start clean (the
+  // reporter instance is reused across runs). Otherwise stale benchmarks/definitions linger — and
+  // an edited metric config would conflict with its own previous-run definition.
+  onTestRunStart(): void {
+    this.benchmarks = {};
+    this.metricDefinitions = {};
+    this.hasFailures = false;
+  }
+
   onTestCaseResult(testCase: TestCase): void {
     if (testCase.result().state === 'failed') {
       this.hasFailures = true;
diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts
index c7ff5576b..f59359bcb 100644
--- a/packages/benchmark/src/types.ts
+++ b/packages/benchmark/src/types.ts
@@ -46,8 +46,9 @@ export interface MetricAlarm {
    */
   warn?: number;
   /**
-   * Harder band: a regression past `error` is flagged as an error (the alarm). When omitted it
-   * defaults to the dashboard's global noise band.
+   * Harder band: a regression past `error` is flagged as an error (the alarm). When **both**
+   * `warn` and `error` are omitted, `error` defaults to the dashboard's global noise band; with
+   * only `warn` set there is no error band (warning-only).
    * Scalar metrics: a relative fraction (`0.25` = 25%). Discrete metrics: an absolute count delta.
    */
   error?: number;

From 4b6f6cd08472e798e4a15469c9ca881f9ebc310d Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Sat, 6 Jun 2026 06:10:58 +0200
Subject: [PATCH 12/20] [code-infra-dashboard] Merge base+head
 metricDefinitions inside compareBenchmarkReports

compareBenchmarkReports now takes both sides' definitions and reconciles them internally
(head wins), so callers pass each side's raw definitions and can't forget to merge.
mergeMetricDefinitions is now private; the View gained a baseDefinitions prop.
---
 .../components/BenchmarkComparisonReportView.tsx  |  6 ++++--
 .../src/components/DailyBenchmarkChart.tsx        |  8 +++-----
 .../lib/benchmark/compareBenchmarkReports.test.ts | 15 +++++++++++++++
 .../src/lib/benchmark/compareBenchmarkReports.ts  | 10 +++++++---
 .../src/lib/ciReports/benchmarkReport.ts          |  8 +++-----
 .../src/views/BenchmarkDetails.tsx                |  7 ++-----
 6 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
index b0487c30c..d8202a5d8 100644
--- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
+++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
@@ -458,16 +458,18 @@ interface BenchmarkComparisonReportViewProps {
   value: BenchmarkReport;
   base: BenchmarkReport | null;
   definitions?: Record<string, MetricDefinition>;
+  baseDefinitions?: Record<string, MetricDefinition>;
 }
 
 export function BenchmarkComparisonReportView({
   value,
   base,
   definitions,
+  baseDefinitions,
 }: BenchmarkComparisonReportViewProps) {
   const comparisonReport = React.useMemo(
-    () => compareBenchmarkReports(value, base, definitions),
-    [value, base, definitions],
+    () => compareBenchmarkReports(value, base, definitions, baseDefinitions),
+    [value, base, definitions, baseDefinitions],
   );
 
   const globalMaxDuration = React.useMemo(() => {
diff --git a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
index 7f92dd5dd..3f34108d2 100644
--- a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
+++ b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
@@ -11,7 +11,6 @@ import Tab from '@mui/material/Tab';
 import { BarChartPro } from '@mui/x-charts-pro/BarChartPro';
 import { useXScale, useDrawingArea } from '@mui/x-charts-pro/hooks';
 import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types';
-import { mergeMetricDefinitions } from '@/lib/benchmark/compareBenchmarkReports';
 import { formatMs } from '@/utils/formatters';
 import { useMasterCommits, type GitHubCommit } from '../hooks/useMasterCommits';
 import { useCiReports } from '../hooks/useCiReports';
@@ -305,10 +304,8 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
     return {
       value: reportData.report,
       base: baselineData?.report ?? null,
-      definitions: mergeMetricDefinitions(
-        baselineData?.metricDefinitions,
-        reportData.metricDefinitions,
-      ),
+      definitions: reportData.metricDefinitions,
+      baseDefinitions: baselineData?.metricDefinitions,
       valueCommit: reportData.commit,
       baseCommit: baselineData?.commit ?? null,
     };
@@ -544,6 +541,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
                   value={inlinePair.value}
                   base={inlinePair.base}
                   definitions={inlinePair.definitions}
+                  baseDefinitions={inlinePair.baseDefinitions}
                 />
               </Box>
             )}
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
index 13bfbf58c..e1f8d9f92 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
@@ -324,5 +324,20 @@ describe('compareBenchmarkReports', () => {
       );
       expect(metric.diff.severity).toBe('error');
     });
+
+    it('keeps a base-only (removed) metric formatted using its base definition', () => {
+      // The metric exists only in the base; its definition is supplied via baseDefinitions.
+      const result = compareBenchmarkReports(
+        reportWithMetrics({}),
+        reportWithMetrics({ bytes: 100 }),
+        undefined,
+        {
+          bytes: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } },
+        },
+      );
+      const metric = result.entries[0].metrics.find((entry) => entry.name === 'bytes')!;
+      expect(metric.removed).toBe(true);
+      expect(metric.format).toEqual({ style: 'unit', unit: 'byte' });
+    });
   });
 });
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
index b455e837d..0d3015887 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
@@ -327,7 +327,7 @@ function compareItems(a: ComparisonItem, b: ComparisonItem): number {
  * Merges base and head metric definitions (head wins) so a metric present only in the base
  * report (e.g. one removed in the head) keeps its formatting/alarm metadata in the diff.
  */
-export function mergeMetricDefinitions(
+function mergeMetricDefinitions(
   base: Record<string, MetricDefinition> | undefined,
   head: Record<string, MetricDefinition> | undefined,
 ): Record<string, MetricDefinition> | undefined {
@@ -344,7 +344,11 @@ export function compareBenchmarkReports(
   current: BenchmarkReport,
   base: BenchmarkReport | null,
   definitions?: Record<string, MetricDefinition>,
+  baseDefinitions?: Record<string, MetricDefinition>,
 ): BenchmarkComparisonReport {
+  // Reconcile the two sides' definitions here (head wins) so callers just pass each side's raw
+  // definitions and a base-only/removed metric keeps its formatting/alarm metadata.
+  const mergedDefinitions = mergeMetricDefinitions(baseDefinitions, definitions);
   const effectiveBase = base ?? {};
   const entries: ComparisonItem[] = [];
 
@@ -365,7 +369,7 @@ export function compareBenchmarkReports(
         ? makeCountDiffValue(entry.renders.length, baseEntry.renders.length)
         : undefined,
       renders: compareRenders(entry.renders, baseEntry),
-      metrics: compareMetrics(entry.metrics, baseEntry, definitions),
+      metrics: compareMetrics(entry.metrics, baseEntry, mergedDefinitions),
       iterations: entry.iterations,
     });
 
@@ -386,7 +390,7 @@ export function compareBenchmarkReports(
       name,
       duration,
       renders: compareRenders([], baseEntry),
-      metrics: compareMetrics({}, baseEntry, definitions),
+      metrics: compareMetrics({}, baseEntry, mergedDefinitions),
       iterations: 0,
     });
 
diff --git a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
index bcd972d31..2fa980b60 100644
--- a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
+++ b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
@@ -1,9 +1,6 @@
 import { fetchCiReport } from '@/utils/fetchCiReport';
 import { fetchCiReportWithFallback } from '@/utils/fetchCiReportWithFallback';
-import {
-  compareBenchmarkReports,
-  mergeMetricDefinitions,
-} from '@/lib/benchmark/compareBenchmarkReports';
+import { compareBenchmarkReports } from '@/lib/benchmark/compareBenchmarkReports';
 import { buildBenchmarkMarkdownReport } from '@/lib/benchmark/buildMarkdownReport';
 import { DASHBOARD_ORIGIN } from '@/constants';
 
@@ -52,7 +49,8 @@ export async function generateBenchmarkReport(
   const comparison = compareBenchmarkReports(
     headReport.report,
     baseReport,
-    mergeMetricDefinitions(baseUpload?.metricDefinitions, headReport.metricDefinitions),
+    headReport.metricDefinitions,
+    baseUpload?.metricDefinitions,
   );
 
   const detailsUrl = new URL(`${DASHBOARD_ORIGIN}/benchmark-details/${repo}`);
diff --git a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
index 6f6cf49ad..57b3710cd 100644
--- a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
+++ b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
@@ -14,7 +14,6 @@ import Heading from '../components/Heading';
 import ReportHeader from '../components/ReportHeader';
 import ErrorDisplay from '../components/ErrorDisplay';
 import { BenchmarkComparisonReportView } from '../components/BenchmarkComparisonReportView';
-import { mergeMetricDefinitions } from '../lib/benchmark/compareBenchmarkReports';
 import { useBaseSha } from '../hooks/useBaseSha';
 
 interface InlinedBaseAlertProps {
@@ -144,10 +143,8 @@ export default function BenchmarkDetails() {
           <BenchmarkComparisonReportView
             value={report.report}
             base={effectiveBase?.report ?? null}
-            definitions={mergeMetricDefinitions(
-              effectiveBase?.metricDefinitions,
-              report.metricDefinitions,
-            )}
+            definitions={report.metricDefinitions}
+            baseDefinitions={effectiveBase?.metricDefinitions}
           />
         )}
       </Paper>

From d736635b17b66c1cc995673834535d346b52c1cd Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Sat, 6 Jun 2026 06:57:29 +0200
Subject: [PATCH 13/20] [code-infra-dashboard] Bundle metric definitions into
 the comparison input

compareBenchmarkReports now takes report-objects that carry their own
metricDefinitions (BenchmarkComparisonInput) instead of separate report +
definitions args. Each metric uses the definition from the side it appears on,
so the mergeMetricDefinitions helper is gone. Test fixtures build the input
bundle directly.
---
 .../BenchmarkComparisonReportView.tsx         | 20 ++-----
 .../src/components/DailyBenchmarkChart.tsx    | 15 ++---
 .../lib/benchmark/buildMarkdownReport.test.ts | 21 +++----
 .../benchmark/compareBenchmarkReports.test.ts | 16 +++--
 .../lib/benchmark/compareBenchmarkReports.ts  | 59 ++++++++-----------
 .../src/lib/benchmark/test-fixtures.ts        |  9 +--
 .../src/lib/ciReports/benchmarkReport.ts      |  7 +--
 .../src/views/BenchmarkDetails.tsx            |  9 +--
 8 files changed, 64 insertions(+), 92 deletions(-)

diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
index d8202a5d8..655e83e5b 100644
--- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
+++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
@@ -14,12 +14,12 @@ import ExpandMoreIcon from '@mui/icons-material/ExpandMore';
 import Tooltip from '@mui/material/Tooltip';
 import {
   compareBenchmarkReports,
+  type BenchmarkComparisonInput,
   type BenchmarkComparisonReport,
   type ComparisonItem,
   type DiffValue,
   type BenchmarkDiffSeverity,
 } from '@/lib/benchmark/compareBenchmarkReports';
-import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types';
 import {
   formatMs,
   formatMetricNumber,
@@ -455,22 +455,12 @@ function BenchmarkAccordion({
 }
 
 interface BenchmarkComparisonReportViewProps {
-  value: BenchmarkReport;
-  base: BenchmarkReport | null;
-  definitions?: Record<string, MetricDefinition>;
-  baseDefinitions?: Record<string, MetricDefinition>;
+  value: BenchmarkComparisonInput;
+  base: BenchmarkComparisonInput | null;
 }
 
-export function BenchmarkComparisonReportView({
-  value,
-  base,
-  definitions,
-  baseDefinitions,
-}: BenchmarkComparisonReportViewProps) {
-  const comparisonReport = React.useMemo(
-    () => compareBenchmarkReports(value, base, definitions, baseDefinitions),
-    [value, base, definitions, baseDefinitions],
-  );
+export function BenchmarkComparisonReportView({ value, base }: BenchmarkComparisonReportViewProps) {
+  const comparisonReport = React.useMemo(() => compareBenchmarkReports(value, base), [value, base]);
 
   const globalMaxDuration = React.useMemo(() => {
     let max = 0;
diff --git a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
index 3f34108d2..608131535 100644
--- a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
+++ b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
@@ -302,10 +302,10 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
       return null;
     }
     return {
-      value: reportData.report,
-      base: baselineData?.report ?? null,
-      definitions: reportData.metricDefinitions,
-      baseDefinitions: baselineData?.metricDefinitions,
+      value: { report: reportData.report, metricDefinitions: reportData.metricDefinitions },
+      base: baselineData?.report
+        ? { report: baselineData.report, metricDefinitions: baselineData.metricDefinitions }
+        : null,
       valueCommit: reportData.commit,
       baseCommit: baselineData?.commit ?? null,
     };
@@ -537,12 +537,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
                     ? `Comparing baseline ${inlinePair.baseCommit.sha.substring(0, 7)} → report ${inlinePair.valueCommit.sha.substring(0, 7)}`
                     : `Report ${inlinePair.valueCommit.sha.substring(0, 7)}`}
                 </Typography>
-                <BenchmarkComparisonReportView
-                  value={inlinePair.value}
-                  base={inlinePair.base}
-                  definitions={inlinePair.definitions}
-                  baseDefinitions={inlinePair.baseDefinitions}
-                />
+                <BenchmarkComparisonReportView value={inlinePair.value} base={inlinePair.base} />
               </Box>
             )}
             {activeTab === 'comparison' && !inlinePair && (
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
index 30d6fa221..6d07c7729 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
@@ -1,18 +1,22 @@
 import { describe, it, expect } from 'vitest';
 import { buildBenchmarkMarkdownReport } from './buildMarkdownReport';
-import { compareBenchmarkReports } from './compareBenchmarkReports';
-import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition } from './types';
+import { compareBenchmarkReports, type BenchmarkComparisonInput } from './compareBenchmarkReports';
+import type { BenchmarkReportEntry, MetricDefinition } from './types';
 import { makeReport, makeReportFromConfig } from './test-fixtures';
 
 // A single benchmark whose duration/renders are neutral, so the only signal is one metric.
-function metricReport(name: string, mean: number): BenchmarkReport {
+function metricReport(
+  name: string,
+  mean: number,
+  definitions?: Record<string, MetricDefinition>,
+): BenchmarkComparisonInput {
   const entry: BenchmarkReportEntry = {
     iterations: 10,
     totalDuration: 100,
     renders: [],
     metrics: { [name]: { mean, stdDev: 0, outliers: 0 } },
   };
-  return { Bench: entry };
+  return { report: { Bench: entry }, metricDefinitions: definitions };
 }
 
 const alarmDefinitions: Record<string, MetricDefinition> = {
@@ -152,9 +156,8 @@ describe('buildBenchmarkMarkdownReport', () => {
   describe('metric alarms', () => {
     it('surfaces an error-level metric, making its otherwise-neutral test significant', () => {
       const report = compareBenchmarkReports(
-        metricReport('clicks', 5), // +2 vs 3, discrete error band 1
+        metricReport('clicks', 5, alarmDefinitions), // +2 vs 3, discrete error band 1
         metricReport('clicks', 3),
-        alarmDefinitions,
       );
       const markdown = buildBenchmarkMarkdownReport(report);
       expect(markdown).not.toContain('No significant changes');
@@ -165,9 +168,8 @@ describe('buildBenchmarkMarkdownReport', () => {
 
     it('does not surface warning-level metrics (kept on the dashboard only)', () => {
       const report = compareBenchmarkReports(
-        metricReport('tti', 115), // +15%: past warn (10%), within error (25%) -> warning
+        metricReport('tti', 115, alarmDefinitions), // +15%: past warn (10%), within error (25%) -> warning
         metricReport('tti', 100),
-        alarmDefinitions,
       );
       const markdown = buildBenchmarkMarkdownReport(report);
       expect(markdown).toContain('No significant changes');
@@ -176,9 +178,8 @@ describe('buildBenchmarkMarkdownReport', () => {
 
     it('ignores informational metrics — no alarm section, test stays within noise', () => {
       const report = compareBenchmarkReports(
-        metricReport('fps', 200), // big change but no alarm configured
+        metricReport('fps', 200, alarmDefinitions), // big change but no alarm configured
         metricReport('fps', 100),
-        alarmDefinitions,
       );
       const markdown = buildBenchmarkMarkdownReport(report);
       expect(markdown).toContain('No significant changes');
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
index e1f8d9f92..e17ea9a26 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
@@ -25,7 +25,11 @@ const definitions: Record<string, MetricDefinition> = {
 };
 
 function metricEntry(current: BenchmarkReport, base: BenchmarkReport, metricName: string) {
-  const result = compareBenchmarkReports(current, base, definitions);
+  // The metric appears on the current side, so its definitions ride along there.
+  const result = compareBenchmarkReports(
+    { report: current, metricDefinitions: definitions },
+    { report: base },
+  );
   return result.entries[0].metrics.find((metric) => metric.name === metricName)!;
 }
 
@@ -326,13 +330,13 @@ describe('compareBenchmarkReports', () => {
     });
 
     it('keeps a base-only (removed) metric formatted using its base definition', () => {
-      // The metric exists only in the base; its definition is supplied via baseDefinitions.
+      // Definitions travel with their report: the metric exists only in the base, so its formatting
+      // comes from the base side's definitions even though the head has none.
       const result = compareBenchmarkReports(
-        reportWithMetrics({}),
-        reportWithMetrics({ bytes: 100 }),
-        undefined,
+        { report: reportWithMetrics({}) },
         {
-          bytes: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } },
+          report: reportWithMetrics({ bytes: 100 }),
+          metricDefinitions: { bytes: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } } },
         },
       );
       const metric = result.entries[0].metrics.find((entry) => entry.name === 'bytes')!;
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
index 0d3015887..513e741e4 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
@@ -1,5 +1,13 @@
 import { formatDiffMs, formatMetricDiff, percentFormatter } from '@/utils/formatters';
-import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition, RenderStats } from './types';
+import type {
+  BenchmarkBaseUpload,
+  BenchmarkReportEntry,
+  MetricDefinition,
+  RenderStats,
+} from './types';
+
+/** A report paired with its own metric definitions — the unit the comparison operates on. */
+export type BenchmarkComparisonInput = Pick<BenchmarkBaseUpload, 'report' | 'metricDefinitions'>;
 
 const NOISE_THRESHOLD = 0.2;
 
@@ -248,12 +256,13 @@ function makeMetricDiff(
 function compareMetrics(
   currentMetrics: Record<string, { mean: number; stdDev: number; outliers: number }>,
   baseEntry: BenchmarkReportEntry | undefined,
-  definitions: Record<string, MetricDefinition> | undefined,
+  currentDefinitions: Record<string, MetricDefinition> | undefined,
+  baseDefinitions: Record<string, MetricDefinition> | undefined,
 ): ComparisonEntry[] {
   const entries: ComparisonEntry[] = [];
 
   for (const [name, stats] of Object.entries(currentMetrics)) {
-    const definition = definitions?.[baseMetricName(name)];
+    const definition = currentDefinitions?.[baseMetricName(name)];
     const baseStats = baseEntry?.metrics[name];
     entries.push({
       name,
@@ -272,7 +281,7 @@ function compareMetrics(
   if (baseEntry) {
     for (const [name, baseStats] of Object.entries(baseEntry.metrics)) {
       if (!(name in currentMetrics)) {
-        const definition = definitions?.[baseMetricName(name)];
+        const definition = baseDefinitions?.[baseMetricName(name)];
         entries.push({
           name,
           value: 0,
@@ -323,33 +332,17 @@ function compareItems(a: ComparisonItem, b: ComparisonItem): number {
   return Math.abs(b.duration.absoluteDiff) - Math.abs(a.duration.absoluteDiff);
 }
 
-/**
- * Merges base and head metric definitions (head wins) so a metric present only in the base
- * report (e.g. one removed in the head) keeps its formatting/alarm metadata in the diff.
- */
-function mergeMetricDefinitions(
-  base: Record<string, MetricDefinition> | undefined,
-  head: Record<string, MetricDefinition> | undefined,
-): Record<string, MetricDefinition> | undefined {
-  if (!base) {
-    return head;
-  }
-  if (!head) {
-    return base;
-  }
-  return { ...base, ...head };
-}
-
 export function compareBenchmarkReports(
-  current: BenchmarkReport,
-  base: BenchmarkReport | null,
-  definitions?: Record<string, MetricDefinition>,
-  baseDefinitions?: Record<string, MetricDefinition>,
+  current: BenchmarkComparisonInput,
+  base: BenchmarkComparisonInput | null,
 ): BenchmarkComparisonReport {
-  // Reconcile the two sides' definitions here (head wins) so callers just pass each side's raw
-  // definitions and a base-only/removed metric keeps its formatting/alarm metadata.
-  const mergedDefinitions = mergeMetricDefinitions(baseDefinitions, definitions);
-  const effectiveBase = base ?? {};
+  // Definitions travel with their report: a current metric uses the current definitions, a
+  // base-only/removed metric uses the base definitions. No merge step — the side a metric appears
+  // on decides how it's formatted.
+  const currentDefinitions = current.metricDefinitions;
+  const baseDefinitions = base?.metricDefinitions;
+  const currentReport = current.report;
+  const effectiveBase = base?.report ?? {};
   const entries: ComparisonItem[] = [];
 
   let totalCurrentDuration = 0;
@@ -358,7 +351,7 @@ export function compareBenchmarkReports(
   let totalBaseRenders = 0;
 
   // Process current entries
-  for (const [name, entry] of Object.entries(current)) {
+  for (const [name, entry] of Object.entries(currentReport)) {
     const baseEntry = effectiveBase[name];
 
     const duration = makeDiffValue(entry.totalDuration, baseEntry?.totalDuration ?? null);
@@ -369,7 +362,7 @@ export function compareBenchmarkReports(
         ? makeCountDiffValue(entry.renders.length, baseEntry.renders.length)
         : undefined,
       renders: compareRenders(entry.renders, baseEntry),
-      metrics: compareMetrics(entry.metrics, baseEntry, mergedDefinitions),
+      metrics: compareMetrics(entry.metrics, baseEntry, currentDefinitions, baseDefinitions),
       iterations: entry.iterations,
     });
 
@@ -381,7 +374,7 @@ export function compareBenchmarkReports(
 
   // Process removed entries (in base but not in current)
   for (const [name, baseEntry] of Object.entries(effectiveBase)) {
-    if (name in current) {
+    if (name in currentReport) {
       continue;
     }
 
@@ -390,7 +383,7 @@ export function compareBenchmarkReports(
       name,
       duration,
       renders: compareRenders([], baseEntry),
-      metrics: compareMetrics({}, baseEntry, mergedDefinitions),
+      metrics: compareMetrics({}, baseEntry, currentDefinitions, baseDefinitions),
       iterations: 0,
     });
 
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts b/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts
index c786b8297..18e86d17a 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts
@@ -1,4 +1,5 @@
 import type { BenchmarkReport, BenchmarkReportEntry } from './types';
+import type { BenchmarkComparisonInput } from './compareBenchmarkReports';
 
 export function makeEntry(totalDuration: number, renderCount: number = 1): BenchmarkReportEntry {
   const perRender = renderCount > 0 ? totalDuration / renderCount : 0;
@@ -19,20 +20,20 @@ export function makeEntry(totalDuration: number, renderCount: number = 1): Bench
   };
 }
 
-export function makeReport(entries: Record<string, number>): BenchmarkReport {
+export function makeReport(entries: Record<string, number>): BenchmarkComparisonInput {
   const report: BenchmarkReport = {};
   for (const [name, totalDuration] of Object.entries(entries)) {
     report[name] = makeEntry(totalDuration);
   }
-  return report;
+  return { report };
 }
 
 export function makeReportFromConfig(
   entries: Record<string, { duration: number; renders: number }>,
-): BenchmarkReport {
+): BenchmarkComparisonInput {
   const report: BenchmarkReport = {};
   for (const [name, { duration, renders }] of Object.entries(entries)) {
     report[name] = makeEntry(duration, renders);
   }
-  return report;
+  return { report };
 }
diff --git a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
index 2fa980b60..e7ef61af5 100644
--- a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
+++ b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
@@ -46,12 +46,7 @@ export async function generateBenchmarkReport(
   }
 
   const baseUpload = useInlinedBase ? inlinedBase : fetchedBaseUpload;
-  const comparison = compareBenchmarkReports(
-    headReport.report,
-    baseReport,
-    headReport.metricDefinitions,
-    baseUpload?.metricDefinitions,
-  );
+  const comparison = compareBenchmarkReports(headReport, baseUpload ?? null);
 
   const detailsUrl = new URL(`${DASHBOARD_ORIGIN}/benchmark-details/${repo}`);
   detailsUrl.searchParams.set('sha', commitSha);
diff --git a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
index 57b3710cd..1b0e8a0e7 100644
--- a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
+++ b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
@@ -139,14 +139,7 @@ export default function BenchmarkDetails() {
           <InlinedBaseAlert fetchedBaseSha={baseSha} inlinedBaseSha={inlinedBase.commitSha} />
         )}
 
-        {report && (
-          <BenchmarkComparisonReportView
-            value={report.report}
-            base={effectiveBase?.report ?? null}
-            definitions={report.metricDefinitions}
-            baseDefinitions={effectiveBase?.metricDefinitions}
-          />
-        )}
+        {report && <BenchmarkComparisonReportView value={report} base={effectiveBase ?? null} />}
       </Paper>
     </React.Fragment>
   );

From 0bb6fabd5c7c5542c6b0e090bd6276792571fd0b Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Mon, 8 Jun 2026 16:47:08 +0200
Subject: [PATCH 14/20] [benchmark] Scope render recording and gate custom
 metrics during warmup

Two orthogonal, per-test mechanisms in the benchmark harness:

- Custom metrics recorded inside benchmark() now honor warmup exclusion via
  an internal per-test gate (metricsGate), matching how renders and bench:paint
  are already excluded. Standalone it() loops are unaffected.
- A React recording switch lets interactions scope which renders/paint are
  measured: pauseReactRecording()/resumeReactRecording() (strict pair, throw on
  wrong state) plus a reactRecordingPaused option to start paused and exclude
  the mount. Paint is attributed by renderTime so async observation can't
  misattribute a paused-frame paint.
---
 packages/benchmark/README.md                  | 38 +++++++++++++
 packages/benchmark/src/Metric.ts              |  8 +++
 packages/benchmark/src/index.tsx              | 43 ++++++++++++--
 packages/benchmark/src/metricsGate.test.ts    | 30 ++++++++++
 packages/benchmark/src/metricsGate.ts         | 24 ++++++++
 packages/benchmark/src/reactRecording.test.ts | 34 +++++++++++
 packages/benchmark/src/reactRecording.ts      | 56 +++++++++++++++++++
 packages/benchmark/src/types.ts               | 10 ++++
 .../tests/react-recording.bench.tsx           | 52 +++++++++++++++++
 9 files changed, 291 insertions(+), 4 deletions(-)
 create mode 100644 packages/benchmark/src/metricsGate.test.ts
 create mode 100644 packages/benchmark/src/metricsGate.ts
 create mode 100644 packages/benchmark/src/reactRecording.test.ts
 create mode 100644 packages/benchmark/src/reactRecording.ts
 create mode 100644 test/performance/tests/react-recording.bench.tsx

diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index 69ebe79a5..c67157b6a 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -61,6 +61,41 @@ benchmark(
 );
 ```
 
+### Scoping which renders are measured
+
+By default a benchmark records every React render and paint, from the mount through the whole interaction. To measure only part of an interaction — or to exclude the mount — pause and resume recording from the interaction callback:
+
+```tsx
+benchmark(
+  'Combobox type',
+  () => <Combobox />,
+  async ({ pauseReactRecording, resumeReactRecording, waitForElementTiming }) => {
+    pauseReactRecording(); // stop recording the settling re-renders
+    await openMenu();
+    resumeReactRecording(); // measure only what follows
+    await type('hello');
+    await waitForElementTiming('results');
+  },
+);
+```
+
+`pauseReactRecording()` / `resumeReactRecording()` toggle only the harness's React render and `bench:paint` recording — your own custom metrics keep recording. They are a strict pair: pausing while already paused, or resuming while already active, throws (this catches unbalanced calls early).
+
+To exclude the mount itself, start paused with the `reactRecordingPaused` option and resume at the point you care about — the mount is captured before the interaction callback runs, so pausing inside the callback can't drop it:
+
+```tsx
+benchmark(
+  'Combobox type',
+  () => <Combobox />,
+  async ({ resumeReactRecording }) => {
+    await openMenu(); // mount + open: not recorded
+    resumeReactRecording();
+    await type('hello'); // only these renders/paint recorded
+  },
+  { reactRecordingPaused: true },
+);
+```
+
 ### Paint metrics
 
 By default, every benchmark captures a `bench:paint` metric — the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically. The harness owns the `bench:` namespace, so avoid it for your own metric names.
@@ -124,6 +159,8 @@ it('measures work', () => {
 
 A metric is declared once (typically at module scope) and reused across tests and iterations. `record()` attaches the value to whichever test is running, so the same instance works in any `it()`.
 
+You can also `record()` from inside a `benchmark()` render function or interaction callback. Values recorded during warmup iterations are excluded automatically, just like renders and `bench:paint`, so a metric recorded once per iteration yields exactly `runs` samples.
+
 #### Metric configuration
 
 - `name` — the metric's report key (**required**).
@@ -154,6 +191,7 @@ Custom metrics are aggregated in the browser and only the resulting stats cross
 benchmark('name', renderFn, interaction, {
   runs: 20, // measurement iterations (default: 20)
   warmupRuns: 10, // warmup iterations before measuring (default: 10)
+  reactRecordingPaused: false, // start with React render/paint recording paused (default: false)
   afterEach: () => {
     /* cleanup between iterations */
   },
diff --git a/packages/benchmark/src/Metric.ts b/packages/benchmark/src/Metric.ts
index b54df9430..d60beb944 100644
--- a/packages/benchmark/src/Metric.ts
+++ b/packages/benchmark/src/Metric.ts
@@ -1,6 +1,7 @@
 import { onTestFinished, TestRunner, type RunnerTestCase } from 'vitest';
 import type { MetricConfig, MetricKind, MetricReport, MetricSampleStats } from './types';
 import { aggregateSamples } from './stats';
+import { metricsGate } from './metricsGate';
 // Import for TaskMeta augmentation side effect
 import './taskMetaAugmentation';
 
@@ -73,6 +74,13 @@ export abstract class Metric {
       );
     }
 
+    // The harness disables the gate during warmup iterations so custom metrics recorded inside a
+    // benchmark honor the same warmup exclusion as renders and `bench:paint`. Standalone `it()`
+    // loops never touch the gate, so they keep recording every value.
+    if (!metricsGate.isRecordingEnabled(test)) {
+      return;
+    }
+
     let accumulator = accumulators.get(test);
     if (!accumulator) {
       const created: TestAccumulator = new Map();
diff --git a/packages/benchmark/src/index.tsx b/packages/benchmark/src/index.tsx
index a9a91d589..ec6346027 100644
--- a/packages/benchmark/src/index.tsx
+++ b/packages/benchmark/src/index.tsx
@@ -5,6 +5,8 @@ import * as ReactDOM from 'react-dom';
 import type { RenderEvent, IterationData, InteractionContext } from './types';
 import { ElementTiming } from './ElementTiming';
 import { ScalarMetric } from './ScalarMetric';
+import { metricsGate } from './metricsGate';
+import { createReactRecordingControls, type ReactRecordingControls } from './reactRecording';
 // Import for TaskMeta augmentation side effect
 import './taskMetaAugmentation';
 
@@ -29,16 +31,22 @@ export { DiscreteMetric } from './DiscreteMetric';
 
 function BenchProfiler({
   captures,
+  recording,
   children,
 }: {
   captures: RenderEvent[];
+  recording: ReactRecordingControls;
   children: React.ReactNode;
 }) {
   const onRender = React.useCallback<React.ProfilerOnRenderCallback>(
     (id, phase, actualDuration, _baseDuration, startTime) => {
-      captures.push({ id, phase, actualDuration, startTime });
+      // Skip renders captured while React recording is paused (e.g. the mount when the benchmark
+      // starts paused, or a span the interaction explicitly excludes).
+      if (recording.active) {
+        captures.push({ id, phase, actualDuration, startTime });
+      }
     },
-    [captures],
+    [captures, recording],
   );
 
   return (
@@ -80,6 +88,12 @@ interface BenchmarkOptions {
   runs?: number;
   warmupRuns?: number;
   afterEach?: () => Promise<void> | void;
+  /**
+   * Start each iteration with React render/paint recording paused. The interaction callback then
+   * calls `resumeReactRecording()` at the point it cares about — useful to exclude the mount and
+   * measure only the renders/paint of a later interaction. Defaults to `false` (mount recorded).
+   */
+  reactRecordingPaused?: boolean;
 }
 
 export function benchmark(
@@ -121,6 +135,14 @@ export function benchmark(
     for (let i = 0; i < totalRuns; i += 1) {
       const isWarmup = i < warmupRuns;
 
+      // Custom metrics recorded inside the benchmark honor warmup exclusion through the gate, the
+      // same way renders and `bench:paint` are excluded during warmup.
+      metricsGate.setRecordingEnabled(task, !isWarmup);
+
+      // Per-iteration switch for the harness's React render/paint recording. Starts paused when
+      // `reactRecordingPaused` is set; the interaction callback drives it from there.
+      const recording = createReactRecordingControls(!(options?.reactRecordingPaused ?? false));
+
       // Drain event loop from previous unmount, then double GC for thorough cleanup
       // eslint-disable-next-line no-await-in-loop
       await settle();
@@ -193,7 +215,11 @@ export function benchmark(
       });
 
       ReactDOM.flushSync(() => {
-        root.render(<BenchProfiler captures={captures}>{renderFn()}</BenchProfiler>);
+        root.render(
+          <BenchProfiler captures={captures} recording={recording}>
+            {renderFn()}
+          </BenchProfiler>,
+        );
       });
 
       if (renderError) {
@@ -205,7 +231,11 @@ export function benchmark(
 
       if (interaction) {
         // eslint-disable-next-line no-await-in-loop
-        await interaction({ waitForElementTiming });
+        await interaction({
+          waitForElementTiming,
+          pauseReactRecording: recording.pauseReactRecording,
+          resumeReactRecording: recording.resumeReactRecording,
+        });
       }
 
       // Wait for the bench sentinel paint entry (relies on test timeout)
@@ -219,6 +249,11 @@ export function benchmark(
 
       if (!isWarmup) {
         for (const entry of elementEntries) {
+          // Skip paints that happened while recording was paused. Attribute by the paint's
+          // `renderTime`, not by when the observer callback fired (which can lag the paint).
+          if (!recording.activeAt(entry.renderTime)) {
+            continue;
+          }
           // The default sentinel is the base series; named markers become sub-series.
           const id = entry.identifier === 'default' ? undefined : entry.identifier;
           paint.record(entry.renderTime - iterationStart, id !== undefined ? { id } : undefined);
diff --git a/packages/benchmark/src/metricsGate.test.ts b/packages/benchmark/src/metricsGate.test.ts
new file mode 100644
index 000000000..0bd43f8b3
--- /dev/null
+++ b/packages/benchmark/src/metricsGate.test.ts
@@ -0,0 +1,30 @@
+import { describe, it, expect } from 'vitest';
+import type { RunnerTestCase } from 'vitest';
+import { metricsGate } from './metricsGate';
+
+// The gate keys on test identity via a WeakSet, so any object stands in for a RunnerTestCase.
+function fakeTest(): RunnerTestCase {
+  return {} as RunnerTestCase;
+}
+
+describe('metricsGate', () => {
+  it('is enabled by default for an unseen test', () => {
+    expect(metricsGate.isRecordingEnabled(fakeTest())).toBe(true);
+  });
+
+  it('disables and re-enables recording for a test', () => {
+    const test = fakeTest();
+    metricsGate.setRecordingEnabled(test, false);
+    expect(metricsGate.isRecordingEnabled(test)).toBe(false);
+    metricsGate.setRecordingEnabled(test, true);
+    expect(metricsGate.isRecordingEnabled(test)).toBe(true);
+  });
+
+  it('keeps the state independent per test', () => {
+    const suppressed = fakeTest();
+    const recording = fakeTest();
+    metricsGate.setRecordingEnabled(suppressed, false);
+    expect(metricsGate.isRecordingEnabled(suppressed)).toBe(false);
+    expect(metricsGate.isRecordingEnabled(recording)).toBe(true);
+  });
+});
diff --git a/packages/benchmark/src/metricsGate.ts b/packages/benchmark/src/metricsGate.ts
new file mode 100644
index 000000000..dc9759d8e
--- /dev/null
+++ b/packages/benchmark/src/metricsGate.ts
@@ -0,0 +1,24 @@
+import type { RunnerTestCase } from 'vitest';
+
+// Internal — not exported from the package, not user-facing. The `benchmark()` harness toggles
+// recording per test (off during warmup iterations) and `Metric.record()` consults it, so custom
+// metrics recorded inside a benchmark honor the same warmup exclusion as renders and `bench:paint`.
+//
+// Storage tracks the *disabled* tests so absence means enabled: a test with no entry — e.g. a
+// standalone `it()` loop that never goes through the harness — records normally by default.
+const disabled = new WeakSet<RunnerTestCase>();
+
+export const metricsGate = {
+  /** Whether custom-metric recording is currently enabled for `test`. Defaults to `true`. */
+  isRecordingEnabled(test: RunnerTestCase): boolean {
+    return !disabled.has(test);
+  },
+  /** Enable or disable custom-metric recording for `test`. */
+  setRecordingEnabled(test: RunnerTestCase, enabled: boolean): void {
+    if (enabled) {
+      disabled.delete(test);
+    } else {
+      disabled.add(test);
+    }
+  },
+};
diff --git a/packages/benchmark/src/reactRecording.test.ts b/packages/benchmark/src/reactRecording.test.ts
new file mode 100644
index 000000000..bc009d123
--- /dev/null
+++ b/packages/benchmark/src/reactRecording.test.ts
@@ -0,0 +1,34 @@
+import { describe, it, expect } from 'vitest';
+import { createReactRecordingControls } from './reactRecording';
+
+describe('createReactRecordingControls', () => {
+  it('starts in the requested state', () => {
+    expect(createReactRecordingControls(true).active).toBe(true);
+    expect(createReactRecordingControls(false).active).toBe(false);
+  });
+
+  it('pauses and resumes', () => {
+    const controls = createReactRecordingControls(true);
+    controls.pauseReactRecording();
+    expect(controls.active).toBe(false);
+    controls.resumeReactRecording();
+    expect(controls.active).toBe(true);
+  });
+
+  it('throws when pausing while already paused', () => {
+    const controls = createReactRecordingControls(false);
+    expect(() => controls.pauseReactRecording()).toThrow(/already paused/);
+  });
+
+  it('throws when resuming while already active', () => {
+    const controls = createReactRecordingControls(true);
+    expect(() => controls.resumeReactRecording()).toThrow(/already active/);
+  });
+
+  it('attributes times before all toggles to the initial state and after to the current', () => {
+    const controls = createReactRecordingControls(false);
+    controls.resumeReactRecording();
+    expect(controls.activeAt(-Infinity)).toBe(false);
+    expect(controls.activeAt(Infinity)).toBe(true);
+  });
+});
diff --git a/packages/benchmark/src/reactRecording.ts b/packages/benchmark/src/reactRecording.ts
new file mode 100644
index 000000000..0496f0985
--- /dev/null
+++ b/packages/benchmark/src/reactRecording.ts
@@ -0,0 +1,56 @@
+export interface ReactRecordingControls {
+  /** Whether React render/paint recording is active right now — the synchronous gate for renders. */
+  readonly active: boolean;
+  /**
+   * Whether recording was active at `time` (a `performance.now()` timestamp). Paint entries are
+   * observed asynchronously, so they are attributed by their `renderTime` rather than by the
+   * recording state at the moment the observer callback happens to fire.
+   */
+  activeAt(time: number): boolean;
+  /** Pause React render/paint recording. Throws if recording is already paused. */
+  pauseReactRecording(): void;
+  /** Resume React render/paint recording. Throws if recording is already active. */
+  resumeReactRecording(): void;
+}
+
+/**
+ * Creates the per-iteration switch that turns the harness's React render/paint recording on and
+ * off. The interaction callback drives it via `pauseReactRecording`/`resumeReactRecording`; the
+ * strict state machine (each throws when called in the wrong state) catches unbalanced pairs early.
+ */
+export function createReactRecordingControls(initiallyActive: boolean): ReactRecordingControls {
+  let active = initiallyActive;
+  // Transitions in chronological order. The implicit state before the first toggle is
+  // `initiallyActive`; `activeAt` replays this to attribute a paint to its render time.
+  const transitions: { time: number; active: boolean }[] = [];
+
+  return {
+    get active() {
+      return active;
+    },
+    activeAt(time) {
+      let result = initiallyActive;
+      for (const transition of transitions) {
+        if (transition.time > time) {
+          break;
+        }
+        result = transition.active;
+      }
+      return result;
+    },
+    pauseReactRecording() {
+      if (!active) {
+        throw new Error('pauseReactRecording() called but React recording is already paused.');
+      }
+      active = false;
+      transitions.push({ time: performance.now(), active: false });
+    },
+    resumeReactRecording() {
+      if (active) {
+        throw new Error('resumeReactRecording() called but React recording is already active.');
+      }
+      active = true;
+      transitions.push({ time: performance.now(), active: true });
+    },
+  };
+}
diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts
index f59359bcb..5294752db 100644
--- a/packages/benchmark/src/types.ts
+++ b/packages/benchmark/src/types.ts
@@ -25,6 +25,16 @@ export interface InteractionContext {
    * @param timeout - Timeout in ms. Default: 5000. Pass 0 or Infinity to rely on the test timeout.
    */
   waitForElementTiming: (identifier: string, timeout?: number) => Promise<void>;
+  /**
+   * Pause recording of the harness's React render/paint measurements. Custom metrics keep
+   * recording. Throws if recording is already paused.
+   */
+  pauseReactRecording: () => void;
+  /**
+   * Resume recording of the harness's React render/paint measurements. Throws if recording is
+   * already active.
+   */
+  resumeReactRecording: () => void;
 }
 
 /**
diff --git a/test/performance/tests/react-recording.bench.tsx b/test/performance/tests/react-recording.bench.tsx
new file mode 100644
index 000000000..ed2d30bb3
--- /dev/null
+++ b/test/performance/tests/react-recording.bench.tsx
@@ -0,0 +1,52 @@
+import * as React from 'react';
+import * as ReactDOM from 'react-dom';
+import { benchmark, ElementTiming, ScalarMetric } from '@mui/internal-benchmark';
+
+function simulateSlowdown(ms: number) {
+  const end = performance.now() + ms;
+  while (performance.now() < end) {
+    // burn CPU
+  }
+}
+
+function Counter() {
+  const [count, setCount] = React.useState(0);
+  simulateSlowdown(2);
+  return (
+    <button type="button" onClick={() => setCount((value) => value + 1)}>
+      {/* `key` forces a fresh element per state, so each paints its own element-timing entry. */}
+      <ElementTiming key={count} name={count === 0 ? 'ready' : 'clicked'} />
+      {count}
+    </button>
+  );
+}
+
+// A custom metric recorded inside the benchmark. Warmup iterations are excluded automatically, so
+// this yields exactly `runs` samples without the interaction needing to know about warmup.
+const clickWork = new ScalarMetric({
+  name: 'click_work',
+  format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
+});
+
+// Start with React recording paused so the mount is excluded. Await the mount paint (which lands in
+// its own frame while paused), resume, then measure only the click's re-render and the paint it
+// produces — so the report shows a single `bench:update` render and a single `bench:paint#clicked`.
+benchmark(
+  'Counter click (interaction only)',
+  () => <Counter />,
+  async ({ resumeReactRecording, waitForElementTiming }) => {
+    await waitForElementTiming('ready'); // mount paint — recorded while paused, so excluded
+    resumeReactRecording();
+    clickWork.time();
+    ReactDOM.flushSync(() => {
+      document.querySelector('button')?.click();
+    });
+    clickWork.timeEnd(); // wraps the synchronous re-render
+    await waitForElementTiming('clicked'); // interaction paint — recorded
+  },
+  {
+    runs: 10,
+    warmupRuns: 5,
+    reactRecordingPaused: true,
+  },
+);

From 2ab3216034c53093a54cb01e174524def3b86f4d Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Mon, 8 Jun 2026 19:09:01 +0200
Subject: [PATCH 15/20] [benchmark] Scope the render sanity check to active
 recording windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The blanket "renders > 0" assertion false-failed benchmarks that pause
recording and measure imperatively (no React renders) or via custom metrics
only. Replace it with a per-window check: every active recording window must
capture at least one render, but windows where recording was never running are
not checked — so a fully-paused, metric-only benchmark passes.

createReactRecordingControls tracks per-window render presence (markRendered /
finalizeWindow / hadEmptyActiveWindow); the harness aggregates across iterations
and fails only on an active window that measured nothing.
---
 packages/benchmark/src/index.tsx              | 19 ++++++--
 packages/benchmark/src/reactRecording.test.ts | 43 +++++++++++++++++++
 packages/benchmark/src/reactRecording.ts      | 30 +++++++++++++
 .../tests/imperative-metric.bench.tsx         | 38 ++++++++++++++++
 4 files changed, 126 insertions(+), 4 deletions(-)
 create mode 100644 test/performance/tests/imperative-metric.bench.tsx

diff --git a/packages/benchmark/src/index.tsx b/packages/benchmark/src/index.tsx
index ec6346027..6db367fe9 100644
--- a/packages/benchmark/src/index.tsx
+++ b/packages/benchmark/src/index.tsx
@@ -44,6 +44,7 @@ function BenchProfiler({
       // starts paused, or a span the interaction explicitly excludes).
       if (recording.active) {
         captures.push({ id, phase, actualDuration, startTime });
+        recording.markRendered();
       }
     },
     [captures, recording],
@@ -131,6 +132,8 @@ export function benchmark(
     }
 
     let renderError: unknown = null;
+    // Set if any iteration had a recording window that was active yet captured no renders.
+    let sawEmptyActiveWindow = false;
 
     for (let i = 0; i < totalRuns; i += 1) {
       const isWarmup = i < warmupRuns;
@@ -242,6 +245,12 @@ export function benchmark(
       // eslint-disable-next-line no-await-in-loop
       await waitForElementTiming('default', 0);
 
+      // Close the final window and remember if any active window measured no renders.
+      recording.finalizeWindow();
+      if (recording.hadEmptyActiveWindow) {
+        sawEmptyActiveWindow = true;
+      }
+
       elementObserver?.disconnect();
 
       root.unmount();
@@ -274,11 +283,13 @@ export function benchmark(
       throw renderError;
     }
 
-    // Validate that at least one render was recorded
+    // Every active recording window must capture at least one render. Windows where recording was
+    // never running (e.g. a fully-paused, metric-only benchmark) are not checked.
     expect(
-      iterations[0].renders.length,
-      'No renders were recorded during benchmark',
-    ).toBeGreaterThan(0);
+      sawEmptyActiveWindow,
+      'React recording was active but captured no renders. If you only measure imperative DOM ' +
+        'updates or custom metrics, keep recording paused (reactRecordingPaused) instead of resuming.',
+    ).toBe(false);
 
     // Validate all iterations produced the same render events (count + order).
     // This runs after meta is set so the reporter can still display results on failure.
diff --git a/packages/benchmark/src/reactRecording.test.ts b/packages/benchmark/src/reactRecording.test.ts
index bc009d123..043d14fb8 100644
--- a/packages/benchmark/src/reactRecording.test.ts
+++ b/packages/benchmark/src/reactRecording.test.ts
@@ -32,3 +32,46 @@ describe('createReactRecordingControls', () => {
     expect(controls.activeAt(Infinity)).toBe(true);
   });
 });
+
+describe('createReactRecordingControls active-window render check', () => {
+  it('does not flag a window that captured a render before pause', () => {
+    const controls = createReactRecordingControls(true);
+    controls.markRendered();
+    controls.pauseReactRecording();
+    expect(controls.hadEmptyActiveWindow).toBe(false);
+  });
+
+  it('flags an active window paused without any render', () => {
+    const controls = createReactRecordingControls(true);
+    controls.pauseReactRecording();
+    expect(controls.hadEmptyActiveWindow).toBe(true);
+  });
+
+  it('flags the final window via finalizeWindow when active and empty', () => {
+    const controls = createReactRecordingControls(true);
+    controls.finalizeWindow();
+    expect(controls.hadEmptyActiveWindow).toBe(true);
+  });
+
+  it('does not flag the final window when it captured a render', () => {
+    const controls = createReactRecordingControls(true);
+    controls.markRendered();
+    controls.finalizeWindow();
+    expect(controls.hadEmptyActiveWindow).toBe(false);
+  });
+
+  it('does not flag anything when recording was never active', () => {
+    const controls = createReactRecordingControls(false);
+    controls.finalizeWindow();
+    expect(controls.hadEmptyActiveWindow).toBe(false);
+  });
+
+  it('resets render presence per window', () => {
+    const controls = createReactRecordingControls(true);
+    controls.markRendered();
+    controls.pauseReactRecording(); // first window had a render
+    controls.resumeReactRecording(); // second window starts empty
+    controls.finalizeWindow();
+    expect(controls.hadEmptyActiveWindow).toBe(true);
+  });
+});
diff --git a/packages/benchmark/src/reactRecording.ts b/packages/benchmark/src/reactRecording.ts
index 0496f0985..0720d188e 100644
--- a/packages/benchmark/src/reactRecording.ts
+++ b/packages/benchmark/src/reactRecording.ts
@@ -1,12 +1,18 @@
 export interface ReactRecordingControls {
   /** Whether React render/paint recording is active right now — the synchronous gate for renders. */
   readonly active: boolean;
+  /** Whether any active recording window closed without capturing a render. */
+  readonly hadEmptyActiveWindow: boolean;
   /**
    * Whether recording was active at `time` (a `performance.now()` timestamp). Paint entries are
    * observed asynchronously, so they are attributed by their `renderTime` rather than by the
    * recording state at the moment the observer callback happens to fire.
    */
   activeAt(time: number): boolean;
+  /** Note that a render was captured in the current window. Called by the harness from `onRender`. */
+  markRendered(): void;
+  /** Close the final window at the end of the iteration (validates it if recording is still active). */
+  finalizeWindow(): void;
   /** Pause React render/paint recording. Throws if recording is already paused. */
   pauseReactRecording(): void;
   /** Resume React render/paint recording. Throws if recording is already active. */
@@ -17,17 +23,33 @@ export interface ReactRecordingControls {
  * Creates the per-iteration switch that turns the harness's React render/paint recording on and
  * off. The interaction callback drives it via `pauseReactRecording`/`resumeReactRecording`; the
  * strict state machine (each throws when called in the wrong state) catches unbalanced pairs early.
+ *
+ * It also tracks whether each *active* window captured at least one render, so the harness can flag
+ * a window that was recording but measured nothing — while leaving fully-paused (metric-only)
+ * benchmarks alone.
  */
 export function createReactRecordingControls(initiallyActive: boolean): ReactRecordingControls {
   let active = initiallyActive;
+  let currentWindowHasRender = false;
+  let emptyActiveWindow = false;
   // Transitions in chronological order. The implicit state before the first toggle is
   // `initiallyActive`; `activeAt` replays this to attribute a paint to its render time.
   const transitions: { time: number; active: boolean }[] = [];
 
+  // Flag the window being closed if it was recording yet captured nothing.
+  function closeWindowIfActive() {
+    if (active && !currentWindowHasRender) {
+      emptyActiveWindow = true;
+    }
+  }
+
   return {
     get active() {
       return active;
     },
+    get hadEmptyActiveWindow() {
+      return emptyActiveWindow;
+    },
     activeAt(time) {
       let result = initiallyActive;
       for (const transition of transitions) {
@@ -38,10 +60,17 @@ export function createReactRecordingControls(initiallyActive: boolean): ReactRec
       }
       return result;
     },
+    markRendered() {
+      currentWindowHasRender = true;
+    },
+    finalizeWindow() {
+      closeWindowIfActive();
+    },
     pauseReactRecording() {
       if (!active) {
         throw new Error('pauseReactRecording() called but React recording is already paused.');
       }
+      closeWindowIfActive();
       active = false;
       transitions.push({ time: performance.now(), active: false });
     },
@@ -50,6 +79,7 @@ export function createReactRecordingControls(initiallyActive: boolean): ReactRec
         throw new Error('resumeReactRecording() called but React recording is already active.');
       }
       active = true;
+      currentWindowHasRender = false;
       transitions.push({ time: performance.now(), active: true });
     },
   };
diff --git a/test/performance/tests/imperative-metric.bench.tsx b/test/performance/tests/imperative-metric.bench.tsx
new file mode 100644
index 000000000..1fc2c833a
--- /dev/null
+++ b/test/performance/tests/imperative-metric.bench.tsx
@@ -0,0 +1,38 @@
+import * as React from 'react';
+import { benchmark, ScalarMetric } from '@mui/internal-benchmark';
+
+function simulateSlowdown(ms: number) {
+  const end = performance.now() + ms;
+  while (performance.now() < end) {
+    // burn CPU
+  }
+}
+
+function Widget() {
+  return <div data-state="idle">Widget</div>;
+}
+
+const updateWork = new ScalarMetric({
+  name: 'imperative_update',
+  format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
+});
+
+// Recording stays paused for the whole benchmark: the interaction mutates the DOM imperatively (no
+// React re-render) and measures only a custom metric. The harness must not require a React render
+// here — there is no active recording window to validate.
+benchmark(
+  'Widget imperative update (metric only)',
+  () => <Widget />,
+  async () => {
+    const node = document.querySelector('[data-state]');
+    updateWork.time();
+    node?.setAttribute('data-state', 'active');
+    simulateSlowdown(1);
+    updateWork.timeEnd();
+  },
+  {
+    runs: 10,
+    warmupRuns: 5,
+    reactRecordingPaused: true,
+  },
+);

From 71ef1935069b9fbafd9a23fada85ee7fd204860c Mon Sep 17 00:00:00 2001
From: Jan Potoms <2109932+Janpot@users.noreply.github.com>
Date: Tue, 9 Jun 2026 09:59:53 +0200
Subject: [PATCH 16/20] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Jan Potoms <2109932+Janpot@users.noreply.github.com>
---
 packages/benchmark/src/Metric.ts | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/packages/benchmark/src/Metric.ts b/packages/benchmark/src/Metric.ts
index d60beb944..4e80e9f2c 100644
--- a/packages/benchmark/src/Metric.ts
+++ b/packages/benchmark/src/Metric.ts
@@ -67,11 +67,9 @@ export abstract class Metric {
    * once when the test finishes. Pass `options.id` to split into a labeled sub-series.
    */
   record(value: number, options?: MetricRecordOptions): void {
-    const test = TestRunner.getCurrentTest<RunnerTestCase | undefined>();
     if (!test) {
-      throw new Error(
-        `${this.constructor.name}.record() must be called inside a running benchmark test.`,
-      );
+      throw new Error(`${this.constructor.name}.record() must be called inside a running Vitest test.`);
+    }
     }
 
     // The harness disables the gate during warmup iterations so custom metrics recorded inside a

From 8d8761f431b0822fb75434abe13ddff2d8b829be Mon Sep 17 00:00:00 2001
From: Jan Potoms <2109932+Janpot@users.noreply.github.com>
Date: Tue, 9 Jun 2026 10:00:05 +0200
Subject: [PATCH 17/20] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Jan Potoms <2109932+Janpot@users.noreply.github.com>
---
 packages/benchmark/src/ScalarMetric.ts | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/packages/benchmark/src/ScalarMetric.ts b/packages/benchmark/src/ScalarMetric.ts
index 9b1c02ac8..c83a00600 100644
--- a/packages/benchmark/src/ScalarMetric.ts
+++ b/packages/benchmark/src/ScalarMetric.ts
@@ -21,7 +21,13 @@ export class ScalarMetric extends Metric {
 
   /** Starts a timer. Pass a `label` to time a sub-series; it maps to `record`'s `id`. */
   time(label?: string): void {
-    this.pending.set(label ?? '', performance.now());
+    const key = label ?? '';
+    if (this.pending.has(key)) {
+      throw new Error(
+        `${this.name}.time(${label ? `\"${label}\"` : ''}) was called while a timer is already running for that label.`,
+      );
+    }
+    this.pending.set(key, performance.now());
   }
 
   /** Stops the timer started by `time(label)` and records the elapsed milliseconds. */

From 7ea769b3debbbf8b86c625fcbb241c9eec01562d Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Tue, 9 Jun 2026 10:09:24 +0200
Subject: [PATCH 18/20] [benchmark] Fix broken Copilot autofixes for
 Metric/ScalarMetric
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Metric.ts autofix dropped the `const test = getCurrentTest()` declaration
(leaving every use of `test` undefined) and a stray brace — restore it, keeping
the improved "running Vitest test" wording. The ScalarMetric.ts autofix escaped
quotes inside a template literal (no-useless-escape); unescape them. Add tests
for the new time()-already-running guard.
---
 packages/benchmark/src/Metric.test.ts  | 14 ++++++++++++++
 packages/benchmark/src/Metric.ts       |  6 ++++--
 packages/benchmark/src/ScalarMetric.ts |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/packages/benchmark/src/Metric.test.ts b/packages/benchmark/src/Metric.test.ts
index 60a46227d..fc061eb07 100644
--- a/packages/benchmark/src/Metric.test.ts
+++ b/packages/benchmark/src/Metric.test.ts
@@ -35,3 +35,17 @@ describe('ScalarMetric.timeEnd', () => {
     expect(() => metric.timeEnd('b')).toThrow(/without a matching time/);
   });
 });
+
+describe('ScalarMetric.time', () => {
+  it('throws when a timer is already running for the same label', () => {
+    const metric = new ScalarMetric({ name: 'double-timer' });
+    metric.time('a');
+    expect(() => metric.time('a')).toThrow(/already running/);
+  });
+
+  it('allows concurrent timers under different labels', () => {
+    const metric = new ScalarMetric({ name: 'multi-timer' });
+    metric.time('a');
+    expect(() => metric.time('b')).not.toThrow();
+  });
+});
diff --git a/packages/benchmark/src/Metric.ts b/packages/benchmark/src/Metric.ts
index 4e80e9f2c..eea291aa2 100644
--- a/packages/benchmark/src/Metric.ts
+++ b/packages/benchmark/src/Metric.ts
@@ -67,9 +67,11 @@ export abstract class Metric {
    * once when the test finishes. Pass `options.id` to split into a labeled sub-series.
    */
   record(value: number, options?: MetricRecordOptions): void {
+    const test = TestRunner.getCurrentTest<RunnerTestCase | undefined>();
     if (!test) {
-      throw new Error(`${this.constructor.name}.record() must be called inside a running Vitest test.`);
-    }
+      throw new Error(
+        `${this.constructor.name}.record() must be called inside a running Vitest test.`,
+      );
     }
 
     // The harness disables the gate during warmup iterations so custom metrics recorded inside a
diff --git a/packages/benchmark/src/ScalarMetric.ts b/packages/benchmark/src/ScalarMetric.ts
index c83a00600..269d0cbad 100644
--- a/packages/benchmark/src/ScalarMetric.ts
+++ b/packages/benchmark/src/ScalarMetric.ts
@@ -24,7 +24,7 @@ export class ScalarMetric extends Metric {
     const key = label ?? '';
     if (this.pending.has(key)) {
       throw new Error(
-        `${this.name}.time(${label ? `\"${label}\"` : ''}) was called while a timer is already running for that label.`,
+        `${this.name}.time(${label ? `"${label}"` : ''}) was called while a timer is already running for that label.`,
       );
     }
     this.pending.set(key, performance.now());

From b74459360f3f7e75c179e2925532d50de347d344 Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Sat, 13 Jun 2026 04:58:26 +0200
Subject: [PATCH 19/20] [benchmark] Address review comments: timing accuracy,
 deepEqual, toSorted

- stats: use Array.prototype.toSorted instead of [...].sort()
- ScalarMetric.timeEnd / reactRecording pause-resume: capture performance.now()
  before bookkeeping so the surrounding work isn't part of the measurement
- ScalarMetric: drop the superclass-method reference from the class JSDoc
- reporter: replace stableStringify-for-equality with a direct deepEqual
- ciReport: alarm warn/error thresholds must be >= 0
- README: record() or time() inside benchmark(); clarify alarms are evaluated
  for the PR comment, not the local run
---
 packages/benchmark/README.md             |  6 +++--
 packages/benchmark/src/ScalarMetric.ts   |  6 +++--
 packages/benchmark/src/ciReport.ts       |  4 +--
 packages/benchmark/src/reactRecording.ts |  8 ++++--
 packages/benchmark/src/reporter.ts       | 33 +++++++++++++++---------
 packages/benchmark/src/stats.ts          |  2 +-
 6 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index c67157b6a..1c9731dc8 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -129,7 +129,7 @@ This produces a `bench:paint#my-component` sub-series alongside the automatic `b
 
 ### Custom metrics
 
-Record your own measurements — a timing, a count, anything measured inside or outside React — from a plain `it()` loop. There are two primitives:
+Record your own measurements — a timing, a count, anything measured inside or outside React — from a plain `it()` loop or from inside a `benchmark()`. There are two primitives:
 
 - `ScalarMetric` — a continuous value (timings, sizes). Aggregated as mean ± standard deviation with IQR outlier removal, and compared against a baseline with a relative noise band. It also offers a `console.time`-style timing helper.
 - `DiscreteMetric` — a count of events. Compared as an exact integer (any change is significant) and formatted as a whole number.
@@ -159,7 +159,7 @@ it('measures work', () => {
 
 A metric is declared once (typically at module scope) and reused across tests and iterations. `record()` attaches the value to whichever test is running, so the same instance works in any `it()`.
 
-You can also `record()` from inside a `benchmark()` render function or interaction callback. Values recorded during warmup iterations are excluded automatically, just like renders and `bench:paint`, so a metric recorded once per iteration yields exactly `runs` samples.
+You can also `record()` or `time()` from inside a `benchmark()` render function or interaction callback. Values recorded during warmup iterations are excluded automatically, just like renders and `bench:paint`, so a metric recorded once per iteration yields exactly `runs` samples.
 
 #### Metric configuration
 
@@ -171,6 +171,8 @@ You can also `record()` from inside a `benchmark()` render function or interacti
   - `error` — harder band; a regression past it is flagged as an error. Defaults to the dashboard's global noise band only when both `warn` and `error` are omitted; with only `warn` set there is no error band (warning-only).
   - Bands are relative fractions for scalar metrics (`0.1` = 10%) and absolute count deltas for discrete metrics (`1`, `2`). Either band is optional.
 
+Alarms are evaluated against the baseline when the PR comment is generated, not during the local `vitest run` — a regression never fails the test suite locally. In the PR comment, `error`-band regressions surface as failures and `warn`-band regressions as warnings.
+
 #### Sub-series
 
 Pass `record(value, { id })` to split one metric into labeled sub-series, reported as `name#id`. For `ScalarMetric.time()`/`timeEnd()`, pass a label that maps to the same `id`:
diff --git a/packages/benchmark/src/ScalarMetric.ts b/packages/benchmark/src/ScalarMetric.ts
index 269d0cbad..ce730327a 100644
--- a/packages/benchmark/src/ScalarMetric.ts
+++ b/packages/benchmark/src/ScalarMetric.ts
@@ -5,7 +5,7 @@ import type { MetricKind } from './types';
  * A continuous measurement (timings, sizes, …). Samples are aggregated with mean ± standard
  * deviation and IQR outlier removal, and compared against a baseline with a relative noise band.
  *
- * Besides `record()`, it offers a `console.time`-style timing helper:
+ * It offers a `console.time`-style timing helper:
  *
  * ```ts
  * const metric = new ScalarMetric({ name: 'render', format: { style: 'unit', unit: 'millisecond' } });
@@ -32,6 +32,8 @@ export class ScalarMetric extends Metric {
 
   /** Stops the timer started by `time(label)` and records the elapsed milliseconds. */
   timeEnd(label?: string): void {
+    // Capture the end time first so the map lookup/delete below isn't part of the measurement.
+    const end = performance.now();
     const key = label ?? '';
     const start = this.pending.get(key);
     if (start === undefined) {
@@ -40,6 +42,6 @@ export class ScalarMetric extends Metric {
       );
     }
     this.pending.delete(key);
-    this.record(performance.now() - start, label !== undefined ? { id: label } : undefined);
+    this.record(end - start, label !== undefined ? { id: label } : undefined);
   }
 }
diff --git a/packages/benchmark/src/ciReport.ts b/packages/benchmark/src/ciReport.ts
index 1add86b94..8e2d8cd1f 100644
--- a/packages/benchmark/src/ciReport.ts
+++ b/packages/benchmark/src/ciReport.ts
@@ -72,8 +72,8 @@ const metricDefinitionSchema = z.object({
   alarm: z
     .object({
       direction: z.enum(['lowerIsBetter', 'higherIsBetter']).optional(),
-      warn: z.number().optional(),
-      error: z.number().optional(),
+      warn: z.number().min(0).optional(),
+      error: z.number().min(0).optional(),
     })
     .optional(),
 });
diff --git a/packages/benchmark/src/reactRecording.ts b/packages/benchmark/src/reactRecording.ts
index 0720d188e..d4b364b2e 100644
--- a/packages/benchmark/src/reactRecording.ts
+++ b/packages/benchmark/src/reactRecording.ts
@@ -70,17 +70,21 @@ export function createReactRecordingControls(initiallyActive: boolean): ReactRec
       if (!active) {
         throw new Error('pauseReactRecording() called but React recording is already paused.');
       }
+      // Stamp the call site before the bookkeeping so paint attribution uses an accurate time.
+      const now = performance.now();
       closeWindowIfActive();
       active = false;
-      transitions.push({ time: performance.now(), active: false });
+      transitions.push({ time: now, active: false });
     },
     resumeReactRecording() {
       if (active) {
         throw new Error('resumeReactRecording() called but React recording is already active.');
       }
+      // Stamp the call site before the bookkeeping so paint attribution uses an accurate time.
+      const now = performance.now();
       active = true;
       currentWindowHasRender = false;
-      transitions.push({ time: performance.now(), active: true });
+      transitions.push({ time: now, active: true });
     },
   };
 }
diff --git a/packages/benchmark/src/reporter.ts b/packages/benchmark/src/reporter.ts
index 4abc383ca..4060b1cab 100644
--- a/packages/benchmark/src/reporter.ts
+++ b/packages/benchmark/src/reporter.ts
@@ -15,19 +15,28 @@ function getEventKey(event: RenderEvent): string {
   return `${event.id}:${event.phase}`;
 }
 
-/** Order-insensitive JSON: sorts object keys and drops `undefined` so equal configs compare equal. */
-function stableStringify(value: unknown): string {
-  if (value === null || typeof value !== 'object') {
-    return JSON.stringify(value);
+/** Order-insensitive deep equality, treating a missing key and an `undefined` value as equal. */
+function deepEqual(first: unknown, second: unknown): boolean {
+  if (first === second) {
+    return true;
   }
-  if (Array.isArray(value)) {
-    return `[${value.map(stableStringify).join(',')}]`;
+  if (
+    typeof first !== 'object' ||
+    first === null ||
+    typeof second !== 'object' ||
+    second === null
+  ) {
+    return false;
   }
-  const record = value as Record<string, unknown>;
-  const keys = Object.keys(record)
-    .filter((key) => record[key] !== undefined)
-    .sort();
-  return `{${keys.map((key) => `${JSON.stringify(key)}:${stableStringify(record[key])}`).join(',')}}`;
+  const firstRecord = first as Record<string, unknown>;
+  const secondRecord = second as Record<string, unknown>;
+  const keys = new Set([...Object.keys(firstRecord), ...Object.keys(secondRecord)]);
+  for (const key of keys) {
+    if (!deepEqual(firstRecord[key], secondRecord[key])) {
+      return false;
+    }
+  }
+  return true;
 }
 
 function generateReportFromIterations(iterations: IterationData[]): BenchmarkReportEntry {
@@ -203,7 +212,7 @@ function mergeCustomMetrics(
     // matches (e.g. the harness `bench:paint`), but conflicting config would silently apply
     // last-write-wins to every entry — reject it instead.
     const existing = definitions[metricName];
-    if (existing && stableStringify(existing) !== stableStringify(definition)) {
+    if (existing && !deepEqual(existing, definition)) {
       throw new Error(
         `Benchmark metric "${metricName}" is defined with conflicting configuration across ` +
           `benchmarks. A metric name must map to a single kind, format, and alarm.`,
diff --git a/packages/benchmark/src/stats.ts b/packages/benchmark/src/stats.ts
index 3d1ffca7f..70756a8d6 100644
--- a/packages/benchmark/src/stats.ts
+++ b/packages/benchmark/src/stats.ts
@@ -38,7 +38,7 @@ export function aggregateSamples(values: number[]): {
   stdDev: number;
   outliers: number;
 } {
-  const sorted = [...values].sort((first, second) => first - second);
+  const sorted = values.toSorted((first, second) => first - second);
   const q1 = quantile(sorted, 0.25);
   const q3 = quantile(sorted, 0.75);
   const filtered = values.filter((value) => !isOutlier(value, q1, q3));

From 97c8710c176fc6bb8ef0e2a3f8ebb9fbf7592765 Mon Sep 17 00:00:00 2001
From: Janpot <2109932+Janpot@users.noreply.github.com>
Date: Mon, 15 Jun 2026 12:57:50 +0200
Subject: [PATCH 20/20] [benchmark] Stamp resume timestamp last, clarify
 record() in README

Move performance.now() above the guard in pauseReactRecording (earliest, so the closing window excludes pause overhead) and below the bookkeeping in resumeReactRecording (latest, so the new window excludes resume overhead). Also clarify that both metric primitives accept record(value).
---
 packages/benchmark/README.md             |  4 +++-
 packages/benchmark/src/reactRecording.ts | 10 ++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index 1c9731dc8..ba001c90b 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -131,9 +131,11 @@ This produces a `bench:paint#my-component` sub-series alongside the automatic `b
 
 Record your own measurements — a timing, a count, anything measured inside or outside React — from a plain `it()` loop or from inside a `benchmark()`. There are two primitives:
 
-- `ScalarMetric` — a continuous value (timings, sizes). Aggregated as mean ± standard deviation with IQR outlier removal, and compared against a baseline with a relative noise band. It also offers a `console.time`-style timing helper.
+- `ScalarMetric` — a continuous value (timings, sizes). Aggregated as mean ± standard deviation with IQR outlier removal, and compared against a baseline with a relative noise band.
 - `DiscreteMetric` — a count of events. Compared as an exact integer (any change is significant) and formatted as a whole number.
 
+Both record values with `record(value)`. `ScalarMetric` additionally offers `time()`/`timeEnd()` — a `console.time`-style shortcut that records the elapsed milliseconds for you.
+
 ```tsx
 import { it } from 'vitest';
 import { ScalarMetric, DiscreteMetric } from '@mui/internal-benchmark';
diff --git a/packages/benchmark/src/reactRecording.ts b/packages/benchmark/src/reactRecording.ts
index d4b364b2e..2d04c2fa3 100644
--- a/packages/benchmark/src/reactRecording.ts
+++ b/packages/benchmark/src/reactRecording.ts
@@ -67,11 +67,12 @@ export function createReactRecordingControls(initiallyActive: boolean): ReactRec
       closeWindowIfActive();
     },
     pauseReactRecording() {
+      // Stamp first — before the guard and bookkeeping — so the closing window ends as early as
+      // possible and excludes pause's own overhead. In the throw path it is simply discarded.
+      const now = performance.now();
       if (!active) {
         throw new Error('pauseReactRecording() called but React recording is already paused.');
       }
-      // Stamp the call site before the bookkeeping so paint attribution uses an accurate time.
-      const now = performance.now();
       closeWindowIfActive();
       active = false;
       transitions.push({ time: now, active: false });
@@ -80,10 +81,11 @@ export function createReactRecordingControls(initiallyActive: boolean): ReactRec
       if (active) {
         throw new Error('resumeReactRecording() called but React recording is already active.');
       }
-      // Stamp the call site before the bookkeeping so paint attribution uses an accurate time.
-      const now = performance.now();
       active = true;
       currentWindowHasRender = false;
+      // Stamp last — after the bookkeeping — so the new window starts as late as possible and
+      // excludes resume's own overhead.
+      const now = performance.now();
       transitions.push({ time: now, active: true });
     },
   };