diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
index afb076129..655e83e5b 100644
--- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
+++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx
@@ -14,16 +14,22 @@ import ExpandMoreIcon from '@mui/icons-material/ExpandMore';
 import Tooltip from '@mui/material/Tooltip';
 import {
   compareBenchmarkReports,
+  type BenchmarkComparisonInput,
   type BenchmarkComparisonReport,
   type ComparisonItem,
   type DiffValue,
   type BenchmarkDiffSeverity,
 } from '@/lib/benchmark/compareBenchmarkReports';
-import type { BenchmarkReport } from '@/lib/benchmark/types';
-import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters';
+import {
+  formatMs,
+  formatMetricNumber,
+  formatMetricDiff,
+  percentFormatter,
+} from '@/utils/formatters';
 
 const SEVERITY_COLOR: Record<BenchmarkDiffSeverity, string> = {
   error: 'error.main',
+  warning: 'warning.main',
   success: 'success.main',
   neutral: 'text.secondary',
 };
@@ -32,6 +38,7 @@ const MIN_BAR_WIDTH_PX = 4;
 
 const DIFF_BAR_COLORS: Record<BenchmarkDiffSeverity, string> = {
   error: 'var(--mui-palette-error-main)',
+  warning: 'var(--mui-palette-warning-main)',
   success: 'var(--mui-palette-success-main)',
   neutral: 'var(--mui-palette-action-disabled)',
 };
@@ -72,13 +79,19 @@ function computeEntryBar(
   return computeDiffBar(comparison.duration, range.min, range.max);
 }
 
-function FormattedDiffMs({ diff, percent = false }: { diff: DiffValue; percent?: boolean }) {
+interface FormattedDiffMsProps {
+  diff: DiffValue;
+  percent?: boolean;
+  format?: Intl.NumberFormatOptions;
+}
+
+function FormattedDiffMs({ diff, percent = false, format }: FormattedDiffMsProps) {
   if (diff.absoluteDiff === 0) {
     return '\u2014';
   }
   return (
     <React.Fragment>
-      {formatDiffMs(diff.absoluteDiff)}
+      {formatMetricDiff(diff.absoluteDiff, format)}
       {percent && (
         <React.Fragment>
           {' '}
@@ -91,12 +104,18 @@ function FormattedDiffMs({ diff, percent = false }: { diff: DiffValue; percent?:
   );
 }
 
-function DiffCell({ diff, sx }: { diff: DiffValue; sx?: object }) {
+interface DiffCellProps {
+  diff: DiffValue;
+  sx?: object;
+  format?: Intl.NumberFormatOptions;
+}
+
+function DiffCell({ diff, sx, format }: DiffCellProps) {
   const color = SEVERITY_COLOR[diff.severity];
   return (
     <Tooltip title={diff.hint} arrow>
       <TableCell align="right" sx={{ color, ...sx }}>
-        <FormattedDiffMs diff={diff} percent />
+        <FormattedDiffMs diff={diff} percent format={format} />
       </TableCell>
     </Tooltip>
   );
@@ -114,6 +133,7 @@ interface ComparisonTableRow {
   valueFill?: number;
   valueColor?: string;
   diffBar?: { left: number; width: number; color: string };
+  format?: Intl.NumberFormatOptions;
 }
 
 const NAME_CELL_SX = {
@@ -181,7 +201,7 @@ function ComparisonTable({
               <TableCell align="right">{row.outliers}</TableCell>
               {hasBase && (
                 <TableCell align="right">
-                  {row.base != null ? formatMs(row.base) : '\u2014'}
+                  {row.base != null ? formatMetricNumber(row.base, row.format) : '\u2014'}
                 </TableCell>
               )}
               {(() => {
@@ -196,10 +216,10 @@ function ComparisonTable({
                     : undefined;
                   return row.removed ? (
                     <TableCell align="right" sx={{ color: 'success.main', ...diffBarSx }}>
-                      <FormattedDiffMs diff={row.comparison} percent />
+                      <FormattedDiffMs diff={row.comparison} percent format={row.format} />
                     </TableCell>
                   ) : (
-                    <DiffCell diff={row.comparison} sx={diffBarSx} />
+                    <DiffCell diff={row.comparison} sx={diffBarSx} format={row.format} />
                   );
                 }
                 return <TableCell align="right">{'\u2014'}</TableCell>;
@@ -246,24 +266,20 @@ function TotalsSummary({ totals }: { totals: BenchmarkComparisonReport['totals']
           </Typography>
         </Typography>
       </Tooltip>
-      {totals.paintDefault && (
-        <Tooltip title={totals.paintDefault.hint} arrow>
-          <Typography variant="body2">
-            <strong>Paint:</strong>{' '}
-            <Typography
-              component="span"
-              variant="body2"
-              sx={{ color: SEVERITY_COLOR[totals.paintDefault.severity] }}
-            >
-              <FormattedDiffMs diff={totals.paintDefault} percent />
-            </Typography>
-          </Typography>
-        </Tooltip>
-      )}
     </Box>
   );
 }
 
+function worstMetricSeverity(metrics: ComparisonItem['metrics']): BenchmarkDiffSeverity {
+  if (metrics.some((row) => row.diff.severity === 'error')) {
+    return 'error';
+  }
+  if (metrics.some((row) => row.diff.severity === 'warning')) {
+    return 'warning';
+  }
+  return 'neutral';
+}
+
 function BenchmarkAccordion({
   comparison,
   hasBase,
@@ -278,6 +294,8 @@ function BenchmarkAccordion({
   entryDiffRange: { min: number; max: number };
 }) {
   const renderCount = comparison.renders.filter((row) => !row.removed).length;
+  const metricCount = comparison.metrics.filter((row) => !row.removed).length;
+  const metricSeverity = worstMetricSeverity(comparison.metrics);
 
   const entryBar = computeEntryBar(comparison, hasBase, entryDiffRange);
 
@@ -345,6 +363,11 @@ function BenchmarkAccordion({
                 </Tooltip>
               )}
             </Typography>
+            {metricCount > 0 && (
+              <Typography variant="body2" sx={{ color: SEVERITY_COLOR[metricSeverity] }}>
+                {metricCount} metrics
+              </Typography>
+            )}
           </Box>
         </Box>
       </AccordionSummary>
@@ -406,9 +429,9 @@ function BenchmarkAccordion({
                     '\u2014'
                   ) : (
                     <React.Fragment>
-                      {formatMs(row.value)}{' '}
+                      {formatMetricNumber(row.value, row.format)}{' '}
                       <Typography component="span" variant="caption" color="text.secondary">
-                        ±{formatMs(row.stdDev)}
+                        ±{formatMetricNumber(row.stdDev, row.format)}
                       </Typography>
                     </React.Fragment>
                   ),
@@ -416,6 +439,7 @@ function BenchmarkAccordion({
                   comparison: row.diff,
                   base: row.diff?.base,
                   removed: row.removed,
+                  format: row.format,
                 };
               })}
             />
@@ -431,8 +455,8 @@ function BenchmarkAccordion({
 }
 
 interface BenchmarkComparisonReportViewProps {
-  value: BenchmarkReport;
-  base: BenchmarkReport | null;
+  value: BenchmarkComparisonInput;
+  base: BenchmarkComparisonInput | null;
 }
 
 export function BenchmarkComparisonReportView({ value, base }: BenchmarkComparisonReportViewProps) {
diff --git a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
index 27e09ec1f..608131535 100644
--- a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
+++ b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
@@ -10,7 +10,7 @@ import Tabs from '@mui/material/Tabs';
 import Tab from '@mui/material/Tab';
 import { BarChartPro } from '@mui/x-charts-pro/BarChartPro';
 import { useXScale, useDrawingArea } from '@mui/x-charts-pro/hooks';
-import type { BenchmarkReport } from '@/lib/benchmark/types';
+import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types';
 import { formatMs } from '@/utils/formatters';
 import { useMasterCommits, type GitHubCommit } from '../hooks/useMasterCommits';
 import { useCiReports } from '../hooks/useCiReports';
@@ -81,6 +81,7 @@ interface CommitReportData {
   timestamp: number;
   commit: GitHubCommit;
   report: BenchmarkReport | null;
+  metricDefinitions?: Record<string, MetricDefinition>;
 }
 
 function collectBenchmarkNames(chartData: CommitReportData[]): string[] {
@@ -126,6 +127,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
         timestamp,
         commit,
         report: reports[commit.sha]?.report ?? null,
+        metricDefinitions: reports[commit.sha]?.metricDefinitions,
       })),
     [commits, reports],
   );
@@ -154,7 +156,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
         return entry.totalDuration;
       }
       if (chartMode === 'paint') {
-        return entry.metrics['paint:default']?.mean ?? null;
+        return entry.metrics['bench:paint']?.mean ?? null;
       }
       return entry.renders.length;
     },
@@ -300,8 +302,10 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
       return null;
     }
     return {
-      value: reportData.report,
-      base: baselineData?.report ?? null,
+      value: { report: reportData.report, metricDefinitions: reportData.metricDefinitions },
+      base: baselineData?.report
+        ? { report: baselineData.report, metricDefinitions: baselineData.metricDefinitions }
+        : null,
       valueCommit: reportData.commit,
       baseCommit: baselineData?.commit ?? null,
     };
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
index f861d5aae..6d07c7729 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts
@@ -1,8 +1,34 @@
 import { describe, it, expect } from 'vitest';
 import { buildBenchmarkMarkdownReport } from './buildMarkdownReport';
-import { compareBenchmarkReports } from './compareBenchmarkReports';
+import { compareBenchmarkReports, type BenchmarkComparisonInput } from './compareBenchmarkReports';
+import type { BenchmarkReportEntry, MetricDefinition } from './types';
 import { makeReport, makeReportFromConfig } from './test-fixtures';
 
+// A single benchmark whose duration/renders are neutral, so the only signal is one metric.
+function metricReport(
+  name: string,
+  mean: number,
+  definitions?: Record<string, MetricDefinition>,
+): BenchmarkComparisonInput {
+  const entry: BenchmarkReportEntry = {
+    iterations: 10,
+    totalDuration: 100,
+    renders: [],
+    metrics: { [name]: { mean, stdDev: 0, outliers: 0 } },
+  };
+  return { report: { Bench: entry }, metricDefinitions: definitions };
+}
+
+const alarmDefinitions: Record<string, MetricDefinition> = {
+  clicks: {
+    kind: 'discrete',
+    format: { maximumFractionDigits: 0 },
+    alarm: { direction: 'lowerIsBetter', error: 1 },
+  },
+  tti: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 } },
+  fps: { kind: 'scalar' }, // informational (no alarm)
+};
+
 describe('buildBenchmarkMarkdownReport', () => {
   it('drops within-noise rows but keeps significant regressions', () => {
     const report = compareBenchmarkReports(
@@ -126,4 +152,38 @@ describe('buildBenchmarkMarkdownReport', () => {
     expect(markdown).not.toContain('No significant changes');
     expect(markdown).toContain('| Test |');
   });
+
+  describe('metric alarms', () => {
+    it('surfaces an error-level metric, making its otherwise-neutral test significant', () => {
+      const report = compareBenchmarkReports(
+        metricReport('clicks', 5, alarmDefinitions), // +2 vs 3, discrete error band 1
+        metricReport('clicks', 3),
+      );
+      const markdown = buildBenchmarkMarkdownReport(report);
+      expect(markdown).not.toContain('No significant changes');
+      expect(markdown).toContain('**Metric alarms**');
+      expect(markdown).toContain('| Bench | clicks |');
+      expect(markdown).toContain('🔺');
+    });
+
+    it('does not surface warning-level metrics (kept on the dashboard only)', () => {
+      const report = compareBenchmarkReports(
+        metricReport('tti', 115, alarmDefinitions), // +15%: past warn (10%), within error (25%) -> warning
+        metricReport('tti', 100),
+      );
+      const markdown = buildBenchmarkMarkdownReport(report);
+      expect(markdown).toContain('No significant changes');
+      expect(markdown).not.toContain('Metric alarms');
+    });
+
+    it('ignores informational metrics — no alarm section, test stays within noise', () => {
+      const report = compareBenchmarkReports(
+        metricReport('fps', 200, alarmDefinitions), // big change but no alarm configured
+        metricReport('fps', 100),
+      );
+      const markdown = buildBenchmarkMarkdownReport(report);
+      expect(markdown).toContain('No significant changes');
+      expect(markdown).not.toContain('Metric alarms');
+    });
+  });
 });
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
index c21135cc2..d33a6c403 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
@@ -1,5 +1,12 @@
-import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters';
-import type { BenchmarkComparisonReport, DiffValue } from './compareBenchmarkReports';
+import { formatMs, formatDiffMs, formatMetricDiff, percentFormatter } from '@/utils/formatters';
+import type {
+  BenchmarkComparisonReport,
+  ComparisonEntry,
+  DiffValue,
+} from './compareBenchmarkReports';
+
+// Only error-level alarms surface in the PR comment; warnings stay on the dashboard.
+const isAlarmed = (metric: ComparisonEntry): boolean => metric.diff.severity === 'error';
 
 export interface BuildMarkdownReportOptions {
   maxRows?: number;
@@ -8,6 +15,7 @@ export interface BuildMarkdownReportOptions {
 
 const SEVERITY_PREFIX: Record<string, string> = {
   error: '🔺',
+  warning: '⚠️',
   success: '▼',
 };
 
@@ -42,11 +50,6 @@ export function buildBenchmarkMarkdownReport(
       `**Total duration:** ${formatMs(report.totals.duration.current ?? 0)}${formatDiff(report.totals.duration, 'ms')}`,
       `**Renders:** ${report.totals.renderCount.current ?? 0}${formatDiff(report.totals.renderCount, 'count')}`,
     ];
-    if (report.totals.paintDefault) {
-      totalParts.push(
-        `**Paint:** ${formatMs(report.totals.paintDefault.current ?? 0)}${formatDiff(report.totals.paintDefault, 'ms')}`,
-      );
-    }
     lines.push(totalParts.join(' | '));
     lines.push('');
   }
@@ -56,7 +59,8 @@ export function buildBenchmarkMarkdownReport(
       entry.duration.severity !== 'neutral' ||
       (entry.renderCount?.severity ?? 'neutral') !== 'neutral' ||
       entry.duration.current === null ||
-      entry.duration.base === null,
+      entry.duration.base === null ||
+      entry.metrics.some(isAlarmed),
   );
 
   const detailsLink = reportUrl ? `[details](${reportUrl})` : '';
@@ -99,5 +103,22 @@ export function buildBenchmarkMarkdownReport(
     lines.push(detailsLink);
   }
 
+  // Error-level metric alarms across all tests (independent of the test-table row cap).
+  const alarms = report.entries.flatMap((entry) =>
+    entry.metrics.filter(isAlarmed).map((metric) => ({ test: entry.name, metric })),
+  );
+  if (alarms.length > 0) {
+    lines.push('');
+    lines.push('**Metric alarms**');
+    lines.push('');
+    lines.push('| Test | Metric | Change |');
+    lines.push('|:-----|:-------|-------:|');
+    for (const { test, metric } of alarms) {
+      const prefix = SEVERITY_PREFIX[metric.diff.severity] ?? '';
+      const change = `${prefix} ${formatMetricDiff(metric.diff.absoluteDiff, metric.format)}`;
+      lines.push(`| ${test} | ${metric.name} | ${change} |`);
+    }
+  }
+
   return lines.join('\n');
 }
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
index 87370da5b..e17ea9a26 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts
@@ -1,7 +1,38 @@
 import { describe, it, expect } from 'vitest';
 import { compareBenchmarkReports } from './compareBenchmarkReports';
+import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition } from './types';
 import { makeReport, makeReportFromConfig } from './test-fixtures';
 
+function reportWithMetrics(metrics: Record<string, number>): BenchmarkReport {
+  const entry: BenchmarkReportEntry = {
+    iterations: 10,
+    totalDuration: 0,
+    renders: [],
+    metrics: Object.fromEntries(
+      Object.entries(metrics).map(([name, mean]) => [name, { mean, stdDev: 0, outliers: 0 }]),
+    ),
+  };
+  return { Bench: entry };
+}
+
+const definitions: Record<string, MetricDefinition> = {
+  scalar_alarm: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', error: 0.1 } },
+  scalar_tiered: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 } },
+  scalar_higher: { kind: 'scalar', alarm: { direction: 'higherIsBetter', error: 0.1 } },
+  scalar_info: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } },
+  discrete_alarm: { kind: 'discrete', alarm: { direction: 'lowerIsBetter' } },
+  discrete_tiered: { kind: 'discrete', alarm: { direction: 'lowerIsBetter', warn: 1, error: 3 } },
+};
+
+function metricEntry(current: BenchmarkReport, base: BenchmarkReport, metricName: string) {
+  // The metric appears on the current side, so its definitions ride along there.
+  const result = compareBenchmarkReports(
+    { report: current, metricDefinitions: definitions },
+    { report: base },
+  );
+  return result.entries[0].metrics.find((metric) => metric.name === metricName)!;
+}
+
 describe('compareBenchmarkReports', () => {
   it('marks diffs within ±20% as neutral noise', () => {
     const result = compareBenchmarkReports(
@@ -183,4 +214,134 @@ describe('compareBenchmarkReports', () => {
       expect(order).toEqual(['DurationRegression', 'FewerRenders']);
     });
   });
+
+  describe('custom metrics', () => {
+    it('keeps an informational metric (no alarm) neutral even on a large change', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_info: 200 }),
+        reportWithMetrics({ scalar_info: 100 }),
+        'scalar_info',
+      );
+      expect(metric.diff.severity).toBe('neutral');
+      expect(metric.format).toEqual({ style: 'unit', unit: 'byte' });
+      // The hint respects the metric's format rather than hard-coding milliseconds.
+      expect(metric.diff.hint).toContain('byte');
+      expect(metric.diff.hint).not.toContain('ms');
+    });
+
+    it('flags a scalar alarm regression beyond its error band', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_alarm: 120 }), // +20%, error band is 10%
+        reportWithMetrics({ scalar_alarm: 100 }),
+        'scalar_alarm',
+      );
+      expect(metric.diff.severity).toBe('error');
+      expect(metric.diff.hint).toContain('Regression');
+    });
+
+    it('flags a scalar regression between warn and error bands as a warning', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_tiered: 115 }), // +15%: past warn (10%), within error (25%)
+        reportWithMetrics({ scalar_tiered: 100 }),
+        'scalar_tiered',
+      );
+      expect(metric.diff.severity).toBe('warning');
+      expect(metric.diff.hint).toContain('Warning');
+    });
+
+    it('escalates a scalar regression past the error band to error', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_tiered: 130 }), // +30%: past error (25%)
+        reportWithMetrics({ scalar_tiered: 100 }),
+        'scalar_tiered',
+      );
+      expect(metric.diff.severity).toBe('error');
+    });
+
+    it('keeps a scalar change within the warn band neutral', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_tiered: 105 }), // +5%: within warn
+        reportWithMetrics({ scalar_tiered: 100 }),
+        'scalar_tiered',
+      );
+      expect(metric.diff.severity).toBe('neutral');
+    });
+
+    it('tiers discrete metrics by absolute count delta, inclusive of the band', () => {
+      const warn = metricEntry(
+        reportWithMetrics({ discrete_tiered: 4 }), // +1: meets warn (1), below error (3)
+        reportWithMetrics({ discrete_tiered: 3 }),
+        'discrete_tiered',
+      );
+      expect(warn.diff.severity).toBe('warning');
+
+      const error = metricEntry(
+        reportWithMetrics({ discrete_tiered: 6 }), // +3: meets error (3)
+        reportWithMetrics({ discrete_tiered: 3 }),
+        'discrete_tiered',
+      );
+      expect(error.diff.severity).toBe('error');
+    });
+
+    it('honors higherIsBetter so an increase is an improvement', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ scalar_higher: 120 }),
+        reportWithMetrics({ scalar_higher: 100 }),
+        'scalar_higher',
+      );
+      expect(metric.diff.severity).toBe('success');
+      expect(metric.diff.hint).toContain('Improvement');
+    });
+
+    it('compares discrete alarms exactly — any change is flagged', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ discrete_alarm: 4 }),
+        reportWithMetrics({ discrete_alarm: 3 }),
+        'discrete_alarm',
+      );
+      expect(metric.diff.severity).toBe('error');
+      expect(metric.diff.hint).toBe('Regression: +1');
+    });
+
+    it('keeps metrics without a definition on the default noise-band behavior', () => {
+      // No definition for `paint:default`, so it uses the global ±20% noise band.
+      const withinNoise = metricEntry(
+        reportWithMetrics({ 'paint:default': 110 }),
+        reportWithMetrics({ 'paint:default': 100 }),
+        'paint:default',
+      );
+      expect(withinNoise.diff.severity).toBe('neutral');
+
+      const regression = metricEntry(
+        reportWithMetrics({ 'paint:default': 130 }),
+        reportWithMetrics({ 'paint:default': 100 }),
+        'paint:default',
+      );
+      expect(regression.diff.severity).toBe('error');
+    });
+
+    it('resolves a sub-series definition by its base metric name', () => {
+      const metric = metricEntry(
+        reportWithMetrics({ 'scalar_alarm#large': 120 }),
+        reportWithMetrics({ 'scalar_alarm#large': 100 }),
+        'scalar_alarm#large',
+      );
+      expect(metric.diff.severity).toBe('error');
+    });
+
+    it('keeps a base-only (removed) metric formatted using its base definition', () => {
+      // Definitions travel with their report: the metric exists only in the base, so its formatting
+      // comes from the base side's definitions even though the head has none.
+      const result = compareBenchmarkReports(
+        { report: reportWithMetrics({}) },
+        {
+          report: reportWithMetrics({ bytes: 100 }),
+          metricDefinitions: { bytes: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } } },
+        },
+      );
+      const metric = result.entries[0].metrics.find((entry) => entry.name === 'bytes')!;
+      expect(metric.removed).toBe(true);
+      expect(metric.format).toEqual({ style: 'unit', unit: 'byte' });
+    });
+  });
 });
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
index af0aa2ca4..513e741e4 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts
@@ -1,9 +1,17 @@
-import { formatDiffMs, percentFormatter } from '@/utils/formatters';
-import type { BenchmarkReport, BenchmarkReportEntry, RenderStats } from './types';
+import { formatDiffMs, formatMetricDiff, percentFormatter } from '@/utils/formatters';
+import type {
+  BenchmarkBaseUpload,
+  BenchmarkReportEntry,
+  MetricDefinition,
+  RenderStats,
+} from './types';
+
+/** A report paired with its own metric definitions — the unit the comparison operates on. */
+export type BenchmarkComparisonInput = Pick<BenchmarkBaseUpload, 'report' | 'metricDefinitions'>;
 
 const NOISE_THRESHOLD = 0.2;
 
-export type BenchmarkDiffSeverity = 'error' | 'success' | 'neutral';
+export type BenchmarkDiffSeverity = 'error' | 'warning' | 'success' | 'neutral';
 
 export interface DiffValue {
   current: number | null;
@@ -21,6 +29,8 @@ export interface ComparisonEntry {
   outliers: number;
   diff: DiffValue;
   removed: boolean;
+  /** Display formatting for custom metrics; absent for renders and built-in (paint) metrics. */
+  format?: Intl.NumberFormatOptions;
 }
 
 export interface ComparisonItem {
@@ -34,8 +44,9 @@ export interface ComparisonItem {
 
 const SEVERITY_RANK: Record<BenchmarkDiffSeverity, number> = {
   error: 0,
-  success: 1,
-  neutral: 2,
+  warning: 1,
+  success: 2,
+  neutral: 3,
 };
 
 export interface BenchmarkComparisonReport {
@@ -44,7 +55,6 @@ export interface BenchmarkComparisonReport {
   totals: {
     duration: DiffValue;
     renderCount: DiffValue;
-    paintDefault: DiffValue | null;
   };
 }
 
@@ -153,21 +163,117 @@ function compareRenders(
   return entries;
 }
 
+/** Strips a `#sub-series` suffix to recover the metric name used to look up its definition. */
+function baseMetricName(key: string): string {
+  const hashIndex = key.indexOf('#');
+  return hashIndex === -1 ? key : key.slice(0, hashIndex);
+}
+
+/**
+ * Diff for a custom metric, honoring its definition. A metric without an `alarm` is
+ * informational (always neutral). With an `alarm`, a regression past the `warn` band is flagged
+ * `warning` and past the `error` band `error`; improvements are `success`. Bands are relative
+ * fractions for `scalar` metrics and absolute count deltas for `discrete` metrics. Metrics
+ * without a definition (e.g. paint) keep the default `makeDiffValue` behavior and never reach here.
+ */
+function makeMetricDiff(
+  current: number | null,
+  base: number | null,
+  definition: MetricDefinition,
+): DiffValue {
+  if (base === null) {
+    return { current, base, absoluteDiff: 0, relativeDiff: 0, severity: 'neutral', hint: 'New' };
+  }
+
+  const currentVal = current ?? 0;
+  const absoluteDiff = currentVal - base;
+  const relativeDiff = base !== 0 ? absoluteDiff / base : 0;
+
+  const { alarm, kind } = definition;
+  const isDiscrete = kind === 'discrete';
+  const diffStr = isDiscrete
+    ? `${absoluteDiff > 0 ? '+' : ''}${absoluteDiff}`
+    : `${formatMetricDiff(absoluteDiff, definition.format)} (${percentFormatter.format(relativeDiff)})`;
+
+  // Informational metric: show the change, never flag it.
+  if (!alarm) {
+    return {
+      current,
+      base,
+      absoluteDiff,
+      relativeDiff,
+      severity: 'neutral',
+      hint: absoluteDiff === 0 ? 'No change' : diffStr,
+    };
+  }
+
+  // Scalar bands are relative fractions; discrete bands are absolute count deltas, and meet their
+  // threshold inclusively (an `error: 2` discrete alarm fires on a delta of exactly 2).
+  const magnitude = isDiscrete ? Math.abs(absoluteDiff) : Math.abs(relativeDiff);
+  const meets = (band: number) => (isDiscrete ? magnitude >= band : magnitude > band);
+
+  // When no bands are given, `error` falls back to the global noise band (scalar) or any change
+  // (discrete). When only `warn` is given, there is no error band.
+  const defaultError = isDiscrete ? 0 : NOISE_THRESHOLD;
+  const errorBand = alarm.error ?? (alarm.warn === undefined ? defaultError : undefined);
+  const warnBand = alarm.warn;
+
+  let level: 'none' | 'warn' | 'error' = 'none';
+  if (absoluteDiff !== 0) {
+    if (errorBand !== undefined && meets(errorBand)) {
+      level = 'error';
+    } else if (warnBand !== undefined && meets(warnBand)) {
+      level = 'warn';
+    }
+  }
+
+  const direction = alarm.direction ?? 'lowerIsBetter';
+  const isRegression = direction === 'lowerIsBetter' ? absoluteDiff > 0 : absoluteDiff < 0;
+
+  let severity: BenchmarkDiffSeverity = 'neutral';
+  if (level !== 'none') {
+    if (!isRegression) {
+      severity = 'success';
+    } else {
+      severity = level === 'error' ? 'error' : 'warning';
+    }
+  }
+
+  let hint: string;
+  if (absoluteDiff === 0) {
+    hint = 'No change';
+  } else if (level === 'none') {
+    hint = `Within noise: ${diffStr}`;
+  } else if (!isRegression) {
+    hint = `Improvement: ${diffStr}`;
+  } else {
+    hint = `${level === 'error' ? 'Regression' : 'Warning'}: ${diffStr}`;
+  }
+
+  return { current, base, absoluteDiff, relativeDiff, severity, hint };
+}
+
 function compareMetrics(
   currentMetrics: Record<string, { mean: number; stdDev: number; outliers: number }>,
   baseEntry: BenchmarkReportEntry | undefined,
+  currentDefinitions: Record<string, MetricDefinition> | undefined,
+  baseDefinitions: Record<string, MetricDefinition> | undefined,
 ): ComparisonEntry[] {
   const entries: ComparisonEntry[] = [];
 
   for (const [name, stats] of Object.entries(currentMetrics)) {
+    const definition = currentDefinitions?.[baseMetricName(name)];
     const baseStats = baseEntry?.metrics[name];
     entries.push({
       name,
       value: stats.mean,
       stdDev: stats.stdDev,
       outliers: stats.outliers,
-      diff: makeDiffValue(stats.mean, baseStats?.mean ?? null),
+      diff: definition
+        ? makeMetricDiff(stats.mean, baseStats?.mean ?? null, definition)
+        : makeDiffValue(stats.mean, baseStats?.mean ?? null),
       removed: false,
+      format: definition?.format,
     });
   }
 
@@ -175,6 +281,7 @@ function compareMetrics(
   if (baseEntry) {
     for (const [name, baseStats] of Object.entries(baseEntry.metrics)) {
       if (!(name in currentMetrics)) {
+        const definition = baseDefinitions?.[baseMetricName(name)];
         entries.push({
           name,
           value: 0,
@@ -182,6 +289,7 @@ function compareMetrics(
           outliers: 0,
           diff: makeDiffValue(null, baseStats.mean),
           removed: true,
+          format: definition?.format,
         });
       }
     }
@@ -225,22 +333,25 @@ function compareItems(a: ComparisonItem, b: ComparisonItem): number {
 }
 
 export function compareBenchmarkReports(
-  current: BenchmarkReport,
-  base: BenchmarkReport | null,
+  current: BenchmarkComparisonInput,
+  base: BenchmarkComparisonInput | null,
 ): BenchmarkComparisonReport {
-  const effectiveBase = base ?? {};
+  // Definitions travel with their report: a current metric uses the current definitions, a
+  // base-only/removed metric uses the base definitions. No merge step — the side a metric appears
+  // on decides how it's formatted.
+  const currentDefinitions = current.metricDefinitions;
+  const baseDefinitions = base?.metricDefinitions;
+  const currentReport = current.report;
+  const effectiveBase = base?.report ?? {};
   const entries: ComparisonItem[] = [];
 
   let totalCurrentDuration = 0;
   let totalBaseDuration = 0;
   let totalCurrentRenders = 0;
   let totalBaseRenders = 0;
-  let totalCurrentPaint = 0;
-  let totalBasePaint = 0;
-  let hasPaint = false;
 
   // Process current entries
-  for (const [name, entry] of Object.entries(current)) {
+  for (const [name, entry] of Object.entries(currentReport)) {
     const baseEntry = effectiveBase[name];
 
     const duration = makeDiffValue(entry.totalDuration, baseEntry?.totalDuration ?? null);
@@ -251,7 +362,7 @@ export function compareBenchmarkReports(
         ? makeCountDiffValue(entry.renders.length, baseEntry.renders.length)
         : undefined,
       renders: compareRenders(entry.renders, baseEntry),
-      metrics: compareMetrics(entry.metrics, baseEntry),
+      metrics: compareMetrics(entry.metrics, baseEntry, currentDefinitions, baseDefinitions),
       iterations: entry.iterations,
     });
 
@@ -259,19 +370,11 @@ export function compareBenchmarkReports(
     totalBaseDuration += baseEntry?.totalDuration ?? 0;
     totalCurrentRenders += entry.renders.length;
     totalBaseRenders += baseEntry?.renders.length ?? 0;
-
-    const paintMetric = entry.metrics['paint:default'];
-    const basePaintMetric = baseEntry?.metrics['paint:default'];
-    if (paintMetric || basePaintMetric) {
-      hasPaint = true;
-      totalCurrentPaint += paintMetric?.mean ?? 0;
-      totalBasePaint += basePaintMetric?.mean ?? 0;
-    }
   }
 
   // Process removed entries (in base but not in current)
   for (const [name, baseEntry] of Object.entries(effectiveBase)) {
-    if (name in current) {
+    if (name in currentReport) {
       continue;
     }
 
@@ -280,18 +383,12 @@ export function compareBenchmarkReports(
       name,
       duration,
       renders: compareRenders([], baseEntry),
-      metrics: compareMetrics({}, baseEntry),
+      metrics: compareMetrics({}, baseEntry, currentDefinitions, baseDefinitions),
       iterations: 0,
     });
 
     totalBaseDuration += baseEntry.totalDuration;
     totalBaseRenders += baseEntry.renders.length;
-
-    const basePaintMetric = baseEntry.metrics['paint:default'];
-    if (basePaintMetric) {
-      hasPaint = true;
-      totalBasePaint += basePaintMetric.mean;
-    }
   }
 
   entries.sort(compareItems);
@@ -302,7 +399,6 @@ export function compareBenchmarkReports(
     totals: {
       duration: makeDiffValue(totalCurrentDuration, totalBaseDuration),
       renderCount: makeCountDiffValue(totalCurrentRenders, totalBaseRenders),
-      paintDefault: hasPaint ? makeDiffValue(totalCurrentPaint, totalBasePaint) : null,
     },
   };
 }
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts
new file mode 100644
index 000000000..538b2537b
--- /dev/null
+++ b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts
@@ -0,0 +1,40 @@
+import { describe, it, expect } from 'vitest';
+import { migrateBenchmarkReport } from './migrateBenchmarkReport';
+import type { BenchmarkReport } from './types';
+
+function report(metrics: Record<string, number>): BenchmarkReport {
+  return {
+    Bench: {
+      iterations: 10,
+      totalDuration: 0,
+      renders: [],
+      metrics: Object.fromEntries(
+        Object.entries(metrics).map(([name, mean]) => [name, { mean, stdDev: 0, outliers: 0 }]),
+      ),
+    },
+  };
+}
+
+describe('migrateBenchmarkReport', () => {
+  it('renames legacy paint:default to bench:paint', () => {
+    const migrated = migrateBenchmarkReport(report({ 'paint:default': 12 }));
+    expect(Object.keys(migrated.Bench.metrics)).toEqual(['bench:paint']);
+    expect(migrated.Bench.metrics['bench:paint'].mean).toBe(12);
+  });
+
+  it('renames named legacy paint keys to bench:paint sub-series', () => {
+    const migrated = migrateBenchmarkReport(report({ 'paint:grid-header': 8 }));
+    expect(Object.keys(migrated.Bench.metrics)).toEqual(['bench:paint#grid-header']);
+  });
+
+  it('leaves current and custom metric keys untouched (idempotent)', () => {
+    const migrated = migrateBenchmarkReport(
+      report({ 'bench:paint': 5, 'bench:paint#header': 3, button_clicks: 2 }),
+    );
+    expect(Object.keys(migrated.Bench.metrics).sort()).toEqual([
+      'bench:paint',
+      'bench:paint#header',
+      'button_clicks',
+    ]);
+  });
+});
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts
new file mode 100644
index 000000000..733342348
--- /dev/null
+++ b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts
@@ -0,0 +1,35 @@
+import type { BenchmarkReport, BenchmarkReportEntry } from './types';
+
+const LEGACY_PAINT_PREFIX = 'paint:';
+
+/**
+ * Renames legacy harness paint keys (`paint:default`, `paint:grid-header`) to the current
+ * `bench:paint` scheme (`bench:paint`, `bench:paint#grid-header`). Idempotent: keys already in the
+ * new scheme are untouched.
+ */
+function migrateMetricKey(key: string): string {
+  if (!key.startsWith(LEGACY_PAINT_PREFIX)) {
+    return key;
+  }
+  const identifier = key.slice(LEGACY_PAINT_PREFIX.length);
+  return identifier === 'default' ? 'bench:paint' : `bench:paint#${identifier}`;
+}
+
+function migrateEntry(entry: BenchmarkReportEntry): BenchmarkReportEntry {
+  return {
+    ...entry,
+    metrics: Object.fromEntries(
+      Object.entries(entry.metrics).map(([key, stats]) => [migrateMetricKey(key), stats]),
+    ),
+  };
+}
+
+/**
+ * Applies forward migrations to a benchmark report fetched from S3 so older uploads read with the
+ * current shape. Centralizes legacy normalization — add future migrations here.
+ */
+export function migrateBenchmarkReport(report: BenchmarkReport): BenchmarkReport {
+  return Object.fromEntries(
+    Object.entries(report).map(([name, entry]) => [name, migrateEntry(entry)]),
+  );
+}
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts b/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts
index c786b8297..18e86d17a 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts
@@ -1,4 +1,5 @@
 import type { BenchmarkReport, BenchmarkReportEntry } from './types';
+import type { BenchmarkComparisonInput } from './compareBenchmarkReports';
 
 export function makeEntry(totalDuration: number, renderCount: number = 1): BenchmarkReportEntry {
   const perRender = renderCount > 0 ? totalDuration / renderCount : 0;
@@ -19,20 +20,20 @@ export function makeEntry(totalDuration: number, renderCount: number = 1): Bench
   };
 }
 
-export function makeReport(entries: Record<string, number>): BenchmarkReport {
+export function makeReport(entries: Record<string, number>): BenchmarkComparisonInput {
   const report: BenchmarkReport = {};
   for (const [name, totalDuration] of Object.entries(entries)) {
     report[name] = makeEntry(totalDuration);
   }
-  return report;
+  return { report };
 }
 
 export function makeReportFromConfig(
   entries: Record<string, { duration: number; renders: number }>,
-): BenchmarkReport {
+): BenchmarkComparisonInput {
   const report: BenchmarkReport = {};
   for (const [name, { duration, renders }] of Object.entries(entries)) {
     report[name] = makeEntry(duration, renders);
   }
-  return report;
+  return { report };
 }
diff --git a/apps/code-infra-dashboard/src/lib/benchmark/types.ts b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
index ec29672c9..92a0f9d67 100644
--- a/apps/code-infra-dashboard/src/lib/benchmark/types.ts
+++ b/apps/code-infra-dashboard/src/lib/benchmark/types.ts
@@ -31,6 +31,23 @@ export interface BenchmarkReportEntry {
 
 export type BenchmarkReport = Record<string, BenchmarkReportEntry>;
 
+export type MetricDirection = 'lowerIsBetter' | 'higherIsBetter';
+
+export interface MetricAlarm {
+  direction?: MetricDirection;
+  /** Softer band (relative fraction for scalar, absolute count delta for discrete). */
+  warn?: number;
+  /** Harder band; defaults to the global noise band only when both `warn` and `error` are omitted. */
+  error?: number;
+}
+
+/** Per-metric config for custom metrics, hoisted to the top level of the report (keyed by name). */
+export interface MetricDefinition {
+  kind: 'scalar' | 'discrete';
+  format?: Intl.NumberFormatOptions;
+  alarm?: MetricAlarm;
+}
+
 export interface BenchmarkBaseUpload {
   version: 1;
   timestamp: number;
@@ -40,6 +57,7 @@ export interface BenchmarkBaseUpload {
   prNumber?: number;
   branch: string;
   report: BenchmarkReport;
+  metricDefinitions?: Record<string, MetricDefinition>;
 }
 
 export interface BenchmarkUpload extends BenchmarkBaseUpload {
diff --git a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
index b1eb97f7a..e7ef61af5 100644
--- a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
+++ b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts
@@ -45,7 +45,8 @@ export async function generateBenchmarkReport(
     markdownContent += `_:information_source: Using benchmark from parent commit ${actualBaseCommit} (fallback from merge base ${mergeBaseCommit})._\n\n`;
   }
 
-  const comparison = compareBenchmarkReports(headReport.report, baseReport);
+  const baseUpload = useInlinedBase ? inlinedBase : fetchedBaseUpload;
+  const comparison = compareBenchmarkReports(headReport, baseUpload ?? null);
 
   const detailsUrl = new URL(`${DASHBOARD_ORIGIN}/benchmark-details/${repo}`);
   detailsUrl.searchParams.set('sha', commitSha);
diff --git a/apps/code-infra-dashboard/src/utils/fetchCiReport.ts b/apps/code-infra-dashboard/src/utils/fetchCiReport.ts
index 67e96b349..2b2e7cfab 100644
--- a/apps/code-infra-dashboard/src/utils/fetchCiReport.ts
+++ b/apps/code-infra-dashboard/src/utils/fetchCiReport.ts
@@ -1,5 +1,6 @@
 import type { SizeSnapshotWithMetadata } from '@/lib/bundleSize/types';
 import type { BenchmarkReport, BenchmarkUpload } from '@/lib/benchmark/types';
+import { migrateBenchmarkReport } from '@/lib/benchmark/migrateBenchmarkReport';
 
 export interface CiReportTypes {
   'benchmark.json': BenchmarkUpload;
@@ -17,14 +18,19 @@ export type CiReportName = keyof CiReportTypes;
  * simply reasserts what the body already contains.
  */
 function normalizeBenchmarkArtifact(raw: unknown, repo: string, sha: string): BenchmarkUpload {
-  if (raw && typeof raw === 'object' && 'report' in raw) {
-    return { ...(raw as BenchmarkUpload), commitSha: sha, repo };
-  }
+  const upload: BenchmarkUpload =
+    raw && typeof raw === 'object' && 'report' in raw
+      ? { ...(raw as BenchmarkUpload), commitSha: sha, repo }
+      : ({ commitSha: sha, repo, report: raw as BenchmarkReport } as BenchmarkUpload);
+
+  // Apply forward migrations so older uploads read with the current metric naming.
   return {
-    commitSha: sha,
-    repo,
-    report: raw as BenchmarkReport,
-  } as BenchmarkUpload;
+    ...upload,
+    report: migrateBenchmarkReport(upload.report),
+    ...(upload.base
+      ? { base: { ...upload.base, report: migrateBenchmarkReport(upload.base.report) } }
+      : {}),
+  };
 }
 
 /**
diff --git a/apps/code-infra-dashboard/src/utils/formatters.ts b/apps/code-infra-dashboard/src/utils/formatters.ts
index 27dd11347..20266ac33 100644
--- a/apps/code-infra-dashboard/src/utils/formatters.ts
+++ b/apps/code-infra-dashboard/src/utils/formatters.ts
@@ -22,6 +22,33 @@ export function formatDiffMs(value: number): string {
   return `${sign}${durationFormatter.format(value)} ms`;
 }
 
+// `Intl.NumberFormat` construction is comparatively expensive, so cache one instance per unique
+// format spec — these formatters run for every metric cell on every render.
+const numberFormatterCache = new Map<string, Intl.NumberFormat>();
+
+function getNumberFormatter(format: Intl.NumberFormatOptions): Intl.NumberFormat {
+  const key = JSON.stringify(format);
+  let formatter = numberFormatterCache.get(key);
+  if (!formatter) {
+    formatter = new Intl.NumberFormat(undefined, format);
+    numberFormatterCache.set(key, formatter);
+  }
+  return formatter;
+}
+
+/** Formats a metric value with its `Intl.NumberFormatOptions`, falling back to milliseconds. */
+export function formatMetricNumber(value: number, format?: Intl.NumberFormatOptions): string {
+  return format ? getNumberFormatter(format).format(value) : formatMs(value);
+}
+
+/** Formats a signed metric diff with its format, falling back to a millisecond diff. */
+export function formatMetricDiff(value: number, format?: Intl.NumberFormatOptions): string {
+  // `signDisplay` is spread last so a diff always shows its sign, regardless of the metric's format.
+  return format
+    ? getNumberFormatter({ ...format, signDisplay: 'exceptZero' }).format(value)
+    : formatDiffMs(value);
+}
+
 interface ColumnDefinition {
   field: string;
   header?: string;
diff --git a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
index a558ffe01..1b0e8a0e7 100644
--- a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
+++ b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx
@@ -139,12 +139,7 @@ export default function BenchmarkDetails() {
           <InlinedBaseAlert fetchedBaseSha={baseSha} inlinedBaseSha={inlinedBase.commitSha} />
         )}
 
-        {report && (
-          <BenchmarkComparisonReportView
-            value={report.report}
-            base={effectiveBase?.report ?? null}
-          />
-        )}
+        {report && <BenchmarkComparisonReportView value={report} base={effectiveBase ?? null} />}
       </Paper>
     </React.Fragment>
   );
diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index f4c980bcd..ba001c90b 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -61,9 +61,44 @@ benchmark(
 );
 ```
 
+### Scoping which renders are measured
+
+By default a benchmark records every React render and paint, from the mount through the whole interaction. To measure only part of an interaction — or to exclude the mount — pause and resume recording from the interaction callback:
+
+```tsx
+benchmark(
+  'Combobox type',
+  () => <Combobox />,
+  async ({ pauseReactRecording, resumeReactRecording, waitForElementTiming }) => {
+    pauseReactRecording(); // stop recording the settling re-renders
+    await openMenu();
+    resumeReactRecording(); // measure only what follows
+    await type('hello');
+    await waitForElementTiming('results');
+  },
+);
+```
+
+`pauseReactRecording()` / `resumeReactRecording()` toggle only the harness's React render and `bench:paint` recording — your own custom metrics keep recording. They are a strict pair: pausing while already paused, or resuming while already active, throws (this catches unbalanced calls early).
+
+To exclude the mount itself, start paused with the `reactRecordingPaused` option and resume at the point you care about — the mount is captured before the interaction callback runs, so pausing inside the callback can't drop it:
+
+```tsx
+benchmark(
+  'Combobox type',
+  () => <Combobox />,
+  async ({ resumeReactRecording }) => {
+    await openMenu(); // mount + open: not recorded
+    resumeReactRecording();
+    await type('hello'); // only these renders/paint recorded
+  },
+  { reactRecordingPaused: true },
+);
+```
+
 ### Paint metrics
 
-By default, every benchmark captures a `paint:default` metric — the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically.
+By default, every benchmark captures a `bench:paint` metric — the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically. The harness owns the `bench:` namespace, so avoid it for your own metric names.
 
 You can track additional paint metrics by placing `<ElementTiming>` markers and awaiting them in an interaction callback. The component renders an invisible `<span>` that fires in the same paint frame as its surrounding content.
 
@@ -88,16 +123,79 @@ benchmark(
 );
 ```
 
-This produces a `paint:my-component` metric alongside the automatic `paint:default`.
+This produces a `bench:paint#my-component` sub-series alongside the automatic `bench:paint`.
 
 `waitForElementTiming` accepts an optional `timeout` in milliseconds (default: 5000). Pass `0` or `Infinity` to rely on the test timeout instead.
 
+### Custom metrics
+
+Record your own measurements — a timing, a count, anything measured inside or outside React — from a plain `it()` loop or from inside a `benchmark()`. There are two primitives:
+
+- `ScalarMetric` — a continuous value (timings, sizes). Aggregated as mean ± standard deviation with IQR outlier removal, and compared against a baseline with a relative noise band.
+- `DiscreteMetric` — a count of events. Compared as an exact integer (any change is significant) and formatted as a whole number.
+
+Both record values with `record(value)`. `ScalarMetric` additionally offers `time()`/`timeEnd()` — a `console.time`-style shortcut that records the elapsed milliseconds for you.
+
+```tsx
+import { it } from 'vitest';
+import { ScalarMetric, DiscreteMetric } from '@mui/internal-benchmark';
+
+const duration = new ScalarMetric({
+  name: 'work_duration',
+  format: { style: 'unit', unit: 'millisecond' }, // Intl.NumberFormatOptions
+  alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 }, // warn >10%, error >25%
+});
+
+const clicks = new DiscreteMetric({ name: 'button_clicks' });
+
+it('measures work', () => {
+  for (let i = 0; i < 100; i += 1) {
+    duration.time();
+    runWork();
+    duration.timeEnd(); // records the elapsed milliseconds
+
+    clicks.record(countClicks()); // a discrete count per run
+  }
+});
+```
+
+A metric is declared once (typically at module scope) and reused across tests and iterations. `record()` attaches the value to whichever test is running, so the same instance works in any `it()`.
+
+You can also `record()` or `time()` from inside a `benchmark()` render function or interaction callback. Values recorded during warmup iterations are excluded automatically, just like renders and `bench:paint`, so a metric recorded once per iteration yields exactly `runs` samples.
+
+#### Metric configuration
+
+- `name` — the metric's report key (**required**).
+- `format` — an [`Intl.NumberFormatOptions`](https://developer.mozilla.org/en-US/docs/Web/API/Intl/NumberFormat/NumberFormat) object used to display the value.
+- `alarm` — opts the metric into regression flagging. Omit it and the metric is informational (its diff is shown but never flagged). Holds:
+  - `direction` — `'lowerIsBetter'` (default) or `'higherIsBetter'`.
+  - `warn` — softer band; a regression past it is flagged as a warning.
+  - `error` — harder band; a regression past it is flagged as an error. Defaults to the dashboard's global noise band only when both `warn` and `error` are omitted; with only `warn` set there is no error band (warning-only).
+  - Bands are relative fractions for scalar metrics (`0.1` = 10%) and absolute count deltas for discrete metrics (`1`, `2`). Either band is optional.
+
+Alarms are evaluated against the baseline when the PR comment is generated, not during the local `vitest run` — a regression never fails the test suite locally. In the PR comment, `error`-band regressions surface as failures and `warn`-band regressions as warnings.
+
+#### Sub-series
+
+Pass `record(value, { id })` to split one metric into labeled sub-series, reported as `name#id`. For `ScalarMetric.time()`/`timeEnd()`, pass a label that maps to the same `id`:
+
+```tsx
+const phase = new ScalarMetric({ name: 'render_phase' });
+
+phase.time('header');
+renderHeader();
+phase.timeEnd('header'); // -> render_phase#header
+```
+
+Custom metrics are aggregated in the browser and only the resulting stats cross to the runner, so the amount of data is independent of how many values you record.
+
 ### Options
 
 ```tsx
 benchmark('name', renderFn, interaction, {
   runs: 20, // measurement iterations (default: 20)
   warmupRuns: 10, // warmup iterations before measuring (default: 10)
+  reactRecordingPaused: false, // start with React render/paint recording paused (default: false)
   afterEach: () => {
     /* cleanup between iterations */
   },
@@ -148,5 +246,7 @@ The feature is opt-in — without `BENCHMARK_BASELINE_PATH` (or the `baselinePat
 
 - `benchmark` — define a benchmark test case
 - `ElementTiming` — invisible marker component for paint timing (renders a `<span>` tracked by the Element Timing API)
+- `ScalarMetric` — record a continuous custom measurement (with a `console.time`-style timing helper)
+- `DiscreteMetric` — record a discrete custom count
 - `createBenchmarkVitestConfig` — create a Vitest config with browser benchmarking defaults
 - `BenchmarkReporter` — Vitest reporter that collects and outputs benchmark results
diff --git a/packages/benchmark/src/DiscreteMetric.ts b/packages/benchmark/src/DiscreteMetric.ts
new file mode 100644
index 000000000..90cec67c5
--- /dev/null
+++ b/packages/benchmark/src/DiscreteMetric.ts
@@ -0,0 +1,20 @@
+import { Metric } from './Metric';
+import type { MetricConfig, MetricKind } from './types';
+
+/**
+ * A discrete count of events or occurrences. Compared against a baseline as an exact integer
+ * (any change is significant — there is no noise band), and formatted as a whole number by default.
+ *
+ * ```ts
+ * const clicks = new DiscreteMetric({ name: 'button_clicks' });
+ * clicks.record(countClicks());
+ * ```
+ */
+export class DiscreteMetric extends Metric {
+  readonly kind: MetricKind = 'discrete';
+
+  constructor(config: MetricConfig | string) {
+    const resolved = typeof config === 'string' ? { name: config } : config;
+    super({ ...resolved, format: resolved.format ?? { maximumFractionDigits: 0 } });
+  }
+}
diff --git a/packages/benchmark/src/Metric.test.ts b/packages/benchmark/src/Metric.test.ts
new file mode 100644
index 000000000..fc061eb07
--- /dev/null
+++ b/packages/benchmark/src/Metric.test.ts
@@ -0,0 +1,51 @@
+import { describe, it, expect } from 'vitest';
+import { ScalarMetric } from './ScalarMetric';
+import { DiscreteMetric } from './DiscreteMetric';
+
+describe('Metric.record', () => {
+  it('throws when two different metrics share a name in the same test', () => {
+    const timing = new ScalarMetric({ name: 'shared' });
+    const count = new DiscreteMetric({ name: 'shared' });
+    timing.record(1);
+    expect(() => count.record(2)).toThrow(/share the name "shared"/);
+  });
+
+  it('allows the same instance to record many values under one name', () => {
+    const metric = new ScalarMetric({ name: 'reused' });
+    expect(() => {
+      metric.record(1);
+      metric.record(2, { id: 'sub' });
+    }).not.toThrow();
+  });
+
+  it('rejects a metric name containing the "#" sub-series separator', () => {
+    expect(() => new ScalarMetric({ name: 'paint#default' })).toThrow(/must not contain "#"/);
+  });
+});
+
+describe('ScalarMetric.timeEnd', () => {
+  it('throws when called without a matching time()', () => {
+    const metric = new ScalarMetric({ name: 'timer' });
+    expect(() => metric.timeEnd()).toThrow(/without a matching time/);
+  });
+
+  it('throws when the label does not match an open timer', () => {
+    const metric = new ScalarMetric({ name: 'labelled-timer' });
+    metric.time('a');
+    expect(() => metric.timeEnd('b')).toThrow(/without a matching time/);
+  });
+});
+
+describe('ScalarMetric.time', () => {
+  it('throws when a timer is already running for the same label', () => {
+    const metric = new ScalarMetric({ name: 'double-timer' });
+    metric.time('a');
+    expect(() => metric.time('a')).toThrow(/already running/);
+  });
+
+  it('allows concurrent timers under different labels', () => {
+    const metric = new ScalarMetric({ name: 'multi-timer' });
+    metric.time('a');
+    expect(() => metric.time('b')).not.toThrow();
+  });
+});
diff --git a/packages/benchmark/src/Metric.ts b/packages/benchmark/src/Metric.ts
new file mode 100644
index 000000000..eea291aa2
--- /dev/null
+++ b/packages/benchmark/src/Metric.ts
@@ -0,0 +1,110 @@
+import { onTestFinished, TestRunner, type RunnerTestCase } from 'vitest';
+import type { MetricConfig, MetricKind, MetricReport, MetricSampleStats } from './types';
+import { aggregateSamples } from './stats';
+import { metricsGate } from './metricsGate';
+// Import for TaskMeta augmentation side effect
+import './taskMetaAugmentation';
+
+interface SeriesAccumulator {
+  /** The instance that registered this name, used to reject two metrics sharing a name. */
+  owner: Metric;
+  kind: MetricKind;
+  config: MetricConfig;
+  /** Raw samples per sub-series id (`''` is the base series), kept in browser memory only. */
+  series: Map<string, number[]>;
+}
+
+type TestAccumulator = Map<string, SeriesAccumulator>;
+
+// Raw samples never cross the browser→runner boundary. They accumulate here per test (keyed by
+// the running task so a module-scoped metric shared across tests never mixes data) and are
+// aggregated into compact stats by an `onTestFinished` hook before being written to `task.meta`.
+const accumulators = new WeakMap<RunnerTestCase, TestAccumulator>();
+
+function flush(test: RunnerTestCase, accumulator: TestAccumulator): void {
+  const store: Record<string, MetricReport> = {};
+  for (const [name, entry] of accumulator) {
+    const series: Record<string, MetricSampleStats> = {};
+    for (const [seriesId, samples] of entry.series) {
+      series[seriesId] = { ...aggregateSamples(samples), count: samples.length };
+    }
+    store[name] = { kind: entry.kind, config: entry.config, series };
+  }
+  test.meta.benchmarkMetrics = store;
+}
+
+export interface MetricRecordOptions {
+  /** Sub-series label. Recorded under `${name}#${id}` in the report; omit for the base series. */
+  id?: string;
+}
+
+/**
+ * Base class for custom benchmark metrics. Use `ScalarMetric` or `DiscreteMetric`.
+ *
+ * Records are tied to the test that is running when `record()` is called (resolved via
+ * `getCurrentTest()`), so a single instance can be declared at module scope and reused across
+ * tests and loop iterations, inside or outside React.
+ */
+export abstract class Metric {
+  abstract readonly kind: MetricKind;
+
+  readonly name: string;
+
+  protected readonly config: MetricConfig;
+
+  constructor(config: MetricConfig | string) {
+    this.config = typeof config === 'string' ? { name: config } : config;
+    this.name = this.config.name;
+    if (this.name.includes('#')) {
+      throw new Error(
+        `Metric name "${this.name}" must not contain "#" — it is reserved as the sub-series separator.`,
+      );
+    }
+  }
+
+  /**
+   * Records a single measured value. Samples accumulate in browser memory and are aggregated
+   * once when the test finishes. Pass `options.id` to split into a labeled sub-series.
+   */
+  record(value: number, options?: MetricRecordOptions): void {
+    const test = TestRunner.getCurrentTest<RunnerTestCase | undefined>();
+    if (!test) {
+      throw new Error(
+        `${this.constructor.name}.record() must be called inside a running Vitest test.`,
+      );
+    }
+
+    // The harness disables the gate during warmup iterations so custom metrics recorded inside a
+    // benchmark honor the same warmup exclusion as renders and `bench:paint`. Standalone `it()`
+    // loops never touch the gate, so they keep recording every value.
+    if (!metricsGate.isRecordingEnabled(test)) {
+      return;
+    }
+
+    let accumulator = accumulators.get(test);
+    if (!accumulator) {
+      const created: TestAccumulator = new Map();
+      accumulator = created;
+      accumulators.set(test, created);
+      onTestFinished(() => flush(test, created));
+    }
+
+    let entry = accumulator.get(this.name);
+    if (!entry) {
+      entry = { owner: this, kind: this.kind, config: this.config, series: new Map() };
+      accumulator.set(this.name, entry);
+    } else if (entry.owner !== this) {
+      throw new Error(
+        `Two metrics share the name "${this.name}". Metric names must be unique; reuse a single instance instead.`,
+      );
+    }
+
+    const seriesId = options?.id ?? '';
+    let samples = entry.series.get(seriesId);
+    if (!samples) {
+      samples = [];
+      entry.series.set(seriesId, samples);
+    }
+    samples.push(value);
+  }
+}
diff --git a/packages/benchmark/src/ScalarMetric.ts b/packages/benchmark/src/ScalarMetric.ts
new file mode 100644
index 000000000..ce730327a
--- /dev/null
+++ b/packages/benchmark/src/ScalarMetric.ts
@@ -0,0 +1,47 @@
+import { Metric } from './Metric';
+import type { MetricKind } from './types';
+
+/**
+ * A continuous measurement (timings, sizes, …). Samples are aggregated with mean ± standard
+ * deviation and IQR outlier removal, and compared against a baseline with a relative noise band.
+ *
+ * It offers a `console.time`-style timing helper:
+ *
+ * ```ts
+ * const metric = new ScalarMetric({ name: 'render', format: { style: 'unit', unit: 'millisecond' } });
+ * metric.time();
+ * doWork();
+ * metric.timeEnd(); // records the elapsed milliseconds
+ * ```
+ */
+export class ScalarMetric extends Metric {
+  readonly kind: MetricKind = 'scalar';
+
+  private readonly pending = new Map<string, number>();
+
+  /** Starts a timer. Pass a `label` to time a sub-series; it maps to `record`'s `id`. */
+  time(label?: string): void {
+    const key = label ?? '';
+    if (this.pending.has(key)) {
+      throw new Error(
+        `${this.name}.time(${label ? `"${label}"` : ''}) was called while a timer is already running for that label.`,
+      );
+    }
+    this.pending.set(key, performance.now());
+  }
+
+  /** Stops the timer started by `time(label)` and records the elapsed milliseconds. */
+  timeEnd(label?: string): void {
+    // Capture the end time first so the map lookup/delete below isn't part of the measurement.
+    const end = performance.now();
+    const key = label ?? '';
+    const start = this.pending.get(key);
+    if (start === undefined) {
+      throw new Error(
+        `${this.name}.timeEnd(${label ? `"${label}"` : ''}) was called without a matching time().`,
+      );
+    }
+    this.pending.delete(key);
+    this.record(end - start, label !== undefined ? { id: label } : undefined);
+  }
+}
diff --git a/packages/benchmark/src/ciReport.ts b/packages/benchmark/src/ciReport.ts
index 682739639..8e2d8cd1f 100644
--- a/packages/benchmark/src/ciReport.ts
+++ b/packages/benchmark/src/ciReport.ts
@@ -66,7 +66,27 @@ const benchmarkReportEntrySchema = z.object({
 
 const benchmarkReportSchema = z.record(z.string(), benchmarkReportEntrySchema);
 
-const benchmarkBaseUploadSchema = ciReportUploadSchema('benchmark', 1, benchmarkReportSchema);
+const metricDefinitionSchema = z.object({
+  kind: z.enum(['scalar', 'discrete']),
+  format: z.custom<Intl.NumberFormatOptions>().optional(),
+  alarm: z
+    .object({
+      direction: z.enum(['lowerIsBetter', 'higherIsBetter']).optional(),
+      warn: z.number().min(0).optional(),
+      error: z.number().min(0).optional(),
+    })
+    .optional(),
+});
+
+const benchmarkBaseUploadSchema = ciReportUploadSchema(
+  'benchmark',
+  1,
+  benchmarkReportSchema,
+).extend({
+  // Per-metric config for custom metrics, hoisted to the top level (keyed by metric name) so
+  // it is stored once rather than duplicated in every report entry. Optional and additive.
+  metricDefinitions: z.record(z.string(), metricDefinitionSchema).optional(),
+});
 
 export const benchmarkUploadSchema = benchmarkBaseUploadSchema.extend({
   base: benchmarkBaseUploadSchema.optional(),
@@ -74,6 +94,7 @@ export const benchmarkUploadSchema = benchmarkBaseUploadSchema.extend({
 
 export type RenderStats = z.infer<typeof renderStatsSchema>;
 export type MetricStats = z.infer<typeof metricStatsSchema>;
+export type BenchmarkMetricDefinition = z.infer<typeof metricDefinitionSchema>;
 export type BenchmarkReportEntry = z.infer<typeof benchmarkReportEntrySchema>;
 export type BenchmarkReport = z.infer<typeof benchmarkReportSchema>;
 export type BenchmarkBaseUpload = z.infer<typeof benchmarkBaseUploadSchema>;
diff --git a/packages/benchmark/src/index.tsx b/packages/benchmark/src/index.tsx
index b9e77223e..6db367fe9 100644
--- a/packages/benchmark/src/index.tsx
+++ b/packages/benchmark/src/index.tsx
@@ -2,8 +2,11 @@ import * as React from 'react';
 import { expect, it } from 'vitest';
 import * as ReactDOMClient from 'react-dom/client'; // aliased to react-dom/profiling by Vite
 import * as ReactDOM from 'react-dom';
-import type { RenderEvent, BenchmarkMetric, IterationData, InteractionContext } from './types';
+import type { RenderEvent, IterationData, InteractionContext } from './types';
 import { ElementTiming } from './ElementTiming';
+import { ScalarMetric } from './ScalarMetric';
+import { metricsGate } from './metricsGate';
+import { createReactRecordingControls, type ReactRecordingControls } from './reactRecording';
 // Import for TaskMeta augmentation side effect
 import './taskMetaAugmentation';
 
@@ -13,21 +16,38 @@ interface PerformanceElementTiming extends PerformanceEntry {
   readonly identifier: string;
 }
 
-export type { RenderEvent, BenchmarkMetric, IterationData, InteractionContext } from './types';
+export type { RenderEvent, IterationData, InteractionContext } from './types';
+export type {
+  MetricKind,
+  MetricDirection,
+  MetricAlarm,
+  MetricConfig,
+  MetricDefinition,
+} from './types';
 export { ElementTiming } from './ElementTiming';
+export { Metric, type MetricRecordOptions } from './Metric';
+export { ScalarMetric };
+export { DiscreteMetric } from './DiscreteMetric';
 
 function BenchProfiler({
   captures,
+  recording,
   children,
 }: {
   captures: RenderEvent[];
+  recording: ReactRecordingControls;
   children: React.ReactNode;
 }) {
   const onRender = React.useCallback<React.ProfilerOnRenderCallback>(
     (id, phase, actualDuration, _baseDuration, startTime) => {
-      captures.push({ id, phase, actualDuration, startTime });
+      // Skip renders captured while React recording is paused (e.g. the mount when the benchmark
+      // starts paused, or a span the interaction explicitly excludes).
+      if (recording.active) {
+        captures.push({ id, phase, actualDuration, startTime });
+        recording.markRendered();
+      }
     },
-    [captures],
+    [captures, recording],
   );
 
   return (
@@ -69,6 +89,12 @@ interface BenchmarkOptions {
   runs?: number;
   warmupRuns?: number;
   afterEach?: () => Promise<void> | void;
+  /**
+   * Start each iteration with React render/paint recording paused. The interaction callback then
+   * calls `resumeReactRecording()` at the point it cares about — useful to exclude the mount and
+   * measure only the renders/paint of a later interaction. Defaults to `false` (mount recorded).
+   */
+  reactRecordingPaused?: boolean;
 }
 
 export function benchmark(
@@ -87,6 +113,16 @@ export function benchmark(
     const totalRuns = warmupRuns + runs;
     const iterations: IterationData[] = [];
 
+    // Paint timings are recorded as one harness-owned `bench:paint` metric: the default sentinel
+    // is the base series (`bench:paint`) and named `elementtiming` markers are sub-series
+    // (`bench:paint#grid-header`, …), all sharing a single definition. A default alarm keeps a
+    // >20% paint regression flagged, matching the previous behavior.
+    const paint = new ScalarMetric({
+      name: 'bench:paint',
+      format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 2 },
+      alarm: { error: 0.2 },
+    });
+
     const hasElementTiming = supportsElementTiming();
 
     if (typeof window.gc !== 'function') {
@@ -96,10 +132,20 @@ export function benchmark(
     }
 
     let renderError: unknown = null;
+    // Set if any iteration had a recording window that was active yet captured no renders.
+    let sawEmptyActiveWindow = false;
 
     for (let i = 0; i < totalRuns; i += 1) {
       const isWarmup = i < warmupRuns;
 
+      // Custom metrics recorded inside the benchmark honor warmup exclusion through the gate, the
+      // same way renders and `bench:paint` are excluded during warmup.
+      metricsGate.setRecordingEnabled(task, !isWarmup);
+
+      // Per-iteration switch for the harness's React render/paint recording. Starts paused when
+      // `reactRecordingPaused` is set; the interaction callback drives it from there.
+      const recording = createReactRecordingControls(!(options?.reactRecordingPaused ?? false));
+
       // Drain event loop from previous unmount, then double GC for thorough cleanup
       // eslint-disable-next-line no-await-in-loop
       await settle();
@@ -172,7 +218,11 @@ export function benchmark(
       });
 
       ReactDOM.flushSync(() => {
-        root.render(<BenchProfiler captures={captures}>{renderFn()}</BenchProfiler>);
+        root.render(
+          <BenchProfiler captures={captures} recording={recording}>
+            {renderFn()}
+          </BenchProfiler>,
+        );
       });
 
       if (renderError) {
@@ -184,24 +234,40 @@ export function benchmark(
 
       if (interaction) {
         // eslint-disable-next-line no-await-in-loop
-        await interaction({ waitForElementTiming });
+        await interaction({
+          waitForElementTiming,
+          pauseReactRecording: recording.pauseReactRecording,
+          resumeReactRecording: recording.resumeReactRecording,
+        });
       }
 
       // Wait for the bench sentinel paint entry (relies on test timeout)
       // eslint-disable-next-line no-await-in-loop
       await waitForElementTiming('default', 0);
 
+      // Close the final window and remember if any active window measured no renders.
+      recording.finalizeWindow();
+      if (recording.hadEmptyActiveWindow) {
+        sawEmptyActiveWindow = true;
+      }
+
       elementObserver?.disconnect();
 
       root.unmount();
       container.remove();
 
       if (!isWarmup) {
-        const metrics: BenchmarkMetric[] = elementEntries.map((entry) => ({
-          name: `paint:${entry.identifier}`,
-          value: entry.renderTime - iterationStart,
-        }));
-        iterations.push({ renders: captures, metrics });
+        for (const entry of elementEntries) {
+          // Skip paints that happened while recording was paused. Attribute by the paint's
+          // `renderTime`, not by when the observer callback fired (which can lag the paint).
+          if (!recording.activeAt(entry.renderTime)) {
+            continue;
+          }
+          // The default sentinel is the base series; named markers become sub-series.
+          const id = entry.identifier === 'default' ? undefined : entry.identifier;
+          paint.record(entry.renderTime - iterationStart, id !== undefined ? { id } : undefined);
+        }
+        iterations.push({ renders: captures });
       }
 
       if (options?.afterEach) {
@@ -217,11 +283,13 @@ export function benchmark(
       throw renderError;
     }
 
-    // Validate that at least one render was recorded
+    // Every active recording window must capture at least one render. Windows where recording was
+    // never running (e.g. a fully-paused, metric-only benchmark) are not checked.
     expect(
-      iterations[0].renders.length,
-      'No renders were recorded during benchmark',
-    ).toBeGreaterThan(0);
+      sawEmptyActiveWindow,
+      'React recording was active but captured no renders. If you only measure imperative DOM ' +
+        'updates or custom metrics, keep recording paused (reactRecordingPaused) instead of resuming.',
+    ).toBe(false);
 
     // Validate all iterations produced the same render events (count + order).
     // This runs after meta is set so the reporter can still display results on failure.
diff --git a/packages/benchmark/src/metricsGate.test.ts b/packages/benchmark/src/metricsGate.test.ts
new file mode 100644
index 000000000..0bd43f8b3
--- /dev/null
+++ b/packages/benchmark/src/metricsGate.test.ts
@@ -0,0 +1,30 @@
+import { describe, it, expect } from 'vitest';
+import type { RunnerTestCase } from 'vitest';
+import { metricsGate } from './metricsGate';
+
+// The gate keys on test identity via a WeakSet, so any object stands in for a RunnerTestCase.
+function fakeTest(): RunnerTestCase {
+  return {} as RunnerTestCase;
+}
+
+describe('metricsGate', () => {
+  it('is enabled by default for an unseen test', () => {
+    expect(metricsGate.isRecordingEnabled(fakeTest())).toBe(true);
+  });
+
+  it('disables and re-enables recording for a test', () => {
+    const test = fakeTest();
+    metricsGate.setRecordingEnabled(test, false);
+    expect(metricsGate.isRecordingEnabled(test)).toBe(false);
+    metricsGate.setRecordingEnabled(test, true);
+    expect(metricsGate.isRecordingEnabled(test)).toBe(true);
+  });
+
+  it('keeps the state independent per test', () => {
+    const suppressed = fakeTest();
+    const recording = fakeTest();
+    metricsGate.setRecordingEnabled(suppressed, false);
+    expect(metricsGate.isRecordingEnabled(suppressed)).toBe(false);
+    expect(metricsGate.isRecordingEnabled(recording)).toBe(true);
+  });
+});
diff --git a/packages/benchmark/src/metricsGate.ts b/packages/benchmark/src/metricsGate.ts
new file mode 100644
index 000000000..dc9759d8e
--- /dev/null
+++ b/packages/benchmark/src/metricsGate.ts
@@ -0,0 +1,24 @@
+import type { RunnerTestCase } from 'vitest';
+
+// Internal — not exported from the package, not user-facing. The `benchmark()` harness toggles
+// recording per test (off during warmup iterations) and `Metric.record()` consults it, so custom
+// metrics recorded inside a benchmark honor the same warmup exclusion as renders and `bench:paint`.
+//
+// Storage tracks the *disabled* tests so absence means enabled: a test with no entry — e.g. a
+// standalone `it()` loop that never goes through the harness — records normally by default.
+const disabled = new WeakSet<RunnerTestCase>();
+
+export const metricsGate = {
+  /** Whether custom-metric recording is currently enabled for `test`. Defaults to `true`. */
+  isRecordingEnabled(test: RunnerTestCase): boolean {
+    return !disabled.has(test);
+  },
+  /** Enable or disable custom-metric recording for `test`. */
+  setRecordingEnabled(test: RunnerTestCase, enabled: boolean): void {
+    if (enabled) {
+      disabled.delete(test);
+    } else {
+      disabled.add(test);
+    }
+  },
+};
diff --git a/packages/benchmark/src/reactRecording.test.ts b/packages/benchmark/src/reactRecording.test.ts
new file mode 100644
index 000000000..043d14fb8
--- /dev/null
+++ b/packages/benchmark/src/reactRecording.test.ts
@@ -0,0 +1,77 @@
+import { describe, it, expect } from 'vitest';
+import { createReactRecordingControls } from './reactRecording';
+
+describe('createReactRecordingControls', () => {
+  it('starts in the requested state', () => {
+    expect(createReactRecordingControls(true).active).toBe(true);
+    expect(createReactRecordingControls(false).active).toBe(false);
+  });
+
+  it('pauses and resumes', () => {
+    const controls = createReactRecordingControls(true);
+    controls.pauseReactRecording();
+    expect(controls.active).toBe(false);
+    controls.resumeReactRecording();
+    expect(controls.active).toBe(true);
+  });
+
+  it('throws when pausing while already paused', () => {
+    const controls = createReactRecordingControls(false);
+    expect(() => controls.pauseReactRecording()).toThrow(/already paused/);
+  });
+
+  it('throws when resuming while already active', () => {
+    const controls = createReactRecordingControls(true);
+    expect(() => controls.resumeReactRecording()).toThrow(/already active/);
+  });
+
+  it('attributes times before all toggles to the initial state and after to the current', () => {
+    const controls = createReactRecordingControls(false);
+    controls.resumeReactRecording();
+    expect(controls.activeAt(-Infinity)).toBe(false);
+    expect(controls.activeAt(Infinity)).toBe(true);
+  });
+});
+
+describe('createReactRecordingControls active-window render check', () => {
+  it('does not flag a window that captured a render before pause', () => {
+    const controls = createReactRecordingControls(true);
+    controls.markRendered();
+    controls.pauseReactRecording();
+    expect(controls.hadEmptyActiveWindow).toBe(false);
+  });
+
+  it('flags an active window paused without any render', () => {
+    const controls = createReactRecordingControls(true);
+    controls.pauseReactRecording();
+    expect(controls.hadEmptyActiveWindow).toBe(true);
+  });
+
+  it('flags the final window via finalizeWindow when active and empty', () => {
+    const controls = createReactRecordingControls(true);
+    controls.finalizeWindow();
+    expect(controls.hadEmptyActiveWindow).toBe(true);
+  });
+
+  it('does not flag the final window when it captured a render', () => {
+    const controls = createReactRecordingControls(true);
+    controls.markRendered();
+    controls.finalizeWindow();
+    expect(controls.hadEmptyActiveWindow).toBe(false);
+  });
+
+  it('does not flag anything when recording was never active', () => {
+    const controls = createReactRecordingControls(false);
+    controls.finalizeWindow();
+    expect(controls.hadEmptyActiveWindow).toBe(false);
+  });
+
+  it('resets render presence per window', () => {
+    const controls = createReactRecordingControls(true);
+    controls.markRendered();
+    controls.pauseReactRecording(); // first window had a render
+    controls.resumeReactRecording(); // second window starts empty
+    controls.finalizeWindow();
+    expect(controls.hadEmptyActiveWindow).toBe(true);
+  });
+});
diff --git a/packages/benchmark/src/reactRecording.ts b/packages/benchmark/src/reactRecording.ts
new file mode 100644
index 000000000..2d04c2fa3
--- /dev/null
+++ b/packages/benchmark/src/reactRecording.ts
@@ -0,0 +1,92 @@
+export interface ReactRecordingControls {
+  /** Whether React render/paint recording is active right now — the synchronous gate for renders. */
+  readonly active: boolean;
+  /** Whether any active recording window closed without capturing a render. */
+  readonly hadEmptyActiveWindow: boolean;
+  /**
+   * Whether recording was active at `time` (a `performance.now()` timestamp). Paint entries are
+   * observed asynchronously, so they are attributed by their `renderTime` rather than by the
+   * recording state at the moment the observer callback happens to fire.
+   */
+  activeAt(time: number): boolean;
+  /** Note that a render was captured in the current window. Called by the harness from `onRender`. */
+  markRendered(): void;
+  /** Close the final window at the end of the iteration (validates it if recording is still active). */
+  finalizeWindow(): void;
+  /** Pause React render/paint recording. Throws if recording is already paused. */
+  pauseReactRecording(): void;
+  /** Resume React render/paint recording. Throws if recording is already active. */
+  resumeReactRecording(): void;
+}
+
+/**
+ * Creates the per-iteration switch that turns the harness's React render/paint recording on and
+ * off. The interaction callback drives it via `pauseReactRecording`/`resumeReactRecording`; the
+ * strict state machine (each throws when called in the wrong state) catches unbalanced pairs early.
+ *
+ * It also tracks whether each *active* window captured at least one render, so the harness can flag
+ * a window that was recording but measured nothing — while leaving fully-paused (metric-only)
+ * benchmarks alone.
+ */
+export function createReactRecordingControls(initiallyActive: boolean): ReactRecordingControls {
+  let active = initiallyActive;
+  let currentWindowHasRender = false;
+  let emptyActiveWindow = false;
+  // Transitions in chronological order. The implicit state before the first toggle is
+  // `initiallyActive`; `activeAt` replays this to attribute a paint to its render time.
+  const transitions: { time: number; active: boolean }[] = [];
+
+  // Flag the window being closed if it was recording yet captured nothing.
+  function closeWindowIfActive() {
+    if (active && !currentWindowHasRender) {
+      emptyActiveWindow = true;
+    }
+  }
+
+  return {
+    get active() {
+      return active;
+    },
+    get hadEmptyActiveWindow() {
+      return emptyActiveWindow;
+    },
+    activeAt(time) {
+      let result = initiallyActive;
+      for (const transition of transitions) {
+        if (transition.time > time) {
+          break;
+        }
+        result = transition.active;
+      }
+      return result;
+    },
+    markRendered() {
+      currentWindowHasRender = true;
+    },
+    finalizeWindow() {
+      closeWindowIfActive();
+    },
+    pauseReactRecording() {
+      // Stamp first — before the guard and bookkeeping — so the closing window ends as early as
+      // possible and excludes pause's own overhead. In the throw path it is simply discarded.
+      const now = performance.now();
+      if (!active) {
+        throw new Error('pauseReactRecording() called but React recording is already paused.');
+      }
+      closeWindowIfActive();
+      active = false;
+      transitions.push({ time: now, active: false });
+    },
+    resumeReactRecording() {
+      if (active) {
+        throw new Error('resumeReactRecording() called but React recording is already active.');
+      }
+      active = true;
+      currentWindowHasRender = false;
+      // Stamp last — after the bookkeeping — so the new window starts as late as possible and
+      // excludes resume's own overhead.
+      const now = performance.now();
+      transitions.push({ time: now, active: true });
+    },
+  };
+}
diff --git a/packages/benchmark/src/reporter.test.ts b/packages/benchmark/src/reporter.test.ts
index 88f7f96b8..b60279df2 100644
--- a/packages/benchmark/src/reporter.test.ts
+++ b/packages/benchmark/src/reporter.test.ts
@@ -1,5 +1,6 @@
 import * as os from 'node:os';
 import * as path from 'node:path';
+import * as fs from 'node:fs/promises';
 import { describe, it, expect, vi } from 'vitest';
 import type { TestCase } from 'vitest/node';
 import type { RenderEvent, IterationData } from './types';
@@ -15,8 +16,8 @@ function event(
   return { id, phase, startTime, actualDuration };
 }
 
-function iteration(renders: RenderEvent[], metrics: IterationData['metrics'] = []): IterationData {
-  return { renders, metrics };
+function iteration(renders: RenderEvent[]): IterationData {
+  return { renders };
 }
 
 describe('generateReportFromIterations', () => {
@@ -125,42 +126,7 @@ describe('generateReportFromIterations', () => {
     expect(report.renders[0].outliers).toBe(1);
   });
 
-  it('aggregates metrics across iterations', () => {
-    const iterations = [
-      iteration(
-        [event('App', 'mount', 0, 10)],
-        [
-          { name: 'paint:bench', value: 60 },
-          { name: 'paint:grid', value: 55 },
-        ],
-      ),
-      iteration(
-        [event('App', 'mount', 0, 10)],
-        [
-          { name: 'paint:bench', value: 64 },
-          { name: 'paint:grid', value: 57 },
-        ],
-      ),
-      iteration(
-        [event('App', 'mount', 0, 10)],
-        [
-          { name: 'paint:bench', value: 62 },
-          { name: 'paint:grid', value: 56 },
-        ],
-      ),
-    ];
-    const report = generateReportFromIterations(iterations);
-
-    expect(report.metrics).toHaveProperty('paint:bench');
-    expect(report.metrics).toHaveProperty('paint:grid');
-    expect(report.metrics['paint:bench'].mean).toBeCloseTo(62, 0);
-    expect(report.metrics['paint:grid'].mean).toBeCloseTo(56, 0);
-    expect(report.metrics['paint:bench'].stdDev).toBeGreaterThanOrEqual(0);
-    expect(report.metrics['paint:grid'].stdDev).toBeGreaterThanOrEqual(0);
-    expect(report.metrics['paint:bench'].outliers).toBe(0);
-  });
-
-  it('returns empty metrics when no metrics are present', () => {
+  it('does not aggregate metrics from iterations (paint flows through benchmarkMetrics)', () => {
     const iterations = [
       iteration([event('App', 'mount', 0, 10)]),
       iteration([event('App', 'mount', 0, 12)]),
@@ -169,25 +135,6 @@ describe('generateReportFromIterations', () => {
 
     expect(report.metrics).toEqual({});
   });
-
-  it('handles multiple metric identifiers with different counts', () => {
-    const iterations = [
-      iteration(
-        [event('App', 'mount', 0, 10)],
-        [
-          { name: 'paint:bench', value: 60 },
-          { name: 'paint:header', value: 50 },
-        ],
-      ),
-      iteration([event('App', 'mount', 0, 10)], [{ name: 'paint:bench', value: 64 }]),
-    ];
-    const report = generateReportFromIterations(iterations);
-
-    expect(report.metrics['paint:bench'].mean).toBeCloseTo(62, 0);
-    // paint:header only has 1 data point
-    expect(report.metrics['paint:header'].mean).toBe(50);
-    expect(report.metrics['paint:header'].stdDev).toBe(0);
-  });
 });
 
 function mockTestCase(options: {
@@ -323,4 +270,183 @@ describe('BenchmarkReporter', () => {
       consoleSpy.mockRestore();
     });
   });
+
+  describe('custom metrics', () => {
+    it('merges aggregated custom metrics into the report and hoists definitions', async () => {
+      const outputPath = path.join(os.tmpdir(), `benchmark-custom-metrics-${process.pid}.json`);
+      const reporter = new BenchmarkReporter({ outputPath });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(
+        mockTestCase({
+          fullName: 'custom only',
+          meta: {
+            benchmarkName: 'custom only',
+            benchmarkMetrics: {
+              fib_duration: {
+                kind: 'scalar',
+                config: {
+                  name: 'fib_duration',
+                  format: { maximumFractionDigits: 3 },
+                  alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 },
+                },
+                series: { '': { mean: 1.5, stdDev: 0.1, outliers: 0, count: 50 } },
+              },
+              fib_phase: {
+                kind: 'scalar',
+                config: { name: 'fib_phase' },
+                series: {
+                  small: { mean: 0.2, stdDev: 0.01, outliers: 0, count: 50 },
+                  large: { mean: 3.4, stdDev: 0.2, outliers: 1, count: 50 },
+                },
+              },
+            },
+          },
+          state: 'passed',
+        }),
+      );
+
+      await reporter.onTestRunEnd();
+      const written = JSON.parse(await fs.readFile(outputPath, 'utf8'));
+
+      // Metric-only test still produces an entry; iteration count derived from samples.
+      expect(written.report['custom only'].iterations).toBe(50);
+      expect(written.report['custom only'].renders).toEqual([]);
+
+      // Base series keyed by name; sub-series keyed `name#id`.
+      expect(written.report['custom only'].metrics).toMatchObject({
+        fib_duration: { mean: 1.5, stdDev: 0.1, outliers: 0 },
+        'fib_phase#small': { mean: 0.2 },
+        'fib_phase#large': { mean: 3.4, outliers: 1 },
+      });
+
+      // The sample count must not leak into the per-entry metric stats.
+      expect(written.report['custom only'].metrics.fib_duration).not.toHaveProperty('count');
+
+      // Config is hoisted once, keyed by metric name.
+      expect(written.metricDefinitions).toMatchObject({
+        fib_duration: {
+          kind: 'scalar',
+          alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 },
+        },
+        fib_phase: { kind: 'scalar' },
+      });
+
+      await fs.rm(outputPath, { force: true });
+      consoleSpy.mockRestore();
+    });
+
+    it('merges custom metrics alongside React render iterations', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-combined-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(
+        mockTestCase({
+          fullName: 'combined',
+          meta: {
+            benchmarkName: 'combined',
+            benchmarkIterations: [
+              iteration([event('App', 'mount', 0, 10)]),
+              iteration([event('App', 'mount', 0, 12)]),
+            ],
+            benchmarkMetrics: {
+              clicks: {
+                kind: 'discrete',
+                config: { name: 'clicks', format: { maximumFractionDigits: 0 } },
+                series: { '': { mean: 3, stdDev: 0, outliers: 0, count: 2 } },
+              },
+            },
+          },
+          state: 'passed',
+        }),
+      );
+
+      const output = consoleSpy.mock.calls.map((call) => call[0]).join('\n');
+      // Both the React render table and the custom metric are printed.
+      expect(output).toContain('combined');
+      expect(output).toContain('clicks');
+
+      consoleSpy.mockRestore();
+    });
+
+    function metricCase(testName: string, metricName: string, alarm: unknown) {
+      return mockTestCase({
+        fullName: testName,
+        meta: {
+          benchmarkName: testName,
+          benchmarkMetrics: {
+            [metricName]: {
+              kind: 'scalar',
+              config: { name: metricName, alarm },
+              series: { '': { mean: 1, stdDev: 0, outliers: 0, count: 1 } },
+            },
+          },
+        },
+        state: 'passed',
+      });
+    }
+
+    it('allows reusing a metric name across benchmarks with identical config', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-reuse-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 }));
+      expect(() =>
+        reporter.onTestCaseResult(metricCase('b', 'shared', { error: 0.2 })),
+      ).not.toThrow();
+
+      consoleSpy.mockRestore();
+    });
+
+    it('throws when a metric name is reused with conflicting config across benchmarks', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-conflict-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 }));
+      expect(() => reporter.onTestCaseResult(metricCase('b', 'shared', { error: 0.9 }))).toThrow(
+        /conflicting/,
+      );
+
+      consoleSpy.mockRestore();
+    });
+
+    it('does not flag the same config written with a different key order', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-order-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(
+        metricCase('a', 'shared', { direction: 'lowerIsBetter', error: 0.2 }),
+      );
+      expect(() =>
+        reporter.onTestCaseResult(
+          metricCase('b', 'shared', { error: 0.2, direction: 'lowerIsBetter' }),
+        ),
+      ).not.toThrow();
+
+      consoleSpy.mockRestore();
+    });
+
+    it('resets between runs so an edited config does not conflict on watch reload', () => {
+      const reporter = new BenchmarkReporter({
+        outputPath: path.join(os.tmpdir(), `benchmark-reload-${process.pid}.json`),
+      });
+      const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+      reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 }));
+      reporter.onTestRunStart(); // simulate a watch re-run
+      expect(() =>
+        reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.9 })),
+      ).not.toThrow();
+
+      consoleSpy.mockRestore();
+    });
+  });
 });
diff --git a/packages/benchmark/src/reporter.ts b/packages/benchmark/src/reporter.ts
index 8e2d03d9c..4060b1cab 100644
--- a/packages/benchmark/src/reporter.ts
+++ b/packages/benchmark/src/reporter.ts
@@ -1,52 +1,42 @@
 import * as path from 'node:path';
 import * as fs from 'node:fs/promises';
 import type { Reporter, TestCase } from 'vitest/node';
-import type { RenderEvent, IterationData } from './types';
+import type { RenderEvent, IterationData, MetricReport, MetricDefinition } from './types';
 import type { BenchmarkBaseUpload, BenchmarkReportEntry } from './ciReport';
 import { benchmarkUploadSchema, getCiMetadata } from './ciReport';
-import { calculateMean, calculateStdDev, quantile, isOutlier } from './stats';
+import { calculateMean, aggregateSamples } from './stats';
 import { dim, red, green, yellow, cyan, printTable, fileUrl } from './format';
 import { uploadCiReport } from './upload';
 import { syncPrComment } from './syncPrComment';
 // Import for TaskMeta augmentation side effect
 import './taskMetaAugmentation';
 
-const byNumeric = (a: number, b: number) => a - b;
-
 function getEventKey(event: RenderEvent): string {
   return `${event.id}:${event.phase}`;
 }
 
-function aggregateMetrics(
-  iterations: IterationData[],
-): Record<string, { mean: number; stdDev: number; outliers: number }> {
-  // Collect all metric names across iterations
-  const metricValues = new Map<string, number[]>();
-  for (const iteration of iterations) {
-    for (const metric of iteration.metrics) {
-      let values = metricValues.get(metric.name);
-      if (!values) {
-        values = [];
-        metricValues.set(metric.name, values);
-      }
-      values.push(metric.value);
-    }
+/** Order-insensitive deep equality, treating a missing key and an `undefined` value as equal. */
+function deepEqual(first: unknown, second: unknown): boolean {
+  if (first === second) {
+    return true;
   }
-
-  const result: Record<string, { mean: number; stdDev: number; outliers: number }> = {};
-  for (const [name, values] of metricValues) {
-    // Apply IQR filtering
-    const sorted = [...values].sort(byNumeric);
-    const q1 = quantile(sorted, 0.25);
-    const q3 = quantile(sorted, 0.75);
-    const filtered = values.filter((d) => !isOutlier(d, q1, q3));
-    const used = filtered.length > 0 ? filtered : values;
-
-    const mean = calculateMean(used);
-    const stdDev = calculateStdDev(used, mean);
-    result[name] = { mean, stdDev, outliers: values.length - used.length };
+  if (
+    typeof first !== 'object' ||
+    first === null ||
+    typeof second !== 'object' ||
+    second === null
+  ) {
+    return false;
   }
-  return result;
+  const firstRecord = first as Record<string, unknown>;
+  const secondRecord = second as Record<string, unknown>;
+  const keys = new Set([...Object.keys(firstRecord), ...Object.keys(secondRecord)]);
+  for (const key of keys) {
+    if (!deepEqual(firstRecord[key], secondRecord[key])) {
+      return false;
+    }
+  }
+  return true;
 }
 
 function generateReportFromIterations(iterations: IterationData[]): BenchmarkReportEntry {
@@ -74,14 +64,7 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep
   for (let index = 0; index < expectedLength; index += 1) {
     const durations = iterations.map((iteration) => iteration.renders[index].actualDuration);
 
-    const sorted = [...durations].sort(byNumeric);
-    const q1 = quantile(sorted, 0.25);
-    const q3 = quantile(sorted, 0.75);
-    const filtered = durations.filter((d) => !isOutlier(d, q1, q3));
-    const used = filtered.length > 0 ? filtered : durations;
-
-    const iqrMean = calculateMean(used);
-    const iqrStdDev = calculateStdDev(used, iqrMean);
+    const { mean: iqrMean, stdDev: iqrStdDev, outliers } = aggregateSamples(durations);
     const coefficientOfVariation = iqrMean > 0 ? iqrStdDev / iqrMean : 0;
 
     if (iqrMean > 1 && coefficientOfVariation > 0.1) {
@@ -96,7 +79,7 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep
       event: firstIteration.renders[index],
       iqrMean,
       iqrStdDev,
-      outliers: durations.length - used.length,
+      outliers,
     });
   }
 
@@ -130,8 +113,8 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep
     totalDuration += iqrMean;
   }
 
-  // Aggregate metrics
-  const metrics = aggregateMetrics(iterations);
+  // Custom + paint metrics are merged separately from `task.meta.benchmarkMetrics`.
+  const metrics = {};
 
   return {
     iterations: iterationCount,
@@ -190,10 +173,63 @@ function printDurationMatrix(name: string, report: BenchmarkReportEntry, footer:
   );
 }
 
+/** Strips a `#sub-series` suffix to recover the metric name used to look up its definition. */
+function baseMetricName(key: string): string {
+  const hashIndex = key.indexOf('#');
+  return hashIndex === -1 ? key : key.slice(0, hashIndex);
+}
+
+function formatMetricValue(value: number, definition?: MetricDefinition): string {
+  if (definition?.format) {
+    return new Intl.NumberFormat(undefined, definition.format).format(value);
+  }
+  return value.toFixed(2);
+}
+
+/**
+ * Merges aggregated custom metrics (already stats, not raw samples) into a report entry, keyed
+ * `name` or `name#id` for sub-series, and collects each metric's config into the shared
+ * top-level definitions. For a metric-only test, derives the iteration count from the samples.
+ */
+function mergeCustomMetrics(
+  report: BenchmarkReportEntry,
+  customMetrics: Record<string, MetricReport>,
+  definitions: Record<string, MetricDefinition>,
+): void {
+  let maxCount = 0;
+  for (const [metricName, metric] of Object.entries(customMetrics)) {
+    for (const [seriesId, stats] of Object.entries(metric.series)) {
+      const key = seriesId === '' ? metricName : `${metricName}#${seriesId}`;
+      report.metrics[key] = { mean: stats.mean, stdDev: stats.stdDev, outliers: stats.outliers };
+      maxCount = Math.max(maxCount, stats.count);
+    }
+    const definition: MetricDefinition = {
+      kind: metric.kind,
+      format: metric.config.format,
+      alarm: metric.config.alarm,
+    };
+    // A metric name maps to one definition. Reusing it across benchmarks is fine when the config
+    // matches (e.g. the harness `bench:paint`), but conflicting config would silently apply
+    // last-write-wins to every entry — reject it instead.
+    const existing = definitions[metricName];
+    if (existing && !deepEqual(existing, definition)) {
+      throw new Error(
+        `Benchmark metric "${metricName}" is defined with conflicting configuration across ` +
+          `benchmarks. A metric name must map to a single kind, format, and alarm.`,
+      );
+    }
+    definitions[metricName] = definition;
+  }
+  if (report.iterations === 0) {
+    report.iterations = maxCount;
+  }
+}
+
 function printMetricsTable(
   name: string,
   metrics: Record<string, { mean: number; stdDev: number; outliers: number }>,
   iterationCount: number,
+  definitions: Record<string, MetricDefinition>,
 ): void {
   const entries = Object.entries(metrics);
   if (entries.length === 0) {
@@ -201,10 +237,12 @@ function printMetricsTable(
   }
 
   const rows: string[][] = entries.map(([metricName, stats]) => {
-    const iqrStr = `${stats.mean.toFixed(2)}±${stats.stdDev.toFixed(2)}`;
+    const definition = definitions[baseMetricName(metricName)];
+    const iqrStr = `${formatMetricValue(stats.mean, definition)}±${formatMetricValue(stats.stdDev, definition)}`;
     const cv = stats.mean > 0 ? (stats.stdDev / stats.mean) * 100 : 0;
+    const label = definition?.alarm ? `${metricName} ⚠` : metricName;
     return [
-      metricName.slice(0, LABEL_WIDTH).padStart(LABEL_WIDTH),
+      label.slice(0, LABEL_WIDTH).padStart(LABEL_WIDTH),
       cyan(iqrStr.padStart(STAT_WIDTH)),
       colorCV(cv),
       stats.outliers > 0 ? yellow(String(stats.outliers).padStart(4)) : dim('0'.padStart(4)),
@@ -214,7 +252,7 @@ function printMetricsTable(
   printTable(
     [
       { header: 'Metric', width: LABEL_WIDTH },
-      { header: 'Mean±σ (ms)', width: STAT_WIDTH },
+      { header: 'Mean±σ', width: STAT_WIDTH },
       { header: 'Var%', width: CV_WIDTH },
       { header: 'Out', width: 4 },
     ],
@@ -242,6 +280,8 @@ async function loadBaselineReport(baselinePath: string): Promise<BenchmarkBaseUp
 class BenchmarkReporter implements Reporter {
   private benchmarks: Record<string, BenchmarkReportEntry> = {};
 
+  private metricDefinitions: Record<string, MetricDefinition> = {};
+
   private outputPath: string;
 
   private upload: boolean;
@@ -259,6 +299,15 @@ class BenchmarkReporter implements Reporter {
     this.baselinePath = options?.baselinePath ?? process.env.BENCHMARK_BASELINE_PATH;
   }
 
+  // Reset accumulated state at the start of every run so watch-mode re-runs start clean (the
+  // reporter instance is reused across runs). Otherwise stale benchmarks/definitions linger — and
+  // an edited metric config would conflict with its own previous-run definition.
+  onTestRunStart(): void {
+    this.benchmarks = {};
+    this.metricDefinitions = {};
+    this.hasFailures = false;
+  }
+
   onTestCaseResult(testCase: TestCase): void {
     if (testCase.result().state === 'failed') {
       this.hasFailures = true;
@@ -266,14 +315,21 @@ class BenchmarkReporter implements Reporter {
 
     const meta = testCase.meta();
     const iterations = meta.benchmarkIterations;
+    const customMetrics = meta.benchmarkMetrics;
 
-    if (!iterations) {
+    if (!iterations && !customMetrics) {
       console.warn(yellow(`  No iterations recorded for: ${testCase.fullName}`));
       return;
     }
 
     const name = meta.benchmarkName ?? testCase.fullName;
-    const report = generateReportFromIterations(iterations);
+    const report = iterations
+      ? generateReportFromIterations(iterations)
+      : { iterations: 0, totalDuration: 0, renders: [], metrics: {} };
+
+    if (customMetrics) {
+      mergeCustomMetrics(report, customMetrics, this.metricDefinitions);
+    }
 
     this.benchmarks[name] = report;
 
@@ -284,7 +340,7 @@ class BenchmarkReporter implements Reporter {
 
     printDurationMatrix(`${name} — React`, report, summary);
 
-    printMetricsTable(name, report.metrics, report.iterations);
+    printMetricsTable(name, report.metrics, report.iterations, this.metricDefinitions);
   }
 
   async onTestRunEnd(): Promise<void> {
@@ -303,11 +359,14 @@ class BenchmarkReporter implements Reporter {
 
     const baseline = this.baselinePath ? await loadBaselineReport(this.baselinePath) : undefined;
 
+    const hasMetricDefinitions = Object.keys(this.metricDefinitions).length > 0;
+
     const results = {
       version: 1 as const,
       reportType: 'benchmark' as const,
       ...(await getCiMetadata()),
       report: this.benchmarks,
+      ...(hasMetricDefinitions ? { metricDefinitions: this.metricDefinitions } : {}),
       ...(baseline ? { base: baseline } : {}),
     };
 
diff --git a/packages/benchmark/src/stats.test.ts b/packages/benchmark/src/stats.test.ts
index 13c387122..cd6dd8b94 100644
--- a/packages/benchmark/src/stats.test.ts
+++ b/packages/benchmark/src/stats.test.ts
@@ -1,5 +1,5 @@
 import { describe, it, expect } from 'vitest';
-import { calculateMean, calculateStdDev, quantile, isOutlier } from './stats';
+import { calculateMean, calculateStdDev, quantile, isOutlier, aggregateSamples } from './stats';
 
 describe('calculateMean', () => {
   it('returns the mean of values', () => {
@@ -80,3 +80,24 @@ describe('isOutlier', () => {
     expect(isOutlier(35, 10, 20)).toBe(false);
   });
 });
+
+describe('aggregateSamples', () => {
+  it('computes the mean, standard deviation, and zero outliers for a clean series', () => {
+    const result = aggregateSamples([2, 4, 6]);
+    expect(result.mean).toBe(4);
+    expect(result.outliers).toBe(0);
+    expect(result.stdDev).toBeCloseTo(Math.sqrt(8 / 3));
+  });
+
+  it('removes IQR outliers before computing the mean', () => {
+    const result = aggregateSamples([10, 10, 10, 10, 10, 1000]);
+    expect(result.mean).toBe(10);
+    expect(result.outliers).toBe(1);
+  });
+
+  it('falls back to all values when filtering would remove everything', () => {
+    const result = aggregateSamples([5]);
+    expect(result.mean).toBe(5);
+    expect(result.outliers).toBe(0);
+  });
+});
diff --git a/packages/benchmark/src/stats.ts b/packages/benchmark/src/stats.ts
index d8d3e2a27..70756a8d6 100644
--- a/packages/benchmark/src/stats.ts
+++ b/packages/benchmark/src/stats.ts
@@ -27,3 +27,24 @@ export function isOutlier(value: number, q1: number, q3: number): boolean {
   const iqr = q3 - q1;
   return value < q1 - 1.5 * iqr || value > q3 + 1.5 * iqr;
 }
+
+/**
+ * Aggregates a series of samples into a mean, standard deviation, and outlier count using
+ * IQR-based outlier removal. Falls back to the raw values when filtering would remove
+ * everything. This is the shared aggregation core for custom metrics.
+ */
+export function aggregateSamples(values: number[]): {
+  mean: number;
+  stdDev: number;
+  outliers: number;
+} {
+  const sorted = values.toSorted((first, second) => first - second);
+  const q1 = quantile(sorted, 0.25);
+  const q3 = quantile(sorted, 0.75);
+  const filtered = values.filter((value) => !isOutlier(value, q1, q3));
+  const used = filtered.length > 0 ? filtered : values;
+
+  const mean = calculateMean(used);
+  const stdDev = calculateStdDev(used, mean);
+  return { mean, stdDev, outliers: values.length - used.length };
+}
diff --git a/packages/benchmark/src/taskMetaAugmentation.ts b/packages/benchmark/src/taskMetaAugmentation.ts
index 58d922d51..eee8aa740 100644
--- a/packages/benchmark/src/taskMetaAugmentation.ts
+++ b/packages/benchmark/src/taskMetaAugmentation.ts
@@ -1,10 +1,12 @@
 /// <reference types="vitest" />
 
-import type { IterationData } from './types';
+import type { IterationData, MetricReport } from './types';
 
 declare module 'vitest' {
   interface TaskMeta {
     benchmarkName?: string;
     benchmarkIterations?: IterationData[];
+    /** Custom metrics recorded via `ScalarMetric`/`DiscreteMetric`, keyed by metric name. */
+    benchmarkMetrics?: Record<string, MetricReport>;
   }
 }
diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts
index 3156cc275..5294752db 100644
--- a/packages/benchmark/src/types.ts
+++ b/packages/benchmark/src/types.ts
@@ -14,16 +14,8 @@ export interface RenderEvent {
   startTime: number;
 }
 
-export interface BenchmarkMetric {
-  /** Metric name, e.g. "paint:bench", "paint:grid-header" */
-  name: string;
-  /** Measured value in ms */
-  value: number;
-}
-
 export interface IterationData {
   renders: RenderEvent[];
-  metrics: BenchmarkMetric[];
 }
 
 export interface InteractionContext {
@@ -33,4 +25,76 @@ export interface InteractionContext {
    * @param timeout - Timeout in ms. Default: 5000. Pass 0 or Infinity to rely on the test timeout.
    */
   waitForElementTiming: (identifier: string, timeout?: number) => Promise<void>;
+  /**
+   * Pause recording of the harness's React render/paint measurements. Custom metrics keep
+   * recording. Throws if recording is already paused.
+   */
+  pauseReactRecording: () => void;
+  /**
+   * Resume recording of the harness's React render/paint measurements. Throws if recording is
+   * already active.
+   */
+  resumeReactRecording: () => void;
+}
+
+/**
+ * Whether a custom metric measures a continuous value or a discrete count.
+ * - `scalar` — continuous measurements (timings, sizes); compared with a relative noise band.
+ * - `discrete` — counts/events; compared as exact integers.
+ */
+export type MetricKind = 'scalar' | 'discrete';
+
+/** Which direction of change counts as a regression for a metric in alarm mode. */
+export type MetricDirection = 'lowerIsBetter' | 'higherIsBetter';
+
+export interface MetricAlarm {
+  /** Defaults to `lowerIsBetter`. */
+  direction?: MetricDirection;
+  /**
+   * Softer band: a regression past `warn` (but within `error`) is flagged as a warning.
+   * Scalar metrics: a relative fraction (`0.1` = 10%). Discrete metrics: an absolute count delta.
+   */
+  warn?: number;
+  /**
+   * Harder band: a regression past `error` is flagged as an error (the alarm). When **both**
+   * `warn` and `error` are omitted, `error` defaults to the dashboard's global noise band; with
+   * only `warn` set there is no error band (warning-only).
+   * Scalar metrics: a relative fraction (`0.25` = 25%). Discrete metrics: an absolute count delta.
+   */
+  error?: number;
+}
+
+export interface MetricConfig {
+  name: string;
+  /** Display formatting applied by the reporter and dashboard via `Intl.NumberFormat`. */
+  format?: Intl.NumberFormatOptions;
+  /**
+   * Regression judgment. Its presence opts the metric into alarming; when omitted the metric
+   * is informational (the diff is shown but never flagged).
+   */
+  alarm?: MetricAlarm;
+}
+
+/** Aggregated stats for a single metric sub-series, as they cross the browser→runner boundary. */
+export interface MetricSampleStats {
+  mean: number;
+  stdDev: number;
+  outliers: number;
+  /** Number of recorded samples (used to derive iteration counts; stripped from the report). */
+  count: number;
+}
+
+/** A custom metric's aggregated data attached to `task.meta`, keyed by metric name. */
+export interface MetricReport {
+  kind: MetricKind;
+  config: MetricConfig;
+  /** Aggregated stats keyed by sub-series id (`''` is the base series). */
+  series: Record<string, MetricSampleStats>;
+}
+
+/** Per-metric configuration hoisted to the top level of the report (keyed by metric name). */
+export interface MetricDefinition {
+  kind: MetricKind;
+  format?: Intl.NumberFormatOptions;
+  alarm?: MetricAlarm;
 }
diff --git a/test/performance/tests/imperative-metric.bench.tsx b/test/performance/tests/imperative-metric.bench.tsx
new file mode 100644
index 000000000..1fc2c833a
--- /dev/null
+++ b/test/performance/tests/imperative-metric.bench.tsx
@@ -0,0 +1,38 @@
+import * as React from 'react';
+import { benchmark, ScalarMetric } from '@mui/internal-benchmark';
+
+function simulateSlowdown(ms: number) {
+  const end = performance.now() + ms;
+  while (performance.now() < end) {
+    // burn CPU
+  }
+}
+
+function Widget() {
+  return <div data-state="idle">Widget</div>;
+}
+
+const updateWork = new ScalarMetric({
+  name: 'imperative_update',
+  format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
+});
+
+// Recording stays paused for the whole benchmark: the interaction mutates the DOM imperatively (no
+// React re-render) and measures only a custom metric. The harness must not require a React render
+// here — there is no active recording window to validate.
+benchmark(
+  'Widget imperative update (metric only)',
+  () => <Widget />,
+  async () => {
+    const node = document.querySelector('[data-state]');
+    updateWork.time();
+    node?.setAttribute('data-state', 'active');
+    simulateSlowdown(1);
+    updateWork.timeEnd();
+  },
+  {
+    runs: 10,
+    warmupRuns: 5,
+    reactRecordingPaused: true,
+  },
+);
diff --git a/test/performance/tests/metric.bench.tsx b/test/performance/tests/metric.bench.tsx
new file mode 100644
index 000000000..ad5ff0899
--- /dev/null
+++ b/test/performance/tests/metric.bench.tsx
@@ -0,0 +1,44 @@
+import { it } from 'vitest';
+import { ScalarMetric, DiscreteMetric } from '@mui/internal-benchmark';
+
+function fib(n: number): number {
+  return n < 2 ? n : fib(n - 1) + fib(n - 2);
+}
+
+// A scalar timing metric with an alarm: regressions past 10% warn, past 25% error.
+const duration = new ScalarMetric({
+  name: 'fib_duration',
+  format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
+  alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 },
+});
+
+// A discrete count metric (informational), compared as an exact integer.
+const evenResults = new DiscreteMetric({ name: 'fib_even_results' });
+
+it('custom scalar + discrete metrics', () => {
+  for (let run = 0; run < 50; run += 1) {
+    duration.time();
+    const result = fib(22);
+    duration.timeEnd();
+
+    evenResults.record(result % 2 === 0 ? 1 : 0);
+  }
+});
+
+// A second scalar metric demonstrating labeled sub-series via `time`/`timeEnd` labels.
+const phases = new ScalarMetric({
+  name: 'fib_phase',
+  format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
+});
+
+it('sub-series via labels', () => {
+  for (let run = 0; run < 50; run += 1) {
+    phases.time('small');
+    fib(18);
+    phases.timeEnd('small'); // -> fib_phase#small
+
+    phases.time('large');
+    fib(24);
+    phases.timeEnd('large'); // -> fib_phase#large
+  }
+});
diff --git a/test/performance/tests/react-recording.bench.tsx b/test/performance/tests/react-recording.bench.tsx
new file mode 100644
index 000000000..ed2d30bb3
--- /dev/null
+++ b/test/performance/tests/react-recording.bench.tsx
@@ -0,0 +1,52 @@
+import * as React from 'react';
+import * as ReactDOM from 'react-dom';
+import { benchmark, ElementTiming, ScalarMetric } from '@mui/internal-benchmark';
+
+function simulateSlowdown(ms: number) {
+  const end = performance.now() + ms;
+  while (performance.now() < end) {
+    // burn CPU
+  }
+}
+
+function Counter() {
+  const [count, setCount] = React.useState(0);
+  simulateSlowdown(2);
+  return (
+    <button type="button" onClick={() => setCount((value) => value + 1)}>
+      {/* `key` forces a fresh element per state, so each paints its own element-timing entry. */}
+      <ElementTiming key={count} name={count === 0 ? 'ready' : 'clicked'} />
+      {count}
+    </button>
+  );
+}
+
+// A custom metric recorded inside the benchmark. Warmup iterations are excluded automatically, so
+// this yields exactly `runs` samples without the interaction needing to know about warmup.
+const clickWork = new ScalarMetric({
+  name: 'click_work',
+  format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 },
+});
+
+// Start with React recording paused so the mount is excluded. Await the mount paint (which lands in
+// its own frame while paused), resume, then measure only the click's re-render and the paint it
+// produces — so the report shows a single `bench:update` render and a single `bench:paint#clicked`.
+benchmark(
+  'Counter click (interaction only)',
+  () => <Counter />,
+  async ({ resumeReactRecording, waitForElementTiming }) => {
+    await waitForElementTiming('ready'); // mount paint — recorded while paused, so excluded
+    resumeReactRecording();
+    clickWork.time();
+    ReactDOM.flushSync(() => {
+      document.querySelector('button')?.click();
+    });
+    clickWork.timeEnd(); // wraps the synchronous re-render
+    await waitForElementTiming('clicked'); // interaction paint — recorded
+  },
+  {
+    runs: 10,
+    warmupRuns: 5,
+    reactRecordingPaused: true,
+  },
+);