diff --git a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx index afb076129..655e83e5b 100644 --- a/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx +++ b/apps/code-infra-dashboard/src/components/BenchmarkComparisonReportView.tsx @@ -14,16 +14,22 @@ import ExpandMoreIcon from '@mui/icons-material/ExpandMore'; import Tooltip from '@mui/material/Tooltip'; import { compareBenchmarkReports, + type BenchmarkComparisonInput, type BenchmarkComparisonReport, type ComparisonItem, type DiffValue, type BenchmarkDiffSeverity, } from '@/lib/benchmark/compareBenchmarkReports'; -import type { BenchmarkReport } from '@/lib/benchmark/types'; -import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters'; +import { + formatMs, + formatMetricNumber, + formatMetricDiff, + percentFormatter, +} from '@/utils/formatters'; const SEVERITY_COLOR: Record = { error: 'error.main', + warning: 'warning.main', success: 'success.main', neutral: 'text.secondary', }; @@ -32,6 +38,7 @@ const MIN_BAR_WIDTH_PX = 4; const DIFF_BAR_COLORS: Record = { error: 'var(--mui-palette-error-main)', + warning: 'var(--mui-palette-warning-main)', success: 'var(--mui-palette-success-main)', neutral: 'var(--mui-palette-action-disabled)', }; @@ -72,13 +79,19 @@ function computeEntryBar( return computeDiffBar(comparison.duration, range.min, range.max); } -function FormattedDiffMs({ diff, percent = false }: { diff: DiffValue; percent?: boolean }) { +interface FormattedDiffMsProps { + diff: DiffValue; + percent?: boolean; + format?: Intl.NumberFormatOptions; +} + +function FormattedDiffMs({ diff, percent = false, format }: FormattedDiffMsProps) { if (diff.absoluteDiff === 0) { return '\u2014'; } return ( - {formatDiffMs(diff.absoluteDiff)} + {formatMetricDiff(diff.absoluteDiff, format)} {percent && ( {' '} @@ -91,12 +104,18 @@ function FormattedDiffMs({ diff, percent = false }: { diff: DiffValue; percent?: ); } -function DiffCell({ diff, sx }: { diff: DiffValue; sx?: object }) { +interface DiffCellProps { + diff: DiffValue; + sx?: object; + format?: Intl.NumberFormatOptions; +} + +function DiffCell({ diff, sx, format }: DiffCellProps) { const color = SEVERITY_COLOR[diff.severity]; return ( - + ); @@ -114,6 +133,7 @@ interface ComparisonTableRow { valueFill?: number; valueColor?: string; diffBar?: { left: number; width: number; color: string }; + format?: Intl.NumberFormatOptions; } const NAME_CELL_SX = { @@ -181,7 +201,7 @@ function ComparisonTable({ {row.outliers} {hasBase && ( - {row.base != null ? formatMs(row.base) : '\u2014'} + {row.base != null ? formatMetricNumber(row.base, row.format) : '\u2014'} )} {(() => { @@ -196,10 +216,10 @@ function ComparisonTable({ : undefined; return row.removed ? ( - + ) : ( - + ); } return {'\u2014'}; @@ -246,24 +266,20 @@ function TotalsSummary({ totals }: { totals: BenchmarkComparisonReport['totals'] - {totals.paintDefault && ( - - - Paint:{' '} - - - - - - )} ); } +function worstMetricSeverity(metrics: ComparisonItem['metrics']): BenchmarkDiffSeverity { + if (metrics.some((row) => row.diff.severity === 'error')) { + return 'error'; + } + if (metrics.some((row) => row.diff.severity === 'warning')) { + return 'warning'; + } + return 'neutral'; +} + function BenchmarkAccordion({ comparison, hasBase, @@ -278,6 +294,8 @@ function BenchmarkAccordion({ entryDiffRange: { min: number; max: number }; }) { const renderCount = comparison.renders.filter((row) => !row.removed).length; + const metricCount = comparison.metrics.filter((row) => !row.removed).length; + const metricSeverity = worstMetricSeverity(comparison.metrics); const entryBar = computeEntryBar(comparison, hasBase, entryDiffRange); @@ -345,6 +363,11 @@ function BenchmarkAccordion({ )} + {metricCount > 0 && ( + + {metricCount} metrics + + )} @@ -406,9 +429,9 @@ function BenchmarkAccordion({ '\u2014' ) : ( - {formatMs(row.value)}{' '} + {formatMetricNumber(row.value, row.format)}{' '} - Β±{formatMs(row.stdDev)} + Β±{formatMetricNumber(row.stdDev, row.format)} ), @@ -416,6 +439,7 @@ function BenchmarkAccordion({ comparison: row.diff, base: row.diff?.base, removed: row.removed, + format: row.format, }; })} /> @@ -431,8 +455,8 @@ function BenchmarkAccordion({ } interface BenchmarkComparisonReportViewProps { - value: BenchmarkReport; - base: BenchmarkReport | null; + value: BenchmarkComparisonInput; + base: BenchmarkComparisonInput | null; } export function BenchmarkComparisonReportView({ value, base }: BenchmarkComparisonReportViewProps) { diff --git a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx index 27e09ec1f..608131535 100644 --- a/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx +++ b/apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx @@ -10,7 +10,7 @@ import Tabs from '@mui/material/Tabs'; import Tab from '@mui/material/Tab'; import { BarChartPro } from '@mui/x-charts-pro/BarChartPro'; import { useXScale, useDrawingArea } from '@mui/x-charts-pro/hooks'; -import type { BenchmarkReport } from '@/lib/benchmark/types'; +import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types'; import { formatMs } from '@/utils/formatters'; import { useMasterCommits, type GitHubCommit } from '../hooks/useMasterCommits'; import { useCiReports } from '../hooks/useCiReports'; @@ -81,6 +81,7 @@ interface CommitReportData { timestamp: number; commit: GitHubCommit; report: BenchmarkReport | null; + metricDefinitions?: Record; } function collectBenchmarkNames(chartData: CommitReportData[]): string[] { @@ -126,6 +127,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps) timestamp, commit, report: reports[commit.sha]?.report ?? null, + metricDefinitions: reports[commit.sha]?.metricDefinitions, })), [commits, reports], ); @@ -154,7 +156,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps) return entry.totalDuration; } if (chartMode === 'paint') { - return entry.metrics['paint:default']?.mean ?? null; + return entry.metrics['bench:paint']?.mean ?? null; } return entry.renders.length; }, @@ -300,8 +302,10 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps) return null; } return { - value: reportData.report, - base: baselineData?.report ?? null, + value: { report: reportData.report, metricDefinitions: reportData.metricDefinitions }, + base: baselineData?.report + ? { report: baselineData.report, metricDefinitions: baselineData.metricDefinitions } + : null, valueCommit: reportData.commit, baseCommit: baselineData?.commit ?? null, }; diff --git a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts index f861d5aae..6d07c7729 100644 --- a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts +++ b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.test.ts @@ -1,8 +1,34 @@ import { describe, it, expect } from 'vitest'; import { buildBenchmarkMarkdownReport } from './buildMarkdownReport'; -import { compareBenchmarkReports } from './compareBenchmarkReports'; +import { compareBenchmarkReports, type BenchmarkComparisonInput } from './compareBenchmarkReports'; +import type { BenchmarkReportEntry, MetricDefinition } from './types'; import { makeReport, makeReportFromConfig } from './test-fixtures'; +// A single benchmark whose duration/renders are neutral, so the only signal is one metric. +function metricReport( + name: string, + mean: number, + definitions?: Record, +): BenchmarkComparisonInput { + const entry: BenchmarkReportEntry = { + iterations: 10, + totalDuration: 100, + renders: [], + metrics: { [name]: { mean, stdDev: 0, outliers: 0 } }, + }; + return { report: { Bench: entry }, metricDefinitions: definitions }; +} + +const alarmDefinitions: Record = { + clicks: { + kind: 'discrete', + format: { maximumFractionDigits: 0 }, + alarm: { direction: 'lowerIsBetter', error: 1 }, + }, + tti: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 } }, + fps: { kind: 'scalar' }, // informational (no alarm) +}; + describe('buildBenchmarkMarkdownReport', () => { it('drops within-noise rows but keeps significant regressions', () => { const report = compareBenchmarkReports( @@ -126,4 +152,38 @@ describe('buildBenchmarkMarkdownReport', () => { expect(markdown).not.toContain('No significant changes'); expect(markdown).toContain('| Test |'); }); + + describe('metric alarms', () => { + it('surfaces an error-level metric, making its otherwise-neutral test significant', () => { + const report = compareBenchmarkReports( + metricReport('clicks', 5, alarmDefinitions), // +2 vs 3, discrete error band 1 + metricReport('clicks', 3), + ); + const markdown = buildBenchmarkMarkdownReport(report); + expect(markdown).not.toContain('No significant changes'); + expect(markdown).toContain('**Metric alarms**'); + expect(markdown).toContain('| Bench | clicks |'); + expect(markdown).toContain('πŸ”Ί'); + }); + + it('does not surface warning-level metrics (kept on the dashboard only)', () => { + const report = compareBenchmarkReports( + metricReport('tti', 115, alarmDefinitions), // +15%: past warn (10%), within error (25%) -> warning + metricReport('tti', 100), + ); + const markdown = buildBenchmarkMarkdownReport(report); + expect(markdown).toContain('No significant changes'); + expect(markdown).not.toContain('Metric alarms'); + }); + + it('ignores informational metrics β€” no alarm section, test stays within noise', () => { + const report = compareBenchmarkReports( + metricReport('fps', 200, alarmDefinitions), // big change but no alarm configured + metricReport('fps', 100), + ); + const markdown = buildBenchmarkMarkdownReport(report); + expect(markdown).toContain('No significant changes'); + expect(markdown).not.toContain('Metric alarms'); + }); + }); }); diff --git a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts index c21135cc2..d33a6c403 100644 --- a/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts +++ b/apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts @@ -1,5 +1,12 @@ -import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters'; -import type { BenchmarkComparisonReport, DiffValue } from './compareBenchmarkReports'; +import { formatMs, formatDiffMs, formatMetricDiff, percentFormatter } from '@/utils/formatters'; +import type { + BenchmarkComparisonReport, + ComparisonEntry, + DiffValue, +} from './compareBenchmarkReports'; + +// Only error-level alarms surface in the PR comment; warnings stay on the dashboard. +const isAlarmed = (metric: ComparisonEntry): boolean => metric.diff.severity === 'error'; export interface BuildMarkdownReportOptions { maxRows?: number; @@ -8,6 +15,7 @@ export interface BuildMarkdownReportOptions { const SEVERITY_PREFIX: Record = { error: 'πŸ”Ί', + warning: '⚠️', success: 'β–Ό', }; @@ -42,11 +50,6 @@ export function buildBenchmarkMarkdownReport( `**Total duration:** ${formatMs(report.totals.duration.current ?? 0)}${formatDiff(report.totals.duration, 'ms')}`, `**Renders:** ${report.totals.renderCount.current ?? 0}${formatDiff(report.totals.renderCount, 'count')}`, ]; - if (report.totals.paintDefault) { - totalParts.push( - `**Paint:** ${formatMs(report.totals.paintDefault.current ?? 0)}${formatDiff(report.totals.paintDefault, 'ms')}`, - ); - } lines.push(totalParts.join(' | ')); lines.push(''); } @@ -56,7 +59,8 @@ export function buildBenchmarkMarkdownReport( entry.duration.severity !== 'neutral' || (entry.renderCount?.severity ?? 'neutral') !== 'neutral' || entry.duration.current === null || - entry.duration.base === null, + entry.duration.base === null || + entry.metrics.some(isAlarmed), ); const detailsLink = reportUrl ? `[details](${reportUrl})` : ''; @@ -99,5 +103,22 @@ export function buildBenchmarkMarkdownReport( lines.push(detailsLink); } + // Error-level metric alarms across all tests (independent of the test-table row cap). + const alarms = report.entries.flatMap((entry) => + entry.metrics.filter(isAlarmed).map((metric) => ({ test: entry.name, metric })), + ); + if (alarms.length > 0) { + lines.push(''); + lines.push('**Metric alarms**'); + lines.push(''); + lines.push('| Test | Metric | Change |'); + lines.push('|:-----|:-------|-------:|'); + for (const { test, metric } of alarms) { + const prefix = SEVERITY_PREFIX[metric.diff.severity] ?? ''; + const change = `${prefix} ${formatMetricDiff(metric.diff.absoluteDiff, metric.format)}`; + lines.push(`| ${test} | ${metric.name} | ${change} |`); + } + } + return lines.join('\n'); } diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts index 87370da5b..e17ea9a26 100644 --- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts +++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.test.ts @@ -1,7 +1,38 @@ import { describe, it, expect } from 'vitest'; import { compareBenchmarkReports } from './compareBenchmarkReports'; +import type { BenchmarkReport, BenchmarkReportEntry, MetricDefinition } from './types'; import { makeReport, makeReportFromConfig } from './test-fixtures'; +function reportWithMetrics(metrics: Record): BenchmarkReport { + const entry: BenchmarkReportEntry = { + iterations: 10, + totalDuration: 0, + renders: [], + metrics: Object.fromEntries( + Object.entries(metrics).map(([name, mean]) => [name, { mean, stdDev: 0, outliers: 0 }]), + ), + }; + return { Bench: entry }; +} + +const definitions: Record = { + scalar_alarm: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', error: 0.1 } }, + scalar_tiered: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 } }, + scalar_higher: { kind: 'scalar', alarm: { direction: 'higherIsBetter', error: 0.1 } }, + scalar_info: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } }, + discrete_alarm: { kind: 'discrete', alarm: { direction: 'lowerIsBetter' } }, + discrete_tiered: { kind: 'discrete', alarm: { direction: 'lowerIsBetter', warn: 1, error: 3 } }, +}; + +function metricEntry(current: BenchmarkReport, base: BenchmarkReport, metricName: string) { + // The metric appears on the current side, so its definitions ride along there. + const result = compareBenchmarkReports( + { report: current, metricDefinitions: definitions }, + { report: base }, + ); + return result.entries[0].metrics.find((metric) => metric.name === metricName)!; +} + describe('compareBenchmarkReports', () => { it('marks diffs within Β±20% as neutral noise', () => { const result = compareBenchmarkReports( @@ -183,4 +214,134 @@ describe('compareBenchmarkReports', () => { expect(order).toEqual(['DurationRegression', 'FewerRenders']); }); }); + + describe('custom metrics', () => { + it('keeps an informational metric (no alarm) neutral even on a large change', () => { + const metric = metricEntry( + reportWithMetrics({ scalar_info: 200 }), + reportWithMetrics({ scalar_info: 100 }), + 'scalar_info', + ); + expect(metric.diff.severity).toBe('neutral'); + expect(metric.format).toEqual({ style: 'unit', unit: 'byte' }); + // The hint respects the metric's format rather than hard-coding milliseconds. + expect(metric.diff.hint).toContain('byte'); + expect(metric.diff.hint).not.toContain('ms'); + }); + + it('flags a scalar alarm regression beyond its error band', () => { + const metric = metricEntry( + reportWithMetrics({ scalar_alarm: 120 }), // +20%, error band is 10% + reportWithMetrics({ scalar_alarm: 100 }), + 'scalar_alarm', + ); + expect(metric.diff.severity).toBe('error'); + expect(metric.diff.hint).toContain('Regression'); + }); + + it('flags a scalar regression between warn and error bands as a warning', () => { + const metric = metricEntry( + reportWithMetrics({ scalar_tiered: 115 }), // +15%: past warn (10%), within error (25%) + reportWithMetrics({ scalar_tiered: 100 }), + 'scalar_tiered', + ); + expect(metric.diff.severity).toBe('warning'); + expect(metric.diff.hint).toContain('Warning'); + }); + + it('escalates a scalar regression past the error band to error', () => { + const metric = metricEntry( + reportWithMetrics({ scalar_tiered: 130 }), // +30%: past error (25%) + reportWithMetrics({ scalar_tiered: 100 }), + 'scalar_tiered', + ); + expect(metric.diff.severity).toBe('error'); + }); + + it('keeps a scalar change within the warn band neutral', () => { + const metric = metricEntry( + reportWithMetrics({ scalar_tiered: 105 }), // +5%: within warn + reportWithMetrics({ scalar_tiered: 100 }), + 'scalar_tiered', + ); + expect(metric.diff.severity).toBe('neutral'); + }); + + it('tiers discrete metrics by absolute count delta, inclusive of the band', () => { + const warn = metricEntry( + reportWithMetrics({ discrete_tiered: 4 }), // +1: meets warn (1), below error (3) + reportWithMetrics({ discrete_tiered: 3 }), + 'discrete_tiered', + ); + expect(warn.diff.severity).toBe('warning'); + + const error = metricEntry( + reportWithMetrics({ discrete_tiered: 6 }), // +3: meets error (3) + reportWithMetrics({ discrete_tiered: 3 }), + 'discrete_tiered', + ); + expect(error.diff.severity).toBe('error'); + }); + + it('honors higherIsBetter so an increase is an improvement', () => { + const metric = metricEntry( + reportWithMetrics({ scalar_higher: 120 }), + reportWithMetrics({ scalar_higher: 100 }), + 'scalar_higher', + ); + expect(metric.diff.severity).toBe('success'); + expect(metric.diff.hint).toContain('Improvement'); + }); + + it('compares discrete alarms exactly β€” any change is flagged', () => { + const metric = metricEntry( + reportWithMetrics({ discrete_alarm: 4 }), + reportWithMetrics({ discrete_alarm: 3 }), + 'discrete_alarm', + ); + expect(metric.diff.severity).toBe('error'); + expect(metric.diff.hint).toBe('Regression: +1'); + }); + + it('keeps metrics without a definition on the default noise-band behavior', () => { + // No definition for `paint:default`, so it uses the global Β±20% noise band. + const withinNoise = metricEntry( + reportWithMetrics({ 'paint:default': 110 }), + reportWithMetrics({ 'paint:default': 100 }), + 'paint:default', + ); + expect(withinNoise.diff.severity).toBe('neutral'); + + const regression = metricEntry( + reportWithMetrics({ 'paint:default': 130 }), + reportWithMetrics({ 'paint:default': 100 }), + 'paint:default', + ); + expect(regression.diff.severity).toBe('error'); + }); + + it('resolves a sub-series definition by its base metric name', () => { + const metric = metricEntry( + reportWithMetrics({ 'scalar_alarm#large': 120 }), + reportWithMetrics({ 'scalar_alarm#large': 100 }), + 'scalar_alarm#large', + ); + expect(metric.diff.severity).toBe('error'); + }); + + it('keeps a base-only (removed) metric formatted using its base definition', () => { + // Definitions travel with their report: the metric exists only in the base, so its formatting + // comes from the base side's definitions even though the head has none. + const result = compareBenchmarkReports( + { report: reportWithMetrics({}) }, + { + report: reportWithMetrics({ bytes: 100 }), + metricDefinitions: { bytes: { kind: 'scalar', format: { style: 'unit', unit: 'byte' } } }, + }, + ); + const metric = result.entries[0].metrics.find((entry) => entry.name === 'bytes')!; + expect(metric.removed).toBe(true); + expect(metric.format).toEqual({ style: 'unit', unit: 'byte' }); + }); + }); }); diff --git a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts index af0aa2ca4..513e741e4 100644 --- a/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts +++ b/apps/code-infra-dashboard/src/lib/benchmark/compareBenchmarkReports.ts @@ -1,9 +1,17 @@ -import { formatDiffMs, percentFormatter } from '@/utils/formatters'; -import type { BenchmarkReport, BenchmarkReportEntry, RenderStats } from './types'; +import { formatDiffMs, formatMetricDiff, percentFormatter } from '@/utils/formatters'; +import type { + BenchmarkBaseUpload, + BenchmarkReportEntry, + MetricDefinition, + RenderStats, +} from './types'; + +/** A report paired with its own metric definitions β€” the unit the comparison operates on. */ +export type BenchmarkComparisonInput = Pick; const NOISE_THRESHOLD = 0.2; -export type BenchmarkDiffSeverity = 'error' | 'success' | 'neutral'; +export type BenchmarkDiffSeverity = 'error' | 'warning' | 'success' | 'neutral'; export interface DiffValue { current: number | null; @@ -21,6 +29,8 @@ export interface ComparisonEntry { outliers: number; diff: DiffValue; removed: boolean; + /** Display formatting for custom metrics; absent for renders and built-in (paint) metrics. */ + format?: Intl.NumberFormatOptions; } export interface ComparisonItem { @@ -34,8 +44,9 @@ export interface ComparisonItem { const SEVERITY_RANK: Record = { error: 0, - success: 1, - neutral: 2, + warning: 1, + success: 2, + neutral: 3, }; export interface BenchmarkComparisonReport { @@ -44,7 +55,6 @@ export interface BenchmarkComparisonReport { totals: { duration: DiffValue; renderCount: DiffValue; - paintDefault: DiffValue | null; }; } @@ -153,21 +163,117 @@ function compareRenders( return entries; } +/** Strips a `#sub-series` suffix to recover the metric name used to look up its definition. */ +function baseMetricName(key: string): string { + const hashIndex = key.indexOf('#'); + return hashIndex === -1 ? key : key.slice(0, hashIndex); +} + +/** + * Diff for a custom metric, honoring its definition. A metric without an `alarm` is + * informational (always neutral). With an `alarm`, a regression past the `warn` band is flagged + * `warning` and past the `error` band `error`; improvements are `success`. Bands are relative + * fractions for `scalar` metrics and absolute count deltas for `discrete` metrics. Metrics + * without a definition (e.g. paint) keep the default `makeDiffValue` behavior and never reach here. + */ +function makeMetricDiff( + current: number | null, + base: number | null, + definition: MetricDefinition, +): DiffValue { + if (base === null) { + return { current, base, absoluteDiff: 0, relativeDiff: 0, severity: 'neutral', hint: 'New' }; + } + + const currentVal = current ?? 0; + const absoluteDiff = currentVal - base; + const relativeDiff = base !== 0 ? absoluteDiff / base : 0; + + const { alarm, kind } = definition; + const isDiscrete = kind === 'discrete'; + const diffStr = isDiscrete + ? `${absoluteDiff > 0 ? '+' : ''}${absoluteDiff}` + : `${formatMetricDiff(absoluteDiff, definition.format)} (${percentFormatter.format(relativeDiff)})`; + + // Informational metric: show the change, never flag it. + if (!alarm) { + return { + current, + base, + absoluteDiff, + relativeDiff, + severity: 'neutral', + hint: absoluteDiff === 0 ? 'No change' : diffStr, + }; + } + + // Scalar bands are relative fractions; discrete bands are absolute count deltas, and meet their + // threshold inclusively (an `error: 2` discrete alarm fires on a delta of exactly 2). + const magnitude = isDiscrete ? Math.abs(absoluteDiff) : Math.abs(relativeDiff); + const meets = (band: number) => (isDiscrete ? magnitude >= band : magnitude > band); + + // When no bands are given, `error` falls back to the global noise band (scalar) or any change + // (discrete). When only `warn` is given, there is no error band. + const defaultError = isDiscrete ? 0 : NOISE_THRESHOLD; + const errorBand = alarm.error ?? (alarm.warn === undefined ? defaultError : undefined); + const warnBand = alarm.warn; + + let level: 'none' | 'warn' | 'error' = 'none'; + if (absoluteDiff !== 0) { + if (errorBand !== undefined && meets(errorBand)) { + level = 'error'; + } else if (warnBand !== undefined && meets(warnBand)) { + level = 'warn'; + } + } + + const direction = alarm.direction ?? 'lowerIsBetter'; + const isRegression = direction === 'lowerIsBetter' ? absoluteDiff > 0 : absoluteDiff < 0; + + let severity: BenchmarkDiffSeverity = 'neutral'; + if (level !== 'none') { + if (!isRegression) { + severity = 'success'; + } else { + severity = level === 'error' ? 'error' : 'warning'; + } + } + + let hint: string; + if (absoluteDiff === 0) { + hint = 'No change'; + } else if (level === 'none') { + hint = `Within noise: ${diffStr}`; + } else if (!isRegression) { + hint = `Improvement: ${diffStr}`; + } else { + hint = `${level === 'error' ? 'Regression' : 'Warning'}: ${diffStr}`; + } + + return { current, base, absoluteDiff, relativeDiff, severity, hint }; +} + function compareMetrics( currentMetrics: Record, baseEntry: BenchmarkReportEntry | undefined, + currentDefinitions: Record | undefined, + baseDefinitions: Record | undefined, ): ComparisonEntry[] { const entries: ComparisonEntry[] = []; for (const [name, stats] of Object.entries(currentMetrics)) { + const definition = currentDefinitions?.[baseMetricName(name)]; const baseStats = baseEntry?.metrics[name]; entries.push({ name, value: stats.mean, stdDev: stats.stdDev, outliers: stats.outliers, - diff: makeDiffValue(stats.mean, baseStats?.mean ?? null), + diff: definition + ? makeMetricDiff(stats.mean, baseStats?.mean ?? null, definition) + : makeDiffValue(stats.mean, baseStats?.mean ?? null), removed: false, + format: definition?.format, }); } @@ -175,6 +281,7 @@ function compareMetrics( if (baseEntry) { for (const [name, baseStats] of Object.entries(baseEntry.metrics)) { if (!(name in currentMetrics)) { + const definition = baseDefinitions?.[baseMetricName(name)]; entries.push({ name, value: 0, @@ -182,6 +289,7 @@ function compareMetrics( outliers: 0, diff: makeDiffValue(null, baseStats.mean), removed: true, + format: definition?.format, }); } } @@ -225,22 +333,25 @@ function compareItems(a: ComparisonItem, b: ComparisonItem): number { } export function compareBenchmarkReports( - current: BenchmarkReport, - base: BenchmarkReport | null, + current: BenchmarkComparisonInput, + base: BenchmarkComparisonInput | null, ): BenchmarkComparisonReport { - const effectiveBase = base ?? {}; + // Definitions travel with their report: a current metric uses the current definitions, a + // base-only/removed metric uses the base definitions. No merge step β€” the side a metric appears + // on decides how it's formatted. + const currentDefinitions = current.metricDefinitions; + const baseDefinitions = base?.metricDefinitions; + const currentReport = current.report; + const effectiveBase = base?.report ?? {}; const entries: ComparisonItem[] = []; let totalCurrentDuration = 0; let totalBaseDuration = 0; let totalCurrentRenders = 0; let totalBaseRenders = 0; - let totalCurrentPaint = 0; - let totalBasePaint = 0; - let hasPaint = false; // Process current entries - for (const [name, entry] of Object.entries(current)) { + for (const [name, entry] of Object.entries(currentReport)) { const baseEntry = effectiveBase[name]; const duration = makeDiffValue(entry.totalDuration, baseEntry?.totalDuration ?? null); @@ -251,7 +362,7 @@ export function compareBenchmarkReports( ? makeCountDiffValue(entry.renders.length, baseEntry.renders.length) : undefined, renders: compareRenders(entry.renders, baseEntry), - metrics: compareMetrics(entry.metrics, baseEntry), + metrics: compareMetrics(entry.metrics, baseEntry, currentDefinitions, baseDefinitions), iterations: entry.iterations, }); @@ -259,19 +370,11 @@ export function compareBenchmarkReports( totalBaseDuration += baseEntry?.totalDuration ?? 0; totalCurrentRenders += entry.renders.length; totalBaseRenders += baseEntry?.renders.length ?? 0; - - const paintMetric = entry.metrics['paint:default']; - const basePaintMetric = baseEntry?.metrics['paint:default']; - if (paintMetric || basePaintMetric) { - hasPaint = true; - totalCurrentPaint += paintMetric?.mean ?? 0; - totalBasePaint += basePaintMetric?.mean ?? 0; - } } // Process removed entries (in base but not in current) for (const [name, baseEntry] of Object.entries(effectiveBase)) { - if (name in current) { + if (name in currentReport) { continue; } @@ -280,18 +383,12 @@ export function compareBenchmarkReports( name, duration, renders: compareRenders([], baseEntry), - metrics: compareMetrics({}, baseEntry), + metrics: compareMetrics({}, baseEntry, currentDefinitions, baseDefinitions), iterations: 0, }); totalBaseDuration += baseEntry.totalDuration; totalBaseRenders += baseEntry.renders.length; - - const basePaintMetric = baseEntry.metrics['paint:default']; - if (basePaintMetric) { - hasPaint = true; - totalBasePaint += basePaintMetric.mean; - } } entries.sort(compareItems); @@ -302,7 +399,6 @@ export function compareBenchmarkReports( totals: { duration: makeDiffValue(totalCurrentDuration, totalBaseDuration), renderCount: makeCountDiffValue(totalCurrentRenders, totalBaseRenders), - paintDefault: hasPaint ? makeDiffValue(totalCurrentPaint, totalBasePaint) : null, }, }; } diff --git a/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts new file mode 100644 index 000000000..538b2537b --- /dev/null +++ b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.test.ts @@ -0,0 +1,40 @@ +import { describe, it, expect } from 'vitest'; +import { migrateBenchmarkReport } from './migrateBenchmarkReport'; +import type { BenchmarkReport } from './types'; + +function report(metrics: Record): BenchmarkReport { + return { + Bench: { + iterations: 10, + totalDuration: 0, + renders: [], + metrics: Object.fromEntries( + Object.entries(metrics).map(([name, mean]) => [name, { mean, stdDev: 0, outliers: 0 }]), + ), + }, + }; +} + +describe('migrateBenchmarkReport', () => { + it('renames legacy paint:default to bench:paint', () => { + const migrated = migrateBenchmarkReport(report({ 'paint:default': 12 })); + expect(Object.keys(migrated.Bench.metrics)).toEqual(['bench:paint']); + expect(migrated.Bench.metrics['bench:paint'].mean).toBe(12); + }); + + it('renames named legacy paint keys to bench:paint sub-series', () => { + const migrated = migrateBenchmarkReport(report({ 'paint:grid-header': 8 })); + expect(Object.keys(migrated.Bench.metrics)).toEqual(['bench:paint#grid-header']); + }); + + it('leaves current and custom metric keys untouched (idempotent)', () => { + const migrated = migrateBenchmarkReport( + report({ 'bench:paint': 5, 'bench:paint#header': 3, button_clicks: 2 }), + ); + expect(Object.keys(migrated.Bench.metrics).sort()).toEqual([ + 'bench:paint', + 'bench:paint#header', + 'button_clicks', + ]); + }); +}); diff --git a/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts new file mode 100644 index 000000000..733342348 --- /dev/null +++ b/apps/code-infra-dashboard/src/lib/benchmark/migrateBenchmarkReport.ts @@ -0,0 +1,35 @@ +import type { BenchmarkReport, BenchmarkReportEntry } from './types'; + +const LEGACY_PAINT_PREFIX = 'paint:'; + +/** + * Renames legacy harness paint keys (`paint:default`, `paint:grid-header`) to the current + * `bench:paint` scheme (`bench:paint`, `bench:paint#grid-header`). Idempotent: keys already in the + * new scheme are untouched. + */ +function migrateMetricKey(key: string): string { + if (!key.startsWith(LEGACY_PAINT_PREFIX)) { + return key; + } + const identifier = key.slice(LEGACY_PAINT_PREFIX.length); + return identifier === 'default' ? 'bench:paint' : `bench:paint#${identifier}`; +} + +function migrateEntry(entry: BenchmarkReportEntry): BenchmarkReportEntry { + return { + ...entry, + metrics: Object.fromEntries( + Object.entries(entry.metrics).map(([key, stats]) => [migrateMetricKey(key), stats]), + ), + }; +} + +/** + * Applies forward migrations to a benchmark report fetched from S3 so older uploads read with the + * current shape. Centralizes legacy normalization β€” add future migrations here. + */ +export function migrateBenchmarkReport(report: BenchmarkReport): BenchmarkReport { + return Object.fromEntries( + Object.entries(report).map(([name, entry]) => [name, migrateEntry(entry)]), + ); +} diff --git a/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts b/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts index c786b8297..18e86d17a 100644 --- a/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts +++ b/apps/code-infra-dashboard/src/lib/benchmark/test-fixtures.ts @@ -1,4 +1,5 @@ import type { BenchmarkReport, BenchmarkReportEntry } from './types'; +import type { BenchmarkComparisonInput } from './compareBenchmarkReports'; export function makeEntry(totalDuration: number, renderCount: number = 1): BenchmarkReportEntry { const perRender = renderCount > 0 ? totalDuration / renderCount : 0; @@ -19,20 +20,20 @@ export function makeEntry(totalDuration: number, renderCount: number = 1): Bench }; } -export function makeReport(entries: Record): BenchmarkReport { +export function makeReport(entries: Record): BenchmarkComparisonInput { const report: BenchmarkReport = {}; for (const [name, totalDuration] of Object.entries(entries)) { report[name] = makeEntry(totalDuration); } - return report; + return { report }; } export function makeReportFromConfig( entries: Record, -): BenchmarkReport { +): BenchmarkComparisonInput { const report: BenchmarkReport = {}; for (const [name, { duration, renders }] of Object.entries(entries)) { report[name] = makeEntry(duration, renders); } - return report; + return { report }; } diff --git a/apps/code-infra-dashboard/src/lib/benchmark/types.ts b/apps/code-infra-dashboard/src/lib/benchmark/types.ts index ec29672c9..92a0f9d67 100644 --- a/apps/code-infra-dashboard/src/lib/benchmark/types.ts +++ b/apps/code-infra-dashboard/src/lib/benchmark/types.ts @@ -31,6 +31,23 @@ export interface BenchmarkReportEntry { export type BenchmarkReport = Record; +export type MetricDirection = 'lowerIsBetter' | 'higherIsBetter'; + +export interface MetricAlarm { + direction?: MetricDirection; + /** Softer band (relative fraction for scalar, absolute count delta for discrete). */ + warn?: number; + /** Harder band; defaults to the global noise band only when both `warn` and `error` are omitted. */ + error?: number; +} + +/** Per-metric config for custom metrics, hoisted to the top level of the report (keyed by name). */ +export interface MetricDefinition { + kind: 'scalar' | 'discrete'; + format?: Intl.NumberFormatOptions; + alarm?: MetricAlarm; +} + export interface BenchmarkBaseUpload { version: 1; timestamp: number; @@ -40,6 +57,7 @@ export interface BenchmarkBaseUpload { prNumber?: number; branch: string; report: BenchmarkReport; + metricDefinitions?: Record; } export interface BenchmarkUpload extends BenchmarkBaseUpload { diff --git a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts index b1eb97f7a..e7ef61af5 100644 --- a/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts +++ b/apps/code-infra-dashboard/src/lib/ciReports/benchmarkReport.ts @@ -45,7 +45,8 @@ export async function generateBenchmarkReport( markdownContent += `_:information_source: Using benchmark from parent commit ${actualBaseCommit} (fallback from merge base ${mergeBaseCommit})._\n\n`; } - const comparison = compareBenchmarkReports(headReport.report, baseReport); + const baseUpload = useInlinedBase ? inlinedBase : fetchedBaseUpload; + const comparison = compareBenchmarkReports(headReport, baseUpload ?? null); const detailsUrl = new URL(`${DASHBOARD_ORIGIN}/benchmark-details/${repo}`); detailsUrl.searchParams.set('sha', commitSha); diff --git a/apps/code-infra-dashboard/src/utils/fetchCiReport.ts b/apps/code-infra-dashboard/src/utils/fetchCiReport.ts index 67e96b349..2b2e7cfab 100644 --- a/apps/code-infra-dashboard/src/utils/fetchCiReport.ts +++ b/apps/code-infra-dashboard/src/utils/fetchCiReport.ts @@ -1,5 +1,6 @@ import type { SizeSnapshotWithMetadata } from '@/lib/bundleSize/types'; import type { BenchmarkReport, BenchmarkUpload } from '@/lib/benchmark/types'; +import { migrateBenchmarkReport } from '@/lib/benchmark/migrateBenchmarkReport'; export interface CiReportTypes { 'benchmark.json': BenchmarkUpload; @@ -17,14 +18,19 @@ export type CiReportName = keyof CiReportTypes; * simply reasserts what the body already contains. */ function normalizeBenchmarkArtifact(raw: unknown, repo: string, sha: string): BenchmarkUpload { - if (raw && typeof raw === 'object' && 'report' in raw) { - return { ...(raw as BenchmarkUpload), commitSha: sha, repo }; - } + const upload: BenchmarkUpload = + raw && typeof raw === 'object' && 'report' in raw + ? { ...(raw as BenchmarkUpload), commitSha: sha, repo } + : ({ commitSha: sha, repo, report: raw as BenchmarkReport } as BenchmarkUpload); + + // Apply forward migrations so older uploads read with the current metric naming. return { - commitSha: sha, - repo, - report: raw as BenchmarkReport, - } as BenchmarkUpload; + ...upload, + report: migrateBenchmarkReport(upload.report), + ...(upload.base + ? { base: { ...upload.base, report: migrateBenchmarkReport(upload.base.report) } } + : {}), + }; } /** diff --git a/apps/code-infra-dashboard/src/utils/formatters.ts b/apps/code-infra-dashboard/src/utils/formatters.ts index 27dd11347..20266ac33 100644 --- a/apps/code-infra-dashboard/src/utils/formatters.ts +++ b/apps/code-infra-dashboard/src/utils/formatters.ts @@ -22,6 +22,33 @@ export function formatDiffMs(value: number): string { return `${sign}${durationFormatter.format(value)} ms`; } +// `Intl.NumberFormat` construction is comparatively expensive, so cache one instance per unique +// format spec β€” these formatters run for every metric cell on every render. +const numberFormatterCache = new Map(); + +function getNumberFormatter(format: Intl.NumberFormatOptions): Intl.NumberFormat { + const key = JSON.stringify(format); + let formatter = numberFormatterCache.get(key); + if (!formatter) { + formatter = new Intl.NumberFormat(undefined, format); + numberFormatterCache.set(key, formatter); + } + return formatter; +} + +/** Formats a metric value with its `Intl.NumberFormatOptions`, falling back to milliseconds. */ +export function formatMetricNumber(value: number, format?: Intl.NumberFormatOptions): string { + return format ? getNumberFormatter(format).format(value) : formatMs(value); +} + +/** Formats a signed metric diff with its format, falling back to a millisecond diff. */ +export function formatMetricDiff(value: number, format?: Intl.NumberFormatOptions): string { + // `signDisplay` is spread last so a diff always shows its sign, regardless of the metric's format. + return format + ? getNumberFormatter({ ...format, signDisplay: 'exceptZero' }).format(value) + : formatDiffMs(value); +} + interface ColumnDefinition { field: string; header?: string; diff --git a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx index a558ffe01..1b0e8a0e7 100644 --- a/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx +++ b/apps/code-infra-dashboard/src/views/BenchmarkDetails.tsx @@ -139,12 +139,7 @@ export default function BenchmarkDetails() { )} - {report && ( - - )} + {report && } ); diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md index f4c980bcd..ba001c90b 100644 --- a/packages/benchmark/README.md +++ b/packages/benchmark/README.md @@ -61,9 +61,44 @@ benchmark( ); ``` +### Scoping which renders are measured + +By default a benchmark records every React render and paint, from the mount through the whole interaction. To measure only part of an interaction β€” or to exclude the mount β€” pause and resume recording from the interaction callback: + +```tsx +benchmark( + 'Combobox type', + () => , + async ({ pauseReactRecording, resumeReactRecording, waitForElementTiming }) => { + pauseReactRecording(); // stop recording the settling re-renders + await openMenu(); + resumeReactRecording(); // measure only what follows + await type('hello'); + await waitForElementTiming('results'); + }, +); +``` + +`pauseReactRecording()` / `resumeReactRecording()` toggle only the harness's React render and `bench:paint` recording β€” your own custom metrics keep recording. They are a strict pair: pausing while already paused, or resuming while already active, throws (this catches unbalanced calls early). + +To exclude the mount itself, start paused with the `reactRecordingPaused` option and resume at the point you care about β€” the mount is captured before the interaction callback runs, so pausing inside the callback can't drop it: + +```tsx +benchmark( + 'Combobox type', + () => , + async ({ resumeReactRecording }) => { + await openMenu(); // mount + open: not recorded + resumeReactRecording(); + await type('hello'); // only these renders/paint recorded + }, + { reactRecordingPaused: true }, +); +``` + ### Paint metrics -By default, every benchmark captures a `paint:default` metric β€” the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically. +By default, every benchmark captures a `bench:paint` metric β€” the time from iteration start until the browser actually paints the rendered output. This uses the [Element Timing API](https://developer.mozilla.org/en-US/docs/Web/API/PerformanceElementTiming) via an invisible sentinel element that the benchmark harness renders automatically. The harness owns the `bench:` namespace, so avoid it for your own metric names. You can track additional paint metrics by placing `` markers and awaiting them in an interaction callback. The component renders an invisible `` that fires in the same paint frame as its surrounding content. @@ -88,16 +123,79 @@ benchmark( ); ``` -This produces a `paint:my-component` metric alongside the automatic `paint:default`. +This produces a `bench:paint#my-component` sub-series alongside the automatic `bench:paint`. `waitForElementTiming` accepts an optional `timeout` in milliseconds (default: 5000). Pass `0` or `Infinity` to rely on the test timeout instead. +### Custom metrics + +Record your own measurements β€” a timing, a count, anything measured inside or outside React β€” from a plain `it()` loop or from inside a `benchmark()`. There are two primitives: + +- `ScalarMetric` β€” a continuous value (timings, sizes). Aggregated as mean Β± standard deviation with IQR outlier removal, and compared against a baseline with a relative noise band. +- `DiscreteMetric` β€” a count of events. Compared as an exact integer (any change is significant) and formatted as a whole number. + +Both record values with `record(value)`. `ScalarMetric` additionally offers `time()`/`timeEnd()` β€” a `console.time`-style shortcut that records the elapsed milliseconds for you. + +```tsx +import { it } from 'vitest'; +import { ScalarMetric, DiscreteMetric } from '@mui/internal-benchmark'; + +const duration = new ScalarMetric({ + name: 'work_duration', + format: { style: 'unit', unit: 'millisecond' }, // Intl.NumberFormatOptions + alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 }, // warn >10%, error >25% +}); + +const clicks = new DiscreteMetric({ name: 'button_clicks' }); + +it('measures work', () => { + for (let i = 0; i < 100; i += 1) { + duration.time(); + runWork(); + duration.timeEnd(); // records the elapsed milliseconds + + clicks.record(countClicks()); // a discrete count per run + } +}); +``` + +A metric is declared once (typically at module scope) and reused across tests and iterations. `record()` attaches the value to whichever test is running, so the same instance works in any `it()`. + +You can also `record()` or `time()` from inside a `benchmark()` render function or interaction callback. Values recorded during warmup iterations are excluded automatically, just like renders and `bench:paint`, so a metric recorded once per iteration yields exactly `runs` samples. + +#### Metric configuration + +- `name` β€” the metric's report key (**required**). +- `format` β€” an [`Intl.NumberFormatOptions`](https://developer.mozilla.org/en-US/docs/Web/API/Intl/NumberFormat/NumberFormat) object used to display the value. +- `alarm` β€” opts the metric into regression flagging. Omit it and the metric is informational (its diff is shown but never flagged). Holds: + - `direction` β€” `'lowerIsBetter'` (default) or `'higherIsBetter'`. + - `warn` β€” softer band; a regression past it is flagged as a warning. + - `error` β€” harder band; a regression past it is flagged as an error. Defaults to the dashboard's global noise band only when both `warn` and `error` are omitted; with only `warn` set there is no error band (warning-only). + - Bands are relative fractions for scalar metrics (`0.1` = 10%) and absolute count deltas for discrete metrics (`1`, `2`). Either band is optional. + +Alarms are evaluated against the baseline when the PR comment is generated, not during the local `vitest run` β€” a regression never fails the test suite locally. In the PR comment, `error`-band regressions surface as failures and `warn`-band regressions as warnings. + +#### Sub-series + +Pass `record(value, { id })` to split one metric into labeled sub-series, reported as `name#id`. For `ScalarMetric.time()`/`timeEnd()`, pass a label that maps to the same `id`: + +```tsx +const phase = new ScalarMetric({ name: 'render_phase' }); + +phase.time('header'); +renderHeader(); +phase.timeEnd('header'); // -> render_phase#header +``` + +Custom metrics are aggregated in the browser and only the resulting stats cross to the runner, so the amount of data is independent of how many values you record. + ### Options ```tsx benchmark('name', renderFn, interaction, { runs: 20, // measurement iterations (default: 20) warmupRuns: 10, // warmup iterations before measuring (default: 10) + reactRecordingPaused: false, // start with React render/paint recording paused (default: false) afterEach: () => { /* cleanup between iterations */ }, @@ -148,5 +246,7 @@ The feature is opt-in β€” without `BENCHMARK_BASELINE_PATH` (or the `baselinePat - `benchmark` β€” define a benchmark test case - `ElementTiming` β€” invisible marker component for paint timing (renders a `` tracked by the Element Timing API) +- `ScalarMetric` β€” record a continuous custom measurement (with a `console.time`-style timing helper) +- `DiscreteMetric` β€” record a discrete custom count - `createBenchmarkVitestConfig` β€” create a Vitest config with browser benchmarking defaults - `BenchmarkReporter` β€” Vitest reporter that collects and outputs benchmark results diff --git a/packages/benchmark/src/DiscreteMetric.ts b/packages/benchmark/src/DiscreteMetric.ts new file mode 100644 index 000000000..90cec67c5 --- /dev/null +++ b/packages/benchmark/src/DiscreteMetric.ts @@ -0,0 +1,20 @@ +import { Metric } from './Metric'; +import type { MetricConfig, MetricKind } from './types'; + +/** + * A discrete count of events or occurrences. Compared against a baseline as an exact integer + * (any change is significant β€” there is no noise band), and formatted as a whole number by default. + * + * ```ts + * const clicks = new DiscreteMetric({ name: 'button_clicks' }); + * clicks.record(countClicks()); + * ``` + */ +export class DiscreteMetric extends Metric { + readonly kind: MetricKind = 'discrete'; + + constructor(config: MetricConfig | string) { + const resolved = typeof config === 'string' ? { name: config } : config; + super({ ...resolved, format: resolved.format ?? { maximumFractionDigits: 0 } }); + } +} diff --git a/packages/benchmark/src/Metric.test.ts b/packages/benchmark/src/Metric.test.ts new file mode 100644 index 000000000..fc061eb07 --- /dev/null +++ b/packages/benchmark/src/Metric.test.ts @@ -0,0 +1,51 @@ +import { describe, it, expect } from 'vitest'; +import { ScalarMetric } from './ScalarMetric'; +import { DiscreteMetric } from './DiscreteMetric'; + +describe('Metric.record', () => { + it('throws when two different metrics share a name in the same test', () => { + const timing = new ScalarMetric({ name: 'shared' }); + const count = new DiscreteMetric({ name: 'shared' }); + timing.record(1); + expect(() => count.record(2)).toThrow(/share the name "shared"/); + }); + + it('allows the same instance to record many values under one name', () => { + const metric = new ScalarMetric({ name: 'reused' }); + expect(() => { + metric.record(1); + metric.record(2, { id: 'sub' }); + }).not.toThrow(); + }); + + it('rejects a metric name containing the "#" sub-series separator', () => { + expect(() => new ScalarMetric({ name: 'paint#default' })).toThrow(/must not contain "#"/); + }); +}); + +describe('ScalarMetric.timeEnd', () => { + it('throws when called without a matching time()', () => { + const metric = new ScalarMetric({ name: 'timer' }); + expect(() => metric.timeEnd()).toThrow(/without a matching time/); + }); + + it('throws when the label does not match an open timer', () => { + const metric = new ScalarMetric({ name: 'labelled-timer' }); + metric.time('a'); + expect(() => metric.timeEnd('b')).toThrow(/without a matching time/); + }); +}); + +describe('ScalarMetric.time', () => { + it('throws when a timer is already running for the same label', () => { + const metric = new ScalarMetric({ name: 'double-timer' }); + metric.time('a'); + expect(() => metric.time('a')).toThrow(/already running/); + }); + + it('allows concurrent timers under different labels', () => { + const metric = new ScalarMetric({ name: 'multi-timer' }); + metric.time('a'); + expect(() => metric.time('b')).not.toThrow(); + }); +}); diff --git a/packages/benchmark/src/Metric.ts b/packages/benchmark/src/Metric.ts new file mode 100644 index 000000000..eea291aa2 --- /dev/null +++ b/packages/benchmark/src/Metric.ts @@ -0,0 +1,110 @@ +import { onTestFinished, TestRunner, type RunnerTestCase } from 'vitest'; +import type { MetricConfig, MetricKind, MetricReport, MetricSampleStats } from './types'; +import { aggregateSamples } from './stats'; +import { metricsGate } from './metricsGate'; +// Import for TaskMeta augmentation side effect +import './taskMetaAugmentation'; + +interface SeriesAccumulator { + /** The instance that registered this name, used to reject two metrics sharing a name. */ + owner: Metric; + kind: MetricKind; + config: MetricConfig; + /** Raw samples per sub-series id (`''` is the base series), kept in browser memory only. */ + series: Map; +} + +type TestAccumulator = Map; + +// Raw samples never cross the browserβ†’runner boundary. They accumulate here per test (keyed by +// the running task so a module-scoped metric shared across tests never mixes data) and are +// aggregated into compact stats by an `onTestFinished` hook before being written to `task.meta`. +const accumulators = new WeakMap(); + +function flush(test: RunnerTestCase, accumulator: TestAccumulator): void { + const store: Record = {}; + for (const [name, entry] of accumulator) { + const series: Record = {}; + for (const [seriesId, samples] of entry.series) { + series[seriesId] = { ...aggregateSamples(samples), count: samples.length }; + } + store[name] = { kind: entry.kind, config: entry.config, series }; + } + test.meta.benchmarkMetrics = store; +} + +export interface MetricRecordOptions { + /** Sub-series label. Recorded under `${name}#${id}` in the report; omit for the base series. */ + id?: string; +} + +/** + * Base class for custom benchmark metrics. Use `ScalarMetric` or `DiscreteMetric`. + * + * Records are tied to the test that is running when `record()` is called (resolved via + * `getCurrentTest()`), so a single instance can be declared at module scope and reused across + * tests and loop iterations, inside or outside React. + */ +export abstract class Metric { + abstract readonly kind: MetricKind; + + readonly name: string; + + protected readonly config: MetricConfig; + + constructor(config: MetricConfig | string) { + this.config = typeof config === 'string' ? { name: config } : config; + this.name = this.config.name; + if (this.name.includes('#')) { + throw new Error( + `Metric name "${this.name}" must not contain "#" β€” it is reserved as the sub-series separator.`, + ); + } + } + + /** + * Records a single measured value. Samples accumulate in browser memory and are aggregated + * once when the test finishes. Pass `options.id` to split into a labeled sub-series. + */ + record(value: number, options?: MetricRecordOptions): void { + const test = TestRunner.getCurrentTest(); + if (!test) { + throw new Error( + `${this.constructor.name}.record() must be called inside a running Vitest test.`, + ); + } + + // The harness disables the gate during warmup iterations so custom metrics recorded inside a + // benchmark honor the same warmup exclusion as renders and `bench:paint`. Standalone `it()` + // loops never touch the gate, so they keep recording every value. + if (!metricsGate.isRecordingEnabled(test)) { + return; + } + + let accumulator = accumulators.get(test); + if (!accumulator) { + const created: TestAccumulator = new Map(); + accumulator = created; + accumulators.set(test, created); + onTestFinished(() => flush(test, created)); + } + + let entry = accumulator.get(this.name); + if (!entry) { + entry = { owner: this, kind: this.kind, config: this.config, series: new Map() }; + accumulator.set(this.name, entry); + } else if (entry.owner !== this) { + throw new Error( + `Two metrics share the name "${this.name}". Metric names must be unique; reuse a single instance instead.`, + ); + } + + const seriesId = options?.id ?? ''; + let samples = entry.series.get(seriesId); + if (!samples) { + samples = []; + entry.series.set(seriesId, samples); + } + samples.push(value); + } +} diff --git a/packages/benchmark/src/ScalarMetric.ts b/packages/benchmark/src/ScalarMetric.ts new file mode 100644 index 000000000..ce730327a --- /dev/null +++ b/packages/benchmark/src/ScalarMetric.ts @@ -0,0 +1,47 @@ +import { Metric } from './Metric'; +import type { MetricKind } from './types'; + +/** + * A continuous measurement (timings, sizes, …). Samples are aggregated with mean Β± standard + * deviation and IQR outlier removal, and compared against a baseline with a relative noise band. + * + * It offers a `console.time`-style timing helper: + * + * ```ts + * const metric = new ScalarMetric({ name: 'render', format: { style: 'unit', unit: 'millisecond' } }); + * metric.time(); + * doWork(); + * metric.timeEnd(); // records the elapsed milliseconds + * ``` + */ +export class ScalarMetric extends Metric { + readonly kind: MetricKind = 'scalar'; + + private readonly pending = new Map(); + + /** Starts a timer. Pass a `label` to time a sub-series; it maps to `record`'s `id`. */ + time(label?: string): void { + const key = label ?? ''; + if (this.pending.has(key)) { + throw new Error( + `${this.name}.time(${label ? `"${label}"` : ''}) was called while a timer is already running for that label.`, + ); + } + this.pending.set(key, performance.now()); + } + + /** Stops the timer started by `time(label)` and records the elapsed milliseconds. */ + timeEnd(label?: string): void { + // Capture the end time first so the map lookup/delete below isn't part of the measurement. + const end = performance.now(); + const key = label ?? ''; + const start = this.pending.get(key); + if (start === undefined) { + throw new Error( + `${this.name}.timeEnd(${label ? `"${label}"` : ''}) was called without a matching time().`, + ); + } + this.pending.delete(key); + this.record(end - start, label !== undefined ? { id: label } : undefined); + } +} diff --git a/packages/benchmark/src/ciReport.ts b/packages/benchmark/src/ciReport.ts index 682739639..8e2d8cd1f 100644 --- a/packages/benchmark/src/ciReport.ts +++ b/packages/benchmark/src/ciReport.ts @@ -66,7 +66,27 @@ const benchmarkReportEntrySchema = z.object({ const benchmarkReportSchema = z.record(z.string(), benchmarkReportEntrySchema); -const benchmarkBaseUploadSchema = ciReportUploadSchema('benchmark', 1, benchmarkReportSchema); +const metricDefinitionSchema = z.object({ + kind: z.enum(['scalar', 'discrete']), + format: z.custom().optional(), + alarm: z + .object({ + direction: z.enum(['lowerIsBetter', 'higherIsBetter']).optional(), + warn: z.number().min(0).optional(), + error: z.number().min(0).optional(), + }) + .optional(), +}); + +const benchmarkBaseUploadSchema = ciReportUploadSchema( + 'benchmark', + 1, + benchmarkReportSchema, +).extend({ + // Per-metric config for custom metrics, hoisted to the top level (keyed by metric name) so + // it is stored once rather than duplicated in every report entry. Optional and additive. + metricDefinitions: z.record(z.string(), metricDefinitionSchema).optional(), +}); export const benchmarkUploadSchema = benchmarkBaseUploadSchema.extend({ base: benchmarkBaseUploadSchema.optional(), @@ -74,6 +94,7 @@ export const benchmarkUploadSchema = benchmarkBaseUploadSchema.extend({ export type RenderStats = z.infer; export type MetricStats = z.infer; +export type BenchmarkMetricDefinition = z.infer; export type BenchmarkReportEntry = z.infer; export type BenchmarkReport = z.infer; export type BenchmarkBaseUpload = z.infer; diff --git a/packages/benchmark/src/index.tsx b/packages/benchmark/src/index.tsx index b9e77223e..6db367fe9 100644 --- a/packages/benchmark/src/index.tsx +++ b/packages/benchmark/src/index.tsx @@ -2,8 +2,11 @@ import * as React from 'react'; import { expect, it } from 'vitest'; import * as ReactDOMClient from 'react-dom/client'; // aliased to react-dom/profiling by Vite import * as ReactDOM from 'react-dom'; -import type { RenderEvent, BenchmarkMetric, IterationData, InteractionContext } from './types'; +import type { RenderEvent, IterationData, InteractionContext } from './types'; import { ElementTiming } from './ElementTiming'; +import { ScalarMetric } from './ScalarMetric'; +import { metricsGate } from './metricsGate'; +import { createReactRecordingControls, type ReactRecordingControls } from './reactRecording'; // Import for TaskMeta augmentation side effect import './taskMetaAugmentation'; @@ -13,21 +16,38 @@ interface PerformanceElementTiming extends PerformanceEntry { readonly identifier: string; } -export type { RenderEvent, BenchmarkMetric, IterationData, InteractionContext } from './types'; +export type { RenderEvent, IterationData, InteractionContext } from './types'; +export type { + MetricKind, + MetricDirection, + MetricAlarm, + MetricConfig, + MetricDefinition, +} from './types'; export { ElementTiming } from './ElementTiming'; +export { Metric, type MetricRecordOptions } from './Metric'; +export { ScalarMetric }; +export { DiscreteMetric } from './DiscreteMetric'; function BenchProfiler({ captures, + recording, children, }: { captures: RenderEvent[]; + recording: ReactRecordingControls; children: React.ReactNode; }) { const onRender = React.useCallback( (id, phase, actualDuration, _baseDuration, startTime) => { - captures.push({ id, phase, actualDuration, startTime }); + // Skip renders captured while React recording is paused (e.g. the mount when the benchmark + // starts paused, or a span the interaction explicitly excludes). + if (recording.active) { + captures.push({ id, phase, actualDuration, startTime }); + recording.markRendered(); + } }, - [captures], + [captures, recording], ); return ( @@ -69,6 +89,12 @@ interface BenchmarkOptions { runs?: number; warmupRuns?: number; afterEach?: () => Promise | void; + /** + * Start each iteration with React render/paint recording paused. The interaction callback then + * calls `resumeReactRecording()` at the point it cares about β€” useful to exclude the mount and + * measure only the renders/paint of a later interaction. Defaults to `false` (mount recorded). + */ + reactRecordingPaused?: boolean; } export function benchmark( @@ -87,6 +113,16 @@ export function benchmark( const totalRuns = warmupRuns + runs; const iterations: IterationData[] = []; + // Paint timings are recorded as one harness-owned `bench:paint` metric: the default sentinel + // is the base series (`bench:paint`) and named `elementtiming` markers are sub-series + // (`bench:paint#grid-header`, …), all sharing a single definition. A default alarm keeps a + // >20% paint regression flagged, matching the previous behavior. + const paint = new ScalarMetric({ + name: 'bench:paint', + format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 2 }, + alarm: { error: 0.2 }, + }); + const hasElementTiming = supportsElementTiming(); if (typeof window.gc !== 'function') { @@ -96,10 +132,20 @@ export function benchmark( } let renderError: unknown = null; + // Set if any iteration had a recording window that was active yet captured no renders. + let sawEmptyActiveWindow = false; for (let i = 0; i < totalRuns; i += 1) { const isWarmup = i < warmupRuns; + // Custom metrics recorded inside the benchmark honor warmup exclusion through the gate, the + // same way renders and `bench:paint` are excluded during warmup. + metricsGate.setRecordingEnabled(task, !isWarmup); + + // Per-iteration switch for the harness's React render/paint recording. Starts paused when + // `reactRecordingPaused` is set; the interaction callback drives it from there. + const recording = createReactRecordingControls(!(options?.reactRecordingPaused ?? false)); + // Drain event loop from previous unmount, then double GC for thorough cleanup // eslint-disable-next-line no-await-in-loop await settle(); @@ -172,7 +218,11 @@ export function benchmark( }); ReactDOM.flushSync(() => { - root.render({renderFn()}); + root.render( + + {renderFn()} + , + ); }); if (renderError) { @@ -184,24 +234,40 @@ export function benchmark( if (interaction) { // eslint-disable-next-line no-await-in-loop - await interaction({ waitForElementTiming }); + await interaction({ + waitForElementTiming, + pauseReactRecording: recording.pauseReactRecording, + resumeReactRecording: recording.resumeReactRecording, + }); } // Wait for the bench sentinel paint entry (relies on test timeout) // eslint-disable-next-line no-await-in-loop await waitForElementTiming('default', 0); + // Close the final window and remember if any active window measured no renders. + recording.finalizeWindow(); + if (recording.hadEmptyActiveWindow) { + sawEmptyActiveWindow = true; + } + elementObserver?.disconnect(); root.unmount(); container.remove(); if (!isWarmup) { - const metrics: BenchmarkMetric[] = elementEntries.map((entry) => ({ - name: `paint:${entry.identifier}`, - value: entry.renderTime - iterationStart, - })); - iterations.push({ renders: captures, metrics }); + for (const entry of elementEntries) { + // Skip paints that happened while recording was paused. Attribute by the paint's + // `renderTime`, not by when the observer callback fired (which can lag the paint). + if (!recording.activeAt(entry.renderTime)) { + continue; + } + // The default sentinel is the base series; named markers become sub-series. + const id = entry.identifier === 'default' ? undefined : entry.identifier; + paint.record(entry.renderTime - iterationStart, id !== undefined ? { id } : undefined); + } + iterations.push({ renders: captures }); } if (options?.afterEach) { @@ -217,11 +283,13 @@ export function benchmark( throw renderError; } - // Validate that at least one render was recorded + // Every active recording window must capture at least one render. Windows where recording was + // never running (e.g. a fully-paused, metric-only benchmark) are not checked. expect( - iterations[0].renders.length, - 'No renders were recorded during benchmark', - ).toBeGreaterThan(0); + sawEmptyActiveWindow, + 'React recording was active but captured no renders. If you only measure imperative DOM ' + + 'updates or custom metrics, keep recording paused (reactRecordingPaused) instead of resuming.', + ).toBe(false); // Validate all iterations produced the same render events (count + order). // This runs after meta is set so the reporter can still display results on failure. diff --git a/packages/benchmark/src/metricsGate.test.ts b/packages/benchmark/src/metricsGate.test.ts new file mode 100644 index 000000000..0bd43f8b3 --- /dev/null +++ b/packages/benchmark/src/metricsGate.test.ts @@ -0,0 +1,30 @@ +import { describe, it, expect } from 'vitest'; +import type { RunnerTestCase } from 'vitest'; +import { metricsGate } from './metricsGate'; + +// The gate keys on test identity via a WeakSet, so any object stands in for a RunnerTestCase. +function fakeTest(): RunnerTestCase { + return {} as RunnerTestCase; +} + +describe('metricsGate', () => { + it('is enabled by default for an unseen test', () => { + expect(metricsGate.isRecordingEnabled(fakeTest())).toBe(true); + }); + + it('disables and re-enables recording for a test', () => { + const test = fakeTest(); + metricsGate.setRecordingEnabled(test, false); + expect(metricsGate.isRecordingEnabled(test)).toBe(false); + metricsGate.setRecordingEnabled(test, true); + expect(metricsGate.isRecordingEnabled(test)).toBe(true); + }); + + it('keeps the state independent per test', () => { + const suppressed = fakeTest(); + const recording = fakeTest(); + metricsGate.setRecordingEnabled(suppressed, false); + expect(metricsGate.isRecordingEnabled(suppressed)).toBe(false); + expect(metricsGate.isRecordingEnabled(recording)).toBe(true); + }); +}); diff --git a/packages/benchmark/src/metricsGate.ts b/packages/benchmark/src/metricsGate.ts new file mode 100644 index 000000000..dc9759d8e --- /dev/null +++ b/packages/benchmark/src/metricsGate.ts @@ -0,0 +1,24 @@ +import type { RunnerTestCase } from 'vitest'; + +// Internal β€” not exported from the package, not user-facing. The `benchmark()` harness toggles +// recording per test (off during warmup iterations) and `Metric.record()` consults it, so custom +// metrics recorded inside a benchmark honor the same warmup exclusion as renders and `bench:paint`. +// +// Storage tracks the *disabled* tests so absence means enabled: a test with no entry β€” e.g. a +// standalone `it()` loop that never goes through the harness β€” records normally by default. +const disabled = new WeakSet(); + +export const metricsGate = { + /** Whether custom-metric recording is currently enabled for `test`. Defaults to `true`. */ + isRecordingEnabled(test: RunnerTestCase): boolean { + return !disabled.has(test); + }, + /** Enable or disable custom-metric recording for `test`. */ + setRecordingEnabled(test: RunnerTestCase, enabled: boolean): void { + if (enabled) { + disabled.delete(test); + } else { + disabled.add(test); + } + }, +}; diff --git a/packages/benchmark/src/reactRecording.test.ts b/packages/benchmark/src/reactRecording.test.ts new file mode 100644 index 000000000..043d14fb8 --- /dev/null +++ b/packages/benchmark/src/reactRecording.test.ts @@ -0,0 +1,77 @@ +import { describe, it, expect } from 'vitest'; +import { createReactRecordingControls } from './reactRecording'; + +describe('createReactRecordingControls', () => { + it('starts in the requested state', () => { + expect(createReactRecordingControls(true).active).toBe(true); + expect(createReactRecordingControls(false).active).toBe(false); + }); + + it('pauses and resumes', () => { + const controls = createReactRecordingControls(true); + controls.pauseReactRecording(); + expect(controls.active).toBe(false); + controls.resumeReactRecording(); + expect(controls.active).toBe(true); + }); + + it('throws when pausing while already paused', () => { + const controls = createReactRecordingControls(false); + expect(() => controls.pauseReactRecording()).toThrow(/already paused/); + }); + + it('throws when resuming while already active', () => { + const controls = createReactRecordingControls(true); + expect(() => controls.resumeReactRecording()).toThrow(/already active/); + }); + + it('attributes times before all toggles to the initial state and after to the current', () => { + const controls = createReactRecordingControls(false); + controls.resumeReactRecording(); + expect(controls.activeAt(-Infinity)).toBe(false); + expect(controls.activeAt(Infinity)).toBe(true); + }); +}); + +describe('createReactRecordingControls active-window render check', () => { + it('does not flag a window that captured a render before pause', () => { + const controls = createReactRecordingControls(true); + controls.markRendered(); + controls.pauseReactRecording(); + expect(controls.hadEmptyActiveWindow).toBe(false); + }); + + it('flags an active window paused without any render', () => { + const controls = createReactRecordingControls(true); + controls.pauseReactRecording(); + expect(controls.hadEmptyActiveWindow).toBe(true); + }); + + it('flags the final window via finalizeWindow when active and empty', () => { + const controls = createReactRecordingControls(true); + controls.finalizeWindow(); + expect(controls.hadEmptyActiveWindow).toBe(true); + }); + + it('does not flag the final window when it captured a render', () => { + const controls = createReactRecordingControls(true); + controls.markRendered(); + controls.finalizeWindow(); + expect(controls.hadEmptyActiveWindow).toBe(false); + }); + + it('does not flag anything when recording was never active', () => { + const controls = createReactRecordingControls(false); + controls.finalizeWindow(); + expect(controls.hadEmptyActiveWindow).toBe(false); + }); + + it('resets render presence per window', () => { + const controls = createReactRecordingControls(true); + controls.markRendered(); + controls.pauseReactRecording(); // first window had a render + controls.resumeReactRecording(); // second window starts empty + controls.finalizeWindow(); + expect(controls.hadEmptyActiveWindow).toBe(true); + }); +}); diff --git a/packages/benchmark/src/reactRecording.ts b/packages/benchmark/src/reactRecording.ts new file mode 100644 index 000000000..2d04c2fa3 --- /dev/null +++ b/packages/benchmark/src/reactRecording.ts @@ -0,0 +1,92 @@ +export interface ReactRecordingControls { + /** Whether React render/paint recording is active right now β€” the synchronous gate for renders. */ + readonly active: boolean; + /** Whether any active recording window closed without capturing a render. */ + readonly hadEmptyActiveWindow: boolean; + /** + * Whether recording was active at `time` (a `performance.now()` timestamp). Paint entries are + * observed asynchronously, so they are attributed by their `renderTime` rather than by the + * recording state at the moment the observer callback happens to fire. + */ + activeAt(time: number): boolean; + /** Note that a render was captured in the current window. Called by the harness from `onRender`. */ + markRendered(): void; + /** Close the final window at the end of the iteration (validates it if recording is still active). */ + finalizeWindow(): void; + /** Pause React render/paint recording. Throws if recording is already paused. */ + pauseReactRecording(): void; + /** Resume React render/paint recording. Throws if recording is already active. */ + resumeReactRecording(): void; +} + +/** + * Creates the per-iteration switch that turns the harness's React render/paint recording on and + * off. The interaction callback drives it via `pauseReactRecording`/`resumeReactRecording`; the + * strict state machine (each throws when called in the wrong state) catches unbalanced pairs early. + * + * It also tracks whether each *active* window captured at least one render, so the harness can flag + * a window that was recording but measured nothing β€” while leaving fully-paused (metric-only) + * benchmarks alone. + */ +export function createReactRecordingControls(initiallyActive: boolean): ReactRecordingControls { + let active = initiallyActive; + let currentWindowHasRender = false; + let emptyActiveWindow = false; + // Transitions in chronological order. The implicit state before the first toggle is + // `initiallyActive`; `activeAt` replays this to attribute a paint to its render time. + const transitions: { time: number; active: boolean }[] = []; + + // Flag the window being closed if it was recording yet captured nothing. + function closeWindowIfActive() { + if (active && !currentWindowHasRender) { + emptyActiveWindow = true; + } + } + + return { + get active() { + return active; + }, + get hadEmptyActiveWindow() { + return emptyActiveWindow; + }, + activeAt(time) { + let result = initiallyActive; + for (const transition of transitions) { + if (transition.time > time) { + break; + } + result = transition.active; + } + return result; + }, + markRendered() { + currentWindowHasRender = true; + }, + finalizeWindow() { + closeWindowIfActive(); + }, + pauseReactRecording() { + // Stamp first β€” before the guard and bookkeeping β€” so the closing window ends as early as + // possible and excludes pause's own overhead. In the throw path it is simply discarded. + const now = performance.now(); + if (!active) { + throw new Error('pauseReactRecording() called but React recording is already paused.'); + } + closeWindowIfActive(); + active = false; + transitions.push({ time: now, active: false }); + }, + resumeReactRecording() { + if (active) { + throw new Error('resumeReactRecording() called but React recording is already active.'); + } + active = true; + currentWindowHasRender = false; + // Stamp last β€” after the bookkeeping β€” so the new window starts as late as possible and + // excludes resume's own overhead. + const now = performance.now(); + transitions.push({ time: now, active: true }); + }, + }; +} diff --git a/packages/benchmark/src/reporter.test.ts b/packages/benchmark/src/reporter.test.ts index 88f7f96b8..b60279df2 100644 --- a/packages/benchmark/src/reporter.test.ts +++ b/packages/benchmark/src/reporter.test.ts @@ -1,5 +1,6 @@ import * as os from 'node:os'; import * as path from 'node:path'; +import * as fs from 'node:fs/promises'; import { describe, it, expect, vi } from 'vitest'; import type { TestCase } from 'vitest/node'; import type { RenderEvent, IterationData } from './types'; @@ -15,8 +16,8 @@ function event( return { id, phase, startTime, actualDuration }; } -function iteration(renders: RenderEvent[], metrics: IterationData['metrics'] = []): IterationData { - return { renders, metrics }; +function iteration(renders: RenderEvent[]): IterationData { + return { renders }; } describe('generateReportFromIterations', () => { @@ -125,42 +126,7 @@ describe('generateReportFromIterations', () => { expect(report.renders[0].outliers).toBe(1); }); - it('aggregates metrics across iterations', () => { - const iterations = [ - iteration( - [event('App', 'mount', 0, 10)], - [ - { name: 'paint:bench', value: 60 }, - { name: 'paint:grid', value: 55 }, - ], - ), - iteration( - [event('App', 'mount', 0, 10)], - [ - { name: 'paint:bench', value: 64 }, - { name: 'paint:grid', value: 57 }, - ], - ), - iteration( - [event('App', 'mount', 0, 10)], - [ - { name: 'paint:bench', value: 62 }, - { name: 'paint:grid', value: 56 }, - ], - ), - ]; - const report = generateReportFromIterations(iterations); - - expect(report.metrics).toHaveProperty('paint:bench'); - expect(report.metrics).toHaveProperty('paint:grid'); - expect(report.metrics['paint:bench'].mean).toBeCloseTo(62, 0); - expect(report.metrics['paint:grid'].mean).toBeCloseTo(56, 0); - expect(report.metrics['paint:bench'].stdDev).toBeGreaterThanOrEqual(0); - expect(report.metrics['paint:grid'].stdDev).toBeGreaterThanOrEqual(0); - expect(report.metrics['paint:bench'].outliers).toBe(0); - }); - - it('returns empty metrics when no metrics are present', () => { + it('does not aggregate metrics from iterations (paint flows through benchmarkMetrics)', () => { const iterations = [ iteration([event('App', 'mount', 0, 10)]), iteration([event('App', 'mount', 0, 12)]), @@ -169,25 +135,6 @@ describe('generateReportFromIterations', () => { expect(report.metrics).toEqual({}); }); - - it('handles multiple metric identifiers with different counts', () => { - const iterations = [ - iteration( - [event('App', 'mount', 0, 10)], - [ - { name: 'paint:bench', value: 60 }, - { name: 'paint:header', value: 50 }, - ], - ), - iteration([event('App', 'mount', 0, 10)], [{ name: 'paint:bench', value: 64 }]), - ]; - const report = generateReportFromIterations(iterations); - - expect(report.metrics['paint:bench'].mean).toBeCloseTo(62, 0); - // paint:header only has 1 data point - expect(report.metrics['paint:header'].mean).toBe(50); - expect(report.metrics['paint:header'].stdDev).toBe(0); - }); }); function mockTestCase(options: { @@ -323,4 +270,183 @@ describe('BenchmarkReporter', () => { consoleSpy.mockRestore(); }); }); + + describe('custom metrics', () => { + it('merges aggregated custom metrics into the report and hoists definitions', async () => { + const outputPath = path.join(os.tmpdir(), `benchmark-custom-metrics-${process.pid}.json`); + const reporter = new BenchmarkReporter({ outputPath }); + const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {}); + + reporter.onTestCaseResult( + mockTestCase({ + fullName: 'custom only', + meta: { + benchmarkName: 'custom only', + benchmarkMetrics: { + fib_duration: { + kind: 'scalar', + config: { + name: 'fib_duration', + format: { maximumFractionDigits: 3 }, + alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 }, + }, + series: { '': { mean: 1.5, stdDev: 0.1, outliers: 0, count: 50 } }, + }, + fib_phase: { + kind: 'scalar', + config: { name: 'fib_phase' }, + series: { + small: { mean: 0.2, stdDev: 0.01, outliers: 0, count: 50 }, + large: { mean: 3.4, stdDev: 0.2, outliers: 1, count: 50 }, + }, + }, + }, + }, + state: 'passed', + }), + ); + + await reporter.onTestRunEnd(); + const written = JSON.parse(await fs.readFile(outputPath, 'utf8')); + + // Metric-only test still produces an entry; iteration count derived from samples. + expect(written.report['custom only'].iterations).toBe(50); + expect(written.report['custom only'].renders).toEqual([]); + + // Base series keyed by name; sub-series keyed `name#id`. + expect(written.report['custom only'].metrics).toMatchObject({ + fib_duration: { mean: 1.5, stdDev: 0.1, outliers: 0 }, + 'fib_phase#small': { mean: 0.2 }, + 'fib_phase#large': { mean: 3.4, outliers: 1 }, + }); + + // The sample count must not leak into the per-entry metric stats. + expect(written.report['custom only'].metrics.fib_duration).not.toHaveProperty('count'); + + // Config is hoisted once, keyed by metric name. + expect(written.metricDefinitions).toMatchObject({ + fib_duration: { + kind: 'scalar', + alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 }, + }, + fib_phase: { kind: 'scalar' }, + }); + + await fs.rm(outputPath, { force: true }); + consoleSpy.mockRestore(); + }); + + it('merges custom metrics alongside React render iterations', () => { + const reporter = new BenchmarkReporter({ + outputPath: path.join(os.tmpdir(), `benchmark-combined-${process.pid}.json`), + }); + const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {}); + + reporter.onTestCaseResult( + mockTestCase({ + fullName: 'combined', + meta: { + benchmarkName: 'combined', + benchmarkIterations: [ + iteration([event('App', 'mount', 0, 10)]), + iteration([event('App', 'mount', 0, 12)]), + ], + benchmarkMetrics: { + clicks: { + kind: 'discrete', + config: { name: 'clicks', format: { maximumFractionDigits: 0 } }, + series: { '': { mean: 3, stdDev: 0, outliers: 0, count: 2 } }, + }, + }, + }, + state: 'passed', + }), + ); + + const output = consoleSpy.mock.calls.map((call) => call[0]).join('\n'); + // Both the React render table and the custom metric are printed. + expect(output).toContain('combined'); + expect(output).toContain('clicks'); + + consoleSpy.mockRestore(); + }); + + function metricCase(testName: string, metricName: string, alarm: unknown) { + return mockTestCase({ + fullName: testName, + meta: { + benchmarkName: testName, + benchmarkMetrics: { + [metricName]: { + kind: 'scalar', + config: { name: metricName, alarm }, + series: { '': { mean: 1, stdDev: 0, outliers: 0, count: 1 } }, + }, + }, + }, + state: 'passed', + }); + } + + it('allows reusing a metric name across benchmarks with identical config', () => { + const reporter = new BenchmarkReporter({ + outputPath: path.join(os.tmpdir(), `benchmark-reuse-${process.pid}.json`), + }); + const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {}); + + reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 })); + expect(() => + reporter.onTestCaseResult(metricCase('b', 'shared', { error: 0.2 })), + ).not.toThrow(); + + consoleSpy.mockRestore(); + }); + + it('throws when a metric name is reused with conflicting config across benchmarks', () => { + const reporter = new BenchmarkReporter({ + outputPath: path.join(os.tmpdir(), `benchmark-conflict-${process.pid}.json`), + }); + const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {}); + + reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 })); + expect(() => reporter.onTestCaseResult(metricCase('b', 'shared', { error: 0.9 }))).toThrow( + /conflicting/, + ); + + consoleSpy.mockRestore(); + }); + + it('does not flag the same config written with a different key order', () => { + const reporter = new BenchmarkReporter({ + outputPath: path.join(os.tmpdir(), `benchmark-order-${process.pid}.json`), + }); + const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {}); + + reporter.onTestCaseResult( + metricCase('a', 'shared', { direction: 'lowerIsBetter', error: 0.2 }), + ); + expect(() => + reporter.onTestCaseResult( + metricCase('b', 'shared', { error: 0.2, direction: 'lowerIsBetter' }), + ), + ).not.toThrow(); + + consoleSpy.mockRestore(); + }); + + it('resets between runs so an edited config does not conflict on watch reload', () => { + const reporter = new BenchmarkReporter({ + outputPath: path.join(os.tmpdir(), `benchmark-reload-${process.pid}.json`), + }); + const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {}); + + reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.2 })); + reporter.onTestRunStart(); // simulate a watch re-run + expect(() => + reporter.onTestCaseResult(metricCase('a', 'shared', { error: 0.9 })), + ).not.toThrow(); + + consoleSpy.mockRestore(); + }); + }); }); diff --git a/packages/benchmark/src/reporter.ts b/packages/benchmark/src/reporter.ts index 8e2d03d9c..4060b1cab 100644 --- a/packages/benchmark/src/reporter.ts +++ b/packages/benchmark/src/reporter.ts @@ -1,52 +1,42 @@ import * as path from 'node:path'; import * as fs from 'node:fs/promises'; import type { Reporter, TestCase } from 'vitest/node'; -import type { RenderEvent, IterationData } from './types'; +import type { RenderEvent, IterationData, MetricReport, MetricDefinition } from './types'; import type { BenchmarkBaseUpload, BenchmarkReportEntry } from './ciReport'; import { benchmarkUploadSchema, getCiMetadata } from './ciReport'; -import { calculateMean, calculateStdDev, quantile, isOutlier } from './stats'; +import { calculateMean, aggregateSamples } from './stats'; import { dim, red, green, yellow, cyan, printTable, fileUrl } from './format'; import { uploadCiReport } from './upload'; import { syncPrComment } from './syncPrComment'; // Import for TaskMeta augmentation side effect import './taskMetaAugmentation'; -const byNumeric = (a: number, b: number) => a - b; - function getEventKey(event: RenderEvent): string { return `${event.id}:${event.phase}`; } -function aggregateMetrics( - iterations: IterationData[], -): Record { - // Collect all metric names across iterations - const metricValues = new Map(); - for (const iteration of iterations) { - for (const metric of iteration.metrics) { - let values = metricValues.get(metric.name); - if (!values) { - values = []; - metricValues.set(metric.name, values); - } - values.push(metric.value); - } +/** Order-insensitive deep equality, treating a missing key and an `undefined` value as equal. */ +function deepEqual(first: unknown, second: unknown): boolean { + if (first === second) { + return true; } - - const result: Record = {}; - for (const [name, values] of metricValues) { - // Apply IQR filtering - const sorted = [...values].sort(byNumeric); - const q1 = quantile(sorted, 0.25); - const q3 = quantile(sorted, 0.75); - const filtered = values.filter((d) => !isOutlier(d, q1, q3)); - const used = filtered.length > 0 ? filtered : values; - - const mean = calculateMean(used); - const stdDev = calculateStdDev(used, mean); - result[name] = { mean, stdDev, outliers: values.length - used.length }; + if ( + typeof first !== 'object' || + first === null || + typeof second !== 'object' || + second === null + ) { + return false; } - return result; + const firstRecord = first as Record; + const secondRecord = second as Record; + const keys = new Set([...Object.keys(firstRecord), ...Object.keys(secondRecord)]); + for (const key of keys) { + if (!deepEqual(firstRecord[key], secondRecord[key])) { + return false; + } + } + return true; } function generateReportFromIterations(iterations: IterationData[]): BenchmarkReportEntry { @@ -74,14 +64,7 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep for (let index = 0; index < expectedLength; index += 1) { const durations = iterations.map((iteration) => iteration.renders[index].actualDuration); - const sorted = [...durations].sort(byNumeric); - const q1 = quantile(sorted, 0.25); - const q3 = quantile(sorted, 0.75); - const filtered = durations.filter((d) => !isOutlier(d, q1, q3)); - const used = filtered.length > 0 ? filtered : durations; - - const iqrMean = calculateMean(used); - const iqrStdDev = calculateStdDev(used, iqrMean); + const { mean: iqrMean, stdDev: iqrStdDev, outliers } = aggregateSamples(durations); const coefficientOfVariation = iqrMean > 0 ? iqrStdDev / iqrMean : 0; if (iqrMean > 1 && coefficientOfVariation > 0.1) { @@ -96,7 +79,7 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep event: firstIteration.renders[index], iqrMean, iqrStdDev, - outliers: durations.length - used.length, + outliers, }); } @@ -130,8 +113,8 @@ function generateReportFromIterations(iterations: IterationData[]): BenchmarkRep totalDuration += iqrMean; } - // Aggregate metrics - const metrics = aggregateMetrics(iterations); + // Custom + paint metrics are merged separately from `task.meta.benchmarkMetrics`. + const metrics = {}; return { iterations: iterationCount, @@ -190,10 +173,63 @@ function printDurationMatrix(name: string, report: BenchmarkReportEntry, footer: ); } +/** Strips a `#sub-series` suffix to recover the metric name used to look up its definition. */ +function baseMetricName(key: string): string { + const hashIndex = key.indexOf('#'); + return hashIndex === -1 ? key : key.slice(0, hashIndex); +} + +function formatMetricValue(value: number, definition?: MetricDefinition): string { + if (definition?.format) { + return new Intl.NumberFormat(undefined, definition.format).format(value); + } + return value.toFixed(2); +} + +/** + * Merges aggregated custom metrics (already stats, not raw samples) into a report entry, keyed + * `name` or `name#id` for sub-series, and collects each metric's config into the shared + * top-level definitions. For a metric-only test, derives the iteration count from the samples. + */ +function mergeCustomMetrics( + report: BenchmarkReportEntry, + customMetrics: Record, + definitions: Record, +): void { + let maxCount = 0; + for (const [metricName, metric] of Object.entries(customMetrics)) { + for (const [seriesId, stats] of Object.entries(metric.series)) { + const key = seriesId === '' ? metricName : `${metricName}#${seriesId}`; + report.metrics[key] = { mean: stats.mean, stdDev: stats.stdDev, outliers: stats.outliers }; + maxCount = Math.max(maxCount, stats.count); + } + const definition: MetricDefinition = { + kind: metric.kind, + format: metric.config.format, + alarm: metric.config.alarm, + }; + // A metric name maps to one definition. Reusing it across benchmarks is fine when the config + // matches (e.g. the harness `bench:paint`), but conflicting config would silently apply + // last-write-wins to every entry β€” reject it instead. + const existing = definitions[metricName]; + if (existing && !deepEqual(existing, definition)) { + throw new Error( + `Benchmark metric "${metricName}" is defined with conflicting configuration across ` + + `benchmarks. A metric name must map to a single kind, format, and alarm.`, + ); + } + definitions[metricName] = definition; + } + if (report.iterations === 0) { + report.iterations = maxCount; + } +} + function printMetricsTable( name: string, metrics: Record, iterationCount: number, + definitions: Record, ): void { const entries = Object.entries(metrics); if (entries.length === 0) { @@ -201,10 +237,12 @@ function printMetricsTable( } const rows: string[][] = entries.map(([metricName, stats]) => { - const iqrStr = `${stats.mean.toFixed(2)}Β±${stats.stdDev.toFixed(2)}`; + const definition = definitions[baseMetricName(metricName)]; + const iqrStr = `${formatMetricValue(stats.mean, definition)}Β±${formatMetricValue(stats.stdDev, definition)}`; const cv = stats.mean > 0 ? (stats.stdDev / stats.mean) * 100 : 0; + const label = definition?.alarm ? `${metricName} ⚠` : metricName; return [ - metricName.slice(0, LABEL_WIDTH).padStart(LABEL_WIDTH), + label.slice(0, LABEL_WIDTH).padStart(LABEL_WIDTH), cyan(iqrStr.padStart(STAT_WIDTH)), colorCV(cv), stats.outliers > 0 ? yellow(String(stats.outliers).padStart(4)) : dim('0'.padStart(4)), @@ -214,7 +252,7 @@ function printMetricsTable( printTable( [ { header: 'Metric', width: LABEL_WIDTH }, - { header: 'MeanΒ±Οƒ (ms)', width: STAT_WIDTH }, + { header: 'MeanΒ±Οƒ', width: STAT_WIDTH }, { header: 'Var%', width: CV_WIDTH }, { header: 'Out', width: 4 }, ], @@ -242,6 +280,8 @@ async function loadBaselineReport(baselinePath: string): Promise = {}; + private metricDefinitions: Record = {}; + private outputPath: string; private upload: boolean; @@ -259,6 +299,15 @@ class BenchmarkReporter implements Reporter { this.baselinePath = options?.baselinePath ?? process.env.BENCHMARK_BASELINE_PATH; } + // Reset accumulated state at the start of every run so watch-mode re-runs start clean (the + // reporter instance is reused across runs). Otherwise stale benchmarks/definitions linger β€” and + // an edited metric config would conflict with its own previous-run definition. + onTestRunStart(): void { + this.benchmarks = {}; + this.metricDefinitions = {}; + this.hasFailures = false; + } + onTestCaseResult(testCase: TestCase): void { if (testCase.result().state === 'failed') { this.hasFailures = true; @@ -266,14 +315,21 @@ class BenchmarkReporter implements Reporter { const meta = testCase.meta(); const iterations = meta.benchmarkIterations; + const customMetrics = meta.benchmarkMetrics; - if (!iterations) { + if (!iterations && !customMetrics) { console.warn(yellow(` No iterations recorded for: ${testCase.fullName}`)); return; } const name = meta.benchmarkName ?? testCase.fullName; - const report = generateReportFromIterations(iterations); + const report = iterations + ? generateReportFromIterations(iterations) + : { iterations: 0, totalDuration: 0, renders: [], metrics: {} }; + + if (customMetrics) { + mergeCustomMetrics(report, customMetrics, this.metricDefinitions); + } this.benchmarks[name] = report; @@ -284,7 +340,7 @@ class BenchmarkReporter implements Reporter { printDurationMatrix(`${name} β€” React`, report, summary); - printMetricsTable(name, report.metrics, report.iterations); + printMetricsTable(name, report.metrics, report.iterations, this.metricDefinitions); } async onTestRunEnd(): Promise { @@ -303,11 +359,14 @@ class BenchmarkReporter implements Reporter { const baseline = this.baselinePath ? await loadBaselineReport(this.baselinePath) : undefined; + const hasMetricDefinitions = Object.keys(this.metricDefinitions).length > 0; + const results = { version: 1 as const, reportType: 'benchmark' as const, ...(await getCiMetadata()), report: this.benchmarks, + ...(hasMetricDefinitions ? { metricDefinitions: this.metricDefinitions } : {}), ...(baseline ? { base: baseline } : {}), }; diff --git a/packages/benchmark/src/stats.test.ts b/packages/benchmark/src/stats.test.ts index 13c387122..cd6dd8b94 100644 --- a/packages/benchmark/src/stats.test.ts +++ b/packages/benchmark/src/stats.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { calculateMean, calculateStdDev, quantile, isOutlier } from './stats'; +import { calculateMean, calculateStdDev, quantile, isOutlier, aggregateSamples } from './stats'; describe('calculateMean', () => { it('returns the mean of values', () => { @@ -80,3 +80,24 @@ describe('isOutlier', () => { expect(isOutlier(35, 10, 20)).toBe(false); }); }); + +describe('aggregateSamples', () => { + it('computes the mean, standard deviation, and zero outliers for a clean series', () => { + const result = aggregateSamples([2, 4, 6]); + expect(result.mean).toBe(4); + expect(result.outliers).toBe(0); + expect(result.stdDev).toBeCloseTo(Math.sqrt(8 / 3)); + }); + + it('removes IQR outliers before computing the mean', () => { + const result = aggregateSamples([10, 10, 10, 10, 10, 1000]); + expect(result.mean).toBe(10); + expect(result.outliers).toBe(1); + }); + + it('falls back to all values when filtering would remove everything', () => { + const result = aggregateSamples([5]); + expect(result.mean).toBe(5); + expect(result.outliers).toBe(0); + }); +}); diff --git a/packages/benchmark/src/stats.ts b/packages/benchmark/src/stats.ts index d8d3e2a27..70756a8d6 100644 --- a/packages/benchmark/src/stats.ts +++ b/packages/benchmark/src/stats.ts @@ -27,3 +27,24 @@ export function isOutlier(value: number, q1: number, q3: number): boolean { const iqr = q3 - q1; return value < q1 - 1.5 * iqr || value > q3 + 1.5 * iqr; } + +/** + * Aggregates a series of samples into a mean, standard deviation, and outlier count using + * IQR-based outlier removal. Falls back to the raw values when filtering would remove + * everything. This is the shared aggregation core for custom metrics. + */ +export function aggregateSamples(values: number[]): { + mean: number; + stdDev: number; + outliers: number; +} { + const sorted = values.toSorted((first, second) => first - second); + const q1 = quantile(sorted, 0.25); + const q3 = quantile(sorted, 0.75); + const filtered = values.filter((value) => !isOutlier(value, q1, q3)); + const used = filtered.length > 0 ? filtered : values; + + const mean = calculateMean(used); + const stdDev = calculateStdDev(used, mean); + return { mean, stdDev, outliers: values.length - used.length }; +} diff --git a/packages/benchmark/src/taskMetaAugmentation.ts b/packages/benchmark/src/taskMetaAugmentation.ts index 58d922d51..eee8aa740 100644 --- a/packages/benchmark/src/taskMetaAugmentation.ts +++ b/packages/benchmark/src/taskMetaAugmentation.ts @@ -1,10 +1,12 @@ /// -import type { IterationData } from './types'; +import type { IterationData, MetricReport } from './types'; declare module 'vitest' { interface TaskMeta { benchmarkName?: string; benchmarkIterations?: IterationData[]; + /** Custom metrics recorded via `ScalarMetric`/`DiscreteMetric`, keyed by metric name. */ + benchmarkMetrics?: Record; } } diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts index 3156cc275..5294752db 100644 --- a/packages/benchmark/src/types.ts +++ b/packages/benchmark/src/types.ts @@ -14,16 +14,8 @@ export interface RenderEvent { startTime: number; } -export interface BenchmarkMetric { - /** Metric name, e.g. "paint:bench", "paint:grid-header" */ - name: string; - /** Measured value in ms */ - value: number; -} - export interface IterationData { renders: RenderEvent[]; - metrics: BenchmarkMetric[]; } export interface InteractionContext { @@ -33,4 +25,76 @@ export interface InteractionContext { * @param timeout - Timeout in ms. Default: 5000. Pass 0 or Infinity to rely on the test timeout. */ waitForElementTiming: (identifier: string, timeout?: number) => Promise; + /** + * Pause recording of the harness's React render/paint measurements. Custom metrics keep + * recording. Throws if recording is already paused. + */ + pauseReactRecording: () => void; + /** + * Resume recording of the harness's React render/paint measurements. Throws if recording is + * already active. + */ + resumeReactRecording: () => void; +} + +/** + * Whether a custom metric measures a continuous value or a discrete count. + * - `scalar` β€” continuous measurements (timings, sizes); compared with a relative noise band. + * - `discrete` β€” counts/events; compared as exact integers. + */ +export type MetricKind = 'scalar' | 'discrete'; + +/** Which direction of change counts as a regression for a metric in alarm mode. */ +export type MetricDirection = 'lowerIsBetter' | 'higherIsBetter'; + +export interface MetricAlarm { + /** Defaults to `lowerIsBetter`. */ + direction?: MetricDirection; + /** + * Softer band: a regression past `warn` (but within `error`) is flagged as a warning. + * Scalar metrics: a relative fraction (`0.1` = 10%). Discrete metrics: an absolute count delta. + */ + warn?: number; + /** + * Harder band: a regression past `error` is flagged as an error (the alarm). When **both** + * `warn` and `error` are omitted, `error` defaults to the dashboard's global noise band; with + * only `warn` set there is no error band (warning-only). + * Scalar metrics: a relative fraction (`0.25` = 25%). Discrete metrics: an absolute count delta. + */ + error?: number; +} + +export interface MetricConfig { + name: string; + /** Display formatting applied by the reporter and dashboard via `Intl.NumberFormat`. */ + format?: Intl.NumberFormatOptions; + /** + * Regression judgment. Its presence opts the metric into alarming; when omitted the metric + * is informational (the diff is shown but never flagged). + */ + alarm?: MetricAlarm; +} + +/** Aggregated stats for a single metric sub-series, as they cross the browserβ†’runner boundary. */ +export interface MetricSampleStats { + mean: number; + stdDev: number; + outliers: number; + /** Number of recorded samples (used to derive iteration counts; stripped from the report). */ + count: number; +} + +/** A custom metric's aggregated data attached to `task.meta`, keyed by metric name. */ +export interface MetricReport { + kind: MetricKind; + config: MetricConfig; + /** Aggregated stats keyed by sub-series id (`''` is the base series). */ + series: Record; +} + +/** Per-metric configuration hoisted to the top level of the report (keyed by metric name). */ +export interface MetricDefinition { + kind: MetricKind; + format?: Intl.NumberFormatOptions; + alarm?: MetricAlarm; } diff --git a/test/performance/tests/imperative-metric.bench.tsx b/test/performance/tests/imperative-metric.bench.tsx new file mode 100644 index 000000000..1fc2c833a --- /dev/null +++ b/test/performance/tests/imperative-metric.bench.tsx @@ -0,0 +1,38 @@ +import * as React from 'react'; +import { benchmark, ScalarMetric } from '@mui/internal-benchmark'; + +function simulateSlowdown(ms: number) { + const end = performance.now() + ms; + while (performance.now() < end) { + // burn CPU + } +} + +function Widget() { + return
Widget
; +} + +const updateWork = new ScalarMetric({ + name: 'imperative_update', + format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 }, +}); + +// Recording stays paused for the whole benchmark: the interaction mutates the DOM imperatively (no +// React re-render) and measures only a custom metric. The harness must not require a React render +// here β€” there is no active recording window to validate. +benchmark( + 'Widget imperative update (metric only)', + () => , + async () => { + const node = document.querySelector('[data-state]'); + updateWork.time(); + node?.setAttribute('data-state', 'active'); + simulateSlowdown(1); + updateWork.timeEnd(); + }, + { + runs: 10, + warmupRuns: 5, + reactRecordingPaused: true, + }, +); diff --git a/test/performance/tests/metric.bench.tsx b/test/performance/tests/metric.bench.tsx new file mode 100644 index 000000000..ad5ff0899 --- /dev/null +++ b/test/performance/tests/metric.bench.tsx @@ -0,0 +1,44 @@ +import { it } from 'vitest'; +import { ScalarMetric, DiscreteMetric } from '@mui/internal-benchmark'; + +function fib(n: number): number { + return n < 2 ? n : fib(n - 1) + fib(n - 2); +} + +// A scalar timing metric with an alarm: regressions past 10% warn, past 25% error. +const duration = new ScalarMetric({ + name: 'fib_duration', + format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 }, + alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 }, +}); + +// A discrete count metric (informational), compared as an exact integer. +const evenResults = new DiscreteMetric({ name: 'fib_even_results' }); + +it('custom scalar + discrete metrics', () => { + for (let run = 0; run < 50; run += 1) { + duration.time(); + const result = fib(22); + duration.timeEnd(); + + evenResults.record(result % 2 === 0 ? 1 : 0); + } +}); + +// A second scalar metric demonstrating labeled sub-series via `time`/`timeEnd` labels. +const phases = new ScalarMetric({ + name: 'fib_phase', + format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 }, +}); + +it('sub-series via labels', () => { + for (let run = 0; run < 50; run += 1) { + phases.time('small'); + fib(18); + phases.timeEnd('small'); // -> fib_phase#small + + phases.time('large'); + fib(24); + phases.timeEnd('large'); // -> fib_phase#large + } +}); diff --git a/test/performance/tests/react-recording.bench.tsx b/test/performance/tests/react-recording.bench.tsx new file mode 100644 index 000000000..ed2d30bb3 --- /dev/null +++ b/test/performance/tests/react-recording.bench.tsx @@ -0,0 +1,52 @@ +import * as React from 'react'; +import * as ReactDOM from 'react-dom'; +import { benchmark, ElementTiming, ScalarMetric } from '@mui/internal-benchmark'; + +function simulateSlowdown(ms: number) { + const end = performance.now() + ms; + while (performance.now() < end) { + // burn CPU + } +} + +function Counter() { + const [count, setCount] = React.useState(0); + simulateSlowdown(2); + return ( + + ); +} + +// A custom metric recorded inside the benchmark. Warmup iterations are excluded automatically, so +// this yields exactly `runs` samples without the interaction needing to know about warmup. +const clickWork = new ScalarMetric({ + name: 'click_work', + format: { style: 'unit', unit: 'millisecond', maximumFractionDigits: 3 }, +}); + +// Start with React recording paused so the mount is excluded. Await the mount paint (which lands in +// its own frame while paused), resume, then measure only the click's re-render and the paint it +// produces β€” so the report shows a single `bench:update` render and a single `bench:paint#clicked`. +benchmark( + 'Counter click (interaction only)', + () => , + async ({ resumeReactRecording, waitForElementTiming }) => { + await waitForElementTiming('ready'); // mount paint β€” recorded while paused, so excluded + resumeReactRecording(); + clickWork.time(); + ReactDOM.flushSync(() => { + document.querySelector('button')?.click(); + }); + clickWork.timeEnd(); // wraps the synchronous re-render + await waitForElementTiming('clicked'); // interaction paint β€” recorded + }, + { + runs: 10, + warmupRuns: 5, + reactRecordingPaused: true, + }, +);