Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
6fd2dd3
[benchmark] Add custom ScalarMetric and DiscreteMetric primitives
Janpot Jun 5, 2026
5fe6d87
[benchmark] Simplify: reuse aggregateSamples, cache metric formatters
Janpot Jun 5, 2026
f51afe2
[benchmark] Drop unused reserved 'group' field from MetricDefinition
Janpot Jun 5, 2026
0eeb991
[benchmark] Add warning severity tier to custom-metric alarms
Janpot Jun 5, 2026
b0d7950
[benchmark] Reject two metrics sharing a name; add primitive unit tests
Janpot Jun 5, 2026
5a0a107
[code-infra-dashboard] Extract props interfaces for FormattedDiffMs a…
Janpot Jun 5, 2026
303669c
[benchmark] Migrate paint to the metric primitive; surface alarms in …
Janpot Jun 5, 2026
a2daaa8
[benchmark] Namespace harness paint as bench:paint with default base …
Janpot Jun 5, 2026
c22bb9e
[code-infra-dashboard] Migrate legacy paint keys on fetch; fix metric…
Janpot Jun 5, 2026
1704502
[benchmark] Reject conflicting metric definitions reused across bench…
Janpot Jun 5, 2026
6f49c00
[benchmark] Address review: reset reporter per run, robust conflict c…
Janpot Jun 5, 2026
4b6f6cd
[code-infra-dashboard] Merge base+head metricDefinitions inside compa…
Janpot Jun 6, 2026
d736635
[code-infra-dashboard] Bundle metric definitions into the comparison …
Janpot Jun 6, 2026
0bb6fab
[benchmark] Scope render recording and gate custom metrics during warmup
Janpot Jun 8, 2026
2ab3216
[benchmark] Scope the render sanity check to active recording windows
Janpot Jun 8, 2026
71ef193
Potential fix for pull request finding
Janpot Jun 9, 2026
8d8761f
Potential fix for pull request finding
Janpot Jun 9, 2026
7ea769b
[benchmark] Fix broken Copilot autofixes for Metric/ScalarMetric
Janpot Jun 9, 2026
b744593
[benchmark] Address review comments: timing accuracy, deepEqual, toSo…
Janpot Jun 13, 2026
97c8710
[benchmark] Stamp resume timestamp last, clarify record() in README
Janpot Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,22 @@ import ExpandMoreIcon from '@mui/icons-material/ExpandMore';
import Tooltip from '@mui/material/Tooltip';
import {
compareBenchmarkReports,
type BenchmarkComparisonInput,
type BenchmarkComparisonReport,
type ComparisonItem,
type DiffValue,
type BenchmarkDiffSeverity,
} from '@/lib/benchmark/compareBenchmarkReports';
import type { BenchmarkReport } from '@/lib/benchmark/types';
import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters';
import {
formatMs,
formatMetricNumber,
formatMetricDiff,
percentFormatter,
} from '@/utils/formatters';

const SEVERITY_COLOR: Record<BenchmarkDiffSeverity, string> = {
error: 'error.main',
warning: 'warning.main',
success: 'success.main',
neutral: 'text.secondary',
};
Expand All @@ -32,6 +38,7 @@ const MIN_BAR_WIDTH_PX = 4;

const DIFF_BAR_COLORS: Record<BenchmarkDiffSeverity, string> = {
error: 'var(--mui-palette-error-main)',
warning: 'var(--mui-palette-warning-main)',
success: 'var(--mui-palette-success-main)',
neutral: 'var(--mui-palette-action-disabled)',
};
Expand Down Expand Up @@ -72,13 +79,19 @@ function computeEntryBar(
return computeDiffBar(comparison.duration, range.min, range.max);
}

function FormattedDiffMs({ diff, percent = false }: { diff: DiffValue; percent?: boolean }) {
interface FormattedDiffMsProps {
diff: DiffValue;
percent?: boolean;
format?: Intl.NumberFormatOptions;
}

function FormattedDiffMs({ diff, percent = false, format }: FormattedDiffMsProps) {
if (diff.absoluteDiff === 0) {
return '\u2014';
}
return (
<React.Fragment>
{formatDiffMs(diff.absoluteDiff)}
{formatMetricDiff(diff.absoluteDiff, format)}
{percent && (
<React.Fragment>
{' '}
Expand All @@ -91,12 +104,18 @@ function FormattedDiffMs({ diff, percent = false }: { diff: DiffValue; percent?:
);
}

function DiffCell({ diff, sx }: { diff: DiffValue; sx?: object }) {
interface DiffCellProps {
diff: DiffValue;
sx?: object;
format?: Intl.NumberFormatOptions;
}

function DiffCell({ diff, sx, format }: DiffCellProps) {
const color = SEVERITY_COLOR[diff.severity];
return (
<Tooltip title={diff.hint} arrow>
<TableCell align="right" sx={{ color, ...sx }}>
<FormattedDiffMs diff={diff} percent />
<FormattedDiffMs diff={diff} percent format={format} />
</TableCell>
</Tooltip>
);
Expand All @@ -114,6 +133,7 @@ interface ComparisonTableRow {
valueFill?: number;
valueColor?: string;
diffBar?: { left: number; width: number; color: string };
format?: Intl.NumberFormatOptions;
}

const NAME_CELL_SX = {
Expand Down Expand Up @@ -181,7 +201,7 @@ function ComparisonTable({
<TableCell align="right">{row.outliers}</TableCell>
{hasBase && (
<TableCell align="right">
{row.base != null ? formatMs(row.base) : '\u2014'}
{row.base != null ? formatMetricNumber(row.base, row.format) : '\u2014'}
</TableCell>
)}
{(() => {
Expand All @@ -196,10 +216,10 @@ function ComparisonTable({
: undefined;
return row.removed ? (
<TableCell align="right" sx={{ color: 'success.main', ...diffBarSx }}>
<FormattedDiffMs diff={row.comparison} percent />
<FormattedDiffMs diff={row.comparison} percent format={row.format} />
</TableCell>
) : (
<DiffCell diff={row.comparison} sx={diffBarSx} />
<DiffCell diff={row.comparison} sx={diffBarSx} format={row.format} />
);
}
return <TableCell align="right">{'\u2014'}</TableCell>;
Expand Down Expand Up @@ -246,24 +266,20 @@ function TotalsSummary({ totals }: { totals: BenchmarkComparisonReport['totals']
</Typography>
</Typography>
</Tooltip>
{totals.paintDefault && (
<Tooltip title={totals.paintDefault.hint} arrow>
<Typography variant="body2">
<strong>Paint:</strong>{' '}
<Typography
component="span"
variant="body2"
sx={{ color: SEVERITY_COLOR[totals.paintDefault.severity] }}
>
<FormattedDiffMs diff={totals.paintDefault} percent />
</Typography>
</Typography>
</Tooltip>
)}
</Box>
);
}

function worstMetricSeverity(metrics: ComparisonItem['metrics']): BenchmarkDiffSeverity {
if (metrics.some((row) => row.diff.severity === 'error')) {
return 'error';
}
if (metrics.some((row) => row.diff.severity === 'warning')) {
return 'warning';
}
return 'neutral';
}

function BenchmarkAccordion({
comparison,
hasBase,
Expand All @@ -278,6 +294,8 @@ function BenchmarkAccordion({
entryDiffRange: { min: number; max: number };
}) {
const renderCount = comparison.renders.filter((row) => !row.removed).length;
const metricCount = comparison.metrics.filter((row) => !row.removed).length;
const metricSeverity = worstMetricSeverity(comparison.metrics);

const entryBar = computeEntryBar(comparison, hasBase, entryDiffRange);

Expand Down Expand Up @@ -345,6 +363,11 @@ function BenchmarkAccordion({
</Tooltip>
)}
</Typography>
{metricCount > 0 && (
<Typography variant="body2" sx={{ color: SEVERITY_COLOR[metricSeverity] }}>
{metricCount} metrics
</Typography>
)}
</Box>
</Box>
</AccordionSummary>
Expand Down Expand Up @@ -406,16 +429,17 @@ function BenchmarkAccordion({
'\u2014'
) : (
<React.Fragment>
{formatMs(row.value)}{' '}
{formatMetricNumber(row.value, row.format)}{' '}
<Typography component="span" variant="caption" color="text.secondary">
±{formatMs(row.stdDev)}
±{formatMetricNumber(row.stdDev, row.format)}
</Typography>
</React.Fragment>
),
outliers: row.removed ? '\u2014' : row.outliers,
comparison: row.diff,
base: row.diff?.base,
removed: row.removed,
format: row.format,
};
})}
/>
Expand All @@ -431,8 +455,8 @@ function BenchmarkAccordion({
}

interface BenchmarkComparisonReportViewProps {
value: BenchmarkReport;
base: BenchmarkReport | null;
value: BenchmarkComparisonInput;
base: BenchmarkComparisonInput | null;
}

export function BenchmarkComparisonReportView({ value, base }: BenchmarkComparisonReportViewProps) {
Expand Down
12 changes: 8 additions & 4 deletions apps/code-infra-dashboard/src/components/DailyBenchmarkChart.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Tabs from '@mui/material/Tabs';
import Tab from '@mui/material/Tab';
import { BarChartPro } from '@mui/x-charts-pro/BarChartPro';
import { useXScale, useDrawingArea } from '@mui/x-charts-pro/hooks';
import type { BenchmarkReport } from '@/lib/benchmark/types';
import type { BenchmarkReport, MetricDefinition } from '@/lib/benchmark/types';
import { formatMs } from '@/utils/formatters';
import { useMasterCommits, type GitHubCommit } from '../hooks/useMasterCommits';
import { useCiReports } from '../hooks/useCiReports';
Expand Down Expand Up @@ -81,6 +81,7 @@ interface CommitReportData {
timestamp: number;
commit: GitHubCommit;
report: BenchmarkReport | null;
metricDefinitions?: Record<string, MetricDefinition>;
}

function collectBenchmarkNames(chartData: CommitReportData[]): string[] {
Expand Down Expand Up @@ -126,6 +127,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
timestamp,
commit,
report: reports[commit.sha]?.report ?? null,
metricDefinitions: reports[commit.sha]?.metricDefinitions,
})),
[commits, reports],
);
Expand Down Expand Up @@ -154,7 +156,7 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
return entry.totalDuration;
}
if (chartMode === 'paint') {
return entry.metrics['paint:default']?.mean ?? null;
return entry.metrics['bench:paint']?.mean ?? null;
}
Comment thread
Janpot marked this conversation as resolved.
return entry.renders.length;
},
Expand Down Expand Up @@ -300,8 +302,10 @@ export default function DailyBenchmarkChart({ repo }: DailyBenchmarkChartProps)
return null;
}
return {
value: reportData.report,
base: baselineData?.report ?? null,
value: { report: reportData.report, metricDefinitions: reportData.metricDefinitions },
base: baselineData?.report
? { report: baselineData.report, metricDefinitions: baselineData.metricDefinitions }
: null,
valueCommit: reportData.commit,
baseCommit: baselineData?.commit ?? null,
};
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,34 @@
import { describe, it, expect } from 'vitest';
import { buildBenchmarkMarkdownReport } from './buildMarkdownReport';
import { compareBenchmarkReports } from './compareBenchmarkReports';
import { compareBenchmarkReports, type BenchmarkComparisonInput } from './compareBenchmarkReports';
import type { BenchmarkReportEntry, MetricDefinition } from './types';
import { makeReport, makeReportFromConfig } from './test-fixtures';

// A single benchmark whose duration/renders are neutral, so the only signal is one metric.
function metricReport(
name: string,
mean: number,
definitions?: Record<string, MetricDefinition>,
): BenchmarkComparisonInput {
const entry: BenchmarkReportEntry = {
iterations: 10,
totalDuration: 100,
renders: [],
metrics: { [name]: { mean, stdDev: 0, outliers: 0 } },
};
return { report: { Bench: entry }, metricDefinitions: definitions };
}

const alarmDefinitions: Record<string, MetricDefinition> = {
clicks: {
kind: 'discrete',
format: { maximumFractionDigits: 0 },
alarm: { direction: 'lowerIsBetter', error: 1 },
},
tti: { kind: 'scalar', alarm: { direction: 'lowerIsBetter', warn: 0.1, error: 0.25 } },
fps: { kind: 'scalar' }, // informational (no alarm)
};

describe('buildBenchmarkMarkdownReport', () => {
it('drops within-noise rows but keeps significant regressions', () => {
const report = compareBenchmarkReports(
Expand Down Expand Up @@ -126,4 +152,38 @@ describe('buildBenchmarkMarkdownReport', () => {
expect(markdown).not.toContain('No significant changes');
expect(markdown).toContain('| Test |');
});

describe('metric alarms', () => {
it('surfaces an error-level metric, making its otherwise-neutral test significant', () => {
const report = compareBenchmarkReports(
metricReport('clicks', 5, alarmDefinitions), // +2 vs 3, discrete error band 1
metricReport('clicks', 3),
);
const markdown = buildBenchmarkMarkdownReport(report);
expect(markdown).not.toContain('No significant changes');
expect(markdown).toContain('**Metric alarms**');
expect(markdown).toContain('| Bench | clicks |');
expect(markdown).toContain('🔺');
});

it('does not surface warning-level metrics (kept on the dashboard only)', () => {
const report = compareBenchmarkReports(
metricReport('tti', 115, alarmDefinitions), // +15%: past warn (10%), within error (25%) -> warning
metricReport('tti', 100),
);
const markdown = buildBenchmarkMarkdownReport(report);
expect(markdown).toContain('No significant changes');
expect(markdown).not.toContain('Metric alarms');
});

it('ignores informational metrics — no alarm section, test stays within noise', () => {
const report = compareBenchmarkReports(
metricReport('fps', 200, alarmDefinitions), // big change but no alarm configured
metricReport('fps', 100),
);
const markdown = buildBenchmarkMarkdownReport(report);
expect(markdown).toContain('No significant changes');
expect(markdown).not.toContain('Metric alarms');
});
});
});
37 changes: 29 additions & 8 deletions apps/code-infra-dashboard/src/lib/benchmark/buildMarkdownReport.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import { formatMs, formatDiffMs, percentFormatter } from '@/utils/formatters';
import type { BenchmarkComparisonReport, DiffValue } from './compareBenchmarkReports';
import { formatMs, formatDiffMs, formatMetricDiff, percentFormatter } from '@/utils/formatters';
import type {
BenchmarkComparisonReport,
ComparisonEntry,
DiffValue,
} from './compareBenchmarkReports';

// Only error-level alarms surface in the PR comment; warnings stay on the dashboard.
const isAlarmed = (metric: ComparisonEntry): boolean => metric.diff.severity === 'error';

export interface BuildMarkdownReportOptions {
maxRows?: number;
Expand All @@ -8,6 +15,7 @@ export interface BuildMarkdownReportOptions {

const SEVERITY_PREFIX: Record<string, string> = {
error: '🔺',
warning: '⚠️',
success: '▼',
};

Expand Down Expand Up @@ -42,11 +50,6 @@ export function buildBenchmarkMarkdownReport(
`**Total duration:** ${formatMs(report.totals.duration.current ?? 0)}${formatDiff(report.totals.duration, 'ms')}`,
`**Renders:** ${report.totals.renderCount.current ?? 0}${formatDiff(report.totals.renderCount, 'count')}`,
];
if (report.totals.paintDefault) {
totalParts.push(
`**Paint:** ${formatMs(report.totals.paintDefault.current ?? 0)}${formatDiff(report.totals.paintDefault, 'ms')}`,
);
}
lines.push(totalParts.join(' | '));
lines.push('');
}
Expand All @@ -56,7 +59,8 @@ export function buildBenchmarkMarkdownReport(
entry.duration.severity !== 'neutral' ||
(entry.renderCount?.severity ?? 'neutral') !== 'neutral' ||
entry.duration.current === null ||
entry.duration.base === null,
entry.duration.base === null ||
entry.metrics.some(isAlarmed),
);

const detailsLink = reportUrl ? `[details](${reportUrl})` : '';
Expand Down Expand Up @@ -99,5 +103,22 @@ export function buildBenchmarkMarkdownReport(
lines.push(detailsLink);
}

// Error-level metric alarms across all tests (independent of the test-table row cap).
const alarms = report.entries.flatMap((entry) =>
entry.metrics.filter(isAlarmed).map((metric) => ({ test: entry.name, metric })),
);
if (alarms.length > 0) {
lines.push('');
lines.push('**Metric alarms**');
lines.push('');
lines.push('| Test | Metric | Change |');
lines.push('|:-----|:-------|-------:|');
for (const { test, metric } of alarms) {
const prefix = SEVERITY_PREFIX[metric.diff.severity] ?? '';
const change = `${prefix} ${formatMetricDiff(metric.diff.absoluteDiff, metric.format)}`;
lines.push(`| ${test} | ${metric.name} | ${change} |`);
}
}

return lines.join('\n');
}
Loading
Loading