Skip to content

Commit 70bf6b5

Browse files
authored
fix: small improvements for scale test dashboard (#1545)
Signed-off-by: SiorMeir <msior@nvidia.com>
1 parent f59fa11 commit 70bf6b5

4 files changed

Lines changed: 82 additions & 105 deletions

File tree

docs/scale-tests/favicon.ico

174 KB
Binary file not shown.

docs/scale-tests/index.html

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
<meta charset="UTF-8">
2626
<meta name="viewport" content="width=device-width, initial-scale=1.0">
2727
<title>KAI Scheduler — Scale Tests</title>
28+
<link rel="icon" type="image/x-icon" href="favicon.ico">
2829
<link rel="stylesheet" href="styles.css">
2930
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
3031
<script src="https://cdn.jsdelivr.net/npm/chartjs-adapter-date-fns@3.0.0/dist/chartjs-adapter-date-fns.bundle.min.js"></script>
@@ -33,10 +34,7 @@
3334

3435
<!-- ── Page header ─────────────────────────────────────────────────────── -->
3536
<header class="page-header">
36-
<svg width="22" height="22" viewBox="0 0 24 24" fill="none" aria-hidden="true">
37-
<path d="M12 2L2 7l10 5 10-5-10-5zM2 17l10 5 10-5M2 12l10 5 10-5"
38-
stroke="#58a6ff" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
39-
</svg>
37+
<img src="logo.svg" width="32" height="32" alt="KAI Scheduler Logo" aria-hidden="true">
4038
<h1>KAI Scheduler — Scale Tests</h1>
4139
<div class="header-stats" id="header-stats" aria-live="polite"></div>
4240
</header>
@@ -91,33 +89,17 @@ <h3 class="chart-title">Fill Cluster with single GPU Jobs</h3>
9189
<div class="chart-wrapper"><canvas id="chart-1"></canvas></div>
9290
</div>
9391
<div class="chart-card">
94-
<h3 class="chart-title">Fill Cluster with single GPU Jobs (400 pending tasks)</h3>
92+
<h3 class="chart-title">Schedules Jobs with Pending Tasks in Background</h3>
9593
<div class="chart-wrapper"><canvas id="chart-2"></canvas></div>
9694
</div>
9795
<div class="chart-card">
98-
<h3 class="chart-title">Average time to unschedulable for distributed job</h3>
96+
<h3 class="chart-title">Allocate Single Distributed Job (with preferred topology)</h3>
9997
<div class="chart-wrapper"><canvas id="chart-3"></canvas></div>
10098
</div>
10199
<div class="chart-card">
102-
<h3 class="chart-title">Reclaim time for one very large job</h3>
100+
<h3 class="chart-title">Allocate Single Distributed Job (without preferred topology)</h3>
103101
<div class="chart-wrapper"><canvas id="chart-4"></canvas></div>
104102
</div>
105-
<div class="chart-card">
106-
<h3 class="chart-title">Measuring reclaim time for single GPU</h3>
107-
<div class="chart-wrapper"><canvas id="chart-5"></canvas></div>
108-
</div>
109-
<div class="chart-card">
110-
<h3 class="chart-title">Multi Node Reclaim for distributed jobs</h3>
111-
<div class="chart-wrapper"><canvas id="chart-6"></canvas></div>
112-
</div>
113-
<div class="chart-card">
114-
<h3 class="chart-title">Reclaim single GPU Jobs</h3>
115-
<div class="chart-wrapper"><canvas id="chart-7"></canvas></div>
116-
</div>
117-
<div class="chart-card">
118-
<h3 class="chart-title">Consolidation to run multiple distributed jobs</h3>
119-
<div class="chart-wrapper"><canvas id="chart-8"></canvas></div>
120-
</div>
121103
</div>
122104
</main>
123105

docs/scale-tests/logo.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/scale-tests/metrics.js

Lines changed: 76 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -11,96 +11,51 @@ const CHART_CONFIGS = [
1111
id: 'chart-1',
1212
testNamePattern: /^fill cluster with single GPU Jobs$/i,
1313
excludePattern: /pending tasks/i,
14-
extractMetric: (entries) => {
15-
const metrics = findMetrics(entries);
16-
return metrics?.time ? parseDuration(metrics.time) : null;
14+
extractMetric: (entries, metrics) => {
15+
if (!metrics) metrics = findMetrics(entries);
16+
return metrics?.time || metrics?.total_time ? parseDuration(metrics.time || metrics.total_time) : null;
1717
},
1818
label: 'Time (seconds)',
19-
legendBuilder: (metrics) => `${metrics.nodes || '?'} nodes, ${metrics.jobs || '?'} jobs`,
19+
legendBuilder: (metrics, containerHierarchy) => {
20+
const nodes = metrics.nodes || '?';
21+
const jobs = metrics.jobs || '?';
22+
// Use last container context to distinguish test variants
23+
const context = containerHierarchy?.[containerHierarchy.length - 1] || '';
24+
const variant = context.includes('scheduler disabled') ? 'scheduler disabled' :
25+
context.includes('running') ? 'scheduler running' : '';
26+
return variant ? `${nodes} nodes, ${jobs} jobs (${variant})` : `${nodes} nodes, ${jobs} jobs`;
27+
},
2028
},
2129
{
2230
id: 'chart-2',
23-
testNamePattern: /^fill cluster with single GPU Jobs.*400 pending tasks/i,
24-
extractMetric: (entries) => {
25-
const metrics = findMetrics(entries);
26-
return metrics?.time ? parseDuration(metrics.time) : null;
31+
testNamePattern: /schedules jobs with pending tasks in background/i,
32+
extractMetric: (entries, metrics) => {
33+
if (!metrics) metrics = findMetrics(entries);
34+
return metrics?.time || metrics?.total_time ? parseDuration(metrics.time || metrics.total_time) : null;
2735
},
2836
label: 'Time (seconds)',
29-
legendBuilder: (metrics) => `${metrics.nodes || '?'} nodes, ${metrics.jobs || '?'} jobs`,
37+
legendBuilder: (metrics, containerHierarchy) => `${metrics.nodes || '?'} nodes, ${metrics.jobs || '?'} jobs`,
3038
},
3139
{
3240
id: 'chart-3',
33-
testNamePattern: /average time to unschedulable for distributed job/i,
34-
extractMetric: (entries) => {
35-
const metrics = findMetrics(entries);
36-
return metrics?.average_time_to_unschedulable__seconds_ ||
37-
metrics?.details_average_time_to_unschedulable__seconds_ || null;
41+
testNamePattern: /Allocate single distributed job with preferred topology/i,
42+
extractMetric: (entries, metrics) => {
43+
if (!metrics) metrics = findMetrics(entries);
44+
return metrics?.time || metrics?.total_time || metrics?.duration ?
45+
parseDuration(metrics.time || metrics.total_time) || metrics.duration : null;
3846
},
3947
label: 'Time (seconds)',
40-
legendBuilder: (metrics) =>
41-
`${metrics.nodes || metrics.details_nodes || '?'} nodes, ` +
42-
`${metrics.total_requested_gpus || metrics.details_total_requested_gpus || '?'} GPUs requested`,
48+
legendBuilder: (metrics, containerHierarchy) => `${metrics.nodes || '?'} nodes, ${metrics.jobs || '?'} jobs`,
4349
},
4450
{
4551
id: 'chart-4',
46-
testNamePattern: /reclaim time for one very large job/i,
47-
extractMetric: (entries) => {
48-
const metrics = findMetrics(entries);
49-
return metrics?.time_to_reclaim__seconds_ ||
50-
metrics?.details_time_to_reclaim__seconds_ || null;
51-
},
52-
label: 'Time (seconds)',
53-
legendBuilder: (metrics) =>
54-
`${metrics.total_requested_gpus || metrics.details_total_requested_gpus || '?'} GPUs requested`,
55-
},
56-
{
57-
id: 'chart-5',
58-
testNamePattern: /measuring reclaim time for single GPU/i,
59-
extractMetric: (entries) => {
60-
const metrics = findMetrics(entries);
61-
return metrics?.average_time_to_reclaim_single_GPU__seconds_ ||
62-
metrics?.details_average_time_to_reclaim_single_GPU__seconds_ || null;
52+
testNamePattern: /Allocate single distributed job without preferred topology/i,
53+
extractMetric: (entries, metrics) => {
54+
if (!metrics) metrics = findMetrics(entries);
55+
return metrics?.time || metrics?.total_time ? parseDuration(metrics.time || metrics.total_time) : null;
6356
},
6457
label: 'Time (seconds)',
65-
legendBuilder: (metrics) =>
66-
`${metrics.running_jobs || metrics.details_running_jobs || '?'} running jobs`,
67-
},
68-
{
69-
id: 'chart-6',
70-
testNamePattern: /multi node reclaim for distributed jobs/i,
71-
extractMetric: (entries) => {
72-
const metrics = findMetrics(entries);
73-
return metrics?.time ? parseDuration(metrics.time) : null;
74-
},
75-
label: 'Time (seconds)',
76-
legendBuilder: (metrics) =>
77-
`${metrics.nodes || metrics.details_nodes || '?'} nodes, ` +
78-
`${metrics.pods || metrics.details_pods || '?'} pods`,
79-
},
80-
{
81-
id: 'chart-7',
82-
testNamePattern: /reclaim.*single GPU jobs/i,
83-
excludePattern: /measuring/i,
84-
extractMetric: (entries) => {
85-
const metrics = findMetrics(entries);
86-
return metrics?.time ? parseDuration(metrics.time) : null;
87-
},
88-
label: 'Time (seconds)',
89-
legendBuilder: (metrics) =>
90-
`${metrics.nodes || metrics.details_nodes || '?'} nodes, ` +
91-
`${metrics.jobs || metrics.details_jobs || '?'} jobs`,
92-
},
93-
{
94-
id: 'chart-8',
95-
testNamePattern: /consolidation to run multiple distributed jobs/i,
96-
extractMetric: (entries) => {
97-
const metrics = findMetrics(entries);
98-
return metrics?.time ? parseDuration(metrics.time) : null;
99-
},
100-
label: 'Time (seconds)',
101-
legendBuilder: (metrics) =>
102-
`${metrics.nodes || metrics.details_nodes || '?'} nodes, ` +
103-
`${metrics.pods || metrics.details_pods || '?'} pods`,
58+
legendBuilder: (metrics, containerHierarchy) => `${metrics.nodes || '?'} nodes, ${metrics.jobs || '?'} jobs`,
10459
},
10560
];
10661

@@ -121,6 +76,29 @@ function findMetrics(reportEntries) {
12176
return null;
12277
}
12378

79+
// Parse metrics from CapturedGinkgoWriterOutput log format
80+
// Example: "Total time"="6m36.848389057s" "nodes"=500 "jobs"=4000
81+
function parseMetricsFromOutput(output) {
82+
if (!output) return null;
83+
84+
const metrics = {};
85+
86+
// Match key="value" or key=value patterns
87+
const pattern = /"([^"]+)"=(?:"([^"]*)"|(\d+(?:\.\d+)?))/g;
88+
let match;
89+
90+
while ((match = pattern.exec(output)) !== null) {
91+
const key = match[1].toLowerCase().replace(/\s+/g, '_');
92+
const value = match[2] || match[3];
93+
94+
// Try to parse as number, otherwise keep as string
95+
const numValue = parseFloat(value);
96+
metrics[key] = isNaN(numValue) ? value : numValue;
97+
}
98+
99+
return Object.keys(metrics).length > 0 ? metrics : null;
100+
}
101+
124102
function parseDuration(durationStr) {
125103
if (!durationStr) return null;
126104
if (typeof durationStr === 'number') return durationStr;
@@ -142,9 +120,11 @@ function extractMetricsFromRuns(runs, config) {
142120
const dataPoints = [];
143121

144122
runs.forEach(run => {
145-
if (!run.specs || !Array.isArray(run.specs)) return;
123+
// Use suite.SpecReports which contains all test results
124+
const specs = run.suite?.SpecReports || run.specs || [];
125+
if (!Array.isArray(specs)) return;
146126

147-
run.specs.forEach(spec => {
127+
specs.forEach(spec => {
148128
const testName = spec.LeafNodeText || '';
149129

150130
// Check if this spec matches the pattern
@@ -154,11 +134,18 @@ function extractMetricsFromRuns(runs, config) {
154134
// Only include passed tests for metrics
155135
if (spec.State !== 'passed') return;
156136

157-
const metric = config.extractMetric(spec.ReportEntries);
137+
// Try ReportEntries first (new format), fall back to parsing output (current format)
138+
let metrics = findMetrics(spec.ReportEntries);
139+
if (!metrics) {
140+
metrics = parseMetricsFromOutput(spec.CapturedGinkgoWriterOutput);
141+
}
142+
143+
// Pass metrics to extractMetric for compatibility
144+
const metric = config.extractMetric(spec.ReportEntries, metrics);
158145
if (metric === null || metric === undefined) return;
159146

160-
const metrics = findMetrics(spec.ReportEntries);
161-
const legend = config.legendBuilder(metrics || {});
147+
// Include container hierarchy for distinguishing test variants
148+
const legend = config.legendBuilder(metrics || {}, spec.ContainerHierarchyTexts);
162149

163150
dataPoints.push({
164151
timestamp: new Date(run.timestamp),
@@ -268,7 +255,7 @@ function createChart(canvasId, dataPoints, config) {
268255
},
269256
label: (context) => {
270257
const value = context.parsed.y;
271-
return `${context.dataset.label}: ${value.toFixed(1)}s`;
258+
return `${context.dataset.label}: ${value.toFixed(3)}s`;
272259
},
273260
},
274261
},
@@ -292,15 +279,16 @@ function createChart(canvasId, dataPoints, config) {
292279
},
293280
},
294281
y: {
295-
beginAtZero: true,
282+
beginAtZero: false,
283+
grace: '10%',
296284
grid: {
297285
color: '#30363d',
298286
drawBorder: false,
299287
},
300288
ticks: {
301289
color: '#8b949e',
302290
font: { size: 10 },
303-
callback: (value) => `${value}s`,
291+
callback: (value) => `${value.toFixed(3)}s`,
304292
},
305293
title: {
306294
display: true,
@@ -372,5 +360,11 @@ initializeTabs();
372360
// Listen for data loaded event from app.js
373361
window.addEventListener('scale-tests:data-loaded', () => {
374362
console.log('[metrics] Data loaded event received');
375-
// Metrics will be initialized when user clicks the Metrics tab
363+
364+
// If metrics tab is currently visible, initialize now
365+
const metricsTab = document.getElementById('metrics-main');
366+
if (metricsTab && !metricsTab.classList.contains('hidden')) {
367+
window._metricsInitialized = false; // Reset flag to allow initialization
368+
initializeMetrics();
369+
}
376370
});

0 commit comments

Comments
 (0)