Skip to content

Commit e01c392

Browse files
authored
[HUD] Adds dashboard in Metrics page to track ephemeral experimentation % over time (#6420)
Adds a dashboard with title "Percentage of jobs on ephemeral experiment" on metrics page of hud. Query is a copy-paste-change-a-bit from meta/lf runners. This dashboard is temporary and we should be removing it as we finish the experiment and the migration. Once we do that, the above mentioned dashboard and the `nonephemeral` metric in SLI page should be removed.
1 parent 51699c8 commit e01c392

File tree

3 files changed

+158
-0
lines changed

3 files changed

+158
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"params": {
3+
"days_ago": "Int64",
4+
"experiment_name": "String"
5+
},
6+
"tests": []
7+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
WITH
2+
normalized_jobs AS (
3+
SELECT
4+
l AS label,
5+
extract(j.name, '[^,]*') AS job_name, -- Remove shard number and label from job names
6+
j.workflow_name,
7+
toStartOfInterval(j.started_at, INTERVAL 1 HOUR) AS bucket
8+
FROM
9+
-- Deliberatly not adding FINAL to this workflow_job.
10+
-- Risks of not using it:
11+
-- - You may get duplicate records for rows that were updated corresponding to their
12+
-- before/after states, but as long as there’s some mechanism in the query to account
13+
-- for that it’s okay (we check for j.status = 'completed`).
14+
-- - In the worst case scenario, you may only see the ‘old’ version of the records for some rows
15+
-- Costs of using it:
16+
-- - Query procesing time increases from ~5 -> 16 seconds
17+
-- - Memory usage grows from 7.5 GB -> 32 GB
18+
-- So the tradeoff is worth it for this query.
19+
workflow_job AS j
20+
ARRAY JOIN j.labels as l
21+
WHERE
22+
j.created_at > now() - INTERVAL {days_ago: Int64} DAY
23+
AND j.status = 'completed'
24+
AND l != 'self-hosted'
25+
AND l NOT LIKE 'lf.c.%'
26+
AND l NOT LIKE '%.canary'
27+
AND l NOT LIKE 'c.%'
28+
),
29+
experiment_jobs AS (
30+
SELECT
31+
DISTINCT j.job_name
32+
FROM
33+
normalized_jobs AS j
34+
WHERE
35+
j.label LIKE concat('%.', {experiment_name: String}, '.%')
36+
),
37+
comparable_jobs AS (
38+
SELECT
39+
j.bucket,
40+
j.label,
41+
j.job_name,
42+
-- Remove shard number and label from job names
43+
j.workflow_name
44+
FROM
45+
normalized_jobs AS j
46+
INNER JOIN
47+
experiment_jobs AS lfj ON j.job_name = lfj.job_name
48+
),
49+
success_stats AS (
50+
SELECT
51+
bucket,
52+
count(*) AS group_size,
53+
job_name,
54+
workflow_name,
55+
label,
56+
if(like(label, concat('%.', {experiment_name: String}, '.%')), True, False) AS is_ephemeral_exp
57+
FROM
58+
comparable_jobs
59+
GROUP BY
60+
bucket, job_name, workflow_name, label
61+
),
62+
comparison_stats AS (
63+
SELECT
64+
experiment.bucket,
65+
SUM(experiment.group_size + m.group_size) AS total_jobs,
66+
SUM(m.group_size) AS compliment_jobs,
67+
SUM(experiment.group_size) AS counted_jobs,
68+
m.is_ephemeral_exp AS c_fleet,
69+
experiment.is_ephemeral_exp AS m_fleet,
70+
CAST(SUM(experiment.group_size) AS Float32) / SUM(experiment.group_size + m.group_size) * 100 AS percentage,
71+
IF(experiment.is_ephemeral_exp, 'On experiment', 'Not on experiment') AS fleet
72+
FROM
73+
success_stats AS experiment
74+
INNER JOIN
75+
success_stats AS m ON experiment.bucket = m.bucket
76+
WHERE
77+
experiment.job_name = m.job_name
78+
AND experiment.workflow_name = m.workflow_name
79+
AND experiment.is_ephemeral_exp = 1 AND m.is_ephemeral_exp = 0
80+
AND experiment.group_size > 3
81+
AND m.group_size > 3
82+
GROUP BY
83+
experiment.bucket, experiment.is_ephemeral_exp, m.is_ephemeral_exp
84+
)
85+
SELECT * FROM comparison_stats
86+
ORDER BY bucket DESC, fleet

torchci/pages/metrics.tsx

+65
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,37 @@ export function TtsPercentilePicker({
336336
);
337337
}
338338

339+
/**
340+
* Allows the user to pick the experiment metrics.
341+
*/
342+
export function ExperimentPicker({
343+
experimentName,
344+
setExperimentName,
345+
}: {
346+
experimentName: string;
347+
setExperimentName: any;
348+
}) {
349+
function handleChange(e: SelectChangeEvent<string>) {
350+
setExperimentName(e.target.value as string);
351+
}
352+
353+
return (
354+
<>
355+
<FormControl>
356+
<InputLabel id="experiment-picker-select-label">Experiment</InputLabel>
357+
<Select
358+
defaultValue={experimentName}
359+
label="Experiment Name"
360+
labelId="experiment-picker-select-label"
361+
onChange={handleChange}
362+
>
363+
<MenuItem value={"ephemeral"}>ephemeral</MenuItem>
364+
</Select>
365+
</FormControl>
366+
</>
367+
);
368+
}
369+
339370
function WorkflowDuration({
340371
percentile,
341372
timeParams,
@@ -425,6 +456,7 @@ export default function Page() {
425456
};
426457

427458
const [ttsPercentile, setTtsPercentile] = useState<number>(0.5);
459+
const [experimentName, setExperimentName] = useState<string>("ephemeral");
428460

429461
// Split the aggregated red % into broken trunk and flaky red %
430462
const queryName = "master_commit_red_avg";
@@ -918,6 +950,39 @@ export default function Page() {
918950
yAxisRenderer={(value) => value.toFixed(2).toString() + "%"}
919951
/>
920952
</Grid2>
953+
954+
<Grid2 size={{ xs: 12 }}>
955+
<Stack direction="row" spacing={2} sx={{ mb: 2 }}>
956+
<Typography variant="h3" gutterBottom>
957+
Percentage of jobs running on experiment
958+
</Typography>
959+
<ExperimentPicker
960+
experimentName={experimentName}
961+
setExperimentName={setExperimentName}
962+
/>
963+
</Stack>
964+
<p>
965+
This pannel shows the % of jobs that are running the selected
966+
experiment in the dropbox.
967+
</p>
968+
</Grid2>
969+
970+
<Grid2 size={{ xs: 12 }} height={ROW_HEIGHT}>
971+
<TimeSeriesPanel
972+
title={"Percentage of jobs running on experiment"}
973+
queryName={"experiment_rollover_percentage"}
974+
queryParams={{
975+
...timeParams,
976+
days_ago: timeRange,
977+
experiment_name: experimentName,
978+
}}
979+
granularity={"hour"}
980+
timeFieldName={"bucket"}
981+
yAxisFieldName={"percentage"}
982+
groupByFieldName={"fleet"}
983+
yAxisRenderer={(value) => value.toFixed(2).toString() + "%"}
984+
/>
985+
</Grid2>
921986
</Grid2>
922987
</div>
923988
);

0 commit comments

Comments
 (0)