[HUD] Adds dashboard in Metrics page to track ephemeral experimentation % over time (#6420)

jeanschmidt · web-flow · commit e01c39243ae0 · 2025-03-18T15:45:46.000+01:00
Adds a dashboard with title "Percentage of jobs on ephemeral experiment"
on metrics page of hud.

Query is a copy-paste-change-a-bit from meta/lf runners. This dashboard
is temporary and we should be removing it as we finish the experiment
and the migration.

Once we do that, the above mentioned dashboard and the `nonephemeral`
metric in SLI page should be removed.
diff --git a/torchci/clickhouse_queries/experiment_rollover_percentage/params.json b/torchci/clickhouse_queries/experiment_rollover_percentage/params.json
@@ -0,0 +1,7 @@
+{
+  "params": {
+    "days_ago": "Int64",
+    "experiment_name": "String"
+  },
+  "tests": []
+}
diff --git a/torchci/clickhouse_queries/experiment_rollover_percentage/query.sql b/torchci/clickhouse_queries/experiment_rollover_percentage/query.sql
@@ -0,0 +1,86 @@
+WITH
+    normalized_jobs AS (
+        SELECT
+            l AS label,
+            extract(j.name, '[^,]*') AS job_name, -- Remove shard number and label from job names
+            j.workflow_name,
+            toStartOfInterval(j.started_at, INTERVAL 1 HOUR) AS bucket
+        FROM
+            -- Deliberatly not adding FINAL to this workflow_job.
+            -- Risks of not using it:
+            --   - You may get duplicate records for rows that were updated corresponding to their
+            --     before/after states, but as long as there’s some mechanism in the query to account
+            --     for that it’s okay (we check for j.status = 'completed`).
+            --   - In the worst case scenario, you may only see the ‘old’ version of the records for some rows
+            -- Costs of using it:
+            --   - Query procesing time increases from ~5 -> 16 seconds
+            --   - Memory usage grows from 7.5 GB -> 32 GB
+            -- So the tradeoff is worth it for this query.
+            workflow_job AS j
+            ARRAY JOIN j.labels as l
+        WHERE
+            j.created_at > now() - INTERVAL {days_ago: Int64} DAY
+            AND j.status = 'completed'
+            AND l != 'self-hosted'
+            AND l NOT LIKE 'lf.c.%'
+            AND l NOT LIKE '%.canary'
+            AND l NOT LIKE 'c.%'
+    ),
+    experiment_jobs AS (
+        SELECT
+            DISTINCT j.job_name
+        FROM
+            normalized_jobs AS j
+        WHERE
+            j.label LIKE concat('%.', {experiment_name: String}, '.%')
+    ),
+    comparable_jobs AS (
+        SELECT
+            j.bucket,
+            j.label,
+            j.job_name,
+            -- Remove shard number and label from job names
+            j.workflow_name
+        FROM
+            normalized_jobs AS j
+        INNER JOIN
+            experiment_jobs AS lfj ON j.job_name = lfj.job_name
+    ),
+    success_stats AS (
+        SELECT
+            bucket,
+            count(*) AS group_size,
+            job_name,
+            workflow_name,
+            label,
+            if(like(label, concat('%.', {experiment_name: String}, '.%')), True, False) AS is_ephemeral_exp
+        FROM
+            comparable_jobs
+        GROUP BY
+            bucket, job_name, workflow_name, label
+    ),
+    comparison_stats AS (
+        SELECT
+            experiment.bucket,
+            SUM(experiment.group_size + m.group_size) AS total_jobs,
+            SUM(m.group_size) AS compliment_jobs,
+            SUM(experiment.group_size) AS counted_jobs,
+            m.is_ephemeral_exp AS c_fleet,
+            experiment.is_ephemeral_exp AS m_fleet,
+            CAST(SUM(experiment.group_size) AS Float32) / SUM(experiment.group_size + m.group_size) * 100 AS percentage,
+            IF(experiment.is_ephemeral_exp, 'On experiment', 'Not on experiment') AS fleet
+        FROM
+            success_stats AS experiment
+        INNER JOIN
+            success_stats AS m ON experiment.bucket = m.bucket
+        WHERE
+            experiment.job_name = m.job_name
+            AND experiment.workflow_name = m.workflow_name
+            AND experiment.is_ephemeral_exp = 1 AND m.is_ephemeral_exp = 0
+            AND experiment.group_size > 3
+            AND m.group_size > 3
+        GROUP BY
+            experiment.bucket, experiment.is_ephemeral_exp, m.is_ephemeral_exp
+    )
+SELECT * FROM comparison_stats
+ORDER BY  bucket DESC, fleet
diff --git a/torchci/pages/metrics.tsx b/torchci/pages/metrics.tsx
@@ -336,6 +336,37 @@ export function TtsPercentilePicker({
   );
 }
 
+/**
+ * Allows the user to pick the experiment metrics.
+ */
+export function ExperimentPicker({
+  experimentName,
+  setExperimentName,
+}: {
+  experimentName: string;
+  setExperimentName: any;
+}) {
+  function handleChange(e: SelectChangeEvent<string>) {
+    setExperimentName(e.target.value as string);
+  }
+
+  return (
+    <>
+      <FormControl>
+        <InputLabel id="experiment-picker-select-label">Experiment</InputLabel>
+        <Select
+          defaultValue={experimentName}
+          label="Experiment Name"
+          labelId="experiment-picker-select-label"
+          onChange={handleChange}
+        >
+          <MenuItem value={"ephemeral"}>ephemeral</MenuItem>
+        </Select>
+      </FormControl>
+    </>
+  );
+}
+
 function WorkflowDuration({
   percentile,
   timeParams,
@@ -425,6 +456,7 @@ export default function Page() {
   };
 
   const [ttsPercentile, setTtsPercentile] = useState<number>(0.5);
+  const [experimentName, setExperimentName] = useState<string>("ephemeral");
 
   // Split the aggregated red % into broken trunk and flaky red %
   const queryName = "master_commit_red_avg";
@@ -918,6 +950,39 @@ export default function Page() {
             yAxisRenderer={(value) => value.toFixed(2).toString() + "%"}
           />
         </Grid2>
+
+        <Grid2 size={{ xs: 12 }}>
+          <Stack direction="row" spacing={2} sx={{ mb: 2 }}>
+            <Typography variant="h3" gutterBottom>
+              Percentage of jobs running on experiment
+            </Typography>
+            <ExperimentPicker
+              experimentName={experimentName}
+              setExperimentName={setExperimentName}
+            />
+          </Stack>
+          <p>
+            This pannel shows the % of jobs that are running the selected
+            experiment in the dropbox.
+          </p>
+        </Grid2>
+
+        <Grid2 size={{ xs: 12 }} height={ROW_HEIGHT}>
+          <TimeSeriesPanel
+            title={"Percentage of jobs running on experiment"}
+            queryName={"experiment_rollover_percentage"}
+            queryParams={{
+              ...timeParams,
+              days_ago: timeRange,
+              experiment_name: experimentName,
+            }}
+            granularity={"hour"}
+            timeFieldName={"bucket"}
+            yAxisFieldName={"percentage"}
+            groupByFieldName={"fleet"}
+            yAxisRenderer={(value) => value.toFixed(2).toString() + "%"}
+          />
+        </Grid2>
       </Grid2>
     </div>
   );