|
| 1 | +WITH |
| 2 | + normalized_jobs AS ( |
| 3 | + SELECT |
| 4 | + l AS label, |
| 5 | + extract(j.name, '[^,]*') AS job_name, -- Remove shard number and label from job names |
| 6 | + j.workflow_name, |
| 7 | + toStartOfInterval(j.started_at, INTERVAL 1 HOUR) AS bucket |
| 8 | + FROM |
| 9 | + -- Deliberatly not adding FINAL to this workflow_job. |
| 10 | + -- Risks of not using it: |
| 11 | + -- - You may get duplicate records for rows that were updated corresponding to their |
| 12 | + -- before/after states, but as long as there’s some mechanism in the query to account |
| 13 | + -- for that it’s okay (we check for j.status = 'completed`). |
| 14 | + -- - In the worst case scenario, you may only see the ‘old’ version of the records for some rows |
| 15 | + -- Costs of using it: |
| 16 | + -- - Query procesing time increases from ~5 -> 16 seconds |
| 17 | + -- - Memory usage grows from 7.5 GB -> 32 GB |
| 18 | + -- So the tradeoff is worth it for this query. |
| 19 | + workflow_job AS j |
| 20 | + ARRAY JOIN j.labels as l |
| 21 | + WHERE |
| 22 | + j.created_at > now() - INTERVAL {days_ago: Int64} DAY |
| 23 | + AND j.status = 'completed' |
| 24 | + AND l != 'self-hosted' |
| 25 | + AND l NOT LIKE 'lf.c.%' |
| 26 | + AND l NOT LIKE '%.canary' |
| 27 | + AND l NOT LIKE 'c.%' |
| 28 | + ), |
| 29 | + experiment_jobs AS ( |
| 30 | + SELECT |
| 31 | + DISTINCT j.job_name |
| 32 | + FROM |
| 33 | + normalized_jobs AS j |
| 34 | + WHERE |
| 35 | + j.label LIKE concat('%.', {experiment_name: String}, '.%') |
| 36 | + ), |
| 37 | + comparable_jobs AS ( |
| 38 | + SELECT |
| 39 | + j.bucket, |
| 40 | + j.label, |
| 41 | + j.job_name, |
| 42 | + -- Remove shard number and label from job names |
| 43 | + j.workflow_name |
| 44 | + FROM |
| 45 | + normalized_jobs AS j |
| 46 | + INNER JOIN |
| 47 | + experiment_jobs AS lfj ON j.job_name = lfj.job_name |
| 48 | + ), |
| 49 | + success_stats AS ( |
| 50 | + SELECT |
| 51 | + bucket, |
| 52 | + count(*) AS group_size, |
| 53 | + job_name, |
| 54 | + workflow_name, |
| 55 | + label, |
| 56 | + if(like(label, concat('%.', {experiment_name: String}, '.%')), True, False) AS is_ephemeral_exp |
| 57 | + FROM |
| 58 | + comparable_jobs |
| 59 | + GROUP BY |
| 60 | + bucket, job_name, workflow_name, label |
| 61 | + ), |
| 62 | + comparison_stats AS ( |
| 63 | + SELECT |
| 64 | + experiment.bucket, |
| 65 | + SUM(experiment.group_size + m.group_size) AS total_jobs, |
| 66 | + SUM(m.group_size) AS compliment_jobs, |
| 67 | + SUM(experiment.group_size) AS counted_jobs, |
| 68 | + m.is_ephemeral_exp AS c_fleet, |
| 69 | + experiment.is_ephemeral_exp AS m_fleet, |
| 70 | + CAST(SUM(experiment.group_size) AS Float32) / SUM(experiment.group_size + m.group_size) * 100 AS percentage, |
| 71 | + IF(experiment.is_ephemeral_exp, 'On experiment', 'Not on experiment') AS fleet |
| 72 | + FROM |
| 73 | + success_stats AS experiment |
| 74 | + INNER JOIN |
| 75 | + success_stats AS m ON experiment.bucket = m.bucket |
| 76 | + WHERE |
| 77 | + experiment.job_name = m.job_name |
| 78 | + AND experiment.workflow_name = m.workflow_name |
| 79 | + AND experiment.is_ephemeral_exp = 1 AND m.is_ephemeral_exp = 0 |
| 80 | + AND experiment.group_size > 3 |
| 81 | + AND m.group_size > 3 |
| 82 | + GROUP BY |
| 83 | + experiment.bucket, experiment.is_ephemeral_exp, m.is_ephemeral_exp |
| 84 | + ) |
| 85 | +SELECT * FROM comparison_stats |
| 86 | +ORDER BY bucket DESC, fleet |
0 commit comments