Use oss_ci_benchmark_metadata materialized view (#6167)

I'm attempting to use a new materialized view called `oss_ci_benchmark_metadata` to make it faster to query benchmark metadata. I have already added the new view manually but its definition is included in this PR for review. The old query works, but it's slower because the original table doesn't keep several important columns, i.e. benchmark name, in order https://github.com/pytorch/test-infra/blob/main/torchci/clickhouse_queries/oss_ci_benchmark_v3/query.sql#L98-L107. According to https://clickhouse.com/docs/en/sql-reference/statements/alter/order-by, it seems that we cannot add existing columns into ORDER BY. ### Testing When loading the benchmark metadata: * Slooooow https://hud.pytorch.org/benchmark/llms?repoName=pytorch%2Fpytorch * Faster https://torchci-git-fork-huydhn-use-osscibenchmarkm-96b1e1-fbopensource.vercel.app/benchmark/llms?repoName=pytorch%2Fpytorch Loading the benchmark data is still relatively slow IMO, but I will try to improve that in a separate PR.
pytorch · Jan 21, 2025 · d83a62f · d83a62f
1 parent 983a018
commit d83a62f
Show file tree

Hide file tree

Showing 3 changed files with 143 additions and 101 deletions.
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_branches/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_branches/query.sql
@@ -1,60 +1,39 @@
 -- This query is used to get the list of branches and commits used by different
 -- OSS CI benchmark experiments. This powers HUD benchmarks dashboards
-WITH benchmarks AS (
-    SELECT
-        o.head_branch AS head_branch,
-        o.head_sha AS head_sha,
-        o.workflow_id AS id,
-        IF(
-            empty(o.runners),
-            tupleElement(o.benchmark, 'extra_info') [ 'device' ],
-            tupleElement(o.runners [ 1 ], 'name')
-        ) AS device,
-        IF(
-            empty(o.runners),
-            tupleElement(o.benchmark, 'extra_info') [ 'arch' ],
-            tupleElement(o.runners [ 1 ], 'type')
-        ) AS arch,
-        o.timestamp AS timestamp,
-        toStartOfDay(fromUnixTimestamp(o.timestamp)) AS event_time
-    FROM
-        benchmark.oss_ci_benchmark_v3 o
-    WHERE
-        o.timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
-        AND o.timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
-        AND o.repo = {repo: String }
-        AND (
-            has({benchmarks: Array(String) }, o.benchmark.name)
-            OR empty({benchmarks: Array(String) })
-        )
-        AND (
-            has({models: Array(String) }, o.model.name)
-            OR empty({models: Array(String) })
-        )
-        AND (
-            has({backends: Array(String) }, o.model.backend)
-            OR empty({backends: Array(String) })
-        )
-        AND (
-            has({dtypes: Array(String) }, o.benchmark.dtype)
-            OR empty({dtypes: Array(String) })
-        )
-        AND (
-            NOT has({excludedMetrics: Array(String) }, o.metric.name)
-            OR empty({excludedMetrics: Array(String) })
-        )
-        AND notEmpty(o.metric.name)
-)
 SELECT
     DISTINCT replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
     head_sha,
-    id,
-    event_time
+    workflow_id AS id,
+    toStartOfDay(fromUnixTimestamp(timestamp)) AS event_time
 FROM
-    benchmarks
+    benchmark.oss_ci_benchmark_metadata
 WHERE
+    timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
+    AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
+    AND repo = {repo: String }
+    AND (
+        has({benchmarks: Array(String) }, benchmark_name)
+        OR empty({benchmarks: Array(String) })
+    )
+    AND (
+        has({models: Array(String) }, model_name)
+        OR empty({models: Array(String) })
+    )
+    AND (
+        has({backends: Array(String) }, model_backend)
+        OR empty({backends: Array(String) })
+    )
+    AND (
+        has({dtypes: Array(String) }, benchmark_dtype)
+        OR empty({dtypes: Array(String) })
+    )
+    AND (
+        NOT has({excludedMetrics: Array(String) }, metric_name)
+        OR empty({excludedMetrics: Array(String) })
+    )
+    AND notEmpty(metric_name)
     -- NB: DEVICE (ARCH) is the display format used by HUD when grouping together these two fields
-    (
+    AND (
         CONCAT(
             device,
             ' (',

diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_names/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_names/query.sql
@@ -1,62 +1,41 @@
 --- This query is used by HUD benchmarks dashboards to get the list of experiment names
-WITH benchmarks AS (
-    SELECT
-        o.benchmark.name AS benchmark,
-        o.model.name AS model,
-        o.model.backend AS backend,
-        o.metric.name AS metric,
-        o.benchmark.dtype AS dtype,
-        IF(
-            empty(o.runners),
-            tupleElement(o.benchmark, 'extra_info') [ 'device' ],
-            tupleElement(o.runners [ 1 ], 'name')
-        ) AS device,
-        IF(
-            empty(o.runners),
-            tupleElement(o.benchmark, 'extra_info') [ 'arch' ],
-            tupleElement(o.runners [ 1 ], 'type')
-        ) AS arch
-    FROM
-        benchmark.oss_ci_benchmark_v3 o
-    WHERE
-        o.timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
-        AND o.timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
-        AND o.repo = {repo: String }
-        AND (
-            has({benchmarks: Array(String) }, o.benchmark.name)
-            OR empty({benchmarks: Array(String) })
-        )
-        AND (
-            has({models: Array(String) }, o.model.name)
-            OR empty({models: Array(String) })
-        )
-        AND (
-            has({backends: Array(String) }, o.model.backend)
-            OR empty({backends: Array(String) })
-        )
-        AND (
-            has({dtypes: Array(String) }, o.benchmark.dtype)
-            OR empty({dtypes: Array(String) })
-        )
-        AND (
-            NOT has({excludedMetrics: Array(String) }, o.metric.name)
-            OR empty({excludedMetrics: Array(String) })
-        )
-        AND notEmpty(o.metric.name)
-)
 SELECT
-    DISTINCT benchmark,
-    model,
-    backend,
-    metric,
-    dtype,
+    DISTINCT benchmark_name AS benchmark,
+    model_name AS model,
+    model_backend AS backend,
+    metric_name AS metric,
+    benchmark_dtype AS dtype,
     device,
     arch
 FROM
-    benchmarks
+    benchmark.oss_ci_benchmark_metadata
 WHERE
+    timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
+    AND timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
+    AND repo = {repo: String }
+    AND (
+        has({benchmarks: Array(String) }, benchmark_name)
+        OR empty({benchmarks: Array(String) })
+    )
+    AND (
+        has({models: Array(String) }, model_name)
+        OR empty({models: Array(String) })
+    )
+    AND (
+        has({backends: Array(String) }, model_backend)
+        OR empty({backends: Array(String) })
+    )
+    AND (
+        has({dtypes: Array(String) }, benchmark_dtype)
+        OR empty({dtypes: Array(String) })
+    )
+    AND (
+        NOT has({excludedMetrics: Array(String) }, metric_name)
+        OR empty({excludedMetrics: Array(String) })
+    )
+    AND notEmpty(metric_name)
     -- NB: DEVICE (ARCH) is the display format used by HUD when grouping together these two fields
-    (
+    AND (
         CONCAT(
             device,
             ' (',

diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_v3_materialized_views/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_v3_materialized_views/query.sql
@@ -0,0 +1,84 @@
+-- This table is used to speed-up the performance of oss_ci_benchmark_names and
+-- and oss_ci_benchmark_branches queries
+CREATE TABLE benchmark.oss_ci_benchmark_metadata (
+    `repo` String,
+    `benchmark_name` String,
+    `benchmark_dtype` String,
+    `model_name` String,
+    `model_backend` String,
+    `device` String,
+    `arch` String,
+    `metric_name` String,
+    `head_branch` String,
+    `head_sha` String,
+    `workflow_id` UInt64,
+    `timestamp` UInt64,
+) ENGINE = MergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
+ORDER BY
+    (
+        repo,
+        benchmark_name,
+        benchmark_dtype,
+        model_name,
+        model_backend,
+        device,
+        arch,
+        metric_name,
+        head_branch,
+        workflow_id,
+        timestamp
+    ) SETTINGS index_granularity = 8192;
+
+CREATE MATERIALIZED VIEW benchmark.oss_ci_benchmark_metadata_mv TO benchmark.oss_ci_benchmark_metadata AS
+SELECT
+    repo AS repo,
+    tupleElement(benchmark, 'name') AS benchmark_name,
+    tupleElement(benchmark, 'dtype') AS benchmark_dtype,
+    tupleElement(model, 'name') AS model_name,
+    tupleElement(model, 'backend') AS model_backend,
+    IF(
+        empty(runners),
+        tupleElement(benchmark, 'extra_info') [ 'device' ],
+        tupleElement(runners [ 1 ], 'name')
+    ) AS device,
+    IF(
+        empty(runners),
+        tupleElement(benchmark, 'extra_info') [ 'arch' ],
+        tupleElement(runners [ 1 ], 'type')
+    ) AS arch,
+    tupleElement(metric, 'name') AS metric_name,
+    head_branch AS head_branch,
+    head_sha AS head_sha,
+    workflow_id AS workflow_id,
+    timestamp AS timestamp
+FROM
+    benchmark.oss_ci_benchmark_v3
+WHERE
+    timestamp >= toUnixTimestamp(toDateTime('2025-01-20 22:45:00'));
+
+-- Below is the SQL query to backfill the view with all data from 2024 onward
+INSERT INTO
+    benchmark.oss_ci_benchmark_metadata
+SELECT
+    repo AS repo,
+    tupleElement(benchmark, 'name') AS benchmark_name,
+    tupleElement(benchmark, 'dtype') AS benchmark_dtype,
+    tupleElement(model, 'name') AS model_name,
+    tupleElement(model, 'backend') AS model_backend,
+    IF(
+        empty(runners),
+        tupleElement(benchmark, 'extra_info') [ 'device' ],
+        tupleElement(runners [ 1 ], 'name')
+    ) AS device,
+    IF(
+        empty(runners),
+        tupleElement(benchmark, 'extra_info') [ 'arch' ],
+        tupleElement(runners [ 1 ], 'type')
+    ) AS arch,
+    tupleElement(metric, 'name') AS metric_name,
+    head_branch AS head_branch,
+    head_sha AS head_sha,
+    workflow_id AS workflow_id,
+    timestamp AS timestamp
+FROM
+    benchmark.oss_ci_benchmark_v3