fix(sql): preserve only probe-side cluster keys

forsaken628 · forsaken628 · commit 45b6c23bda12 · 2026-06-01T09:58:55.000+08:00
diff --git a/src/query/sql/src/planner/plans/join.rs b/src/query/sql/src/planner/plans/join.rs
@@ -510,8 +510,10 @@ impl Join {
         let inner_join_cardinality = join_estimation.join_card();
         let cardinality =
             self.join_cardinality(left_cardinality, right_cardinality, inner_join_cardinality);
-        let mut cluster_keys = left_statistics.cluster_keys.clone();
-        cluster_keys.extend(right_statistics.cluster_keys.clone());
+        // Hash join output follows the probe side. Build-side clustering is not
+        // preserved by hash table lookups, even though build-side columns remain
+        // available in the joined rows.
+        let cluster_keys = left_statistics.cluster_keys.clone();
         if let Some(columns) = join_estimation.updated_columns() {
             match self.join_type {
                 JoinType::LeftSemi => {
diff --git a/src/query/sql/tests/it/optimizer/cluster_key_join_order.rs b/src/query/sql/tests/it/optimizer/cluster_key_join_order.rs
@@ -43,10 +43,16 @@ use crate::framework::golden::write_case_title;
 struct JoinMemoCase<'a> {
     name: &'a str,
     description: &'a str,
+    table_columns: &'a str,
     cluster_by: &'a str,
     sql: &'a str,
+    column_statistics: fn(u64) -> HashMap<String, BasicColumnStatistics>,
 }
 
+const KEY_TABLE_COLUMNS: &str = "(k1 BIGINT, k2 BIGINT, v BIGINT)";
+const TRACE_TABLE_COLUMNS: &str = "\
+    (k1 BIGINT, k2 BIGINT, v BIGINT, start_time TIMESTAMP, start_day UInt32, trace_id STRING)";
+
 fn table_statistics(rows: u64) -> TableStatistics {
     TableStatistics {
         num_rows: Some(rows),
@@ -78,6 +84,29 @@ fn column_statistics(rows: u64) -> HashMap<String, BasicColumnStatistics> {
         .collect()
 }
 
+fn trace_column_statistics(rows: u64) -> HashMap<String, BasicColumnStatistics> {
+    let mut stats = column_statistics(rows);
+    stats.insert("start_day".to_string(), BasicColumnStatistics {
+        min: Some(Datum::UInt(20240101)),
+        max: Some(Datum::UInt(20241231)),
+        ndv: Some(NdvEstimate::exact(365.0)),
+        null_count: 0,
+        in_memory_size: rows.saturating_mul(4),
+    });
+    stats.insert("trace_id".to_string(), BasicColumnStatistics {
+        min: Some(Datum::Bytes(
+            b"0000000000000000000000000000000000000000".to_vec(),
+        )),
+        max: Some(Datum::Bytes(
+            b"ffffffffffffffffffffffffffffffffffffffff".to_vec(),
+        )),
+        ndv: Some(NdvEstimate::exact(rows as f64)),
+        null_count: 0,
+        in_memory_size: rows.saturating_mul(40),
+    });
+    stats
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
 async fn test_cluster_key_order_join_memo_golden() -> Result<()> {
     let mut file = open_golden_file("optimizer", "cluster_key_join_order.txt")?;
@@ -86,46 +115,85 @@ async fn test_cluster_key_order_join_memo_golden() -> Result<()> {
         JoinMemoCase {
             name: "k1_k2_prefix",
             description: "Full memo output when the clustered probe can first match a.k1.",
+            table_columns: KEY_TABLE_COLUMNS,
             cluster_by: "CLUSTER BY (k1, k2)",
             sql: "
                     SELECT *
                     FROM a
                     JOIN b ON a.k1 = b.k1
                     JOIN c ON a.k2 = c.k2
                 ",
+            column_statistics,
         },
         JoinMemoCase {
             name: "k2_k1_prefix",
             description: "Full memo output when the clustered probe can first match a.k2.",
+            table_columns: KEY_TABLE_COLUMNS,
             cluster_by: "CLUSTER BY (k2, k1)",
             sql: "
                     SELECT *
                     FROM a
                     JOIN b ON a.k1 = b.k1
                     JOIN c ON a.k2 = c.k2
                 ",
+            column_statistics,
         },
         JoinMemoCase {
             name: "filter_preserves_cluster_keys",
             description: "Cluster keys still affect join order after a filter on the clustered table.",
+            table_columns: KEY_TABLE_COLUMNS,
             cluster_by: "CLUSTER BY (k1, k2)",
             sql: "
                     SELECT *
                     FROM (SELECT * FROM a WHERE v >= 0) a
                     JOIN b ON a.k1 = b.k1
                     JOIN c ON a.k2 = c.k2
                 ",
+            column_statistics,
         },
         JoinMemoCase {
             name: "limit_and_join_preserve_cluster_keys",
             description: "Cluster keys still affect join order after a limit subquery and a partial join.",
+            table_columns: KEY_TABLE_COLUMNS,
             cluster_by: "CLUSTER BY (k1, k2)",
             sql: "
                     SELECT *
                     FROM (SELECT * FROM a LIMIT 1000) a
                     JOIN b ON a.k1 = b.k1
                     JOIN c ON a.k2 = c.k2
                 ",
+            column_statistics,
+        },
+        JoinMemoCase {
+            name: "build_side_cluster_keys_do_not_propagate",
+            description: "Cluster keys from a build-side clustered table do not affect later join costs.",
+            table_columns: KEY_TABLE_COLUMNS,
+            cluster_by: "CLUSTER BY (k1, k2)",
+            sql: "
+                    SELECT *
+                    FROM b
+                    JOIN (SELECT * FROM a LIMIT 100) a ON b.k1 = a.k1
+                    JOIN (SELECT * FROM c LIMIT 10) c ON a.k2 = c.k2
+                ",
+            column_statistics,
+        },
+        JoinMemoCase {
+            name: "linear_expression_cluster_key",
+            description: "A LINEAR cluster key with to_yyyymmdd and substring expressions affects join costs.",
+            table_columns: TRACE_TABLE_COLUMNS,
+            cluster_by: "CLUSTER BY linear (
+                    to_yyyymmdd(start_time),
+                    SUBSTRING(trace_id FROM 1 FOR 40)
+                )",
+            sql: "
+                    SELECT *
+                    FROM a
+                    JOIN b
+                        ON to_yyyymmdd(a.start_time) = b.start_day
+                        AND SUBSTRING(a.trace_id FROM 1 FOR 40) = b.trace_id
+                    JOIN c ON a.k2 = c.k2
+                ",
+            column_statistics: trace_column_statistics,
         },
     ] {
         write_cluster_key_join_order_memo(&mut file, case).await?;
@@ -146,16 +214,19 @@ async fn write_cluster_key_join_order_memo(
     for table in ["a", "b", "c"] {
         let table_cluster_by = if table == "a" { case.cluster_by } else { "" };
         let setup_sql = match table_cluster_by {
-            "" => format!("CREATE TABLE {table}(k1 BIGINT, k2 BIGINT, v BIGINT)"),
+            "" => format!("CREATE TABLE {table}{}", case.table_columns),
             _ => {
-                format!("CREATE TABLE {table}(k1 BIGINT, k2 BIGINT, v BIGINT) {table_cluster_by}")
+                format!(
+                    "CREATE TABLE {table}{} {table_cluster_by}",
+                    case.table_columns
+                )
             }
         };
         writeln!(file, "setup: {setup_sql}")?;
         ctx.register_table_sql_with_stats(
             &setup_sql,
             Some(table_statistics(1000)),
-            column_statistics(1000),
+            (case.column_statistics)(1000),
         )
         .await?;
     }
diff --git a/src/query/sql/tests/it/optimizer/cluster_key_join_order.txt b/src/query/sql/tests/it/optimizer/cluster_key_join_order.txt
@@ -194,3 +194,116 @@ Memo
     └── #0 EvalScalar [#6]
 
 
+=== build_side_cluster_keys_do_not_propagate ===
+description: Cluster keys from a build-side clustered table do not affect later join costs.
+setup: CREATE TABLE a(k1 BIGINT, k2 BIGINT, v BIGINT) CLUSTER BY (k1, k2)
+setup: CREATE TABLE b(k1 BIGINT, k2 BIGINT, v BIGINT)
+setup: CREATE TABLE c(k1 BIGINT, k2 BIGINT, v BIGINT)
+sql: SELECT *
+FROM b
+JOIN (SELECT * FROM a LIMIT 100) a ON b.k1 = a.k1
+JOIN (SELECT * FROM c LIMIT 10) c ON a.k2 = c.k2
+memo:
+DPhpyOptimizer:
+join_order_candidate:
+- parent: [a, c], left: [a], right: [c], cost: 1000.000, previous best: -, probe factor: 1.000, selected: true
+- parent: [a, b], left: [b], right: [a], cost: 100000.000, previous best: -, probe factor: 1.000, selected: true
+- parent: [a, b, c], left: [b], right: [a, c], cost: 1001000.000, previous best: -, probe factor: 1.000, selected: true
+- parent: [a, b, c], left: [a, b], right: [c], cost: 1100000.000, previous best: 1001000.000, probe factor: 1.000, selected: false
+
+Memo
+├── root group: #9
+├── estimated memory: 6.25 KiB
+├── Group #0
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 1000.000, children: []
+│   └── #0 Scan []
+├── Group #1
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 1000.000, children: []
+│   └── #0 Scan []
+├── Group #2
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 2000.000, children: [{ dist: Any }]
+│   └── #0 Limit [#1]
+├── Group #3
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 2100.000, children: [{ dist: Any }]
+│   └── #0 EvalScalar [#2]
+├── Group #4
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 1000.000, children: []
+│   └── #0 Scan []
+├── Group #5
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 2000.000, children: [{ dist: Any }]
+│   └── #0 Limit [#4]
+├── Group #6
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 2010.000, children: [{ dist: Any }]
+│   └── #0 EvalScalar [#5]
+├── Group #7
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 4310.000, children: [{ dist: Any }, { dist: Any }]
+│   └── #0 Join [#3, #6]
+├── Group #8
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 16310.000, children: [{ dist: Any }, { dist: Any }]
+│   └── #0 Join [#0, #7]
+└── Group #9
+    ├── Best properties
+    │   └── { dist: Any }: expr: #0, cost: 1016310.000, children: [{ dist: Any }]
+    └── #0 EvalScalar [#8]
+
+
+=== linear_expression_cluster_key ===
+description: A LINEAR cluster key with to_yyyymmdd and substring expressions affects join costs.
+setup: CREATE TABLE a(k1 BIGINT, k2 BIGINT, v BIGINT, start_time TIMESTAMP, start_day UInt32, trace_id STRING) CLUSTER BY linear (
+                    to_yyyymmdd(start_time),
+                    SUBSTRING(trace_id FROM 1 FOR 40)
+                )
+setup: CREATE TABLE b(k1 BIGINT, k2 BIGINT, v BIGINT, start_time TIMESTAMP, start_day UInt32, trace_id STRING)
+setup: CREATE TABLE c(k1 BIGINT, k2 BIGINT, v BIGINT, start_time TIMESTAMP, start_day UInt32, trace_id STRING)
+sql: SELECT *
+FROM a
+JOIN b
+ON to_yyyymmdd(a.start_time) = b.start_day
+AND SUBSTRING(a.trace_id FROM 1 FOR 40) = b.trace_id
+JOIN c ON a.k2 = c.k2
+memo:
+DPhpyOptimizer:
+join_order_candidate:
+- parent: [a, c], left: [a], right: [c], cost: 1000.000, previous best: -, probe factor: 1.000, selected: true
+- parent: [a, b], left: [a], right: [b], cost: 902500.000, previous best: -, probe factor: 0.902, selected: true
+- parent: [a, b, c], left: [a, b], right: [c], cost: 903500.000, previous best: -, probe factor: 1.000, selected: true
+- parent: [a, b, c], left: [a, c], right: [b], cost: 1001000.000, previous best: 903500.000, probe factor: 1.000, selected: false
+
+Memo
+├── root group: #5
+├── estimated memory: 3.75 KiB
+├── Group #0
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 1000.000, children: []
+│   └── #0 Scan []
+├── Group #1
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 1000.000, children: []
+│   └── #0 Scan []
+├── Group #2
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 13000.000, children: [{ dist: Any }, { dist: Any }]
+│   └── #0 Join [#0, #1]
+├── Group #3
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 1000.000, children: []
+│   └── #0 Scan []
+├── Group #4
+│   ├── Best properties
+│   │   └── { dist: Any }: expr: #0, cost: 1024000.000, children: [{ dist: Any }, { dist: Any }]
+│   └── #0 Join [#2, #3]
+└── Group #5
+    ├── Best properties
+    │   └── { dist: Any }: expr: #0, cost: 1025000.000, children: [{ dist: Any }]
+    └── #0 EvalScalar [#4]
+
+