@@ -43,10 +43,16 @@ use crate::framework::golden::write_case_title;
4343struct JoinMemoCase < ' a > {
4444 name : & ' a str ,
4545 description : & ' a str ,
46+ table_columns : & ' a str ,
4647 cluster_by : & ' a str ,
4748 sql : & ' a str ,
49+ column_statistics : fn ( u64 ) -> HashMap < String , BasicColumnStatistics > ,
4850}
4951
52+ const KEY_TABLE_COLUMNS : & str = "(k1 BIGINT, k2 BIGINT, v BIGINT)" ;
53+ const TRACE_TABLE_COLUMNS : & str = "\
54+ (k1 BIGINT, k2 BIGINT, v BIGINT, start_time TIMESTAMP, start_day UInt32, trace_id STRING)";
55+
5056fn table_statistics ( rows : u64 ) -> TableStatistics {
5157 TableStatistics {
5258 num_rows : Some ( rows) ,
@@ -78,6 +84,29 @@ fn column_statistics(rows: u64) -> HashMap<String, BasicColumnStatistics> {
7884 . collect ( )
7985}
8086
87+ fn trace_column_statistics ( rows : u64 ) -> HashMap < String , BasicColumnStatistics > {
88+ let mut stats = column_statistics ( rows) ;
89+ stats. insert ( "start_day" . to_string ( ) , BasicColumnStatistics {
90+ min : Some ( Datum :: UInt ( 20240101 ) ) ,
91+ max : Some ( Datum :: UInt ( 20241231 ) ) ,
92+ ndv : Some ( NdvEstimate :: exact ( 365.0 ) ) ,
93+ null_count : 0 ,
94+ in_memory_size : rows. saturating_mul ( 4 ) ,
95+ } ) ;
96+ stats. insert ( "trace_id" . to_string ( ) , BasicColumnStatistics {
97+ min : Some ( Datum :: Bytes (
98+ b"0000000000000000000000000000000000000000" . to_vec ( ) ,
99+ ) ) ,
100+ max : Some ( Datum :: Bytes (
101+ b"ffffffffffffffffffffffffffffffffffffffff" . to_vec ( ) ,
102+ ) ) ,
103+ ndv : Some ( NdvEstimate :: exact ( rows as f64 ) ) ,
104+ null_count : 0 ,
105+ in_memory_size : rows. saturating_mul ( 40 ) ,
106+ } ) ;
107+ stats
108+ }
109+
81110#[ tokio:: test( flavor = "multi_thread" , worker_threads = 1 ) ]
82111async fn test_cluster_key_order_join_memo_golden ( ) -> Result < ( ) > {
83112 let mut file = open_golden_file ( "optimizer" , "cluster_key_join_order.txt" ) ?;
@@ -86,46 +115,85 @@ async fn test_cluster_key_order_join_memo_golden() -> Result<()> {
86115 JoinMemoCase {
87116 name : "k1_k2_prefix" ,
88117 description : "Full memo output when the clustered probe can first match a.k1." ,
118+ table_columns : KEY_TABLE_COLUMNS ,
89119 cluster_by : "CLUSTER BY (k1, k2)" ,
90120 sql : "
91121 SELECT *
92122 FROM a
93123 JOIN b ON a.k1 = b.k1
94124 JOIN c ON a.k2 = c.k2
95125 " ,
126+ column_statistics,
96127 } ,
97128 JoinMemoCase {
98129 name : "k2_k1_prefix" ,
99130 description : "Full memo output when the clustered probe can first match a.k2." ,
131+ table_columns : KEY_TABLE_COLUMNS ,
100132 cluster_by : "CLUSTER BY (k2, k1)" ,
101133 sql : "
102134 SELECT *
103135 FROM a
104136 JOIN b ON a.k1 = b.k1
105137 JOIN c ON a.k2 = c.k2
106138 " ,
139+ column_statistics,
107140 } ,
108141 JoinMemoCase {
109142 name : "filter_preserves_cluster_keys" ,
110143 description : "Cluster keys still affect join order after a filter on the clustered table." ,
144+ table_columns : KEY_TABLE_COLUMNS ,
111145 cluster_by : "CLUSTER BY (k1, k2)" ,
112146 sql : "
113147 SELECT *
114148 FROM (SELECT * FROM a WHERE v >= 0) a
115149 JOIN b ON a.k1 = b.k1
116150 JOIN c ON a.k2 = c.k2
117151 " ,
152+ column_statistics,
118153 } ,
119154 JoinMemoCase {
120155 name : "limit_and_join_preserve_cluster_keys" ,
121156 description : "Cluster keys still affect join order after a limit subquery and a partial join." ,
157+ table_columns : KEY_TABLE_COLUMNS ,
122158 cluster_by : "CLUSTER BY (k1, k2)" ,
123159 sql : "
124160 SELECT *
125161 FROM (SELECT * FROM a LIMIT 1000) a
126162 JOIN b ON a.k1 = b.k1
127163 JOIN c ON a.k2 = c.k2
128164 " ,
165+ column_statistics,
166+ } ,
167+ JoinMemoCase {
168+ name : "build_side_cluster_keys_do_not_propagate" ,
169+ description : "Cluster keys from a build-side clustered table do not affect later join costs." ,
170+ table_columns : KEY_TABLE_COLUMNS ,
171+ cluster_by : "CLUSTER BY (k1, k2)" ,
172+ sql : "
173+ SELECT *
174+ FROM b
175+ JOIN (SELECT * FROM a LIMIT 100) a ON b.k1 = a.k1
176+ JOIN (SELECT * FROM c LIMIT 10) c ON a.k2 = c.k2
177+ " ,
178+ column_statistics,
179+ } ,
180+ JoinMemoCase {
181+ name : "linear_expression_cluster_key" ,
182+ description : "A LINEAR cluster key with to_yyyymmdd and substring expressions affects join costs." ,
183+ table_columns : TRACE_TABLE_COLUMNS ,
184+ cluster_by : "CLUSTER BY linear (
185+ to_yyyymmdd(start_time),
186+ SUBSTRING(trace_id FROM 1 FOR 40)
187+ )" ,
188+ sql : "
189+ SELECT *
190+ FROM a
191+ JOIN b
192+ ON to_yyyymmdd(a.start_time) = b.start_day
193+ AND SUBSTRING(a.trace_id FROM 1 FOR 40) = b.trace_id
194+ JOIN c ON a.k2 = c.k2
195+ " ,
196+ column_statistics : trace_column_statistics,
129197 } ,
130198 ] {
131199 write_cluster_key_join_order_memo ( & mut file, case) . await ?;
@@ -146,16 +214,19 @@ async fn write_cluster_key_join_order_memo(
146214 for table in [ "a" , "b" , "c" ] {
147215 let table_cluster_by = if table == "a" { case. cluster_by } else { "" } ;
148216 let setup_sql = match table_cluster_by {
149- "" => format ! ( "CREATE TABLE {table}(k1 BIGINT, k2 BIGINT, v BIGINT)" ) ,
217+ "" => format ! ( "CREATE TABLE {table}{}" , case . table_columns ) ,
150218 _ => {
151- format ! ( "CREATE TABLE {table}(k1 BIGINT, k2 BIGINT, v BIGINT) {table_cluster_by}" )
219+ format ! (
220+ "CREATE TABLE {table}{} {table_cluster_by}" ,
221+ case. table_columns
222+ )
152223 }
153224 } ;
154225 writeln ! ( file, "setup: {setup_sql}" ) ?;
155226 ctx. register_table_sql_with_stats (
156227 & setup_sql,
157228 Some ( table_statistics ( 1000 ) ) ,
158- column_statistics ( 1000 ) ,
229+ ( case . column_statistics ) ( 1000 ) ,
159230 )
160231 . await ?;
161232 }
0 commit comments