@@ -277,7 +277,14 @@ def make_age_filter_stmt(self, min_age_to_optimize: int, max_age_to_optimize: in
277277
278278 return filter_stmt
279279
280- def create_summary_stmt (self ) -> str :
280+ def _format_bytes_stmt (self , bytes_column : str ) -> str :
281+ return f"""CONCAT(
282+ ROUND({ bytes_column } / POWER(1024, FLOOR(LOG(1024, GREATEST({ bytes_column } , 1)))), 2),
283+ ' ',
284+ ELEMENT_AT(ARRAY('B', 'KB', 'MB', 'GB', 'TB', 'PB'), CAST(FLOOR(LOG(1024, GREATEST({ bytes_column } , 1))) AS INT) + 1)
285+ )"""
286+
287+ def create_summary_stmt (self , * , estimate_optimization_results : bool = False ) -> str :
281288 """Generate an SQL query for creating a partition diagnostics summary.
282289
283290 This summary includes metrics such as correlation factors, number of files
@@ -286,8 +293,18 @@ def create_summary_stmt(self) -> str:
286293 Returns:
287294 str: SQL query for analyzing partition health and optimization readiness.
288295 """
289- min_file_size_bytes = int (self .mnt_props .target_file_size_bytes * 0.75 )
290- max_file_size_bytes = int (self .mnt_props .target_file_size_bytes * 1.8 )
296+ target_file_size_stmt = str (self .mnt_props .target_file_size_bytes )
297+ if self .mnt_props .target_file_size_bytes <= 0 :
298+ target_file_size_stmt = """
299+ case
300+ when sum_file_size < 16L * 16 * 1048576 then 16L * 1048576
301+ when sum_file_size < 32L * 32 * 1048576 then 32L * 1048576
302+ when sum_file_size < 64L * 64 * 1048576 then 64L * 1048576
303+ when sum_file_size < 128L * 128 * 1048576 then 128L * 1048576
304+ when sum_file_size < 256L * 256 * 1048576 then 256L * 1048576
305+ when sum_file_size < 512L * 512 * 1048576 then 512L * 1048576
306+ else 1024L end
307+ """
291308
292309 num_files_targetted_for_rewrite_threshold = 5
293310
@@ -302,6 +319,8 @@ def create_summary_stmt(self) -> str:
302319 order_by = self .spec .make_order_stmt ()
303320 age_filter_stmt = self .make_age_filter_stmt (min_age_to_optimize , max_age_to_optimize )
304321
322+ cte_to_use = "final_decision" if not estimate_optimization_results else "final_estimate"
323+
305324 return f"""
306325 -- Diagnosing partitioned table '{ self .mnt_props .full_name } ' for optimization
307326 -- All data files to consider for optimization.
@@ -344,6 +363,44 @@ def create_summary_stmt(self) -> str:
344363 where
345364 spec_id = { self .spec_id }
346365 ),
366+ file_stats_per_partition as (
367+ select
368+ { grouping_stmt } ,
369+ content,
370+ record_count,
371+ file_size_in_bytes,
372+ rn1,
373+ rn2,
374+ is_data_file_from_widening_src_partition,
375+ -- Aggregations for content = 0 (data files)
376+ count_if(content = 0) over (partition by { grouping_stmt } ) as n_files,
377+ sum(case when content = 0 then record_count end) over (partition by { grouping_stmt } ) as n_records,
378+ avg(case when content = 0 then file_size_in_bytes end) over (partition by { grouping_stmt } ) as avg_file_size,
379+ min(case when content = 0 then file_size_in_bytes end) over (partition by { grouping_stmt } ) as min_file_size,
380+ max(case when content = 0 then file_size_in_bytes end) over (partition by { grouping_stmt } ) as max_file_size,
381+ sum(case when content = 0 then file_size_in_bytes end) over (partition by { grouping_stmt } ) as sum_file_size
382+ from
383+ ranked_data_files
384+ ),
385+ target_file_size_per_partition as (
386+ select
387+ { grouping_stmt } ,
388+ content,
389+ record_count,
390+ file_size_in_bytes,
391+ rn1,
392+ rn2,
393+ is_data_file_from_widening_src_partition,
394+ n_files,
395+ n_records,
396+ avg_file_size,
397+ min_file_size,
398+ max_file_size,
399+ sum_file_size,
400+ { target_file_size_stmt } as target_file_size
401+ from
402+ file_stats_per_partition
403+ ),
347404 -- Aggregate the metrics per partition.
348405 agg_data_files as (
349406 select
@@ -354,20 +411,22 @@ def create_summary_stmt(self) -> str:
354411 { self .spec .make_to_json_stmt ()} as partition_desc,
355412
356413 -- Aggregations for content = 0 (data files)
357- count_if(content = 0) as n_files,
414+ first(n_files) as n_files,
415+
416+ first(n_records) as n_records,
417+ first(avg_file_size) as avg_file_size,
418+ first(min_file_size) as min_file_size,
419+ first(max_file_size) as max_file_size,
420+ first(sum_file_size) as sum_file_size,
421+
422+ first(target_file_size) as target_file_size,
358423
359424 count_if(
360425 content = 0 and
361- (file_size_in_bytes < { min_file_size_bytes }
362- or file_size_in_bytes > { max_file_size_bytes } )
426+ (file_size_in_bytes < int(target_file_size * 0.75)
427+ or file_size_in_bytes > int(target_file_size * 1.8) )
363428 ) as num_files_targetted_for_rewrite,
364429
365- sum(case when content = 0 then record_count end) as n_records,
366- avg(case when content = 0 then file_size_in_bytes end) as avg_file_size,
367- min(case when content = 0 then file_size_in_bytes end) as min_file_size,
368- max(case when content = 0 then file_size_in_bytes end) as max_file_size,
369- sum(case when content = 0 then file_size_in_bytes end) as sum_file_size,
370-
371430 count_if(
372431 content = 0 and
373432 is_data_file_from_widening_src_partition = true
@@ -387,19 +446,20 @@ def create_summary_stmt(self) -> str:
387446
388447 sum(case when content > 0 then record_count else 0 end) as n_delete_records
389448 from
390- ranked_data_files
449+ target_file_size_per_partition
391450 group by
392451 { grouping_stmt }
393452 ),
394453 -- Add should optimize flags to the aggregate.
395- final as (
454+ final_decision as (
396455 select
397456 { grouping_stmt } ,
398457 partition_age,
399458 partition_desc,
400459 n_files,
401460 num_files_targetted_for_rewrite,
402461 n_records,
462+ target_file_size,
403463 avg_file_size,
404464 min_file_size,
405465 max_file_size,
@@ -419,7 +479,27 @@ def create_summary_stmt(self) -> str:
419479 { age_filter_stmt }
420480 order by
421481 { order_by }
482+ ),
483+ final_estimate as (
484+ select
485+ { grouping_stmt } ,
486+ partition_age,
487+ { self ._format_bytes_stmt ("sum_file_size" )} as partition_size,
488+ { self ._format_bytes_stmt ("avg_file_size" )} as avg_file_size,
489+ { self ._format_bytes_stmt ("target_file_size" )} as target_file_size,
490+ n_files as partition_num_files,
491+ int(sum_file_size / target_file_size) as partition_target_num_files,
492+ sum(n_files) over(partition by partition_age) as num_files_per_age,
493+ sum(int(sum_file_size / target_file_size)) over(partition by partition_age) as target_num_files_per_age
494+ from
495+ agg_data_files
496+ where
497+ { age_filter_stmt }
498+ order by
499+ partition_age asc,
500+ n_files desc,
501+ avg_file_size desc
422502 )
423503
424- select * from final
504+ select * from { cte_to_use }
425505 """
0 commit comments