add an auto sizing of target data file and a simulation diagnostic

cccs-jc · cccs-jc · commit 49a417ba6877 · 2026-03-27T20:11:37.000Z
diff --git a/ice_keeper/ice_keeper.py b/ice_keeper/ice_keeper.py
@@ -267,11 +267,21 @@ def reset(force: bool) -> None:  # noqa: FBT001
 @click.option("--min_age_to_diagnose", default=1, help="Minimum snapshot age (in partition rank) to diagnose (default: 1).")
 @click.option("--max_age_to_diagnose", default=72, help="Maximum snapshot age (in partition rank) to diagnose (default: 72).")
 @click.option("--optimization_strategy", help="Optional optimization strategy to use during diagnosis.")
+@click.option("--target_file_size_bytes", help="Optional target data file size.")
+@click.option(
+    "--mode",
+    type=click.Choice(["simulate", "dry_run"], case_sensitive=False),
+    default="simulate",
+    show_default=True,
+    help="Mandatory mode choosing either dry_run or simulate.",
+)
 def diagnose(
     full_name: str,
     min_age_to_diagnose: int,
     max_age_to_diagnose: int,
+    mode: str,
     optimization_strategy: str | None,
+    target_file_size_bytes: int | None,
 ) -> int:
     """Diagnose table health by analyzing its partitions.
 
@@ -287,12 +297,25 @@ def diagnose(
         record_copy = record.model_copy()
         if optimization_strategy:
             record_copy.optimization_strategy = optimization_strategy
+        if target_file_size_bytes:
+            record_copy.target_file_size_bytes = target_file_size_bytes
         record_copy.min_age_to_optimize = min_age_to_diagnose
         record_copy.max_age_to_optimize = max_age_to_diagnose
         row = Row(**record_copy.model_dump(by_alias=True))
         entry = MaintenanceScheduleRecord.from_row(row).to_entry()
         strategy = OptimizationStrategy(entry)
-        strategy.find_and_optimize_specs(None)
+        try:
+            if mode == "dry_run":
+                strategy.diagnose_partition_specs()
+            elif mode == "simulate":
+                strategy.estimate_optimization_results_partition_specs()
+            else:
+                msg = f"Invalid option for mode: {mode}"
+                raise Exception(msg)
+        except Exception as e:  # noqa: BLE001
+            msg = f"An error occurred while diagnosing table '{full_name}': {e}"
+            raise click.ClickException(msg)  # noqa: B904
+
     else:
         # Preserve expected CLI behavior: emit an error and non-zero exit
         # when the table is not present in the maintenance schedule.
diff --git a/ice_keeper/task/action/optimization/datafile_summary.py b/ice_keeper/task/action/optimization/datafile_summary.py
@@ -277,7 +277,14 @@ def make_age_filter_stmt(self, min_age_to_optimize: int, max_age_to_optimize: in
 
         return filter_stmt
 
-    def create_summary_stmt(self) -> str:
+    def _format_bytes_stmt(self, bytes_column: str) -> str:
+        return f"""CONCAT(
+            ROUND({bytes_column} / POWER(1024, FLOOR(LOG(1024, GREATEST({bytes_column}, 1)))), 2),
+            ' ',
+            ELEMENT_AT(ARRAY('B', 'KB', 'MB', 'GB', 'TB', 'PB'), CAST(FLOOR(LOG(1024, GREATEST({bytes_column}, 1))) AS INT) + 1)
+        )"""
+
+    def create_summary_stmt(self, *, estimate_optimization_results: bool = False) -> str:
         """Generate an SQL query for creating a partition diagnostics summary.
 
         This summary includes metrics such as correlation factors, number of files
@@ -286,8 +293,18 @@ def create_summary_stmt(self) -> str:
         Returns:
             str: SQL query for analyzing partition health and optimization readiness.
         """
-        min_file_size_bytes = int(self.mnt_props.target_file_size_bytes * 0.75)
-        max_file_size_bytes = int(self.mnt_props.target_file_size_bytes * 1.8)
+        target_file_size_stmt = str(self.mnt_props.target_file_size_bytes)
+        if self.mnt_props.target_file_size_bytes <= 0:
+            target_file_size_stmt = """
+            case
+            when sum_file_size < 16L * 16 * 1048576 then 16L * 1048576
+            when sum_file_size < 32L * 32 * 1048576 then 32L * 1048576
+            when sum_file_size < 64L * 64 * 1048576 then 64L * 1048576
+            when sum_file_size < 128L * 128 * 1048576 then 128L * 1048576
+            when sum_file_size < 256L * 256 * 1048576 then 256L * 1048576
+            when sum_file_size < 512L * 512 * 1048576 then 512L * 1048576
+            else 1024L end
+            """
 
         num_files_targetted_for_rewrite_threshold = 5
 
@@ -302,6 +319,8 @@ def create_summary_stmt(self) -> str:
         order_by = self.spec.make_order_stmt()
         age_filter_stmt = self.make_age_filter_stmt(min_age_to_optimize, max_age_to_optimize)
 
+        cte_to_use = "final_decision" if not estimate_optimization_results else "final_estimate"
+
         return f"""
             -- Diagnosing partitioned table '{self.mnt_props.full_name}' for optimization
             -- All data files to consider for optimization.
@@ -344,6 +363,44 @@ def create_summary_stmt(self) -> str:
                 where
                     spec_id = {self.spec_id}
             ),
+            file_stats_per_partition as (
+                select
+                    {grouping_stmt},
+                    content,
+                    record_count,
+                    file_size_in_bytes,
+                    rn1,
+                    rn2,
+                    is_data_file_from_widening_src_partition,
+                    -- Aggregations for content = 0 (data files)
+                    count_if(content = 0) over (partition by {grouping_stmt}) as n_files,
+                    sum(case when content = 0 then record_count end) over (partition by {grouping_stmt}) as n_records,
+                    avg(case when content = 0 then file_size_in_bytes end) over (partition by {grouping_stmt}) as avg_file_size,
+                    min(case when content = 0 then file_size_in_bytes end) over (partition by {grouping_stmt}) as min_file_size,
+                    max(case when content = 0 then file_size_in_bytes end) over (partition by {grouping_stmt}) as max_file_size,
+                    sum(case when content = 0 then file_size_in_bytes end) over (partition by {grouping_stmt}) as sum_file_size
+                from
+                    ranked_data_files
+            ),
+            target_file_size_per_partition as (
+                select
+                    {grouping_stmt},
+                    content,
+                    record_count,
+                    file_size_in_bytes,
+                    rn1,
+                    rn2,
+                    is_data_file_from_widening_src_partition,
+                    n_files,
+                    n_records,
+                    avg_file_size,
+                    min_file_size,
+                    max_file_size,
+                    sum_file_size,
+                    {target_file_size_stmt} as target_file_size
+                from
+                    file_stats_per_partition
+            ),
             -- Aggregate the metrics per partition.
             agg_data_files as (
                 select
@@ -354,20 +411,22 @@ def create_summary_stmt(self) -> str:
                     {self.spec.make_to_json_stmt()} as partition_desc,
 
                     -- Aggregations for content = 0 (data files)
-                    count_if(content = 0) as n_files,
+                    first(n_files) as n_files,
+
+                    first(n_records) as n_records,
+                    first(avg_file_size) as avg_file_size,
+                    first(min_file_size) as min_file_size,
+                    first(max_file_size) as max_file_size,
+                    first(sum_file_size) as sum_file_size,
+
+                    first(target_file_size) as target_file_size,
 
                     count_if(
                         content = 0 and
-                        (file_size_in_bytes < {min_file_size_bytes}
-                         or file_size_in_bytes > {max_file_size_bytes})
+                        (file_size_in_bytes < int(target_file_size * 0.75)
+                         or file_size_in_bytes > int(target_file_size * 1.8))
                     ) as num_files_targetted_for_rewrite,
 
-                    sum(case when content = 0 then record_count end) as n_records,
-                    avg(case when content = 0 then file_size_in_bytes end) as avg_file_size,
-                    min(case when content = 0 then file_size_in_bytes end) as min_file_size,
-                    max(case when content = 0 then file_size_in_bytes end) as max_file_size,
-                    sum(case when content = 0 then file_size_in_bytes end) as sum_file_size,
-
                     count_if(
                         content = 0 and
                         is_data_file_from_widening_src_partition = true
@@ -387,19 +446,20 @@ def create_summary_stmt(self) -> str:
 
                     sum(case when content > 0 then record_count else 0 end) as n_delete_records
                 from
-                    ranked_data_files
+                    target_file_size_per_partition
                 group by
                     {grouping_stmt}
             ),
             -- Add should optimize flags to the aggregate.
-            final as (
+            final_decision as (
                 select
                     {grouping_stmt},
                     partition_age,
                     partition_desc,
                     n_files,
                     num_files_targetted_for_rewrite,
                     n_records,
+                    target_file_size,
                     avg_file_size,
                     min_file_size,
                     max_file_size,
@@ -419,7 +479,27 @@ def create_summary_stmt(self) -> str:
                     {age_filter_stmt}
                 order by
                     {order_by}
+            ),
+            final_estimate as (
+                select
+                    {grouping_stmt},
+                    partition_age,
+                    {self._format_bytes_stmt("sum_file_size")} as partition_size,
+                    {self._format_bytes_stmt("avg_file_size")} as avg_file_size,
+                    {self._format_bytes_stmt("target_file_size")} as target_file_size,
+                    n_files as partition_num_files,
+                    int(sum_file_size / target_file_size) as partition_target_num_files,
+                    sum(n_files) over(partition by partition_age) as num_files_per_age,
+                    sum(int(sum_file_size / target_file_size)) over(partition by partition_age) as target_num_files_per_age
+                from
+                    agg_data_files
+                where
+                    {age_filter_stmt}
+                order by
+                    partition_age asc,
+                    n_files desc,
+                    avg_file_size desc
             )
 
-            select * from final
+            select * from {cte_to_use}
         """
diff --git a/ice_keeper/task/action/optimization/optimization.py b/ice_keeper/task/action/optimization/optimization.py
@@ -12,6 +12,7 @@
 from ice_keeper.table import PartitionHealth
 from ice_keeper.task import SparkTask
 from ice_keeper.task.action.action import ActionStrategy, ActionTask
+from ice_keeper.task.action.optimization.partition_summary import DataFilesSummary
 from ice_keeper.task.task import SubTaskExecutor
 from ice_keeper.zorder_udf import zorder2Tuple
 
@@ -120,7 +121,7 @@ def execute_statement(self, sub_executor: SubTaskExecutor, sql_stm: str) -> dict
         self.disable_journaling()
         return {}
 
-    def find_and_optimize_specs(self, sub_executor: SubTaskExecutor | None) -> None:
+    def find_and_optimize_specs(self, sub_executor: SubTaskExecutor) -> None:
         # Register UDF in this new Spark session. We might use it to diagnose the table.
 
         udf = pandas_udf(zorder2Tuple, returnType=BinaryType())  # type: ignore[call-overload]
@@ -133,11 +134,7 @@ def find_and_optimize_specs(self, sub_executor: SubTaskExecutor | None) -> None:
             did_some_optimizations = False
             # Collect partition summary for the spec_id
             summary = PartitionSummary(self.mnt_props, spec_id, self.get_widening_rule(spec_id))
-            if sub_executor:
-                summary.show(100)
-            else:
-                # In diagnostic mode, we want to show the full summary in logs for debugging purposes
-                summary.show(10000)
+            summary.show(100)
 
             try:
                 # Diagnose the partitions for optimization opportunities
@@ -147,19 +144,47 @@ def find_and_optimize_specs(self, sub_executor: SubTaskExecutor | None) -> None:
                 if len(rows) > 0:
                     rows_log_debug(rows, f"Partitions to optimize in {self.mnt_props.full_name}")
                     did_some_optimizations = True
-                    if sub_executor:
-                        self._execute_sub_tasks(sub_executor, rows, spec_id)
+                    self._execute_sub_tasks(sub_executor, rows, spec_id)
                 else:
                     logger.debug("All partitions in spec_id: %s are healthy", spec_id)
 
-                if sub_executor:
-                    # In the context of executing optimization, we want to save the results back to the partition health table
-                    partition_health = PartitionHealth()
-                    summary.save_diff(partition_health, did_some_optimizations=did_some_optimizations)
+                # In the context of executing optimization, we want to save the results back to the partition health table
+                partition_health = PartitionHealth()
+                summary.save_diff(partition_health, did_some_optimizations=did_some_optimizations)
             finally:
                 logger.debug("END Optimizing spec_id: %s", spec_id)
                 summary.uncache_views(did_some_optimizations=did_some_optimizations)
 
+    def diagnose_partition_specs(self) -> None:
+        # Register UDF in this new Spark session. We might use it to diagnose the table.
+        udf = pandas_udf(zorder2Tuple, returnType=BinaryType())  # type: ignore[call-overload]
+        STL.get().udf.register("zorder2Tuple", udf)
+
+        unique_spec_ids = self._find_specs_to_optimize()
+        for spec_id in unique_spec_ids:
+            logger.debug("START Diagnosing spec_id: %s -> %s", spec_id, self.mnt_props.partition_specs[spec_id])
+            spec = self.mnt_props.partition_specs[spec_id]
+            widening_rule = self.get_widening_rule(spec_id)
+            datafiles_summary = DataFilesSummary(self.mnt_props, spec, spec_id, widening_rule)
+            sql = datafiles_summary.create_summary_stmt()
+            rows = STL.sql_and_log(sql, "Retrieve rows from partition summary").take(10000)
+            rows_log_debug(rows, f"Diagnostic Partition Summary of {self.mnt_props.full_name}, spec: {spec}")
+
+    def estimate_optimization_results_partition_specs(self) -> None:
+        # Register UDF in this new Spark session. We might use it to diagnose the table.
+        udf = pandas_udf(zorder2Tuple, returnType=BinaryType())  # type: ignore[call-overload]
+        STL.get().udf.register("zorder2Tuple", udf)
+
+        unique_spec_ids = self._find_specs_to_optimize()
+        for spec_id in unique_spec_ids:
+            logger.debug("START Diagnosing spec_id: %s -> %s", spec_id, self.mnt_props.partition_specs[spec_id])
+            spec = self.mnt_props.partition_specs[spec_id]
+            widening_rule = self.get_widening_rule(spec_id)
+            datafiles_summary = DataFilesSummary(self.mnt_props, spec, spec_id, widening_rule)
+            sql = datafiles_summary.create_summary_stmt(estimate_optimization_results=True)
+            rows = STL.sql_and_log(sql, "Retrieve rows from partition summary").take(10000)
+            rows_log_debug(rows, f"Diagnostic Partition Summary of {self.mnt_props.full_name}, spec: {spec}")
+
     def create_widening_rule_if_any(self) -> None | WideningRule:
         """Attach a widening rule to the partition specs, if defined in the table configuration.
 
diff --git a/tests/config/ice-keeper.yaml b/tests/config/ice-keeper.yaml
@@ -4,4 +4,4 @@ partition_health_table_name: local.admin_test.partition_health
 journal_table_name: local.admin_test.journal
 storage_inventory_report_table_name: local.admin_test.storage_inventory_report
 notification_email_fallback: admin@hostname.com
-logging_config_file: ./config/logging_config.yaml
+logging_config_file: ./tests/config/logging_config.yaml
diff --git a/tests/config/logging_config.yaml b/tests/config/logging_config.yaml
@@ -9,13 +9,13 @@ formatters:
 handlers:
   console:
     class: logging.StreamHandler
-    level: INFO
+    level: DEBUG
     formatter: simple
     stream: ext://sys.stdout
 
 loggers:
   ice-keeper:
-    level: INFO
+    level: DEBUG
     handlers: [console]
     propagate: no
 
diff --git a/tests/integration/test_optimize.py b/tests/integration/test_optimize.py
@@ -964,6 +964,8 @@ def test_diagnose(executor: TaskExecutor) -> None:
             "14",
             "--optimization_strategy",
             "id ASC",
+            "--target_file_size_bytes",
+            "-1",
         ],
     )
 

Original file line number	Diff line number	Diff line change
`@@ -964,6 +964,8 @@ def test_diagnose(executor: TaskExecutor) -> None:`
`964`	`964`	`"14",`
`965`	`965`	`"--optimization_strategy",`
`966`	`966`	`"id ASC",`
	`967`	`+ "--target_file_size_bytes",`
	`968`	`+ "-1",`
`967`	`969`	`],`
`968`	`970`	`)`
`969`	`971`