Skip to content

Commit a03f097

Browse files
committed
Enable file merging by last modification time using preserve-insertion-order
This change leverages the previously unused `preserve-insertion-order` configuration to enable merging files sorted by their last modification time during compaction. This is particularly beneficial for append-only workloads, improving data locality after optimize runs by merging files that were created around similar times.
1 parent f67e828 commit a03f097

File tree

2 files changed

+16
-3
lines changed

2 files changed

+16
-3
lines changed

crates/core/src/operations/optimize.rs

+13-3
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,7 @@ impl<'a> std::future::IntoFuture for OptimizeBuilder<'a> {
331331
this.filters,
332332
this.target_size.to_owned(),
333333
writer_properties,
334+
this.preserve_insertion_order,
334335
)?;
335336
let metrics = plan
336337
.execute(
@@ -877,12 +878,15 @@ pub fn create_merge_plan(
877878
filters: &[PartitionFilter],
878879
target_size: Option<i64>,
879880
writer_properties: WriterProperties,
881+
preserve_insertion_order: bool,
880882
) -> Result<MergePlan, DeltaTableError> {
881883
let target_size = target_size.unwrap_or_else(|| snapshot.table_config().target_file_size());
882884
let partitions_keys = &snapshot.metadata().partition_columns;
883885

884886
let (operations, metrics) = match optimize_type {
885-
OptimizeType::Compact => build_compaction_plan(snapshot, filters, target_size)?,
887+
OptimizeType::Compact => {
888+
build_compaction_plan(snapshot, filters, target_size, preserve_insertion_order)?
889+
}
886890
OptimizeType::ZOrder(zorder_columns) => {
887891
build_zorder_plan(zorder_columns, snapshot, partitions_keys, filters)?
888892
}
@@ -958,6 +962,7 @@ fn build_compaction_plan(
958962
snapshot: &DeltaTableState,
959963
filters: &[PartitionFilter],
960964
target_size: i64,
965+
perserve_insertion_order: bool,
961966
) -> Result<(OptimizeOperations, Metrics), DeltaTableError> {
962967
let mut metrics = Metrics::default();
963968

@@ -985,8 +990,13 @@ fn build_compaction_plan(
985990
}
986991

987992
for (_, file) in partition_files.values_mut() {
988-
// Sort files by size: largest to smallest
989-
file.sort_by(|a, b| b.size.cmp(&a.size));
993+
if perserve_insertion_order {
994+
// sort files by modification date
995+
file.sort_by(|a, b| b.last_modified.cmp(&a.last_modified));
996+
} else {
997+
// Sort files by size: largest to smallest
998+
file.sort_by(|a, b| b.size.cmp(&a.size));
999+
}
9901000
}
9911001

9921002
let mut operations: HashMap<String, (IndexMap<String, Scalar>, Vec<MergeBin>)> = HashMap::new();

crates/core/tests/command_optimize.rs

+3
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ async fn test_conflict_for_remove_actions() -> Result<(), Box<dyn Error>> {
289289
&filter,
290290
None,
291291
WriterProperties::builder().build(),
292+
false,
292293
)?;
293294

294295
let uri = context.tmp_dir.path().to_str().to_owned().unwrap();
@@ -351,6 +352,7 @@ async fn test_no_conflict_for_append_actions() -> Result<(), Box<dyn Error>> {
351352
&filter,
352353
None,
353354
WriterProperties::builder().build(),
355+
false,
354356
)?;
355357

356358
let uri = context.tmp_dir.path().to_str().to_owned().unwrap();
@@ -410,6 +412,7 @@ async fn test_commit_interval() -> Result<(), Box<dyn Error>> {
410412
&[],
411413
None,
412414
WriterProperties::builder().build(),
415+
false,
413416
)?;
414417

415418
let metrics = plan

0 commit comments

Comments
 (0)