From cf17bc948ef905f5af31edcd2744a98fd341e932 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Thu, 23 Oct 2025 11:43:13 -0700 Subject: [PATCH] arrow 57 support --- ffi/src/transaction/mod.rs | 2 +- kernel/Cargo.toml | 15 ++++++++++- .../read-table-multi-threaded/Cargo.toml | 5 ++-- .../read-table-single-threaded/Cargo.toml | 4 +-- kernel/examples/write-table/Cargo.toml | 5 ++-- kernel/src/arrow_compat.rs | 25 +++++++++++++++---- kernel/src/checkpoint/tests.rs | 9 +++---- kernel/src/engine/ensure_data_types.rs | 2 +- mem-test/Cargo.toml | 1 - mem-test/tests/dhat_large_table_data.rs | 6 ++--- 10 files changed, 51 insertions(+), 23 deletions(-) diff --git a/ffi/src/transaction/mod.rs b/ffi/src/transaction/mod.rs index 924271924..80b0f9c8b 100644 --- a/ffi/src/transaction/mod.rs +++ b/ffi/src/transaction/mod.rs @@ -241,7 +241,7 @@ mod tests { // writer must be closed to write footer let res = writer.close().unwrap(); - create_file_metadata(file_path, res.num_rows, metadata_schema) + create_file_metadata(file_path, res.file_metadata().num_rows(), metadata_schema) } #[tokio::test] diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index eb21b3702..59fb8403b 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -90,6 +90,18 @@ version = "56" features = ["async", "object_store"] optional = true +# arrow 57 +[dependencies.arrow_57] +package = "arrow" +version = "57" +features = ["chrono-tz", "ffi", "json", "prettyprint"] +optional = true +[dependencies.parquet_57] +package = "parquet" +version = "57" +features = ["async", "object_store"] +optional = true + [features] # no default features default = [] @@ -99,11 +111,12 @@ internal-api = [] integration-test = ["hdfs-native-object-store/integration-test"] # The default versions for arrow/parquet/object_store -arrow = ["arrow-56"] # latest arrow version +arrow = ["arrow-57"] # latest arrow version need-arrow = [] # need-arrow is a marker that the feature needs arrow dep arrow-55 = ["dep:arrow_55", "dep:parquet_55", "object_store", "comfy-table"] arrow-56 = ["dep:arrow_56", "dep:parquet_56", "object_store", "comfy-table"] +arrow-57 = ["dep:arrow_57", "dep:parquet_57", "object_store", "comfy-table"] arrow-conversion = ["need-arrow"] arrow-expression = ["need-arrow"] diff --git a/kernel/examples/read-table-multi-threaded/Cargo.toml b/kernel/examples/read-table-multi-threaded/Cargo.toml index 17253b98d..0dbbae99a 100644 --- a/kernel/examples/read-table-multi-threaded/Cargo.toml +++ b/kernel/examples/read-table-multi-threaded/Cargo.toml @@ -5,11 +5,12 @@ edition = "2021" publish = false [dependencies] -arrow = { version = "56", features = ["prettyprint", "chrono-tz"] } +arrow = { version = "57", features = ["prettyprint", "chrono-tz"] } clap = { version = "4.5", features = ["derive"] } +# common pulls in arrow latest so we have to keep all these in sync here common = { path = "../common" } delta_kernel = { path = "../../../kernel", features = [ - "arrow-56", + "arrow", "default-engine-rustls", "internal-api", ] } diff --git a/kernel/examples/read-table-single-threaded/Cargo.toml b/kernel/examples/read-table-single-threaded/Cargo.toml index d13457a5a..8ecc5733b 100644 --- a/kernel/examples/read-table-single-threaded/Cargo.toml +++ b/kernel/examples/read-table-single-threaded/Cargo.toml @@ -5,11 +5,11 @@ edition = "2021" publish = false [dependencies] -arrow = { version = "56", features = ["prettyprint", "chrono-tz"] } +arrow = { version = "57", features = ["prettyprint", "chrono-tz"] } clap = { version = "4.5", features = ["derive"] } common = { path = "../common" } delta_kernel = { path = "../../../kernel", features = [ - "arrow-56", + "arrow", "default-engine-rustls", "internal-api", ] } diff --git a/kernel/examples/write-table/Cargo.toml b/kernel/examples/write-table/Cargo.toml index 3291944f6..8e1200d39 100644 --- a/kernel/examples/write-table/Cargo.toml +++ b/kernel/examples/write-table/Cargo.toml @@ -5,11 +5,12 @@ edition = "2021" publish = false [dependencies] -arrow = { version = "56", features = ["prettyprint", "chrono-tz"] } +arrow = { version = "57", features = ["prettyprint", "chrono-tz"] } clap = { version = "4.5", features = ["derive"] } +# NB: common depends on 'arrow' (latest) so have to match here common = { path = "../common" } delta_kernel = { path = "../../../kernel", features = [ - "arrow-56", + "arrow", "default-engine-rustls", "internal-api", ] } diff --git a/kernel/src/arrow_compat.rs b/kernel/src/arrow_compat.rs index e57e63404..58faafc17 100644 --- a/kernel/src/arrow_compat.rs +++ b/kernel/src/arrow_compat.rs @@ -1,12 +1,26 @@ //! This module re-exports the different versions of arrow, parquet, and object_store we support. -#[cfg(feature = "arrow-56")] +#[cfg(feature = "arrow-57")] +mod arrow_compat_shims { + pub use arrow_57 as arrow; + pub use parquet_57 as parquet; +} + +#[cfg(all( + not(feature = "arrow-55"), + feature = "arrow-56", + not(feature = "arrow-57") +))] mod arrow_compat_shims { pub use arrow_56 as arrow; pub use parquet_56 as parquet; } -#[cfg(all(feature = "arrow-55", not(feature = "arrow-56")))] +#[cfg(all( + feature = "arrow-55", + not(feature = "arrow-56"), + not(feature = "arrow-57") +))] mod arrow_compat_shims { pub use arrow_55 as arrow; pub use parquet_55 as parquet; @@ -17,9 +31,10 @@ mod arrow_compat_shims { #[cfg(all( feature = "need-arrow", not(feature = "arrow-55"), - not(feature = "arrow-56") + not(feature = "arrow-56"), + not(feature = "arrow-57") ))] -compile_error!("Requested a feature that needs arrow without enabling arrow. Please enable the `arrow-55` or `arrow-56` feature"); +compile_error!("Requested a feature that needs arrow without enabling arrow. Please enable the `arrow-55`, `arrow-56`, or `arrow-57` feature"); -#[cfg(any(feature = "arrow-55", feature = "arrow-56"))] +#[cfg(any(feature = "arrow-55", feature = "arrow-56", feature = "arrow-57"))] pub use arrow_compat_shims::*; diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 64fca3d2c..8feb5658d 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -6,6 +6,10 @@ use crate::action_reconciliation::{ use crate::actions::{Add, Metadata, Protocol, Remove}; use crate::arrow::array::{ArrayRef, StructArray}; use crate::arrow::datatypes::{DataType, Schema}; +use crate::arrow::{ + array::{create_array, RecordBatch}, + datatypes::Field, +}; use crate::checkpoint::create_last_checkpoint_data; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; @@ -14,11 +18,6 @@ use crate::schema::{DataType as KernelDataType, StructField, StructType}; use crate::utils::test_utils::Action; use crate::{DeltaResult, FileMeta, LogPath, Snapshot}; -use arrow_56::{ - array::{create_array, RecordBatch}, - datatypes::Field, -}; - use object_store::{memory::InMemory, path::Path, ObjectStore}; use serde_json::{from_slice, json, Value}; use test_utils::delta_path_for_version; diff --git a/kernel/src/engine/ensure_data_types.rs b/kernel/src/engine/ensure_data_types.rs index 2d5a660c3..54a16d03a 100644 --- a/kernel/src/engine/ensure_data_types.rs +++ b/kernel/src/engine/ensure_data_types.rs @@ -352,7 +352,7 @@ mod tests { &incorrect_variant_arrow_type(), true, ), - "Invalid argument error: Incorrect datatype. Expected Struct(metadata Binary, value Binary), got Struct(field_1 Binary, field_2 Binary)", + "Invalid argument error: Incorrect datatype. Expected Struct(\"metadata\": Binary, \"value\": Binary), got Struct(\"field_1\": nullable Binary, \"field_2\": nullable Binary)", ) } diff --git a/mem-test/Cargo.toml b/mem-test/Cargo.toml index 81e3d2724..d79e77370 100644 --- a/mem-test/Cargo.toml +++ b/mem-test/Cargo.toml @@ -14,7 +14,6 @@ version.workspace = true release = false [dependencies] -arrow = "56" delta_kernel = { path = "../kernel", features = ["arrow", "default-engine-rustls"] } dhat = "0.3" object_store = "0.12.3" diff --git a/mem-test/tests/dhat_large_table_data.rs b/mem-test/tests/dhat_large_table_data.rs index eabfa09ec..7e9e50f69 100644 --- a/mem-test/tests/dhat_large_table_data.rs +++ b/mem-test/tests/dhat_large_table_data.rs @@ -8,6 +8,7 @@ use std::path::Path; use std::sync::Arc; use delta_kernel::arrow::array::{ArrayRef, Int64Array, StringArray}; +use delta_kernel::arrow::compute::filter_record_batch; use delta_kernel::arrow::record_batch::RecordBatch; use delta_kernel::engine::arrow_data::ArrowEngineData; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; @@ -16,7 +17,6 @@ use delta_kernel::parquet::arrow::ArrowWriter; use delta_kernel::parquet::file::properties::WriterProperties; use delta_kernel::Snapshot; -use arrow::compute::filter_record_batch; use object_store::local::LocalFileSystem; use serde_json::json; use tempfile::tempdir; @@ -46,9 +46,9 @@ fn write_large_parquet_to(path: &Path) -> Result<(), Box> let metadata = std::fs::metadata(&path)?; let file_size = metadata.len(); let total_row_group_size: i64 = parquet_metadata - .row_groups + .row_groups() .iter() - .map(|rg| rg.total_byte_size) + .map(|rg| rg.total_byte_size()) .sum(); println!("File size (compressed file size): {} bytes", file_size); println!(