From a07e743f09458aeb7de109d545b39fa100ceb0d0 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 9 Oct 2025 10:21:07 +0200 Subject: [PATCH 1/3] Add accessor for `Binary` data Resolves #1382 --- kernel/src/engine_data.rs | 87 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/kernel/src/engine_data.rs b/kernel/src/engine_data.rs index 7edb70b7c..aeaa09c1d 100644 --- a/kernel/src/engine_data.rs +++ b/kernel/src/engine_data.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; +use bytes::Bytes; use tracing::debug; use crate::expressions::ArrayData; @@ -13,7 +14,7 @@ use crate::{AsAny, DeltaResult, Error}; /// /// A value of `true` in the selection vector means the corresponding row is selected (i.e., not deleted), /// while `false` means the row is logically deleted and should be ignored. If the selection vector is shorter -/// then the number of rows in `data` then all rows not covered by the selection vector are assumed to be selected. +/// than the number of rows in `data` then all rows not covered by the selection vector are assumed to be selected. /// /// Interpreting unselected (`false`) rows will result in incorrect/undefined behavior. pub struct FilteredEngineData { @@ -172,6 +173,7 @@ pub trait GetData<'a> { (get_int, i32), (get_long, i64), (get_str, &'a str), + (get_binary, Bytes), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -193,6 +195,7 @@ impl<'a> GetData<'a> for () { (get_int, i32), (get_long, i64), (get_str, &'a str), + (get_binary, Bytes), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -227,6 +230,7 @@ impl_typed_get_data!( (get_int, i32), (get_long, i64), (get_str, &'a str), + (get_binary, Bytes), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -557,4 +561,85 @@ mod tests { assert!(e.to_string().contains("3 > 2")); } } + + // Mock implementation of GetData that returns binary data + struct MockBinaryData { + data: Vec>, + } + + impl<'a> GetData<'a> for MockBinaryData { + fn get_binary(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.data.get(row_index).and_then(|v| v.clone())) + } + } + + #[test] + fn test_get_binary_some_value() { + let mock_data = MockBinaryData { + data: vec![ + Some(Bytes::from_static(b"hello")), + Some(Bytes::from_static(b"world")), + None, + ], + }; + + // Cast to dyn GetData to use TypedGetData trait + let getter: &dyn GetData<'_> = &mock_data; + + // Test getting first row + let result: Option = getter.get_opt(0, "binary_field").unwrap(); + assert_eq!(result, Some(Bytes::from_static(b"hello"))); + + // Test getting second row + let result: Option = getter.get_opt(1, "binary_field").unwrap(); + assert_eq!(result, Some(Bytes::from_static(b"world"))); + + // Test getting None value + let result: Option = getter.get_opt(2, "binary_field").unwrap(); + assert_eq!(result, None); + } + + #[test] + fn test_get_binary_required() { + let mock_data = MockBinaryData { + data: vec![Some(Bytes::from_static(b"hello"))], + }; + + // Cast to dyn GetData to use TypedGetData trait + let getter: &dyn GetData<'_> = &mock_data; + + // Test using get() for required field + let result: Bytes = getter.get(0, "binary_field").unwrap(); + assert_eq!(result, Bytes::from_static(b"hello")); + } + + #[test] + fn test_get_binary_required_missing() { + let mock_data = MockBinaryData { data: vec![None] }; + + // Cast to dyn GetData to use TypedGetData trait + let getter: &dyn GetData<'_> = &mock_data; + + // Test using get() for missing required field should error + let result: DeltaResult = getter.get(0, "binary_field"); + assert!(result.is_err()); + if let Err(e) = result { + assert!(e.to_string().contains("Data missing for field")); + } + } + + #[test] + fn test_get_binary_empty_bytes() { + let mock_data = MockBinaryData { + data: vec![Some(Bytes::new())], + }; + + // Cast to dyn GetData to use TypedGetData trait + let getter: &dyn GetData<'_> = &mock_data; + + // Test getting empty Bytes + let result: Option = getter.get_opt(0, "binary_field").unwrap(); + assert_eq!(result, Some(Bytes::new())); + assert_eq!(result.unwrap().len(), 0); + } } From 66cc299d6f38cb58862a3df9fb2273b05a42cd86 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Fri, 10 Oct 2025 12:48:01 +0200 Subject: [PATCH 2/3] Add the `GetData`, thanks Nick! --- kernel/src/engine/arrow_data.rs | 75 ++++++++++++++++++++++++++ kernel/src/engine/arrow_get_data.rs | 12 ++++- kernel/src/engine_data.rs | 83 ++++++++++++++--------------- 3 files changed, 125 insertions(+), 45 deletions(-) diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs index b6a48e581..cf925a752 100644 --- a/kernel/src/engine/arrow_data.rs +++ b/kernel/src/engine/arrow_data.rs @@ -301,6 +301,10 @@ impl ArrowEngineData { debug!("Pushing string array for {}", ColumnName::new(path)); col.as_string_opt().map(|a| a as _).ok_or("string") } + &DataType::BINARY => { + debug!("Pushing binary array for {}", ColumnName::new(path)); + col.as_binary_opt().map(|a| a as _).ok_or("binary") + } &DataType::INTEGER => { debug!("Pushing int32 array for {}", ColumnName::new(path)); col.as_primitive_opt::() @@ -724,4 +728,75 @@ mod tests { Ok(()) } + + #[test] + fn test_binary_column_extraction() -> DeltaResult<()> { + use crate::arrow::array::BinaryArray; + use crate::engine_data::{GetData, RowVisitor}; + use crate::schema::ColumnName; + use std::sync::LazyLock; + + // Create a RecordBatch with binary data + let binary_data: Vec> = vec![ + Some(b"hello"), + Some(b"world"), + None, + Some(b"\x00\x01\x02\x03"), + ]; + let binary_array = BinaryArray::from(binary_data.clone()); + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "data", + ArrowDataType::Binary, + true, + )])); + + let batch = RecordBatch::try_new(schema, vec![Arc::new(binary_array)])?; + let arrow_data = ArrowEngineData::new(batch); + + // Create a visitor to extract binary data + struct BinaryVisitor { + values: Vec>>, + } + + impl RowVisitor for BinaryVisitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES: LazyLock> = + LazyLock::new(|| vec![ColumnName::new(["data"])]); + static TYPES: LazyLock> = LazyLock::new(|| vec![DataType::BINARY]); + (&NAMES, &TYPES) + } + + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + assert_eq!(getters.len(), 1); + let getter = getters[0]; + + for i in 0..row_count { + self.values.push(getter.get_binary(i, "data")?.map(|b| b.to_vec())); + } + Ok(()) + } + } + + let mut visitor = BinaryVisitor { values: vec![] }; + arrow_data.visit_rows(&[ColumnName::new(["data"])], &mut visitor)?; + + // Verify the extracted values + assert_eq!(visitor.values.len(), 4); + assert_eq!(visitor.values[0].as_deref(), Some(b"hello".as_ref())); + assert_eq!(visitor.values[1].as_deref(), Some(b"world".as_ref())); + assert_eq!(visitor.values[2], None); + assert_eq!( + visitor.values[3].as_deref(), + Some(b"\x00\x01\x02\x03".as_ref()) + ); + + Ok(()) + } } diff --git a/kernel/src/engine/arrow_get_data.rs b/kernel/src/engine/arrow_get_data.rs index fbed64df1..6cc36a4c3 100644 --- a/kernel/src/engine/arrow_get_data.rs +++ b/kernel/src/engine/arrow_get_data.rs @@ -1,5 +1,5 @@ use crate::arrow::array::{ - types::{GenericStringType, Int32Type, Int64Type}, + types::{GenericBinaryType, GenericStringType, Int32Type, Int64Type}, Array, BooleanArray, GenericByteArray, GenericListArray, MapArray, OffsetSizeTrait, PrimitiveArray, }; @@ -51,6 +51,16 @@ impl<'a> GetData<'a> for GenericByteArray> { } } +impl<'a> GetData<'a> for GenericByteArray> { + fn get_binary(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { + if self.is_valid(row_index) { + Ok(Some(self.value(row_index))) + } else { + Ok(None) + } + } +} + impl<'a, OffsetSize> GetData<'a> for GenericListArray where OffsetSize: OffsetSizeTrait, diff --git a/kernel/src/engine_data.rs b/kernel/src/engine_data.rs index aeaa09c1d..5e89944cf 100644 --- a/kernel/src/engine_data.rs +++ b/kernel/src/engine_data.rs @@ -2,7 +2,6 @@ use std::collections::HashMap; -use bytes::Bytes; use tracing::debug; use crate::expressions::ArrayData; @@ -173,7 +172,7 @@ pub trait GetData<'a> { (get_int, i32), (get_long, i64), (get_str, &'a str), - (get_binary, Bytes), + (get_binary, &'a [u8]), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -195,7 +194,7 @@ impl<'a> GetData<'a> for () { (get_int, i32), (get_long, i64), (get_str, &'a str), - (get_binary, Bytes), + (get_binary, &'a [u8]), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -230,7 +229,7 @@ impl_typed_get_data!( (get_int, i32), (get_long, i64), (get_str, &'a str), - (get_binary, Bytes), + (get_binary, &'a [u8]), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -562,66 +561,61 @@ mod tests { } } - // Mock implementation of GetData that returns binary data - struct MockBinaryData { - data: Vec>, - } - - impl<'a> GetData<'a> for MockBinaryData { - fn get_binary(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { - Ok(self.data.get(row_index).and_then(|v| v.clone())) - } - } - #[test] fn test_get_binary_some_value() { - let mock_data = MockBinaryData { - data: vec![ - Some(Bytes::from_static(b"hello")), - Some(Bytes::from_static(b"world")), - None, - ], - }; + use crate::arrow::array::BinaryArray; + + // Use Arrow's BinaryArray implementation + let binary_data: Vec> = vec![ + Some(b"hello"), + Some(b"world"), + None, + ]; + let binary_array = BinaryArray::from(binary_data); // Cast to dyn GetData to use TypedGetData trait - let getter: &dyn GetData<'_> = &mock_data; + let getter: &dyn GetData<'_> = &binary_array; // Test getting first row - let result: Option = getter.get_opt(0, "binary_field").unwrap(); - assert_eq!(result, Some(Bytes::from_static(b"hello"))); + let result: Option<&[u8]> = getter.get_opt(0, "binary_field").unwrap(); + assert_eq!(result, Some(b"hello".as_ref())); // Test getting second row - let result: Option = getter.get_opt(1, "binary_field").unwrap(); - assert_eq!(result, Some(Bytes::from_static(b"world"))); + let result: Option<&[u8]> = getter.get_opt(1, "binary_field").unwrap(); + assert_eq!(result, Some(b"world".as_ref())); // Test getting None value - let result: Option = getter.get_opt(2, "binary_field").unwrap(); + let result: Option<&[u8]> = getter.get_opt(2, "binary_field").unwrap(); assert_eq!(result, None); } #[test] fn test_get_binary_required() { - let mock_data = MockBinaryData { - data: vec![Some(Bytes::from_static(b"hello"))], - }; + use crate::arrow::array::BinaryArray; + + let binary_data: Vec> = vec![Some(b"hello")]; + let binary_array = BinaryArray::from(binary_data); // Cast to dyn GetData to use TypedGetData trait - let getter: &dyn GetData<'_> = &mock_data; + let getter: &dyn GetData<'_> = &binary_array; // Test using get() for required field - let result: Bytes = getter.get(0, "binary_field").unwrap(); - assert_eq!(result, Bytes::from_static(b"hello")); + let result: &[u8] = getter.get(0, "binary_field").unwrap(); + assert_eq!(result, b"hello"); } #[test] fn test_get_binary_required_missing() { - let mock_data = MockBinaryData { data: vec![None] }; + use crate::arrow::array::BinaryArray; + + let binary_data: Vec> = vec![None]; + let binary_array = BinaryArray::from(binary_data); // Cast to dyn GetData to use TypedGetData trait - let getter: &dyn GetData<'_> = &mock_data; + let getter: &dyn GetData<'_> = &binary_array; // Test using get() for missing required field should error - let result: DeltaResult = getter.get(0, "binary_field"); + let result: DeltaResult<&[u8]> = getter.get(0, "binary_field"); assert!(result.is_err()); if let Err(e) = result { assert!(e.to_string().contains("Data missing for field")); @@ -630,16 +624,17 @@ mod tests { #[test] fn test_get_binary_empty_bytes() { - let mock_data = MockBinaryData { - data: vec![Some(Bytes::new())], - }; + use crate::arrow::array::BinaryArray; + + let binary_data: Vec> = vec![Some(b"")]; + let binary_array = BinaryArray::from(binary_data); // Cast to dyn GetData to use TypedGetData trait - let getter: &dyn GetData<'_> = &mock_data; + let getter: &dyn GetData<'_> = &binary_array; - // Test getting empty Bytes - let result: Option = getter.get_opt(0, "binary_field").unwrap(); - assert_eq!(result, Some(Bytes::new())); + // Test getting empty bytes + let result: Option<&[u8]> = getter.get_opt(0, "binary_field").unwrap(); + assert_eq!(result, Some([].as_ref())); assert_eq!(result.unwrap().len(), 0); } } From 0e955b9914c5dcefdc55e80cc5db540ec727bc5d Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Fri, 10 Oct 2025 12:50:08 +0200 Subject: [PATCH 3/3] `cargo fmt` --- kernel/src/engine/arrow_data.rs | 3 ++- kernel/src/engine_data.rs | 6 +----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs index cf925a752..c4415525d 100644 --- a/kernel/src/engine/arrow_data.rs +++ b/kernel/src/engine/arrow_data.rs @@ -778,7 +778,8 @@ mod tests { let getter = getters[0]; for i in 0..row_count { - self.values.push(getter.get_binary(i, "data")?.map(|b| b.to_vec())); + self.values + .push(getter.get_binary(i, "data")?.map(|b| b.to_vec())); } Ok(()) } diff --git a/kernel/src/engine_data.rs b/kernel/src/engine_data.rs index 5e89944cf..9c3b725ec 100644 --- a/kernel/src/engine_data.rs +++ b/kernel/src/engine_data.rs @@ -566,11 +566,7 @@ mod tests { use crate::arrow::array::BinaryArray; // Use Arrow's BinaryArray implementation - let binary_data: Vec> = vec![ - Some(b"hello"), - Some(b"world"), - None, - ]; + let binary_data: Vec> = vec![Some(b"hello"), Some(b"world"), None]; let binary_array = BinaryArray::from(binary_data); // Cast to dyn GetData to use TypedGetData trait