From db3e9699ce41c8daa41354a8574ae09c32ccf23e Mon Sep 17 00:00:00 2001 From: Ruzel Ibragimov Date: Tue, 14 Oct 2025 15:40:40 +0000 Subject: [PATCH 1/2] Make tags' values optional --- kernel/src/actions/mod.rs | 4 ++-- kernel/src/schema/derive_macro_utils.rs | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index 77836d27b..3bb9f2d6b 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -721,7 +721,7 @@ pub(crate) struct Add { /// Map containing metadata about this logical file. #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] - pub tags: Option>, + pub tags: Option>>, /// Information about deletion vector (DV) associated with this add action #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] @@ -1075,7 +1075,7 @@ mod tests { StructField::nullable("stats", DataType::STRING), StructField::nullable( "tags", - MapType::new(DataType::STRING, DataType::STRING, false), + MapType::new(DataType::STRING, DataType::STRING, true), ), deletion_vector_field(), StructField::nullable("baseRowId", DataType::LONG), diff --git a/kernel/src/schema/derive_macro_utils.rs b/kernel/src/schema/derive_macro_utils.rs index e193898a6..ebf2691f9 100644 --- a/kernel/src/schema/derive_macro_utils.rs +++ b/kernel/src/schema/derive_macro_utils.rs @@ -66,6 +66,13 @@ impl ToDataType for HashMap { } } +// ToDataType impl for maps with nullable values +impl ToDataType for HashMap> { + fn to_data_type() -> DataType { + MapType::new(K::to_data_type(), V::to_data_type(), true).into() + } +} + /// The [`delta_kernel_derive::ToSchema`] macro uses this to convert a struct field's name + type /// into a `StructField` definition. A blanket impl for `Option` supports nullable /// struct fields, which otherwise default to non-nullable. From f3d028790be2465c1debe1be49f90b85a944ad01 Mon Sep 17 00:00:00 2001 From: Ruzel Ibragimov Date: Tue, 14 Oct 2025 15:45:38 +0000 Subject: [PATCH 2/2] add test --- kernel/src/actions/deletion_vector.rs | 4 +-- kernel/src/actions/mod.rs | 49 ++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/kernel/src/actions/deletion_vector.rs b/kernel/src/actions/deletion_vector.rs index 2c269f036..3b8d60d96 100644 --- a/kernel/src/actions/deletion_vector.rs +++ b/kernel/src/actions/deletion_vector.rs @@ -15,7 +15,7 @@ use crate::utils::require; use crate::{DeltaResult, Error, StorageHandler}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] -#[cfg_attr(test, derive(serde::Serialize))] +#[cfg_attr(test, derive(serde::Serialize, serde::Deserialize))] pub enum DeletionVectorStorageType { #[cfg_attr(test, serde(rename = "u"))] PersistedRelative, @@ -58,7 +58,7 @@ impl ToDataType for DeletionVectorStorageType { } #[derive(Debug, Clone, PartialEq, Eq, ToSchema)] -#[cfg_attr(test, derive(serde::Serialize), serde(rename_all = "camelCase"))] +#[cfg_attr(test, derive(serde::Serialize, serde::Deserialize), serde(rename_all = "camelCase"))] pub struct DeletionVectorDescriptor { /// A single character to indicate how to access the DV. Legal options are: ['u', 'i', 'p']. pub storage_type: DeletionVectorStorageType, diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index 3bb9f2d6b..352b41ed3 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -683,7 +683,7 @@ impl IntoEngineData for CommitInfo { } #[derive(Debug, Clone, PartialEq, Eq, ToSchema)] -#[cfg_attr(test, derive(Serialize, Default), serde(rename_all = "camelCase"))] +#[cfg_attr(test, derive(Serialize, Deserialize, Default), serde(rename_all = "camelCase"))] #[internal_api] pub(crate) struct Add { /// A relative path to a data file from the root of the table or an absolute path to a file @@ -1974,4 +1974,51 @@ mod tests { assert!(record_batch.column(2).is_null(0)); assert!(record_batch.column(3).is_null(0)); } + + #[test] + fn test_add_tags_deserialization() { + // Test different cases for the tags field: + // 1. tags field is null (entire field is None) + // 2. tags map contains nullable values (some keys map to null) + // 3. tags map contains non-null string values only + + // Note about nullable values inside the tags map: + // The tags field type is Option>>, which means: + // - The outer Option makes the entire field nullable (field can be absent or null) + // - The inner Option makes individual map values nullable (value can be null) + // This preserves null values in the map, unlike #[allow_null_container_values] + // which would drop them during materialization. + + // Case 1: tags field is null + let json1 = r#"{"path":"file1.parquet","partitionValues":{},"size":100,"modificationTime":1234567890,"dataChange":true,"tags":null}"#; + let add1: Add = serde_json::from_str(json1).unwrap(); + assert_eq!(add1.tags, None); + + // Case 2: tags map contains nullable values (some keys map to null) + let json2 = r#"{"path":"file2.parquet","partitionValues":{},"size":200,"modificationTime":1234567890,"dataChange":true,"tags":{"INSERTION_TIME":"1677811178336000","NULLABLE_TAG":null}}"#; + let add2: Add = serde_json::from_str(json2).unwrap(); + assert!(add2.tags.is_some()); + let tags = add2.tags.unwrap(); + assert_eq!(tags.len(), 2); + assert_eq!( + tags.get("INSERTION_TIME"), + Some(&Some("1677811178336000".to_string())) + ); + assert_eq!(tags.get("NULLABLE_TAG"), Some(&None)); + + // Case 3: tags map contains non-null string values only + let json3 = r#"{"path":"file3.parquet","partitionValues":{},"size":300,"modificationTime":1234567890,"dataChange":true,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000"}}"#; + let add3: Add = serde_json::from_str(json3).unwrap(); + assert!(add3.tags.is_some()); + let tags = add3.tags.unwrap(); + assert_eq!(tags.len(), 2); + assert_eq!( + tags.get("INSERTION_TIME"), + Some(&Some("1677811178336000".to_string())) + ); + assert_eq!( + tags.get("MIN_INSERTION_TIME"), + Some(&Some("1677811178336000".to_string())) + ); + } }