Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions kernel/src/actions/deletion_vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use crate::utils::require;
use crate::{DeltaResult, Error, StorageHandler};

#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[cfg_attr(test, derive(serde::Serialize))]
#[cfg_attr(test, derive(serde::Serialize, serde::Deserialize))]
pub enum DeletionVectorStorageType {
#[cfg_attr(test, serde(rename = "u"))]
PersistedRelative,
Expand Down Expand Up @@ -58,7 +58,7 @@ impl ToDataType for DeletionVectorStorageType {
}

#[derive(Debug, Clone, PartialEq, Eq, ToSchema)]
#[cfg_attr(test, derive(serde::Serialize), serde(rename_all = "camelCase"))]
#[cfg_attr(test, derive(serde::Serialize, serde::Deserialize), serde(rename_all = "camelCase"))]
pub struct DeletionVectorDescriptor {
/// A single character to indicate how to access the DV. Legal options are: ['u', 'i', 'p'].
pub storage_type: DeletionVectorStorageType,
Expand Down
53 changes: 50 additions & 3 deletions kernel/src/actions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ impl IntoEngineData for CommitInfo {
}

#[derive(Debug, Clone, PartialEq, Eq, ToSchema)]
#[cfg_attr(test, derive(Serialize, Default), serde(rename_all = "camelCase"))]
#[cfg_attr(test, derive(Serialize, Deserialize, Default), serde(rename_all = "camelCase"))]
#[internal_api]
pub(crate) struct Add {
/// A relative path to a data file from the root of the table or an absolute path to a file
Expand Down Expand Up @@ -721,7 +721,7 @@ pub(crate) struct Add {

/// Map containing metadata about this logical file.
#[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))]
pub tags: Option<HashMap<String, String>>,
pub tags: Option<HashMap<String, Option<String>>>,

/// Information about deletion vector (DV) associated with this add action
#[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))]
Expand Down Expand Up @@ -1075,7 +1075,7 @@ mod tests {
StructField::nullable("stats", DataType::STRING),
StructField::nullable(
"tags",
MapType::new(DataType::STRING, DataType::STRING, false),
MapType::new(DataType::STRING, DataType::STRING, true),
),
deletion_vector_field(),
StructField::nullable("baseRowId", DataType::LONG),
Expand Down Expand Up @@ -1974,4 +1974,51 @@ mod tests {
assert!(record_batch.column(2).is_null(0));
assert!(record_batch.column(3).is_null(0));
}

#[test]
fn test_add_tags_deserialization() {
Copy link
Collaborator

@DrakeLin DrakeLin Oct 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I would separate these cases into 3 unit tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I'll do it

// Test different cases for the tags field:
// 1. tags field is null (entire field is None)
// 2. tags map contains nullable values (some keys map to null)
// 3. tags map contains non-null string values only

// Note about nullable values inside the tags map:
// The tags field type is Option<HashMap<String, Option<String>>>, which means:
// - The outer Option makes the entire field nullable (field can be absent or null)
// - The inner Option<String> makes individual map values nullable (value can be null)
// This preserves null values in the map, unlike #[allow_null_container_values]
// which would drop them during materialization.

// Case 1: tags field is null
let json1 = r#"{"path":"file1.parquet","partitionValues":{},"size":100,"modificationTime":1234567890,"dataChange":true,"tags":null}"#;
let add1: Add = serde_json::from_str(json1).unwrap();
assert_eq!(add1.tags, None);

// Case 2: tags map contains nullable values (some keys map to null)
let json2 = r#"{"path":"file2.parquet","partitionValues":{},"size":200,"modificationTime":1234567890,"dataChange":true,"tags":{"INSERTION_TIME":"1677811178336000","NULLABLE_TAG":null}}"#;
let add2: Add = serde_json::from_str(json2).unwrap();
assert!(add2.tags.is_some());
let tags = add2.tags.unwrap();
assert_eq!(tags.len(), 2);
assert_eq!(
tags.get("INSERTION_TIME"),
Some(&Some("1677811178336000".to_string()))
);
assert_eq!(tags.get("NULLABLE_TAG"), Some(&None));

// Case 3: tags map contains non-null string values only
let json3 = r#"{"path":"file3.parquet","partitionValues":{},"size":300,"modificationTime":1234567890,"dataChange":true,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000"}}"#;
let add3: Add = serde_json::from_str(json3).unwrap();
assert!(add3.tags.is_some());
let tags = add3.tags.unwrap();
assert_eq!(tags.len(), 2);
assert_eq!(
tags.get("INSERTION_TIME"),
Some(&Some("1677811178336000".to_string()))
);
assert_eq!(
tags.get("MIN_INSERTION_TIME"),
Some(&Some("1677811178336000".to_string()))
);
}
}
7 changes: 7 additions & 0 deletions kernel/src/schema/derive_macro_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ impl<K: ToDataType, V: ToDataType> ToDataType for HashMap<K, V> {
}
}

// ToDataType impl for maps with nullable values
impl<K: ToDataType, V: ToDataType> ToDataType for HashMap<K, Option<V>> {
fn to_data_type() -> DataType {
MapType::new(K::to_data_type(), V::to_data_type(), true).into()
}
}

/// The [`delta_kernel_derive::ToSchema`] macro uses this to convert a struct field's name + type
/// into a `StructField` definition. A blanket impl for `Option<T: ToDataType>` supports nullable
/// struct fields, which otherwise default to non-nullable.
Expand Down