From 7aa1d7f372ff8f2891e2be665d57154f968d45ac Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 3 Oct 2025 15:18:59 -0700 Subject: [PATCH 01/28] working! just need to add back a test --- kernel/examples/common/src/lib.rs | 56 ++++++- kernel/src/engine/arrow_utils.rs | 7 + kernel/src/scan/log_replay.rs | 147 ++++++++++-------- kernel/src/scan/mod.rs | 94 +++++++---- kernel/src/scan/state.rs | 2 +- .../src/table_changes/physical_to_logical.rs | 7 +- kernel/src/transforms.rs | 112 ++++++++----- kernel/tests/read.rs | 43 ++--- 8 files changed, 300 insertions(+), 168 deletions(-) diff --git a/kernel/examples/common/src/lib.rs b/kernel/examples/common/src/lib.rs index b5976c6a8..d488d2576 100644 --- a/kernel/examples/common/src/lib.rs +++ b/kernel/examples/common/src/lib.rs @@ -7,7 +7,7 @@ use delta_kernel::{ arrow::array::RecordBatch, engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, scan::Scan, - schema::Schema, + schema::{MetadataColumnSpec, Schema}, DeltaResult, SnapshotRef, }; @@ -61,9 +61,18 @@ pub struct ScanArgs { #[arg(long)] pub schema_only: bool, - /// Comma separated list of columns to select - #[arg(long, value_delimiter=',', num_args(0..))] - pub columns: Option>, + /// Comma separated list of columns to select. Must be passed as a single string, leading and + /// trailing spaces for each column name will be trimmed + #[arg(long)] + pub columns: Option, + + /// Include a _metadata.row_index field + #[arg(long)] + pub with_row_index: bool, + + /// Include a _metadata.row_id field if row-tracking is enabled + #[arg(long)] + pub with_row_id: bool, } pub trait ParseWithExamples { @@ -182,9 +191,10 @@ pub fn get_scan(snapshot: SnapshotRef, args: &ScanArgs) -> DeltaResult DeltaResult<_> { let table_schema = snapshot.schema(); + let cols: Vec<&str> = cols.split(",").map(str::trim).collect(); let selected_fields = cols.iter().map(|col| { table_schema .field(col) @@ -193,9 +203,43 @@ pub fn get_scan(snapshot: SnapshotRef, args: &ScanArgs) -> DeltaResult { + debug!("Inserting a row index column for row ids: {}", field.name()); + reorder_indices.push(ReorderIndex::row_index( + requested_position, + Arc::new(field.try_into_arrow()?), + )); + } Some(metadata_spec) => { return Err(Error::Generic(format!( "Metadata column {metadata_spec:?} is not supported by the default parquet reader" diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index a82d123d2..224c599c5 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -97,8 +97,9 @@ impl AddRemoveDedupVisitor<'_> { const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters const ADD_PARTITION_VALUES_INDEX: usize = 1; // Position of "add.partitionValues" in getters const ADD_DV_START_INDEX: usize = 2; // Start position of add deletion vector columns - const REMOVE_PATH_INDEX: usize = 5; // Position of "remove.path" in getters - const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns + const BASE_ROW_ID_INDEX: usize = 5; // Position of add.baseRowId in getters + const REMOVE_PATH_INDEX: usize = 6; // Position of "remove.path" in getters + const REMOVE_DV_START_INDEX: usize = 7; // Start position of remove deletion vector columns fn new( seen: &mut HashSet, @@ -187,10 +188,19 @@ impl AddRemoveDedupVisitor<'_> { if self.deduplicator.check_and_record_seen(file_key) || !is_add { return Ok(false); } + let base_row_id: Option = + getters[Self::BASE_ROW_ID_INDEX].get_opt(i, "add.baseRowId")?; let transform = self .transform_spec .as_ref() - .map(|transform| get_transform_expr(transform, partition_values, &self.physical_schema)) + .map(|transform| { + get_transform_expr( + transform, + partition_values, + &self.physical_schema, + base_row_id, + ) + }) .transpose()?; if transform.is_some() { // fill in any needed `None`s for previous rows @@ -207,6 +217,7 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { const STRING: DataType = DataType::STRING; const INTEGER: DataType = DataType::INTEGER; + const LONG: DataType = DataType::LONG; let ss_map: DataType = MapType::new(STRING, STRING, true).into(); let types_and_names = vec![ (STRING, column_name!("add.path")), @@ -214,6 +225,7 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { (STRING, column_name!("add.deletionVector.storageType")), (STRING, column_name!("add.deletionVector.pathOrInlineDv")), (INTEGER, column_name!("add.deletionVector.offset")), + (LONG, column_name!("add.baseRowId")), (STRING, column_name!("remove.path")), (STRING, column_name!("remove.deletionVector.storageType")), (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), @@ -228,13 +240,13 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } else { // All checkpoint actions are already reconciled and Remove actions in checkpoint files // only serve as tombstones for vacuum jobs. So we only need to examine the adds here. - (&names[..5], &types[..5]) + (&names[..6], &types[..6]) } } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { let is_log_batch = self.deduplicator.is_log_batch(); - let expected_getters = if is_log_batch { 9 } else { 5 }; + let expected_getters = if is_log_batch { 10 } else { 6 }; require!( getters.len() == expected_getters, Error::InternalError(format!( @@ -259,7 +271,10 @@ pub(crate) static SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| // Note that fields projected out of a nullable struct must be nullable let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); let file_constant_values = - StructType::new_unchecked([StructField::nullable("partitionValues", partition_values)]); + StructType::new_unchecked([ + StructField::nullable("partitionValues", partition_values), + StructField::nullable("baseRowId", DataType::LONG), + ]); Arc::new(StructType::new_unchecked([ StructField::nullable("path", DataType::STRING), StructField::nullable("size", DataType::LONG), @@ -282,9 +297,10 @@ fn get_add_transform_expr() -> ExpressionRef { column_expr_ref!("add.modificationTime"), column_expr_ref!("add.stats"), column_expr_ref!("add.deletionVector"), - Arc::new(Expression::Struct(vec![column_expr_ref!( - "add.partitionValues" - )])), + Arc::new(Expression::Struct(vec![ + column_expr_ref!("add.partitionValues"), + column_expr_ref!("add.baseRowId") + ])), ])) }); EXPR.clone() @@ -303,6 +319,7 @@ pub(crate) fn get_scan_metadata_transform_expr() -> ExpressionRef { column_expr_ref!("modificationTime"), column_expr_ref!("stats"), column_expr_ref!("deletionVector"), + column_expr_ref!("fileConstantValues.baseRowId"), ], ))])) }); @@ -466,60 +483,60 @@ mod tests { } } - #[test] - fn test_simple_transform() { - let schema: SchemaRef = Arc::new(StructType::new_unchecked([ - StructField::new("value", DataType::INTEGER, true), - StructField::new("date", DataType::DATE, true), - ])); - let partition_cols = ["date".to_string()]; - let state_info = - StateInfo::try_new(schema.as_ref(), &partition_cols, ColumnMappingMode::None).unwrap(); - let static_transform = Some(Arc::new(get_transform_spec(&state_info.all_fields))); - let batch = vec![add_batch_with_partition_col()]; - let iter = scan_action_iter( - &SyncEngine::new(), - batch - .into_iter() - .map(|batch| Ok(ActionsBatch::new(batch as _, true))), - schema.clone(), - schema, - static_transform, - None, - ); - - fn validate_transform(transform: Option<&ExpressionRef>, expected_date_offset: i32) { - assert!(transform.is_some()); - let Expr::Transform(transform) = transform.unwrap().as_ref() else { - panic!("Transform should always be a Transform expr"); - }; - - // With sparse transforms, we expect only one insertion for the partition column - assert!(transform.prepended_fields.is_empty()); - let mut field_transforms = transform.field_transforms.iter(); - let (field_name, field_transform) = field_transforms.next().unwrap(); - assert_eq!(field_name, "value"); - assert!(!field_transform.is_replace); - let [expr] = &field_transform.exprs[..] else { - panic!("Expected a single insertion"); - }; - let Expr::Literal(Scalar::Date(date_offset)) = expr.as_ref() else { - panic!("Expected a literal date"); - }; - assert_eq!(*date_offset, expected_date_offset); - assert!(field_transforms.next().is_none()); - } - - for res in iter { - let scan_metadata = res.unwrap(); - let transforms = scan_metadata.scan_file_transforms; - // in this case we have a metadata action first and protocol 3rd, so we expect 4 items, - // the first and 3rd being a `None` - assert_eq!(transforms.len(), 4, "Should have 4 transforms"); - assert!(transforms[0].is_none(), "transform at [0] should be None"); - assert!(transforms[2].is_none(), "transform at [2] should be None"); - validate_transform(transforms[1].as_ref(), 17511); - validate_transform(transforms[3].as_ref(), 17510); - } - } + // #[test] + // fn test_simple_transform() { + // let schema: SchemaRef = Arc::new(StructType::new_unchecked([ + // StructField::new("value", DataType::INTEGER, true), + // StructField::new("date", DataType::DATE, true), + // ])); + // let partition_cols = ["date".to_string()]; + // let state_info = + // StateInfo::try_new(schema.as_ref(), &partition_cols, ColumnMappingMode::None).unwrap(); + // let static_transform = Some(Arc::new(get_transform_spec(&state_info.all_fields))); + // let batch = vec![add_batch_with_partition_col()]; + // let iter = scan_action_iter( + // &SyncEngine::new(), + // batch + // .into_iter() + // .map(|batch| Ok(ActionsBatch::new(batch as _, true))), + // schema.clone(), + // schema, + // static_transform, + // None, + // ); + + // fn validate_transform(transform: Option<&ExpressionRef>, expected_date_offset: i32) { + // assert!(transform.is_some()); + // let Expr::Transform(transform) = transform.unwrap().as_ref() else { + // panic!("Transform should always be a Transform expr"); + // }; + + // // With sparse transforms, we expect only one insertion for the partition column + // assert!(transform.prepended_fields.is_empty()); + // let mut field_transforms = transform.field_transforms.iter(); + // let (field_name, field_transform) = field_transforms.next().unwrap(); + // assert_eq!(field_name, "value"); + // assert!(!field_transform.is_replace); + // let [expr] = &field_transform.exprs[..] else { + // panic!("Expected a single insertion"); + // }; + // let Expr::Literal(Scalar::Date(date_offset)) = expr.as_ref() else { + // panic!("Expected a literal date"); + // }; + // assert_eq!(*date_offset, expected_date_offset); + // assert!(field_transforms.next().is_none()); + // } + + // for res in iter { + // let scan_metadata = res.unwrap(); + // let transforms = scan_metadata.scan_file_transforms; + // // in this case we have a metadata action first and protocol 3rd, so we expect 4 items, + // // the first and 3rd being a `None` + // assert_eq!(transforms.len(), 4, "Should have 4 transforms"); + // assert!(transforms[0].is_none(), "transform at [0] should be None"); + // assert!(transforms[2].is_none(), "transform at [2] should be None"); + // validate_transform(transforms[1].as_ref(), 17511); + // validate_transform(transforms[3].as_ref(), 17510); + // } + // } } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index bed1c1b9a..ad831c7cf 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -22,15 +22,15 @@ use crate::listed_log_files::ListedLogFiles; use crate::log_replay::{ActionsBatch, HasSelectionVector}; use crate::log_segment::LogSegment; use crate::scan::state::{DvInfo, Stats}; -use crate::schema::ToSchema as _; use crate::schema::{ ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, StructType, }; +use crate::schema::{MetadataColumnSpec, ToSchema as _}; use crate::snapshot::SnapshotRef; use crate::table_features::ColumnMappingMode; use crate::transforms::{get_transform_spec, ColumnType}; -use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, Version}; +use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, Snapshot, Version}; use self::log_replay::scan_action_iter; @@ -116,11 +116,7 @@ impl ScanBuilder { pub fn build(self) -> DeltaResult { // if no schema is provided, use snapshot's entire schema (e.g. SELECT *) let logical_schema = self.schema.unwrap_or_else(|| self.snapshot.schema()); - let state_info = StateInfo::try_new( - logical_schema.as_ref(), - self.snapshot.metadata().partition_columns(), - self.snapshot.table_configuration().column_mapping_mode(), - )?; + let state_info = StateInfo::try_new(logical_schema.as_ref(), &self.snapshot)?; let physical_predicate = match self.predicate { Some(predicate) => PhysicalPredicate::try_new(&predicate, &logical_schema)?, @@ -133,7 +129,7 @@ impl ScanBuilder { physical_schema: Arc::new(StructType::try_new(state_info.read_fields)?), physical_predicate, all_fields: Arc::new(state_info.all_fields), - have_partition_cols: state_info.have_partition_cols, + need_transform: state_info.need_transform, }) } } @@ -367,7 +363,7 @@ pub struct Scan { physical_schema: SchemaRef, physical_predicate: PhysicalPredicate, all_fields: Arc>, - have_partition_cols: bool, + need_transform: bool, } impl std::fmt::Debug for Scan { @@ -510,6 +506,7 @@ impl Scan { StructField::nullable("modificationTime", DataType::LONG), StructField::nullable("stats", DataType::STRING), StructField::nullable("deletionVector", DeletionVectorDescriptor::to_schema()), + StructField::nullable("baseRowId", DataType::LONG), ]), )]) }); @@ -590,7 +587,7 @@ impl Scan { // needed. We need transforms for: // - Partition columns: Must be injected from partition values // - Column mapping: Physical field names must be mapped to logical field names via output schema - let static_transform = (self.have_partition_cols + let static_transform = (self.need_transform || self.snapshot.column_mapping_mode() != ColumnMappingMode::None) .then(|| Arc::new(get_transform_spec(&self.all_fields))); let physical_predicate = match self.physical_predicate.clone() { @@ -754,7 +751,8 @@ impl Scan { /// cardinality: long, /// }, /// fileConstantValues: { -/// partitionValues: map +/// partitionValues: map, +/// baseRowId: long /// } /// } /// ``` @@ -768,18 +766,16 @@ struct StateInfo { all_fields: Vec, /// The physical (parquet) read schema to use. read_fields: Vec, - /// True if this query references any partition columns. - have_partition_cols: bool, + /// True if this query needs physical -> logical transformation + need_transform: bool, } impl StateInfo { /// Get the state needed to process a scan. - fn try_new( - logical_schema: &Schema, - partition_columns: &[String], - column_mapping_mode: ColumnMappingMode, - ) -> DeltaResult { - let mut have_partition_cols = false; + fn try_new(logical_schema: &Schema, snapshot: &Snapshot) -> DeltaResult { + let partition_columns = snapshot.metadata().partition_columns(); + let column_mapping_mode = snapshot.table_configuration().column_mapping_mode(); + let mut need_transform = false; let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); let mut read_field_names = HashSet::with_capacity(logical_schema.num_fields()); @@ -801,21 +797,61 @@ impl StateInfo { // Store the index into the schema for this field. When we turn it into an // expression in the inner loop, we will index into the schema and get the name and // data type, which we need to properly materialize the column. - have_partition_cols = true; + need_transform = true; Ok(ColumnType::MetadataDerivedColumn(index)) } else { // Add to read schema, store field so we can build a `Column` expression later // if needed (i.e. if we have partition columns) - let physical_field = logical_field.make_physical(column_mapping_mode); - debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); - let physical_name = physical_field.name.clone(); - read_fields.push(physical_field); + let mut select_physical_field = || { + let physical_field = logical_field.make_physical(column_mapping_mode); + debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); + let physical_name = physical_field.name.clone(); + read_fields.push(physical_field); - if !logical_field.is_metadata_column() { - read_field_names.insert(physical_name.clone()); - } + if !logical_field.is_metadata_column() { + read_field_names.insert(physical_name.clone()); + } - Ok(ColumnType::Selected(physical_name)) + Ok(ColumnType::Selected(physical_name)) + }; + match logical_field.get_metadata_column_spec() { + Some(MetadataColumnSpec::RowId) => { + if snapshot + .table_configuration() + .table_properties() + .enable_row_tracking + == Some(true) + { + let row_id_col = snapshot + .metadata() + .configuration() + .get("delta.rowTracking.materializedRowIdColumnName") + .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; + let index_column_name = "_metadata.row_indexes_for_row_id".to_string(); + read_fields.push(StructField::create_metadata_column(&index_column_name, MetadataColumnSpec::RowIndex)); + read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); + need_transform = true; + Ok(ColumnType::RowId { + index_column_name: index_column_name, + physical_name: row_id_col.to_string(), + }) + } else { + return Err(Error::unsupported( + "Row ids are not enabled on this table", + )); + } + } + Some(MetadataColumnSpec::RowIndex) => { + // handled in parquet reader (TODO: include in output schema?) + select_physical_field() + } + Some(MetadataColumnSpec::RowCommitVersion) => { + return Err(Error::unsupported( + "Row commit versions not supported", + )); + } + _ => select_physical_field() + } } }) .try_collect()?; @@ -833,7 +869,7 @@ impl StateInfo { Ok(StateInfo { all_fields, read_fields, - have_partition_cols, + need_transform, }) } } diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index 90a3422f8..752135323 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -179,7 +179,7 @@ impl RowVisitor for ScanFileVisitor<'_, T> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { require!( - getters.len() == 10, + getters.len() == 11, Error::InternalError(format!( "Wrong number of ScanFileVisitor getters: {}", getters.len() diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 173a32eaa..8323818ce 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -93,7 +93,12 @@ pub(crate) fn get_cdf_transform_expr( let cdf_values = get_cdf_columns(logical_schema, scan_file)?; partition_values.extend(cdf_values); - get_transform_expr(transform_spec, partition_values, physical_schema) + get_transform_expr( + transform_spec, + partition_values, + physical_schema, + None, /* base_row_id */ + ) } #[cfg(test)] diff --git a/kernel/src/transforms.rs b/kernel/src/transforms.rs index 502644301..9ee2ef1a6 100644 --- a/kernel/src/transforms.rs +++ b/kernel/src/transforms.rs @@ -9,7 +9,7 @@ use std::sync::Arc; use itertools::Itertools; -use crate::expressions::{Expression, ExpressionRef, Scalar, Transform}; +use crate::expressions::{BinaryExpressionOp, Expression, ExpressionRef, Scalar, Transform}; use crate::schema::{DataType, SchemaRef, StructType}; use crate::{DeltaResult, Error}; @@ -23,6 +23,12 @@ pub(crate) enum ColumnType { /// The string contains the physical column name (after any column mapping). Selected(String), + /// Get the rowid column, the usize is the location in the output schema + RowId { + index_column_name: String, // column name for row indexes + physical_name: String, // column name that contains stable row ids + }, + /// A metadata-derived column (e.g., partition columns, CDF version and timestamp columns). /// The usize is the index of this column in the logical schema. MetadataDerivedColumn(usize), @@ -58,19 +64,18 @@ pub(crate) enum FieldTransformSpec { insert_after: Option, expr: ExpressionRef, }, - /// Replace the named input column with an expression - // NOTE: Row tracking will eventually need to replace the physical rowid column with a COALESCE - // to compute non-materialized row ids and row commit versions. - #[allow(unused)] - StaticReplace { - field_name: String, - expr: ExpressionRef, - }, /// Drops the named input column - // NOTE: Row tracking will need to drop metadata columns that were used to compute rowids, since + // NOTE: Row tracking needs to drop metadata columns that were used to compute rowids, since // they should not appear in the query's output. #[allow(unused)] StaticDrop { field_name: String }, + /// Generate the RowId column. + GenerateRowId { + /// column name which should end up containing the RowId + field_name: String, + /// column name which contains row indexes + row_index_field_name: String, + }, /// Insert a partition column after the named input column. /// The partition column is identified by its field index in the logical table schema. /// Its value varies from file to file and is obtained from file metadata. @@ -122,7 +127,7 @@ pub(crate) fn parse_partition_values( ), FieldTransformSpec::DynamicColumn { .. } | FieldTransformSpec::StaticInsert { .. } - | FieldTransformSpec::StaticReplace { .. } + | FieldTransformSpec::GenerateRowId { .. } | FieldTransformSpec::StaticDrop { .. } => None, }) .try_collect() @@ -137,6 +142,7 @@ pub(crate) fn get_transform_expr( transform_spec: &TransformSpec, mut metadata_values: HashMap, physical_schema: &StructType, + base_row_id: Option, ) -> DeltaResult { let mut transform = Transform::new_top_level(); @@ -146,10 +152,27 @@ pub(crate) fn get_transform_expr( StaticInsert { insert_after, expr } => { transform.with_inserted_field(insert_after.clone(), expr.clone()) } - StaticReplace { field_name, expr } => { - transform.with_replaced_field(field_name.clone(), expr.clone()) - } StaticDrop { field_name } => transform.with_dropped_field(field_name.clone()), + GenerateRowId { + field_name, + row_index_field_name, + } => { + let base_row_id = base_row_id.ok_or_else(|| Error::Generic(format!( + "Asked to generate RowIds, but no baseRowId found." + )))?; + let expr = Arc::new(Expression::variadic( + crate::expressions::VariadicExpressionOp::Coalesce, + vec![ + Expression::column([field_name]), + Expression::binary( + BinaryExpressionOp::Plus, + Expression::literal(base_row_id), + Expression::column([row_index_field_name]), + ), + ], + )); + transform.with_replaced_field(field_name.clone(), expr) + } MetadataDerivedColumn { field_index, insert_after, @@ -221,6 +244,32 @@ pub(crate) fn get_transform_spec(all_fields: &[ColumnType]) -> TransformSpec { field_index: *logical_idx, }); } + ColumnType::RowId { + index_column_name, + physical_name, + } => { + // transform_spec.push(FieldTransformSpec::StaticReplace { + // field_name: physical_name.clone(), + // expr: Arc::new(Expression::variadic( + // crate::expressions::VariadicExpressionOp::Coalesce, + // vec![ + // Expression::column([physical_name]), + // Expression::binary( + // BinaryExpressionOp::Plus, + // Expression::literal(3i64), + // Expression::column([index_column_name]), + // ), + // ], + // )), + // }); + transform_spec.push(FieldTransformSpec::GenerateRowId { + field_name: physical_name.clone(), + row_index_field_name: index_column_name.clone(), + }); + transform_spec.push(FieldTransformSpec::StaticDrop { + field_name: index_column_name.clone(), + }); + } ColumnType::Dynamic { logical_index, physical_name, @@ -456,7 +505,7 @@ mod tests { // Create a minimal physical schema for test let physical_schema = StructType::new_unchecked(vec![]); - let result = get_transform_expr(&transform_spec, partition_values, &physical_schema); + let result = get_transform_expr(&transform_spec, partition_values, &physical_schema, None /* base_row_id */); assert_result_error_with_message(result, "missing partition value"); } @@ -468,12 +517,8 @@ mod tests { insert_after: Some("col1".to_string()), expr: expr.clone(), }, - FieldTransformSpec::StaticReplace { - field_name: "col2".to_string(), - expr: expr.clone(), - }, FieldTransformSpec::StaticDrop { - field_name: "col3".to_string(), + field_name: "col2".to_string(), }, ]; let metadata_values = HashMap::new(); @@ -481,11 +526,10 @@ mod tests { // Create a physical schema with the relevant columns let physical_schema = StructType::new_unchecked(vec![ StructField::nullable("col1", DataType::STRING), - StructField::nullable("col2", DataType::INTEGER), - StructField::nullable("col3", DataType::LONG), + StructField::nullable("col2", DataType::LONG), ]); let result = - get_transform_expr(&transform_spec, metadata_values, &physical_schema).unwrap(); + get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */).unwrap(); let Expression::Transform(transform) = result.as_ref() else { panic!("Expected Transform expression"); @@ -501,20 +545,10 @@ mod tests { }; assert_eq!(scalar, &Scalar::Integer(42)); - // Verify StaticReplace: should replace col2 with the expression + // Verify StaticDrop: should drop col2 (empty expressions and is_replace = true) assert!(transform.field_transforms.contains_key("col2")); assert!(transform.field_transforms["col2"].is_replace); - assert_eq!(transform.field_transforms["col2"].exprs.len(), 1); - let Expression::Literal(scalar) = transform.field_transforms["col2"].exprs[0].as_ref() - else { - panic!("Expected literal expression for replace"); - }; - assert_eq!(scalar, &Scalar::Integer(42)); - - // Verify StaticDrop: should drop col3 (empty expressions and is_replace = true) - assert!(transform.field_transforms.contains_key("col3")); - assert!(transform.field_transforms["col3"].is_replace); - assert!(transform.field_transforms["col3"].exprs.is_empty()); + assert!(transform.field_transforms["col2"].exprs.is_empty()); } #[test] @@ -532,7 +566,7 @@ mod tests { ]); let metadata_values = HashMap::new(); - let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema); + let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */); let transform_expr = result.expect("Transform expression should be created successfully"); let Expression::Transform(transform) = transform_expr.as_ref() else { @@ -575,7 +609,7 @@ mod tests { ), ); - let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema); + let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */); let transform_expr = result.expect("Transform expression should be created successfully"); let Expression::Transform(transform) = transform_expr.as_ref() else { @@ -607,7 +641,7 @@ mod tests { let mut metadata_values = HashMap::new(); metadata_values.insert(1, ("year".to_string(), Scalar::Integer(2024))); - let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema); + let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */); let transform_expr = result.expect("Transform expression should be created successfully"); let Expression::Transform(transform) = transform_expr.as_ref() else { @@ -646,7 +680,7 @@ mod tests { let metadata_values = HashMap::new(); // Should fail with missing data error - let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema); + let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */); assert_result_error_with_message(result, "missing partition value for dynamic column"); } } diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 91fb2f052..9d94c5c97 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -17,7 +17,7 @@ use delta_kernel::parquet::file::properties::{EnabledStatistics, WriterPropertie use delta_kernel::scan::state::{transform_to_logical, DvInfo, Stats}; use delta_kernel::scan::{Scan, ScanResult}; use delta_kernel::schema::{DataType, MetadataColumnSpec, Schema, StructField, StructType}; -use delta_kernel::{Engine, FileMeta, Snapshot}; +use delta_kernel::{DeltaResult, Engine, FileMeta, Snapshot}; use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; @@ -1500,45 +1500,34 @@ async fn test_unsupported_metadata_columns() -> Result<(), Box { - let error_msg = e.to_string(); - if error_msg.contains(error_text) && error_msg.contains("not supported") { - found_error = true; - break; - } - } - Ok(_) => { - panic!( - "Expected error for {} metadata column, but scan succeeded", - error_text - ); - } + + let scan = snapshot.scan_builder().with_schema(schema).build(); + match scan { + Err(e) => { + let error_msg = e.to_string(); + assert!(error_msg.contains(error_text), "Expected {error_msg} to contain {error_text}"); + } + Ok(_) => { + panic!( + "Expected error for {} metadata column, but scan succeeded", + error_text + ); } } - assert!( - found_error, - "Expected error about {} not being supported", - error_text - ); } Ok(()) From 397af574eec59c9fd23f17154ffd4c5a2a107857 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 6 Oct 2025 17:17:22 -0700 Subject: [PATCH 02/28] cleanup, add back tests --- kernel/src/scan/log_replay.rs | 86 +++++--- kernel/src/scan/mod.rs | 403 ++++++++++++++++++---------------- kernel/src/transforms.rs | 82 ++++--- kernel/tests/read.rs | 13 +- 4 files changed, 316 insertions(+), 268 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index b8d2a7187..da6b446d4 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -278,11 +278,10 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { pub(crate) static SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| { // Note that fields projected out of a nullable struct must be nullable let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); - let file_constant_values = - StructType::new_unchecked([ - StructField::nullable("partitionValues", partition_values), - StructField::nullable("baseRowId", DataType::LONG), - ]); + let file_constant_values = StructType::new_unchecked([ + StructField::nullable("partitionValues", partition_values), + StructField::nullable("baseRowId", DataType::LONG), + ]); Arc::new(StructType::new_unchecked([ StructField::nullable("path", DataType::STRING), StructField::nullable("size", DataType::LONG), @@ -307,7 +306,7 @@ fn get_add_transform_expr() -> ExpressionRef { column_expr_ref!("add.deletionVector"), Arc::new(Expression::Struct(vec![ column_expr_ref!("add.partitionValues"), - column_expr_ref!("add.baseRowId") + column_expr_ref!("add.baseRowId"), ])), ])) }); @@ -401,8 +400,8 @@ mod tests { add_batch_simple, add_batch_with_partition_col, add_batch_with_remove, run_with_validate_callback, }; + use crate::scan::tests::get_state_info; use crate::scan::{PhysicalPredicate, StateInfo}; - use crate::table_features::ColumnMappingMode; use crate::Expression as Expr; use crate::{ engine::sync::SyncEngine, @@ -484,26 +483,55 @@ mod tests { } } - // #[test] - // fn test_simple_transform() { - // let schema: SchemaRef = Arc::new(StructType::new_unchecked([ - // StructField::new("value", DataType::INTEGER, true), - // StructField::new("date", DataType::DATE, true), - // ])); - // let partition_cols = ["date".to_string()]; - // let state_info = - // StateInfo::try_new(schema.as_ref(), &partition_cols, ColumnMappingMode::None).unwrap(); - // let static_transform = Some(Arc::new(get_transform_spec(&state_info.all_fields))); - // let batch = vec![add_batch_with_partition_col()]; - // let iter = scan_action_iter( - // &SyncEngine::new(), - // batch - // .into_iter() - // .map(|batch| Ok(ActionsBatch::new(batch as _, true))), - // schema.clone(), - // schema, - // static_transform, - // None, - // ); - // } + #[test] + fn test_simple_transform() { + let schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::new("value", DataType::INTEGER, true), + StructField::new("date", DataType::DATE, true), + ])); + let partition_cols = vec!["date".to_string()]; + let state_info = get_state_info(schema, partition_cols, None); + let batch = vec![add_batch_with_partition_col()]; + let iter = scan_action_iter( + &SyncEngine::new(), + batch + .into_iter() + .map(|batch| Ok(ActionsBatch::new(batch as _, true))), + Arc::new(state_info), + ); + + fn validate_transform(transform: Option<&ExpressionRef>, expected_date_offset: i32) { + assert!(transform.is_some()); + let Expr::Transform(transform) = transform.unwrap().as_ref() else { + panic!("Transform should always be a Transform expr"); + }; + + // With sparse transforms, we expect only one insertion for the partition column + assert!(transform.prepended_fields.is_empty()); + let mut field_transforms = transform.field_transforms.iter(); + let (field_name, field_transform) = field_transforms.next().unwrap(); + assert_eq!(field_name, "value"); + assert!(!field_transform.is_replace); + let [expr] = &field_transform.exprs[..] else { + panic!("Expected a single insertion"); + }; + let Expr::Literal(Scalar::Date(date_offset)) = expr.as_ref() else { + panic!("Expected a literal date"); + }; + assert_eq!(*date_offset, expected_date_offset); + assert!(field_transforms.next().is_none()); + } + + for res in iter { + let scan_metadata = res.unwrap(); + let transforms = scan_metadata.scan_file_transforms; + // in this case we have a metadata action first and protocol 3rd, so we expect 4 items, + // the first and 3rd being a `None` + assert_eq!(transforms.len(), 4, "Should have 4 transforms"); + assert!(transforms[0].is_none(), "transform at [0] should be None"); + assert!(transforms[2].is_none(), "transform at [2] should be None"); + validate_transform(transforms[1].as_ref(), 17511); + validate_transform(transforms[3].as_ref(), 17510); + } + } } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 94fc1ebfd..ec9419856 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -28,9 +28,10 @@ use crate::schema::{ }; use crate::schema::{MetadataColumnSpec, ToSchema as _}; use crate::snapshot::SnapshotRef; +use crate::table_configuration::TableConfiguration; use crate::table_features::ColumnMappingMode; use crate::transforms::{FieldTransformSpec, TransformSpec}; -use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, Snapshot, Version}; +use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, Version}; use self::log_replay::scan_action_iter; @@ -116,7 +117,11 @@ impl ScanBuilder { pub fn build(self) -> DeltaResult { // if no schema is provided, use snapshot's entire schema (e.g. SELECT *) let logical_schema = self.schema.unwrap_or_else(|| self.snapshot.schema()); - let state_info = StateInfo::try_new(logical_schema, &self.snapshot, self.predicate)?; + let state_info = StateInfo::try_new( + logical_schema, + self.snapshot.table_configuration(), + self.predicate, + )?; Ok(Scan { snapshot: self.snapshot, @@ -753,11 +758,11 @@ impl StateInfo { /// `predicate` - Optional predicate to filter data during the scan pub(crate) fn try_new( logical_schema: SchemaRef, - snapshot: &Snapshot, + table_configuration: &TableConfiguration, predicate: Option, ) -> DeltaResult { - let partition_columns = snapshot.metadata().partition_columns(); - let column_mapping_mode = snapshot.table_configuration().column_mapping_mode(); + let partition_columns = table_configuration.metadata().partition_columns(); + let column_mapping_mode = table_configuration.column_mapping_mode(); let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); let mut read_field_names = HashSet::with_capacity(logical_schema.num_fields()); let mut transform_spec = Vec::new(); @@ -771,7 +776,6 @@ impl StateInfo { "Metadata column names must not match partition columns: {}", logical_field.name() ))); - } // Partition column: needs to be injected via transform @@ -794,19 +798,18 @@ impl StateInfo { }; match logical_field.get_metadata_column_spec() { Some(MetadataColumnSpec::RowId) => { - if snapshot - .table_configuration() - .table_properties() - .enable_row_tracking - == Some(true) + if table_configuration.table_properties().enable_row_tracking == Some(true) { - let row_id_col = snapshot + let row_id_col = table_configuration .metadata() .configuration() .get("delta.rowTracking.materializedRowIdColumnName") .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; let index_column_name = "_metadata.row_indexes_for_row_id".to_string(); - read_fields.push(StructField::create_metadata_column(&index_column_name, MetadataColumnSpec::RowIndex)); + read_fields.push(StructField::create_metadata_column( + &index_column_name, + MetadataColumnSpec::RowIndex, + )); read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); transform_spec.push(FieldTransformSpec::GenerateRowId { field_name: row_id_col.to_string(), @@ -818,7 +821,7 @@ impl StateInfo { } else { return Err(Error::unsupported( "Row ids are not enabled on this table", - )); + )); } } Some(MetadataColumnSpec::RowIndex) => { @@ -826,9 +829,7 @@ impl StateInfo { select_physical_field() } Some(MetadataColumnSpec::RowCommitVersion) => { - return Err(Error::unsupported( - "Row commit versions not supported", - )); + return Err(Error::unsupported("Row commit versions not supported")); } _ => select_physical_field(), } @@ -1027,6 +1028,7 @@ pub(crate) mod test_utils { mod tests { use std::path::PathBuf; + use crate::actions::{Metadata, Protocol}; use crate::arrow::array::BooleanArray; use crate::arrow::compute::filter_record_batch; use crate::arrow::record_batch::RecordBatch; @@ -1525,181 +1527,194 @@ mod tests { Ok(()) } - // #[test] - // fn test_state_info_no_partition_columns() { - // // Test case: No partition columns, no column mapping - // let schema = Arc::new(StructType::new_unchecked(vec![ - // StructField::nullable("id", DataType::STRING), - // StructField::nullable("value", DataType::LONG), - // ])); - - // let state_info = StateInfo::try_new( - // schema.clone(), - // &[], // No partition columns - // ColumnMappingMode::None, - // None, // No predicate - // ) - // .unwrap(); - - // // Should have no transform spec (no partitions, no column mapping) - // assert!(state_info.transform_spec.is_none()); - - // // Physical schema should match logical schema - // assert_eq!(state_info.logical_schema, schema); - // assert_eq!(state_info.physical_schema.fields().len(), 2); - - // // No predicate - // assert_eq!(state_info.physical_predicate, PhysicalPredicate::None); - // } - - // #[test] - // fn test_state_info_with_partition_columns() { - // // Test case: With partition columns - // let schema = Arc::new(StructType::new_unchecked(vec![ - // StructField::nullable("id", DataType::STRING), - // StructField::nullable("date", DataType::DATE), // Partition column - // StructField::nullable("value", DataType::LONG), - // ])); - - // let state_info = StateInfo::try_new( - // schema.clone(), - // &["date".to_string()], // date is a partition column - // ColumnMappingMode::None, - // None, - // ) - // .unwrap(); - - // // Should have a transform spec for the partition column - // assert!(state_info.transform_spec.is_some()); - // let transform_spec = state_info.transform_spec.as_ref().unwrap(); - // assert_eq!(transform_spec.len(), 1); - - // // Check the transform spec for the partition column - // match &transform_spec[0] { - // FieldTransformSpec::MetadataDerivedColumn { - // field_index, - // insert_after, - // } => { - // assert_eq!(*field_index, 1); // Index of "date" in logical schema - // assert_eq!(insert_after, &Some("id".to_string())); // After "id" which is physical - // } - // _ => panic!("Expected MetadataDerivedColumn transform"), - // } - - // // Physical schema should not include partition column - // assert_eq!(state_info.logical_schema, schema); - // assert_eq!(state_info.physical_schema.fields().len(), 2); // Only id and value - // } - - // #[test] - // fn test_state_info_multiple_partition_columns() { - // // Test case: Multiple partition columns interspersed with regular columns - // let schema = Arc::new(StructType::new_unchecked(vec![ - // StructField::nullable("col1", DataType::STRING), - // StructField::nullable("part1", DataType::STRING), // Partition - // StructField::nullable("col2", DataType::LONG), - // StructField::nullable("part2", DataType::INTEGER), // Partition - // ])); - - // let state_info = StateInfo::try_new( - // schema.clone(), - // &["part1".to_string(), "part2".to_string()], - // ColumnMappingMode::None, - // None, - // ) - // .unwrap(); - - // // Should have transforms for both partition columns - // assert!(state_info.transform_spec.is_some()); - // let transform_spec = state_info.transform_spec.as_ref().unwrap(); - // assert_eq!(transform_spec.len(), 2); - - // // Check first partition column transform - // match &transform_spec[0] { - // FieldTransformSpec::MetadataDerivedColumn { - // field_index, - // insert_after, - // } => { - // assert_eq!(*field_index, 1); // Index of "part1" - // assert_eq!(insert_after, &Some("col1".to_string())); - // } - // _ => panic!("Expected MetadataDerivedColumn transform"), - // } - - // // Check second partition column transform - // match &transform_spec[1] { - // FieldTransformSpec::MetadataDerivedColumn { - // field_index, - // insert_after, - // } => { - // assert_eq!(*field_index, 3); // Index of "part2" - // assert_eq!(insert_after, &Some("col2".to_string())); - // } - // _ => panic!("Expected MetadataDerivedColumn transform"), - // } - - // // Physical schema should only have non-partition columns - // assert_eq!(state_info.physical_schema.fields().len(), 2); // col1 and col2 - // } - - // #[test] - // fn test_state_info_with_predicate() { - // // Test case: With a valid predicate - // let schema = Arc::new(StructType::new_unchecked(vec![ - // StructField::nullable("id", DataType::STRING), - // StructField::nullable("value", DataType::LONG), - // ])); - - // let predicate = Arc::new(column_expr!("value").gt(Expr::literal(10i64))); - - // let state_info = StateInfo::try_new( - // schema.clone(), - // &[], - // ColumnMappingMode::None, - // Some(predicate), - // ) - // .unwrap(); - - // // Should have a physical predicate - // match &state_info.physical_predicate { - // PhysicalPredicate::Some(_pred, schema) => { - // // Physical predicate exists - // assert_eq!(schema.fields().len(), 1); // Only "value" is referenced - // } - // _ => panic!("Expected PhysicalPredicate::Some"), - // } - // } - - // #[test] - // fn test_state_info_partition_at_beginning() { - // // Test case: Partition column at the beginning - // let schema = Arc::new(StructType::new_unchecked(vec![ - // StructField::nullable("date", DataType::DATE), // Partition column - // StructField::nullable("id", DataType::STRING), - // StructField::nullable("value", DataType::LONG), - // ])); - - // let state_info = StateInfo::try_new( - // schema.clone(), - // &["date".to_string()], - // ColumnMappingMode::None, - // None, - // ) - // .unwrap(); - - // // Should have a transform spec for the partition column - // let transform_spec = state_info.transform_spec.as_ref().unwrap(); - // assert_eq!(transform_spec.len(), 1); - - // match &transform_spec[0] { - // FieldTransformSpec::MetadataDerivedColumn { - // field_index, - // insert_after, - // } => { - // assert_eq!(*field_index, 0); // Index of "date" - // assert_eq!(insert_after, &None); // No physical field before it, so prepend - // } - // _ => panic!("Expected MetadataDerivedColumn transform"), - // } - // } + pub(crate) fn get_state_info( + schema: SchemaRef, + partition_columns: Vec, + predicate: Option, + ) -> StateInfo { + let metadata = Metadata::try_new( + None, + None, + schema.as_ref().clone(), + partition_columns, + 10, + HashMap::new(), + ) + .unwrap(); + let no_features: Option> = None; // needed for type annotation + let protocol = Protocol::try_new(1, 2, no_features.clone(), no_features).unwrap(); + let table_configuration = TableConfiguration::try_new( + metadata, + protocol, + Url::parse("s3://my-table").unwrap(), + 1, + ) + .unwrap(); + + StateInfo::try_new(schema.clone(), &table_configuration, predicate).unwrap() + } + + #[test] + fn test_state_info_no_partition_columns() { + // Test case: No partition columns, no column mapping + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + ])); + + let state_info = get_state_info( + schema.clone(), + vec![], // no partition columns + None, // No predicate + ); + + // Should have no transform spec (no partitions, no column mapping) + assert!(state_info.transform_spec.is_none()); + + // Physical schema should match logical schema + assert_eq!(state_info.logical_schema, schema); + assert_eq!(state_info.physical_schema.fields().len(), 2); + + // No predicate + assert_eq!(state_info.physical_predicate, PhysicalPredicate::None); + } + + #[test] + fn test_state_info_with_partition_columns() { + // Test case: With partition columns + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("date", DataType::DATE), // Partition column + StructField::nullable("value", DataType::LONG), + ])); + + let state_info = get_state_info( + schema.clone(), + vec!["date".to_string()], // date is a partition column + None, // no predicate + ); + + // Should have a transform spec for the partition column + assert!(state_info.transform_spec.is_some()); + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 1); + + // Check the transform spec for the partition column + match &transform_spec[0] { + FieldTransformSpec::MetadataDerivedColumn { + field_index, + insert_after, + } => { + assert_eq!(*field_index, 1); // Index of "date" in logical schema + assert_eq!(insert_after, &Some("id".to_string())); // After "id" which is physical + } + _ => panic!("Expected MetadataDerivedColumn transform"), + } + + // Physical schema should not include partition column + assert_eq!(state_info.logical_schema, schema); + assert_eq!(state_info.physical_schema.fields().len(), 2); // Only id and value + } + + #[test] + fn test_state_info_multiple_partition_columns() { + // Test case: Multiple partition columns interspersed with regular columns + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("col1", DataType::STRING), + StructField::nullable("part1", DataType::STRING), // Partition + StructField::nullable("col2", DataType::LONG), + StructField::nullable("part2", DataType::INTEGER), // Partition + ])); + + let state_info = get_state_info( + schema.clone(), + vec!["part1".to_string(), "part2".to_string()], + None, // no predicate + ); + + // Should have transforms for both partition columns + assert!(state_info.transform_spec.is_some()); + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 2); + + // Check first partition column transform + match &transform_spec[0] { + FieldTransformSpec::MetadataDerivedColumn { + field_index, + insert_after, + } => { + assert_eq!(*field_index, 1); // Index of "part1" + assert_eq!(insert_after, &Some("col1".to_string())); + } + _ => panic!("Expected MetadataDerivedColumn transform"), + } + + // Check second partition column transform + match &transform_spec[1] { + FieldTransformSpec::MetadataDerivedColumn { + field_index, + insert_after, + } => { + assert_eq!(*field_index, 3); // Index of "part2" + assert_eq!(insert_after, &Some("col2".to_string())); + } + _ => panic!("Expected MetadataDerivedColumn transform"), + } + + // Physical schema should only have non-partition columns + assert_eq!(state_info.physical_schema.fields().len(), 2); // col1 and col2 + } + + #[test] + fn test_state_info_with_predicate() { + // Test case: With a valid predicate + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + ])); + + let predicate = Arc::new(column_expr!("value").gt(Expr::literal(10i64))); + + let state_info = get_state_info( + schema.clone(), + vec![], // no partition columns + Some(predicate), + ); + + // Should have a physical predicate + match &state_info.physical_predicate { + PhysicalPredicate::Some(_pred, schema) => { + // Physical predicate exists + assert_eq!(schema.fields().len(), 1); // Only "value" is referenced + } + _ => panic!("Expected PhysicalPredicate::Some"), + } + } + + #[test] + fn test_state_info_partition_at_beginning() { + // Test case: Partition column at the beginning + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("date", DataType::DATE), // Partition column + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + ])); + + let state_info = get_state_info(schema.clone(), vec!["date".to_string()], None); + + // Should have a transform spec for the partition column + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 1); + + match &transform_spec[0] { + FieldTransformSpec::MetadataDerivedColumn { + field_index, + insert_after, + } => { + assert_eq!(*field_index, 0); // Index of "date" + assert_eq!(insert_after, &None); // No physical field before it, so prepend + } + _ => panic!("Expected MetadataDerivedColumn transform"), + } + } } diff --git a/kernel/src/transforms.rs b/kernel/src/transforms.rs index f84c004bb..be3a0278b 100644 --- a/kernel/src/transforms.rs +++ b/kernel/src/transforms.rs @@ -24,12 +24,6 @@ pub(crate) enum ColumnType { /// The string contains the physical column name (after any column mapping). Selected(String), - /// Get the rowid column, the usize is the location in the output schema - RowId { - index_column_name: String, // column name for row indexes - physical_name: String, // column name that contains stable row ids - }, - /// A metadata-derived column (e.g., partition columns, CDF version and timestamp columns). /// The usize is the index of this column in the logical schema. MetadataDerivedColumn(usize), @@ -158,9 +152,9 @@ pub(crate) fn get_transform_expr( field_name, row_index_field_name, } => { - let base_row_id = base_row_id.ok_or_else(|| Error::Generic(format!( - "Asked to generate RowIds, but no baseRowId found." - )))?; + let base_row_id = base_row_id.ok_or_else(|| { + Error::generic("Asked to generate RowIds, but no baseRowId found.") + })?; let expr = Arc::new(Expression::variadic( crate::expressions::VariadicExpressionOp::Coalesce, vec![ @@ -246,32 +240,6 @@ pub(crate) fn get_transform_spec(all_fields: &[ColumnType]) -> TransformSpec { field_index: *logical_idx, }); } - ColumnType::RowId { - index_column_name, - physical_name, - } => { - // transform_spec.push(FieldTransformSpec::StaticReplace { - // field_name: physical_name.clone(), - // expr: Arc::new(Expression::variadic( - // crate::expressions::VariadicExpressionOp::Coalesce, - // vec![ - // Expression::column([physical_name]), - // Expression::binary( - // BinaryExpressionOp::Plus, - // Expression::literal(3i64), - // Expression::column([index_column_name]), - // ), - // ], - // )), - // }); - transform_spec.push(FieldTransformSpec::GenerateRowId { - field_name: physical_name.clone(), - row_index_field_name: index_column_name.clone(), - }); - transform_spec.push(FieldTransformSpec::StaticDrop { - field_name: index_column_name.clone(), - }); - } ColumnType::Dynamic { logical_index, physical_name, @@ -507,7 +475,12 @@ mod tests { // Create a minimal physical schema for test let physical_schema = StructType::new_unchecked(vec![]); - let result = get_transform_expr(&transform_spec, partition_values, &physical_schema, None /* base_row_id */); + let result = get_transform_expr( + &transform_spec, + partition_values, + &physical_schema, + None, /* base_row_id */ + ); assert_result_error_with_message(result, "missing partition value"); } @@ -530,8 +503,13 @@ mod tests { StructField::nullable("col1", DataType::STRING), StructField::nullable("col2", DataType::LONG), ]); - let result = - get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */).unwrap(); + let result = get_transform_expr( + &transform_spec, + metadata_values, + &physical_schema, + None, /* base_row_id */ + ) + .unwrap(); let Expression::Transform(transform) = result.as_ref() else { panic!("Expected Transform expression"); @@ -568,7 +546,12 @@ mod tests { ]); let metadata_values = HashMap::new(); - let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */); + let result = get_transform_expr( + &transform_spec, + metadata_values, + &physical_schema, + None, /* base_row_id */ + ); let transform_expr = result.expect("Transform expression should be created successfully"); let Expression::Transform(transform) = transform_expr.as_ref() else { @@ -611,7 +594,12 @@ mod tests { ), ); - let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */); + let result = get_transform_expr( + &transform_spec, + metadata_values, + &physical_schema, + None, /* base_row_id */ + ); let transform_expr = result.expect("Transform expression should be created successfully"); let Expression::Transform(transform) = transform_expr.as_ref() else { @@ -643,7 +631,12 @@ mod tests { let mut metadata_values = HashMap::new(); metadata_values.insert(1, ("year".to_string(), Scalar::Integer(2024))); - let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */); + let result = get_transform_expr( + &transform_spec, + metadata_values, + &physical_schema, + None, /* base_row_id */ + ); let transform_expr = result.expect("Transform expression should be created successfully"); let Expression::Transform(transform) = transform_expr.as_ref() else { @@ -682,7 +675,12 @@ mod tests { let metadata_values = HashMap::new(); // Should fail with missing data error - let result = get_transform_expr(&transform_spec, metadata_values, &physical_schema, None /* base_row_id */); + let result = get_transform_expr( + &transform_spec, + metadata_values, + &physical_schema, + None, /* base_row_id */ + ); assert_result_error_with_message(result, "missing partition value for dynamic column"); } } diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 9d94c5c97..0edb33765 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -17,7 +17,7 @@ use delta_kernel::parquet::file::properties::{EnabledStatistics, WriterPropertie use delta_kernel::scan::state::{transform_to_logical, DvInfo, Stats}; use delta_kernel::scan::{Scan, ScanResult}; use delta_kernel::schema::{DataType, MetadataColumnSpec, Schema, StructField, StructType}; -use delta_kernel::{DeltaResult, Engine, FileMeta, Snapshot}; +use delta_kernel::{Engine, FileMeta, Snapshot}; use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; @@ -1500,7 +1500,11 @@ async fn test_unsupported_metadata_columns() -> Result<(), Box Result<(), Box { let error_msg = e.to_string(); - assert!(error_msg.contains(error_text), "Expected {error_msg} to contain {error_text}"); + assert!( + error_msg.contains(error_text), + "Expected {error_msg} to contain {error_text}" + ); } Ok(_) => { panic!( From aa68b80596ffc2ef1090694ab24dac9b588efbf2 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 7 Oct 2025 09:28:28 -0700 Subject: [PATCH 03/28] handle selecting row index and row id --- kernel/src/scan/mod.rs | 63 ++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index ec9419856..480342eac 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -768,6 +768,25 @@ impl StateInfo { let mut transform_spec = Vec::new(); let mut last_physical_field: Option = None; + // We remember the name of the column that's selecting row indexes, if it's been requested + // explicitly. this is so we can reference this column and not re-add it as a requested + // column if we're _also_ requesting row-ids + let mut selected_row_index_col_name = None; + + // This iteration runs in O(supported_number_of_metadata_columns) time since each metadata + // column can appear at most once in the schema + for metadata_column in logical_schema.metadata_columns() { + if read_field_names.contains(metadata_column.name()) { + return Err(Error::Schema(format!( + "Metadata column names must not match physical columns: {}", + metadata_column.name() + ))); + } + if let Some(MetadataColumnSpec::RowIndex) = metadata_column.get_metadata_column_spec() { + selected_row_index_col_name = Some(metadata_column.name().to_string()); + } + } + // Loop over all selected fields and build both the physical schema and transform spec for (index, logical_field) in logical_schema.fields().enumerate() { if partition_columns.contains(logical_field.name()) { @@ -805,18 +824,32 @@ impl StateInfo { .configuration() .get("delta.rowTracking.materializedRowIdColumnName") .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; - let index_column_name = "_metadata.row_indexes_for_row_id".to_string(); - read_fields.push(StructField::create_metadata_column( - &index_column_name, - MetadataColumnSpec::RowIndex, - )); + // we can `take` as we should only have one RowId col + let index_column_name = match selected_row_index_col_name.take() { + Some(index_column_name) => index_column_name, + None => { + // ensure we have a column name that isn't already in our schema + let index_column_name = (0..) + .map(|i| format!("row_indexes_for_row_id_{}", i)) + .find(|name| logical_schema.field(name).is_none()) + .ok_or(Error::generic( + "Couldn't generate row index column name", + ))?; + read_fields.push(StructField::create_metadata_column( + &index_column_name, + MetadataColumnSpec::RowIndex, + )); + transform_spec.push(FieldTransformSpec::StaticDrop { + field_name: index_column_name.clone(), + }); + index_column_name + } + }; + read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); transform_spec.push(FieldTransformSpec::GenerateRowId { field_name: row_id_col.to_string(), - row_index_field_name: index_column_name.clone(), - }); - transform_spec.push(FieldTransformSpec::StaticDrop { - field_name: index_column_name, + row_index_field_name: index_column_name, }); } else { return Err(Error::unsupported( @@ -825,7 +858,7 @@ impl StateInfo { } } Some(MetadataColumnSpec::RowIndex) => { - // handled in parquet reader (TODO: include in output schema?) + // handled in parquet reader select_physical_field() } Some(MetadataColumnSpec::RowCommitVersion) => { @@ -836,16 +869,6 @@ impl StateInfo { } } - // This iteration runs in O(3) time since each metadata column can appear at most once in the schema - for metadata_column in logical_schema.metadata_columns() { - if read_field_names.contains(metadata_column.name()) { - return Err(Error::Schema(format!( - "Metadata column names must not match physical columns: {}", - metadata_column.name() - ))); - } - } - let physical_schema = Arc::new(StructType::try_new(read_fields)?); let physical_predicate = match predicate { From a0f11a60a2cf8a7279659f8f7b5db0cba21c9217 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 7 Oct 2025 10:06:34 -0700 Subject: [PATCH 04/28] initial tests --- kernel/src/scan/log_replay.rs | 4 +- kernel/src/scan/mod.rs | 127 ++++++++++++++++++++++++++++------ 2 files changed, 108 insertions(+), 23 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index da6b446d4..ffc0119ac 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -400,7 +400,7 @@ mod tests { add_batch_simple, add_batch_with_partition_col, add_batch_with_remove, run_with_validate_callback, }; - use crate::scan::tests::get_state_info; + use crate::scan::tests::get_simple_state_info; use crate::scan::{PhysicalPredicate, StateInfo}; use crate::Expression as Expr; use crate::{ @@ -490,7 +490,7 @@ mod tests { StructField::new("date", DataType::DATE, true), ])); let partition_cols = vec!["date".to_string()]; - let state_info = get_state_info(schema, partition_cols, None); + let state_info = get_simple_state_info(schema, partition_cols).unwrap(); let batch = vec![add_batch_with_partition_col()]; let iter = scan_action_iter( &SyncEngine::new(), diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 480342eac..15bcce3ab 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -1550,31 +1550,48 @@ mod tests { Ok(()) } + // get a state info with no predicate or extra metadata + pub(crate) fn get_simple_state_info( + schema: SchemaRef, + partition_columns: Vec, + ) -> DeltaResult { + get_state_info(schema, partition_columns, None, HashMap::new(), vec![]) + } + pub(crate) fn get_state_info( schema: SchemaRef, partition_columns: Vec, predicate: Option, - ) -> StateInfo { + metadata_configuration: HashMap, + metadata_cols: Vec<(&str, MetadataColumnSpec)>, + ) -> DeltaResult { let metadata = Metadata::try_new( None, None, schema.as_ref().clone(), partition_columns, 10, - HashMap::new(), - ) - .unwrap(); + metadata_configuration, + )?; let no_features: Option> = None; // needed for type annotation - let protocol = Protocol::try_new(1, 2, no_features.clone(), no_features).unwrap(); + let protocol = Protocol::try_new(1, 2, no_features.clone(), no_features)?; let table_configuration = TableConfiguration::try_new( metadata, protocol, Url::parse("s3://my-table").unwrap(), 1, - ) - .unwrap(); + )?; + + let mut schema = schema; + for (name, spec) in metadata_cols.into_iter() { + schema = Arc::new( + schema + .add_metadata_column(name, spec) + .expect("Couldn't add metadata col"), + ); + } - StateInfo::try_new(schema.clone(), &table_configuration, predicate).unwrap() + StateInfo::try_new(schema.clone(), &table_configuration, predicate) } #[test] @@ -1585,11 +1602,7 @@ mod tests { StructField::nullable("value", DataType::LONG), ])); - let state_info = get_state_info( - schema.clone(), - vec![], // no partition columns - None, // No predicate - ); + let state_info = get_simple_state_info(schema.clone(), vec![]).unwrap(); // Should have no transform spec (no partitions, no column mapping) assert!(state_info.transform_spec.is_none()); @@ -1611,11 +1624,10 @@ mod tests { StructField::nullable("value", DataType::LONG), ])); - let state_info = get_state_info( + let state_info = get_simple_state_info( schema.clone(), vec!["date".to_string()], // date is a partition column - None, // no predicate - ); + ).unwrap(); // Should have a transform spec for the partition column assert!(state_info.transform_spec.is_some()); @@ -1649,11 +1661,10 @@ mod tests { StructField::nullable("part2", DataType::INTEGER), // Partition ])); - let state_info = get_state_info( + let state_info = get_simple_state_info( schema.clone(), vec!["part1".to_string(), "part2".to_string()], - None, // no predicate - ); + ).unwrap(); // Should have transforms for both partition columns assert!(state_info.transform_spec.is_some()); @@ -1702,7 +1713,9 @@ mod tests { schema.clone(), vec![], // no partition columns Some(predicate), - ); + HashMap::new(), // no extra metadata + vec![], // no metadata + ).unwrap(); // Should have a physical predicate match &state_info.physical_predicate { @@ -1723,7 +1736,7 @@ mod tests { StructField::nullable("value", DataType::LONG), ])); - let state_info = get_state_info(schema.clone(), vec!["date".to_string()], None); + let state_info = get_simple_state_info(schema.clone(), vec!["date".to_string()]).unwrap(); // Should have a transform spec for the partition column let transform_spec = state_info.transform_spec.as_ref().unwrap(); @@ -1740,4 +1753,76 @@ mod tests { _ => panic!("Expected MetadataDerivedColumn transform"), } } + + #[test] + fn test_state_info_request_row_ids() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "id", + DataType::STRING, + )])); + + let state_info = get_state_info( + schema.clone(), + vec![], + None, + vec![ + ("delta.enableRowTracking", "true"), + ( + "delta.rowTracking.materializedRowIdColumnName", + "some-row-id_col", + ), + ] + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + vec![("row_id", MetadataColumnSpec::RowId)], + ).unwrap(); + + // Should have a transform spec for the row_id column + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 2); // one for rowid col, one to drop indexes + + match &transform_spec[0] { + FieldTransformSpec::StaticDrop { + field_name, + } => { + assert_eq!(field_name, "row_indexes_for_row_id_0"); + } + _ => panic!("Expected StaticDrop transform"), + } + + match &transform_spec[1] { + FieldTransformSpec::GenerateRowId { + field_name, + row_index_field_name, + } => { + assert_eq!(field_name, "some-row-id_col"); + assert_eq!(row_index_field_name, "row_indexes_for_row_id_0"); + } + _ => panic!("Expected GenerateRowId transform"), + } + } + + #[test] + fn test_state_info_request_row_ids_not_enabled() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "id", + DataType::STRING, + )])); + + match get_state_info( + schema.clone(), + vec![], + None, + HashMap::new(), + vec![("row_id", MetadataColumnSpec::RowId)], + ) { + Ok(_) => { + panic!("Should not have succeeded generating state info without rowids enabled") + } + Err(e) => { + assert_eq!(e.to_string(), "Unsupported: Row ids are not enabled on this table") + } + } + } } From 4d27d6dea3c264f51ab4cf4869c8aff6015e0bcc Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 7 Oct 2025 11:23:29 -0700 Subject: [PATCH 05/28] fix clippy --- kernel/src/scan/mod.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 15bcce3ab..33f752e2a 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -1627,7 +1627,8 @@ mod tests { let state_info = get_simple_state_info( schema.clone(), vec!["date".to_string()], // date is a partition column - ).unwrap(); + ) + .unwrap(); // Should have a transform spec for the partition column assert!(state_info.transform_spec.is_some()); @@ -1664,7 +1665,8 @@ mod tests { let state_info = get_simple_state_info( schema.clone(), vec!["part1".to_string(), "part2".to_string()], - ).unwrap(); + ) + .unwrap(); // Should have transforms for both partition columns assert!(state_info.transform_spec.is_some()); @@ -1715,7 +1717,8 @@ mod tests { Some(predicate), HashMap::new(), // no extra metadata vec![], // no metadata - ).unwrap(); + ) + .unwrap(); // Should have a physical predicate match &state_info.physical_predicate { @@ -1765,7 +1768,7 @@ mod tests { schema.clone(), vec![], None, - vec![ + [ ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", @@ -1776,16 +1779,15 @@ mod tests { .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), vec![("row_id", MetadataColumnSpec::RowId)], - ).unwrap(); + ) + .unwrap(); // Should have a transform spec for the row_id column let transform_spec = state_info.transform_spec.as_ref().unwrap(); assert_eq!(transform_spec.len(), 2); // one for rowid col, one to drop indexes match &transform_spec[0] { - FieldTransformSpec::StaticDrop { - field_name, - } => { + FieldTransformSpec::StaticDrop { field_name } => { assert_eq!(field_name, "row_indexes_for_row_id_0"); } _ => panic!("Expected StaticDrop transform"), @@ -1821,7 +1823,10 @@ mod tests { panic!("Should not have succeeded generating state info without rowids enabled") } Err(e) => { - assert_eq!(e.to_string(), "Unsupported: Row ids are not enabled on this table") + assert_eq!( + e.to_string(), + "Unsupported: Row ids are not enabled on this table" + ) } } } From 723ab1a4d130a53e5932789a71eb087d5a74cad2 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 7 Oct 2025 11:37:00 -0700 Subject: [PATCH 06/28] finish up state info tests --- kernel/src/scan/mod.rs | 74 +++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 12 deletions(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 33f752e2a..041e9cbf0 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -1806,27 +1806,77 @@ mod tests { } #[test] - fn test_state_info_request_row_ids_not_enabled() { + fn test_state_info_request_row_ids_and_indexes() { let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( "id", DataType::STRING, )])); - match get_state_info( + let state_info = get_state_info( schema.clone(), vec![], None, - HashMap::new(), - vec![("row_id", MetadataColumnSpec::RowId)], - ) { - Ok(_) => { - panic!("Should not have succeeded generating state info without rowids enabled") + [ + ("delta.enableRowTracking", "true"), + ( + "delta.rowTracking.materializedRowIdColumnName", + "some-row-id_col", + ), + ] + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + vec![ + ("row_id", MetadataColumnSpec::RowId), + ("row_index", MetadataColumnSpec::RowIndex), + ], + ) + .unwrap(); + + // Should have a transform spec for the row_id column + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 1); // just one because we don't want to drop indexes now + + match &transform_spec[0] { + FieldTransformSpec::GenerateRowId { + field_name, + row_index_field_name, + } => { + assert_eq!(field_name, "some-row-id_col"); + assert_eq!(row_index_field_name, "row_index"); } - Err(e) => { - assert_eq!( - e.to_string(), - "Unsupported: Row ids are not enabled on this table" - ) + _ => panic!("Expected GenerateRowId transform"), + } + } + + #[test] + fn test_state_info_invalid_rowtracking_config() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "id", + DataType::STRING, + )])); + + for (metadata_config, metadata_cols, expected_error) in [ + (HashMap::new(), vec![("row_id", MetadataColumnSpec::RowId)], "Unsupported: Row ids are not enabled on this table"), + ( + [("delta.enableRowTracking", "true")] + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + vec![("row_id", MetadataColumnSpec::RowId)], + "Generic delta kernel error: No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration", + ), + ] { + match get_state_info(schema.clone(), vec![], None, metadata_config, metadata_cols) { + Ok(_) => { + panic!("Should not have succeeded generating state info with invalid config") + } + Err(e) => { + assert_eq!( + e.to_string(), + expected_error, + ) + } } } } From bffa4e8332b22b909f03081fe5615f30bdfc64f9 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 7 Oct 2025 11:40:08 -0700 Subject: [PATCH 07/28] clippy --- kernel/examples/common/src/lib.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/examples/common/src/lib.rs b/kernel/examples/common/src/lib.rs index d488d2576..a56c93db1 100644 --- a/kernel/examples/common/src/lib.rs +++ b/kernel/examples/common/src/lib.rs @@ -227,7 +227,7 @@ pub fn get_scan(snapshot: SnapshotRef, args: &ScanArgs) -> DeltaResult DeltaResult Date: Tue, 7 Oct 2025 11:52:28 -0700 Subject: [PATCH 08/28] add one transform test --- kernel/src/transforms.rs | 49 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/kernel/src/transforms.rs b/kernel/src/transforms.rs index be3a0278b..367e6e57c 100644 --- a/kernel/src/transforms.rs +++ b/kernel/src/transforms.rs @@ -683,4 +683,53 @@ mod tests { ); assert_result_error_with_message(result, "missing partition value for dynamic column"); } + + #[test] + fn test_get_transform_expr_generate_row_ids() { + let transform_spec = vec![FieldTransformSpec::GenerateRowId { + field_name: "row_id_col".to_string(), + row_index_field_name: "row_index_col".to_string(), + }]; + + // Physical schema contains row index col, but no row-id col + let physical_schema = StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::not_null("row_index_col", DataType::LONG), + ]); + let metadata_values = HashMap::new(); + + let result = get_transform_expr( + &transform_spec, + metadata_values, + &physical_schema, + Some(4), /* base_row_id */ + ); + let transform_expr = result.expect("Transform expression should be created successfully"); + + let Expression::Transform(transform) = transform_expr.as_ref() else { + panic!("Expected Transform expression"); + }; + + assert!(transform.input_path.is_none()); + let row_id_transform = transform + .field_transforms + .get("row_id_col") + .expect("Should have row_id_col transform"); + assert!(row_id_transform.is_replace); + + let expeceted_expr = Arc::new(Expression::variadic( + crate::expressions::VariadicExpressionOp::Coalesce, + vec![ + Expression::column(["row_id_col"]), + Expression::binary( + BinaryExpressionOp::Plus, + Expression::literal(4i64), + Expression::column(["row_index_col"]), + ), + ], + )); + assert_eq!(row_id_transform.exprs.len(), 1); + let expr = &row_id_transform.exprs[0]; + assert_eq!(expr, &expeceted_expr); + } } From be7d60dd141d75d114c04e6170f75a31b9810845 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 8 Oct 2025 16:29:58 -0700 Subject: [PATCH 09/28] Add log_replay transform test --- kernel/src/scan/log_replay.rs | 71 +++++++++++++++++++++++++++++++++-- kernel/src/scan/mod.rs | 15 ++++++++ 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index ffc0119ac..a97426795 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -397,11 +397,12 @@ mod tests { use crate::log_replay::ActionsBatch; use crate::scan::state::{DvInfo, Stats}; use crate::scan::test_utils::{ - add_batch_simple, add_batch_with_partition_col, add_batch_with_remove, - run_with_validate_callback, + add_batch_for_row_id, add_batch_simple, add_batch_with_partition_col, + add_batch_with_remove, run_with_validate_callback, }; - use crate::scan::tests::get_simple_state_info; + use crate::scan::tests::{get_simple_state_info, get_state_info}; use crate::scan::{PhysicalPredicate, StateInfo}; + use crate::schema::MetadataColumnSpec; use crate::Expression as Expr; use crate::{ engine::sync::SyncEngine, @@ -534,4 +535,68 @@ mod tests { validate_transform(transforms[3].as_ref(), 17510); } } + + #[test] + fn test_row_id_transform() { + let schema: SchemaRef = Arc::new(StructType::new_unchecked([StructField::new( + "value", + DataType::INTEGER, + true, + )])); + let state_info = get_state_info( + schema.clone(), + vec![], + None, + [ + ("delta.enableRowTracking", "true"), + ( + "delta.rowTracking.materializedRowIdColumnName", + "row_id_col", + ), + ] + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + vec![("row_id", MetadataColumnSpec::RowId)], + ) + .unwrap(); + let batch = vec![add_batch_for_row_id(get_log_schema().clone())]; + let iter = scan_action_iter( + &SyncEngine::new(), + batch + .into_iter() + .map(|batch| Ok(ActionsBatch::new(batch as _, true))), + Arc::new(state_info), + ); + + for res in iter { + let scan_metadata = res.unwrap(); + let transforms = scan_metadata.scan_file_transforms; + assert_eq!(transforms.len(), 1, "Should have 1 transform"); + if let Some(Expr::Transform(transform_expr)) = transforms[0].as_ref().map(Arc::as_ref) { + assert!(transform_expr.input_path.is_none()); + let row_id_transform = transform_expr + .field_transforms + .get("row_id_col") + .expect("Should have row_id_col transform"); + assert!(row_id_transform.is_replace); + assert_eq!(row_id_transform.exprs.len(), 1); + let expr = &row_id_transform.exprs[0]; + let expeceted_expr = Arc::new(Expr::variadic( + crate::expressions::VariadicExpressionOp::Coalesce, + vec![ + Expr::column(["row_id_col"]), + Expr::binary( + crate::expressions::BinaryExpressionOp::Plus, + Expr::literal(42i64), + Expr::column(["row_indexes_for_row_id_0"]), + ), + ], + )); + assert_eq!(expr, &expeceted_expr); + } else { + panic!("Should have been a transform expression"); + } + } + } } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 041e9cbf0..668fd887a 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -972,6 +972,21 @@ pub(crate) mod test_utils { ArrowEngineData::try_from_engine_data(parsed).unwrap() } + // Generates a batch with an add action. + // The schema is provided as null columns affect equality checks. + pub(crate) fn add_batch_for_row_id(output_schema: SchemaRef) -> Box { + let handler = SyncJsonHandler {}; + let json_strings: StringArray = vec![ + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","baseRowId": 42, "tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableRowTracking": "true", "delta.rowTracking.materializedRowIdColumnName":"row_id_col"},"createdTime":1677811175819}}"#, + ] + .into(); + let parsed = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + ArrowEngineData::try_from_engine_data(parsed).unwrap() + } + // An add batch with a removed file parsed with the schema provided pub(crate) fn add_batch_with_remove(output_schema: SchemaRef) -> Box { let handler = SyncJsonHandler {}; From 6e96dc2e121beb6f4cfa897fefe7be3c832d2b84 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 8 Oct 2025 16:37:33 -0700 Subject: [PATCH 10/28] fmt --- kernel/src/scan/log_replay.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index a97426795..ef6944d9f 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -583,7 +583,7 @@ mod tests { assert_eq!(row_id_transform.exprs.len(), 1); let expr = &row_id_transform.exprs[0]; let expeceted_expr = Arc::new(Expr::variadic( - crate::expressions::VariadicExpressionOp::Coalesce, + crate::expressions::VariadicExpressionOp::Coalesce, vec![ Expr::column(["row_id_col"]), Expr::binary( From b91e50d074b0818fcc0b85bca922d333cf2183b9 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 14:44:38 -0700 Subject: [PATCH 11/28] move StateInfo into its own module --- kernel/src/scan/field_classifiers.rs | 41 +- kernel/src/scan/log_replay.rs | 3 +- kernel/src/scan/mod.rs | 543 +---------------- kernel/src/scan/state_info.rs | 547 ++++++++++++++++++ .../src/table_changes/physical_to_logical.rs | 5 +- kernel/src/table_changes/scan.rs | 3 +- 6 files changed, 583 insertions(+), 559 deletions(-) create mode 100644 kernel/src/scan/state_info.rs diff --git a/kernel/src/scan/field_classifiers.rs b/kernel/src/scan/field_classifiers.rs index b8dbcd510..4f3c559f7 100644 --- a/kernel/src/scan/field_classifiers.rs +++ b/kernel/src/scan/field_classifiers.rs @@ -41,23 +41,26 @@ impl TransformFieldClassifier for ScanTransformFieldClassifierieldClassifier { table_configuration: &TableConfiguration, last_physical_field: &Option, ) -> DeltaResult> { - if table_configuration.metadata().partition_columns().contains(field.name()) { + if table_configuration + .metadata() + .partition_columns() + .contains(field.name()) + { // Partition column: needs transform to inject metadata - let transform_specs = vec!(FieldTransformSpec::MetadataDerivedColumn { + let transform_specs = vec![FieldTransformSpec::MetadataDerivedColumn { field_index, insert_after: last_physical_field.clone(), - }); + }]; Ok(Some(FieldRequirements { transform_specs, - extra_read_fields: vec!(), + extra_read_fields: vec![], })) } else { match field.get_metadata_column_spec() { Some(MetadataColumnSpec::RowId) => { - if table_configuration.table_properties().enable_row_tracking == Some(true) - { - let mut transform_specs = vec!(); - let mut extra_read_fields = vec!(); + if table_configuration.table_properties().enable_row_tracking == Some(true) { + let mut transform_specs = vec![]; + let mut extra_read_fields = vec![]; let row_id_col = table_configuration .metadata() .configuration() @@ -93,12 +96,10 @@ impl TransformFieldClassifier for ScanTransformFieldClassifierieldClassifier { }); Ok(Some(FieldRequirements { transform_specs, - extra_read_fields + extra_read_fields, })) } else { - Err(Error::unsupported( - "Row ids are not enabled on this table", - )) + Err(Error::unsupported("Row ids are not enabled on this table")) } } Some(MetadataColumnSpec::RowIndex) => { @@ -143,16 +144,18 @@ impl TransformFieldClassifier for CdfTransformFieldClassifier { } } // Defer to default classifier for partition columns and physical fields - _ => return ScanTransformFieldClassifierieldClassifier.classify_field( - field, - field_index, - table_configuration, - last_physical_field, - ), + _ => { + return ScanTransformFieldClassifierieldClassifier.classify_field( + field, + field_index, + table_configuration, + last_physical_field, + ) + } }; Ok(Some(FieldRequirements { transform_specs: vec![transform_spec], - extra_read_fields: vec!(), + extra_read_fields: vec![], })) } } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index ef6944d9f..a1bc16eec 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -3,7 +3,8 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use super::data_skipping::DataSkippingFilter; -use super::{PhysicalPredicate, ScanMetadata, StateInfo}; +use super::state_info::StateInfo; +use super::{PhysicalPredicate, ScanMetadata}; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 62dadb95a..ff7c779be 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -9,9 +9,7 @@ use itertools::Itertools; use tracing::debug; use url::Url; -use self::field_classifiers::{ - ScanTransformFieldClassifierieldClassifier, TransformFieldClassifier, -}; +use self::field_classifiers::ScanTransformFieldClassifierieldClassifier; use self::log_replay::get_scan_metadata_transform_expr; use crate::actions::deletion_vector::{ deletion_treemap_to_bools, split_vector, DeletionVectorDescriptor, @@ -27,14 +25,8 @@ use crate::log_segment::LogSegment; use crate::scan::state::{DvInfo, Stats}; use crate::schema::{ ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, - StructType, }; -use crate::schema::{MetadataColumnSpec, ToSchema as _}; -use crate::snapshot::SnapshotRef; -use crate::table_configuration::TableConfiguration; -use crate::table_features::ColumnMappingMode; -use crate::transforms::TransformSpec; -use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, Version}; +use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, SnapshotRef, Version}; use self::log_replay::scan_action_iter; @@ -42,6 +34,7 @@ pub(crate) mod data_skipping; pub(crate) mod field_classifiers; pub mod log_replay; pub mod state; +pub(crate) mod state_info; // safety: we define get_log_schema() and _know_ it contains ADD_NAME and REMOVE_NAME #[allow(clippy::unwrap_used)] @@ -122,7 +115,7 @@ impl ScanBuilder { // if no schema is provided, use snapshot's entire schema (e.g. SELECT *) let logical_schema = self.schema.unwrap_or_else(|| self.snapshot.schema()); - let state_info = StateInfo::try_new( + let state_info = state_info::StateInfo::try_new( logical_schema, self.snapshot.table_configuration(), self.predicate, @@ -361,7 +354,7 @@ impl HasSelectionVector for ScanMetadata { /// scanning the table. pub struct Scan { snapshot: SnapshotRef, - state_info: Arc, + state_info: Arc, } impl std::fmt::Debug for Scan { @@ -494,6 +487,8 @@ impl Scan { _existing_predicate: Option, ) -> DeltaResult>>> { static RESTORED_ADD_SCHEMA: LazyLock = LazyLock::new(|| { + use crate::schema::ToSchema as _; + let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); DataType::struct_type_unchecked(vec![StructField::nullable( "add", @@ -743,199 +738,6 @@ pub fn scan_row_schema() -> SchemaRef { log_replay::SCAN_ROW_SCHEMA.clone() } -/// All the state needed to process a scan. -#[derive(Debug)] -pub(crate) struct StateInfo { - /// The logical schema for this scan - pub(crate) logical_schema: SchemaRef, - /// The physical schema to read from parquet files - pub(crate) physical_schema: SchemaRef, - /// The physical predicate for data skipping - pub(crate) physical_predicate: PhysicalPredicate, - /// Transform specification for converting physical to logical data - pub(crate) transform_spec: Option>, -} - -impl StateInfo { - /// Create StateInfo with a custom field classifier for different scan types. - /// Get the state needed to process a scan. - /// - /// `logical_schema` - The logical schema of the scan output, which includes partition columns - /// `partition_columns` - List of column names that are partition columns in the table - /// `column_mapping_mode` - The column mapping mode used by the table for physical to logical mapping - /// `predicate` - Optional predicate to filter data during the scan - /// `classifier` - The classifier to use for different scan types - pub(crate) fn try_new( - logical_schema: SchemaRef, - table_configuration: &TableConfiguration, - predicate: Option, - classifier: C, - ) -> DeltaResult { - let partition_columns = table_configuration.metadata().partition_columns(); - let column_mapping_mode = table_configuration.column_mapping_mode(); - let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); - let mut read_field_names = HashSet::with_capacity(logical_schema.num_fields()); - let mut transform_spec = Vec::new(); - let mut last_physical_field: Option = None; - - // We remember the name of the column that's selecting row indexes, if it's been requested - // explicitly. this is so we can reference this column and not re-add it as a requested - // column if we're _also_ requesting row-ids - let mut selected_row_index_col_name = None; - - // This iteration runs in O(supported_number_of_metadata_columns) time since each metadata - // column can appear at most once in the schema - for metadata_column in logical_schema.metadata_columns() { - if read_field_names.contains(metadata_column.name()) { - return Err(Error::Schema(format!( - "Metadata column names must not match physical columns: {}", - metadata_column.name() - ))); - } - if let Some(MetadataColumnSpec::RowIndex) = metadata_column.get_metadata_column_spec() { - selected_row_index_col_name = Some(metadata_column.name().to_string()); - } - } - - // Loop over all selected fields and build both the physical schema and transform spec - for (index, logical_field) in logical_schema.fields().enumerate() { - let requirements = classifier.classify_field( - logical_field, - index, - table_configuration, - &last_physical_field, - )?; - - if let Some(requirements) = requirements { - // Field needs transformation - not in physical schema - for spec in requirements.transform_specs.into_iter() { - transform_spec.push(spec); - } - } else { - // Physical field - should be read from parquet - // Validate metadata column doesn't conflict with partition columns - if logical_field.is_metadata_column() - && partition_columns.contains(logical_field.name()) - { - return Err(Error::Schema(format!( - "Metadata column names must not match partition columns: {}", - logical_field.name() - ))); - } - -// <<<<<<< HEAD -// // Partition column: needs to be injected via transform -// transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { -// field_index: index, -// insert_after: last_physical_field.clone(), -// }); -// } else { -// // Regular field field, or metadata columns: add to physical schema -// let mut select_physical_field = || { -// let physical_field = logical_field.make_physical(column_mapping_mode); -// debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); -// let physical_name = physical_field.name.clone(); -// ||||||| f431de0c -// // Partition column: needs to be injected via transform -// transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { -// field_index: index, -// insert_after: last_physical_field.clone(), -// }); -// } else { -// // Regular field: add to physical schema -// let physical_field = logical_field.make_physical(column_mapping_mode); -// debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); -// let physical_name = physical_field.name.clone(); -// ======= - // Add to physical schema - let physical_field = logical_field.make_physical(column_mapping_mode); - debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); - let physical_name = physical_field.name.clone(); - - - if !logical_field.is_metadata_column() { - read_field_names.insert(physical_name.clone()); - } - last_physical_field = Some(physical_name); - read_fields.push(physical_field); - } - // match logical_field.get_metadata_column_spec() { - // Some(MetadataColumnSpec::RowId) => { - // if table_configuration.table_properties().enable_row_tracking == Some(true) - // { - // let row_id_col = table_configuration - // .metadata() - // .configuration() - // .get("delta.rowTracking.materializedRowIdColumnName") - // .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; - // // we can `take` as we should only have one RowId col - // let index_column_name = match selected_row_index_col_name.take() { - // Some(index_column_name) => index_column_name, - // None => { - // // ensure we have a column name that isn't already in our schema - // let index_column_name = (0..) - // .map(|i| format!("row_indexes_for_row_id_{}", i)) - // .find(|name| logical_schema.field(name).is_none()) - // .ok_or(Error::generic( - // "Couldn't generate row index column name", - // ))?; - // read_fields.push(StructField::create_metadata_column( - // &index_column_name, - // MetadataColumnSpec::RowIndex, - // )); - // transform_spec.push(FieldTransformSpec::StaticDrop { - // field_name: index_column_name.clone(), - // }); - // index_column_name - // } - // }; - - // read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); - // transform_spec.push(FieldTransformSpec::GenerateRowId { - // field_name: row_id_col.to_string(), - // row_index_field_name: index_column_name, - // }); - // } else { - // return Err(Error::unsupported( - // "Row ids are not enabled on this table", - // )); - // } - // } - // Some(MetadataColumnSpec::RowIndex) => { - // // handled in parquet reader - // select_physical_field() - // } - // Some(MetadataColumnSpec::RowCommitVersion) => { - // return Err(Error::unsupported("Row commit versions not supported")); - // } - // _ => select_physical_field(), - // } - // } - } - - let physical_schema = Arc::new(StructType::try_new(read_fields)?); - - let physical_predicate = match predicate { - Some(pred) => PhysicalPredicate::try_new(&pred, &logical_schema)?, - None => PhysicalPredicate::None, - }; - - let transform_spec = - if !transform_spec.is_empty() || column_mapping_mode != ColumnMappingMode::None { - Some(Arc::new(transform_spec)) - } else { - None - }; - - Ok(StateInfo { - logical_schema, - physical_schema, - physical_predicate, - transform_spec, - }) - } -} - pub fn selection_vector( engine: &dyn Engine, descriptor: &DeletionVectorDescriptor, @@ -1609,335 +1411,4 @@ mod tests { ); Ok(()) } - - // get a state info with no predicate or extra metadata - pub(crate) fn get_simple_state_info( - schema: SchemaRef, - partition_columns: Vec, - ) -> DeltaResult { - get_state_info(schema, partition_columns, None, HashMap::new(), vec![]) - } - - pub(crate) fn get_state_info( - schema: SchemaRef, - partition_columns: Vec, - predicate: Option, - metadata_configuration: HashMap, - metadata_cols: Vec<(&str, MetadataColumnSpec)>, - ) -> DeltaResult { - let metadata = Metadata::try_new( - None, - None, - schema.as_ref().clone(), - partition_columns, - 10, - metadata_configuration, - )?; - let no_features: Option> = None; // needed for type annotation - let protocol = Protocol::try_new(1, 2, no_features.clone(), no_features)?; - let table_configuration = TableConfiguration::try_new( - metadata, - protocol, - Url::parse("s3://my-table").unwrap(), - 1, - )?; - - let mut schema = schema; - for (name, spec) in metadata_cols.into_iter() { - schema = Arc::new( - schema - .add_metadata_column(name, spec) - .expect("Couldn't add metadata col"), - ); - } - - StateInfo::try_new(schema.clone(), &table_configuration, predicate, ScanTransformFieldClassifierieldClassifier) - } - - #[test] - fn test_state_info_no_partition_columns() { - // Test case: No partition columns, no column mapping - let schema = Arc::new(StructType::new_unchecked(vec![ - StructField::nullable("id", DataType::STRING), - StructField::nullable("value", DataType::LONG), - ])); - - let state_info = get_simple_state_info(schema.clone(), vec![]).unwrap(); - - // Should have no transform spec (no partitions, no column mapping) - assert!(state_info.transform_spec.is_none()); - - // Physical schema should match logical schema - assert_eq!(state_info.logical_schema, schema); - assert_eq!(state_info.physical_schema.fields().len(), 2); - - // No predicate - assert_eq!(state_info.physical_predicate, PhysicalPredicate::None); - } - - #[test] - fn test_state_info_with_partition_columns() { - // Test case: With partition columns - let schema = Arc::new(StructType::new_unchecked(vec![ - StructField::nullable("id", DataType::STRING), - StructField::nullable("date", DataType::DATE), // Partition column - StructField::nullable("value", DataType::LONG), - ])); - - let state_info = get_simple_state_info( - schema.clone(), - vec!["date".to_string()], // date is a partition column - ) - .unwrap(); - - // Should have a transform spec for the partition column - assert!(state_info.transform_spec.is_some()); - let transform_spec = state_info.transform_spec.as_ref().unwrap(); - assert_eq!(transform_spec.len(), 1); - - // Check the transform spec for the partition column - match &transform_spec[0] { - FieldTransformSpec::MetadataDerivedColumn { - field_index, - insert_after, - } => { - assert_eq!(*field_index, 1); // Index of "date" in logical schema - assert_eq!(insert_after, &Some("id".to_string())); // After "id" which is physical - } - _ => panic!("Expected MetadataDerivedColumn transform"), - } - - // Physical schema should not include partition column - assert_eq!(state_info.logical_schema, schema); - assert_eq!(state_info.physical_schema.fields().len(), 2); // Only id and value - } - - #[test] - fn test_state_info_multiple_partition_columns() { - // Test case: Multiple partition columns interspersed with regular columns - let schema = Arc::new(StructType::new_unchecked(vec![ - StructField::nullable("col1", DataType::STRING), - StructField::nullable("part1", DataType::STRING), // Partition - StructField::nullable("col2", DataType::LONG), - StructField::nullable("part2", DataType::INTEGER), // Partition - ])); - - let state_info = get_simple_state_info( - schema.clone(), - vec!["part1".to_string(), "part2".to_string()], - ) - .unwrap(); - - // Should have transforms for both partition columns - assert!(state_info.transform_spec.is_some()); - let transform_spec = state_info.transform_spec.as_ref().unwrap(); - assert_eq!(transform_spec.len(), 2); - - // Check first partition column transform - match &transform_spec[0] { - FieldTransformSpec::MetadataDerivedColumn { - field_index, - insert_after, - } => { - assert_eq!(*field_index, 1); // Index of "part1" - assert_eq!(insert_after, &Some("col1".to_string())); - } - _ => panic!("Expected MetadataDerivedColumn transform"), - } - - // Check second partition column transform - match &transform_spec[1] { - FieldTransformSpec::MetadataDerivedColumn { - field_index, - insert_after, - } => { - assert_eq!(*field_index, 3); // Index of "part2" - assert_eq!(insert_after, &Some("col2".to_string())); - } - _ => panic!("Expected MetadataDerivedColumn transform"), - } - - // Physical schema should only have non-partition columns - assert_eq!(state_info.physical_schema.fields().len(), 2); // col1 and col2 - } - - #[test] - fn test_state_info_with_predicate() { - // Test case: With a valid predicate - let schema = Arc::new(StructType::new_unchecked(vec![ - StructField::nullable("id", DataType::STRING), - StructField::nullable("value", DataType::LONG), - ])); - - let predicate = Arc::new(column_expr!("value").gt(Expr::literal(10i64))); - - let state_info = get_state_info( - schema.clone(), - vec![], // no partition columns - Some(predicate), - HashMap::new(), // no extra metadata - vec![], // no metadata - ) - .unwrap(); - - // Should have a physical predicate - match &state_info.physical_predicate { - PhysicalPredicate::Some(_pred, schema) => { - // Physical predicate exists - assert_eq!(schema.fields().len(), 1); // Only "value" is referenced - } - _ => panic!("Expected PhysicalPredicate::Some"), - } - } - - #[test] - fn test_state_info_partition_at_beginning() { - // Test case: Partition column at the beginning - let schema = Arc::new(StructType::new_unchecked(vec![ - StructField::nullable("date", DataType::DATE), // Partition column - StructField::nullable("id", DataType::STRING), - StructField::nullable("value", DataType::LONG), - ])); - - let state_info = get_simple_state_info(schema.clone(), vec!["date".to_string()]).unwrap(); - - // Should have a transform spec for the partition column - let transform_spec = state_info.transform_spec.as_ref().unwrap(); - assert_eq!(transform_spec.len(), 1); - - match &transform_spec[0] { - FieldTransformSpec::MetadataDerivedColumn { - field_index, - insert_after, - } => { - assert_eq!(*field_index, 0); // Index of "date" - assert_eq!(insert_after, &None); // No physical field before it, so prepend - } - _ => panic!("Expected MetadataDerivedColumn transform"), - } - } - - #[test] - fn test_state_info_request_row_ids() { - let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( - "id", - DataType::STRING, - )])); - - let state_info = get_state_info( - schema.clone(), - vec![], - None, - [ - ("delta.enableRowTracking", "true"), - ( - "delta.rowTracking.materializedRowIdColumnName", - "some-row-id_col", - ), - ] - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(), - vec![("row_id", MetadataColumnSpec::RowId)], - ) - .unwrap(); - - // Should have a transform spec for the row_id column - let transform_spec = state_info.transform_spec.as_ref().unwrap(); - assert_eq!(transform_spec.len(), 2); // one for rowid col, one to drop indexes - - match &transform_spec[0] { - FieldTransformSpec::StaticDrop { field_name } => { - assert_eq!(field_name, "row_indexes_for_row_id_0"); - } - _ => panic!("Expected StaticDrop transform"), - } - - match &transform_spec[1] { - FieldTransformSpec::GenerateRowId { - field_name, - row_index_field_name, - } => { - assert_eq!(field_name, "some-row-id_col"); - assert_eq!(row_index_field_name, "row_indexes_for_row_id_0"); - } - _ => panic!("Expected GenerateRowId transform"), - } - } - - #[test] - fn test_state_info_request_row_ids_and_indexes() { - let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( - "id", - DataType::STRING, - )])); - - let state_info = get_state_info( - schema.clone(), - vec![], - None, - [ - ("delta.enableRowTracking", "true"), - ( - "delta.rowTracking.materializedRowIdColumnName", - "some-row-id_col", - ), - ] - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(), - vec![ - ("row_id", MetadataColumnSpec::RowId), - ("row_index", MetadataColumnSpec::RowIndex), - ], - ) - .unwrap(); - - // Should have a transform spec for the row_id column - let transform_spec = state_info.transform_spec.as_ref().unwrap(); - assert_eq!(transform_spec.len(), 1); // just one because we don't want to drop indexes now - - match &transform_spec[0] { - FieldTransformSpec::GenerateRowId { - field_name, - row_index_field_name, - } => { - assert_eq!(field_name, "some-row-id_col"); - assert_eq!(row_index_field_name, "row_index"); - } - _ => panic!("Expected GenerateRowId transform"), - } - } - - #[test] - fn test_state_info_invalid_rowtracking_config() { - let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( - "id", - DataType::STRING, - )])); - - for (metadata_config, metadata_cols, expected_error) in [ - (HashMap::new(), vec![("row_id", MetadataColumnSpec::RowId)], "Unsupported: Row ids are not enabled on this table"), - ( - [("delta.enableRowTracking", "true")] - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(), - vec![("row_id", MetadataColumnSpec::RowId)], - "Generic delta kernel error: No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration", - ), - ] { - match get_state_info(schema.clone(), vec![], None, metadata_config, metadata_cols) { - Ok(_) => { - panic!("Should not have succeeded generating state info with invalid config") - } - Err(e) => { - assert_eq!( - e.to_string(), - expected_error, - ) - } - } - } - } } diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs new file mode 100644 index 000000000..4e63ddb4d --- /dev/null +++ b/kernel/src/scan/state_info.rs @@ -0,0 +1,547 @@ +//! StateInfo handles the state that we use through log-replay in order to correctly construct all +//! the physical->logical transforms needed for each add file + +use std::collections::HashSet; +use std::sync::Arc; + +use tracing::debug; + +use crate::scan::field_classifiers::TransformFieldClassifier; +use crate::scan::PhysicalPredicate; +use crate::schema::{MetadataColumnSpec, SchemaRef, StructType}; +use crate::table_configuration::TableConfiguration; +use crate::table_features::ColumnMappingMode; +use crate::transforms::TransformSpec; +use crate::{DeltaResult, Error, PredicateRef}; + +/// All the state needed to process a scan. +#[derive(Debug)] +pub(crate) struct StateInfo { + /// The logical schema for this scan + pub(crate) logical_schema: SchemaRef, + /// The physical schema to read from parquet files + pub(crate) physical_schema: SchemaRef, + /// The physical predicate for data skipping + pub(crate) physical_predicate: PhysicalPredicate, + /// Transform specification for converting physical to logical data + pub(crate) transform_spec: Option>, +} + +impl StateInfo { + /// Create StateInfo with a custom field classifier for different scan types. + /// Get the state needed to process a scan. + /// + /// `logical_schema` - The logical schema of the scan output, which includes partition columns + /// `partition_columns` - List of column names that are partition columns in the table + /// `column_mapping_mode` - The column mapping mode used by the table for physical to logical mapping + /// `predicate` - Optional predicate to filter data during the scan + /// `classifier` - The classifier to use for different scan types + pub(crate) fn try_new( + logical_schema: SchemaRef, + table_configuration: &TableConfiguration, + predicate: Option, + classifier: C, + ) -> DeltaResult { + let partition_columns = table_configuration.metadata().partition_columns(); + let column_mapping_mode = table_configuration.column_mapping_mode(); + let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); + let mut read_field_names = HashSet::with_capacity(logical_schema.num_fields()); + let mut transform_spec = Vec::new(); + let mut last_physical_field: Option = None; + + // We remember the name of the column that's selecting row indexes, if it's been requested + // explicitly. this is so we can reference this column and not re-add it as a requested + // column if we're _also_ requesting row-ids + let mut selected_row_index_col_name = None; + + // This iteration runs in O(supported_number_of_metadata_columns) time since each metadata + // column can appear at most once in the schema + for metadata_column in logical_schema.metadata_columns() { + if read_field_names.contains(metadata_column.name()) { + return Err(Error::Schema(format!( + "Metadata column names must not match physical columns: {}", + metadata_column.name() + ))); + } + if let Some(MetadataColumnSpec::RowIndex) = metadata_column.get_metadata_column_spec() { + selected_row_index_col_name = Some(metadata_column.name().to_string()); + } + } + + // Loop over all selected fields and build both the physical schema and transform spec + for (index, logical_field) in logical_schema.fields().enumerate() { + let requirements = classifier.classify_field( + logical_field, + index, + table_configuration, + &last_physical_field, + )?; + + if let Some(requirements) = requirements { + // Field needs transformation - not in physical schema + for spec in requirements.transform_specs.into_iter() { + transform_spec.push(spec); + } + } else { + // Physical field - should be read from parquet + // Validate metadata column doesn't conflict with partition columns + if logical_field.is_metadata_column() + && partition_columns.contains(logical_field.name()) + { + return Err(Error::Schema(format!( + "Metadata column names must not match partition columns: {}", + logical_field.name() + ))); + } + + // <<<<<<< HEAD + // // Partition column: needs to be injected via transform + // transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { + // field_index: index, + // insert_after: last_physical_field.clone(), + // }); + // } else { + // // Regular field field, or metadata columns: add to physical schema + // let mut select_physical_field = || { + // let physical_field = logical_field.make_physical(column_mapping_mode); + // debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); + // let physical_name = physical_field.name.clone(); + // ||||||| f431de0c + // // Partition column: needs to be injected via transform + // transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { + // field_index: index, + // insert_after: last_physical_field.clone(), + // }); + // } else { + // // Regular field: add to physical schema + // let physical_field = logical_field.make_physical(column_mapping_mode); + // debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); + // let physical_name = physical_field.name.clone(); + // ======= + // Add to physical schema + let physical_field = logical_field.make_physical(column_mapping_mode); + debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); + let physical_name = physical_field.name.clone(); + + if !logical_field.is_metadata_column() { + read_field_names.insert(physical_name.clone()); + } + last_physical_field = Some(physical_name); + read_fields.push(physical_field); + } + // match logical_field.get_metadata_column_spec() { + // Some(MetadataColumnSpec::RowId) => { + // if table_configuration.table_properties().enable_row_tracking == Some(true) + // { + // let row_id_col = table_configuration + // .metadata() + // .configuration() + // .get("delta.rowTracking.materializedRowIdColumnName") + // .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; + // // we can `take` as we should only have one RowId col + // let index_column_name = match selected_row_index_col_name.take() { + // Some(index_column_name) => index_column_name, + // None => { + // // ensure we have a column name that isn't already in our schema + // let index_column_name = (0..) + // .map(|i| format!("row_indexes_for_row_id_{}", i)) + // .find(|name| logical_schema.field(name).is_none()) + // .ok_or(Error::generic( + // "Couldn't generate row index column name", + // ))?; + // read_fields.push(StructField::create_metadata_column( + // &index_column_name, + // MetadataColumnSpec::RowIndex, + // )); + // transform_spec.push(FieldTransformSpec::StaticDrop { + // field_name: index_column_name.clone(), + // }); + // index_column_name + // } + // }; + + // read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); + // transform_spec.push(FieldTransformSpec::GenerateRowId { + // field_name: row_id_col.to_string(), + // row_index_field_name: index_column_name, + // }); + // } else { + // return Err(Error::unsupported( + // "Row ids are not enabled on this table", + // )); + // } + // } + // Some(MetadataColumnSpec::RowIndex) => { + // // handled in parquet reader + // select_physical_field() + // } + // Some(MetadataColumnSpec::RowCommitVersion) => { + // return Err(Error::unsupported("Row commit versions not supported")); + // } + // _ => select_physical_field(), + // } + // } + } + + let physical_schema = Arc::new(StructType::try_new(read_fields)?); + + let physical_predicate = match predicate { + Some(pred) => PhysicalPredicate::try_new(&pred, &logical_schema)?, + None => PhysicalPredicate::None, + }; + + let transform_spec = + if !transform_spec.is_empty() || column_mapping_mode != ColumnMappingMode::None { + Some(Arc::new(transform_spec)) + } else { + None + }; + + Ok(StateInfo { + logical_schema, + physical_schema, + physical_predicate, + transform_spec, + }) + } +} + +#[cfg(test)] +mod tests { + + // get a state info with no predicate or extra metadata + pub(crate) fn get_simple_state_info( + schema: SchemaRef, + partition_columns: Vec, + ) -> DeltaResult { + get_state_info(schema, partition_columns, None, HashMap::new(), vec![]) + } + + pub(crate) fn get_state_info( + schema: SchemaRef, + partition_columns: Vec, + predicate: Option, + metadata_configuration: HashMap, + metadata_cols: Vec<(&str, MetadataColumnSpec)>, + ) -> DeltaResult { + let metadata = Metadata::try_new( + None, + None, + schema.as_ref().clone(), + partition_columns, + 10, + metadata_configuration, + )?; + let no_features: Option> = None; // needed for type annotation + let protocol = Protocol::try_new(1, 2, no_features.clone(), no_features)?; + let table_configuration = TableConfiguration::try_new( + metadata, + protocol, + Url::parse("s3://my-table").unwrap(), + 1, + )?; + + let mut schema = schema; + for (name, spec) in metadata_cols.into_iter() { + schema = Arc::new( + schema + .add_metadata_column(name, spec) + .expect("Couldn't add metadata col"), + ); + } + + StateInfo::try_new( + schema.clone(), + &table_configuration, + predicate, + ScanTransformFieldClassifierieldClassifier, + ) + } + + #[test] + fn test_state_info_no_partition_columns() { + // Test case: No partition columns, no column mapping + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + ])); + + let state_info = get_simple_state_info(schema.clone(), vec![]).unwrap(); + + // Should have no transform spec (no partitions, no column mapping) + assert!(state_info.transform_spec.is_none()); + + // Physical schema should match logical schema + assert_eq!(state_info.logical_schema, schema); + assert_eq!(state_info.physical_schema.fields().len(), 2); + + // No predicate + assert_eq!(state_info.physical_predicate, PhysicalPredicate::None); + } + + #[test] + fn test_state_info_with_partition_columns() { + // Test case: With partition columns + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("date", DataType::DATE), // Partition column + StructField::nullable("value", DataType::LONG), + ])); + + let state_info = get_simple_state_info( + schema.clone(), + vec!["date".to_string()], // date is a partition column + ) + .unwrap(); + + // Should have a transform spec for the partition column + assert!(state_info.transform_spec.is_some()); + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 1); + + // Check the transform spec for the partition column + match &transform_spec[0] { + FieldTransformSpec::MetadataDerivedColumn { + field_index, + insert_after, + } => { + assert_eq!(*field_index, 1); // Index of "date" in logical schema + assert_eq!(insert_after, &Some("id".to_string())); // After "id" which is physical + } + _ => panic!("Expected MetadataDerivedColumn transform"), + } + + // Physical schema should not include partition column + assert_eq!(state_info.logical_schema, schema); + assert_eq!(state_info.physical_schema.fields().len(), 2); // Only id and value + } + + #[test] + fn test_state_info_multiple_partition_columns() { + // Test case: Multiple partition columns interspersed with regular columns + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("col1", DataType::STRING), + StructField::nullable("part1", DataType::STRING), // Partition + StructField::nullable("col2", DataType::LONG), + StructField::nullable("part2", DataType::INTEGER), // Partition + ])); + + let state_info = get_simple_state_info( + schema.clone(), + vec!["part1".to_string(), "part2".to_string()], + ) + .unwrap(); + + // Should have transforms for both partition columns + assert!(state_info.transform_spec.is_some()); + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 2); + + // Check first partition column transform + match &transform_spec[0] { + FieldTransformSpec::MetadataDerivedColumn { + field_index, + insert_after, + } => { + assert_eq!(*field_index, 1); // Index of "part1" + assert_eq!(insert_after, &Some("col1".to_string())); + } + _ => panic!("Expected MetadataDerivedColumn transform"), + } + + // Check second partition column transform + match &transform_spec[1] { + FieldTransformSpec::MetadataDerivedColumn { + field_index, + insert_after, + } => { + assert_eq!(*field_index, 3); // Index of "part2" + assert_eq!(insert_after, &Some("col2".to_string())); + } + _ => panic!("Expected MetadataDerivedColumn transform"), + } + + // Physical schema should only have non-partition columns + assert_eq!(state_info.physical_schema.fields().len(), 2); // col1 and col2 + } + + #[test] + fn test_state_info_with_predicate() { + // Test case: With a valid predicate + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + ])); + + let predicate = Arc::new(column_expr!("value").gt(Expr::literal(10i64))); + + let state_info = get_state_info( + schema.clone(), + vec![], // no partition columns + Some(predicate), + HashMap::new(), // no extra metadata + vec![], // no metadata + ) + .unwrap(); + + // Should have a physical predicate + match &state_info.physical_predicate { + PhysicalPredicate::Some(_pred, schema) => { + // Physical predicate exists + assert_eq!(schema.fields().len(), 1); // Only "value" is referenced + } + _ => panic!("Expected PhysicalPredicate::Some"), + } + } + + #[test] + fn test_state_info_partition_at_beginning() { + // Test case: Partition column at the beginning + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("date", DataType::DATE), // Partition column + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + ])); + + let state_info = get_simple_state_info(schema.clone(), vec!["date".to_string()]).unwrap(); + + // Should have a transform spec for the partition column + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 1); + + match &transform_spec[0] { + FieldTransformSpec::MetadataDerivedColumn { + field_index, + insert_after, + } => { + assert_eq!(*field_index, 0); // Index of "date" + assert_eq!(insert_after, &None); // No physical field before it, so prepend + } + _ => panic!("Expected MetadataDerivedColumn transform"), + } + } + + #[test] + fn test_state_info_request_row_ids() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "id", + DataType::STRING, + )])); + + let state_info = get_state_info( + schema.clone(), + vec![], + None, + [ + ("delta.enableRowTracking", "true"), + ( + "delta.rowTracking.materializedRowIdColumnName", + "some-row-id_col", + ), + ] + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + vec![("row_id", MetadataColumnSpec::RowId)], + ) + .unwrap(); + + // Should have a transform spec for the row_id column + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 2); // one for rowid col, one to drop indexes + + match &transform_spec[0] { + FieldTransformSpec::StaticDrop { field_name } => { + assert_eq!(field_name, "row_indexes_for_row_id_0"); + } + _ => panic!("Expected StaticDrop transform"), + } + + match &transform_spec[1] { + FieldTransformSpec::GenerateRowId { + field_name, + row_index_field_name, + } => { + assert_eq!(field_name, "some-row-id_col"); + assert_eq!(row_index_field_name, "row_indexes_for_row_id_0"); + } + _ => panic!("Expected GenerateRowId transform"), + } + } + + #[test] + fn test_state_info_request_row_ids_and_indexes() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "id", + DataType::STRING, + )])); + + let state_info = get_state_info( + schema.clone(), + vec![], + None, + [ + ("delta.enableRowTracking", "true"), + ( + "delta.rowTracking.materializedRowIdColumnName", + "some-row-id_col", + ), + ] + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + vec![ + ("row_id", MetadataColumnSpec::RowId), + ("row_index", MetadataColumnSpec::RowIndex), + ], + ) + .unwrap(); + + // Should have a transform spec for the row_id column + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_eq!(transform_spec.len(), 1); // just one because we don't want to drop indexes now + + match &transform_spec[0] { + FieldTransformSpec::GenerateRowId { + field_name, + row_index_field_name, + } => { + assert_eq!(field_name, "some-row-id_col"); + assert_eq!(row_index_field_name, "row_index"); + } + _ => panic!("Expected GenerateRowId transform"), + } + } + + #[test] + fn test_state_info_invalid_rowtracking_config() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "id", + DataType::STRING, + )])); + + for (metadata_config, metadata_cols, expected_error) in [ + (HashMap::new(), vec![("row_id", MetadataColumnSpec::RowId)], "Unsupported: Row ids are not enabled on this table"), + ( + [("delta.enableRowTracking", "true")] + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + vec![("row_id", MetadataColumnSpec::RowId)], + "Generic delta kernel error: No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration", + ), + ] { + match get_state_info(schema.clone(), vec![], None, metadata_config, metadata_cols) { + Ok(_) => { + panic!("Should not have succeeded generating state info with invalid config") + } + Err(e) => { + assert_eq!( + e.to_string(), + expected_error, + ) + } + } + } + } +} diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 5c6553f8e..a9f8a21a4 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use crate::expressions::Scalar; -use crate::scan::StateInfo; +use crate::scan::state_info::StateInfo; use crate::schema::{DataType, SchemaRef, StructField, StructType}; use crate::transforms::{get_transform_expr, parse_partition_values}; use crate::{DeltaResult, Error, ExpressionRef}; @@ -117,7 +117,8 @@ mod tests { use super::*; use crate::expressions::Expression; use crate::scan::state::DvInfo; - use crate::scan::{PhysicalPredicate, StateInfo}; + use crate::scan::state_info::StateInfo; + use crate::scan::PhysicalPredicate; use crate::schema::{DataType, StructField, StructType}; use crate::transforms::FieldTransformSpec; use std::collections::HashMap; diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index 100b75647..ebc2bd166 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -7,7 +7,8 @@ use url::Url; use crate::actions::deletion_vector::split_vector; use crate::scan::field_classifiers::CdfTransformFieldClassifier; -use crate::scan::{PhysicalPredicate, ScanResult, StateInfo}; +use crate::scan::state_info::StateInfo; +use crate::scan::{PhysicalPredicate, ScanResult}; use crate::schema::SchemaRef; use crate::{DeltaResult, Engine, FileMeta, PredicateRef}; From 798abbbfbff39b5cb4014c974289f5b3a44dfb15 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 16:08:18 -0700 Subject: [PATCH 12/28] working, needs simplification --- kernel/src/scan/field_classifiers.rs | 13 ++ kernel/src/scan/log_replay.rs | 5 +- kernel/src/scan/mod.rs | 15 +- kernel/src/scan/state_info.rs | 223 +++++++++++++-------------- kernel/src/table_changes/scan.rs | 2 +- 5 files changed, 132 insertions(+), 126 deletions(-) diff --git a/kernel/src/scan/field_classifiers.rs b/kernel/src/scan/field_classifiers.rs index 4f3c559f7..6af895741 100644 --- a/kernel/src/scan/field_classifiers.rs +++ b/kernel/src/scan/field_classifiers.rs @@ -30,6 +30,19 @@ pub(crate) trait TransformFieldClassifier { ) -> DeltaResult>; } +// Empty classifier, always returns None +impl TransformFieldClassifier for () { + fn classify_field( + &self, + _: &StructField, + _: usize, + _: &TableConfiguration, + _: &Option, + ) -> DeltaResult> { + Ok(None) + } +} + /// Regular scan field classifier for standard Delta table scans. /// Handles partition columns as metadata-derived fields. pub(crate) struct ScanTransformFieldClassifierieldClassifier; diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index a1bc16eec..12d339b97 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -397,12 +397,13 @@ mod tests { use crate::expressions::Scalar; use crate::log_replay::ActionsBatch; use crate::scan::state::{DvInfo, Stats}; + use crate::scan::state_info::tests::{get_simple_state_info, get_state_info}; + use crate::scan::state_info::StateInfo; use crate::scan::test_utils::{ add_batch_for_row_id, add_batch_simple, add_batch_with_partition_col, add_batch_with_remove, run_with_validate_callback, }; - use crate::scan::tests::{get_simple_state_info, get_state_info}; - use crate::scan::{PhysicalPredicate, StateInfo}; + use crate::scan::PhysicalPredicate; use crate::schema::MetadataColumnSpec; use crate::Expression as Expr; use crate::{ diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index ff7c779be..fd2b1d2f1 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -9,7 +9,6 @@ use itertools::Itertools; use tracing::debug; use url::Url; -use self::field_classifiers::ScanTransformFieldClassifierieldClassifier; use self::log_replay::get_scan_metadata_transform_expr; use crate::actions::deletion_vector::{ deletion_treemap_to_bools, split_vector, DeletionVectorDescriptor, @@ -119,7 +118,7 @@ impl ScanBuilder { logical_schema, self.snapshot.table_configuration(), self.predicate, - ScanTransformFieldClassifierieldClassifier, + None::<()>, // No classifer, default is for scans )?; Ok(Scan { @@ -752,6 +751,8 @@ pub fn selection_vector( #[cfg(test)] pub(crate) mod test_utils { use crate::arrow::array::StringArray; + use crate::scan::state_info::StateInfo; + use crate::schema::StructType; use crate::utils::test_utils::string_array_to_engine_data; use itertools::Itertools; use std::sync::Arc; @@ -769,7 +770,7 @@ pub(crate) mod test_utils { }; use super::state::ScanCallback; - use super::{PhysicalPredicate, StateInfo}; + use super::PhysicalPredicate; use crate::transforms::TransformSpec; // Generates a batch of sidecar actions with the given paths. @@ -877,8 +878,8 @@ pub(crate) mod test_utils { context: T, validate_callback: ScanCallback, ) { - let logical_schema = logical_schema - .unwrap_or_else(|| Arc::new(crate::schema::StructType::new_unchecked(vec![]))); + let logical_schema = + logical_schema.unwrap_or_else(|| Arc::new(StructType::new_unchecked(vec![]))); let state_info = Arc::new(StateInfo { logical_schema: logical_schema.clone(), physical_schema: logical_schema, @@ -912,15 +913,13 @@ pub(crate) mod test_utils { mod tests { use std::path::PathBuf; - use crate::actions::{Metadata, Protocol}; use crate::arrow::array::BooleanArray; use crate::arrow::compute::filter_record_batch; use crate::arrow::record_batch::RecordBatch; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::sync::SyncEngine; use crate::expressions::{column_expr, column_pred, Expression as Expr, Predicate as Pred}; - use crate::schema::{ColumnMetadataKey, PrimitiveType}; - use crate::transforms::FieldTransformSpec; + use crate::schema::{ColumnMetadataKey, PrimitiveType, StructType}; use crate::Snapshot; use super::*; diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index 4e63ddb4d..ca17866ed 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -8,11 +8,11 @@ use tracing::debug; use crate::scan::field_classifiers::TransformFieldClassifier; use crate::scan::PhysicalPredicate; -use crate::schema::{MetadataColumnSpec, SchemaRef, StructType}; +use crate::schema::{DataType, MetadataColumnSpec, SchemaRef, StructType}; use crate::table_configuration::TableConfiguration; use crate::table_features::ColumnMappingMode; -use crate::transforms::TransformSpec; -use crate::{DeltaResult, Error, PredicateRef}; +use crate::transforms::{FieldTransformSpec, TransformSpec}; +use crate::{DeltaResult, Error, PredicateRef, StructField}; /// All the state needed to process a scan. #[derive(Debug)] @@ -40,12 +40,12 @@ impl StateInfo { logical_schema: SchemaRef, table_configuration: &TableConfiguration, predicate: Option, - classifier: C, + classifier_opt: Option, ) -> DeltaResult { let partition_columns = table_configuration.metadata().partition_columns(); let column_mapping_mode = table_configuration.column_mapping_mode(); let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); - let mut read_field_names = HashSet::with_capacity(logical_schema.num_fields()); + let mut metadata_field_names = HashSet::new(); let mut transform_spec = Vec::new(); let mut last_physical_field: Option = None; @@ -57,130 +57,120 @@ impl StateInfo { // This iteration runs in O(supported_number_of_metadata_columns) time since each metadata // column can appear at most once in the schema for metadata_column in logical_schema.metadata_columns() { - if read_field_names.contains(metadata_column.name()) { - return Err(Error::Schema(format!( - "Metadata column names must not match physical columns: {}", - metadata_column.name() - ))); - } if let Some(MetadataColumnSpec::RowIndex) = metadata_column.get_metadata_column_spec() { selected_row_index_col_name = Some(metadata_column.name().to_string()); } + metadata_field_names.insert(metadata_column.name()); } // Loop over all selected fields and build both the physical schema and transform spec for (index, logical_field) in logical_schema.fields().enumerate() { - let requirements = classifier.classify_field( - logical_field, - index, - table_configuration, - &last_physical_field, - )?; + let requirements = classifier_opt + .as_ref() + .map(|classifier| { + classifier.classify_field( + logical_field, + index, + table_configuration, + &last_physical_field, + ) + }) + .transpose()? + .flatten(); if let Some(requirements) = requirements { // Field needs transformation - not in physical schema for spec in requirements.transform_specs.into_iter() { transform_spec.push(spec); } - } else { - // Physical field - should be read from parquet - // Validate metadata column doesn't conflict with partition columns - if logical_field.is_metadata_column() - && partition_columns.contains(logical_field.name()) - { + } else if partition_columns.contains(logical_field.name()) { + // this is a partition column, add the transform + if logical_field.is_metadata_column() { return Err(Error::Schema(format!( "Metadata column names must not match partition columns: {}", logical_field.name() ))); } - // <<<<<<< HEAD - // // Partition column: needs to be injected via transform - // transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { - // field_index: index, - // insert_after: last_physical_field.clone(), - // }); - // } else { - // // Regular field field, or metadata columns: add to physical schema - // let mut select_physical_field = || { - // let physical_field = logical_field.make_physical(column_mapping_mode); - // debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); - // let physical_name = physical_field.name.clone(); - // ||||||| f431de0c - // // Partition column: needs to be injected via transform - // transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { - // field_index: index, - // insert_after: last_physical_field.clone(), - // }); - // } else { - // // Regular field: add to physical schema - // let physical_field = logical_field.make_physical(column_mapping_mode); - // debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); - // let physical_name = physical_field.name.clone(); - // ======= - // Add to physical schema - let physical_field = logical_field.make_physical(column_mapping_mode); - debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); - let physical_name = physical_field.name.clone(); - - if !logical_field.is_metadata_column() { - read_field_names.insert(physical_name.clone()); + // Partition column: needs to be injected via transform + transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { + field_index: index, + insert_after: last_physical_field.clone(), + }); + } else { + // Regular field field or a metadata column + let mut select_physical_field = || { + let physical_field = logical_field.make_physical(column_mapping_mode); + debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); + let physical_name = physical_field.name.clone(); + + if logical_field.is_metadata_column() + && partition_columns.contains(logical_field.name()) + { + return Err(Error::Schema(format!( + "Metadata column names must not match partition columns: {}", + logical_field.name() + ))); + } + last_physical_field = Some(physical_name); + read_fields.push(physical_field); + Ok(()) + }; + + match logical_field.get_metadata_column_spec() { + Some(MetadataColumnSpec::RowId) => { + if table_configuration.table_properties().enable_row_tracking == Some(true) + { + let row_id_col = table_configuration + .metadata() + .configuration() + .get("delta.rowTracking.materializedRowIdColumnName") + .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; + // we can `take` as we should only have one RowId col + let index_column_name = match selected_row_index_col_name.take() { + Some(index_column_name) => index_column_name, + None => { + // ensure we have a column name that isn't already in our schema + let index_column_name = (0..) + .map(|i| format!("row_indexes_for_row_id_{}", i)) + .find(|name| logical_schema.field(name).is_none()) + .ok_or(Error::generic( + "Couldn't generate row index column name", + ))?; + read_fields.push(StructField::create_metadata_column( + &index_column_name, + MetadataColumnSpec::RowIndex, + )); + transform_spec.push(FieldTransformSpec::StaticDrop { + field_name: index_column_name.clone(), + }); + index_column_name + } + }; + + read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); + transform_spec.push(FieldTransformSpec::GenerateRowId { + field_name: row_id_col.to_string(), + row_index_field_name: index_column_name, + }); + } else { + return Err(Error::unsupported( + "Row ids are not enabled on this table", + )); + } + } + Some(MetadataColumnSpec::RowIndex) => { + // handled in parquet reader + select_physical_field()?; + } + Some(MetadataColumnSpec::RowCommitVersion) => { + return Err(Error::unsupported("Row commit versions not supported")); + } + _ => { + select_physical_field()?; + } } - last_physical_field = Some(physical_name); - read_fields.push(physical_field); } - // match logical_field.get_metadata_column_spec() { - // Some(MetadataColumnSpec::RowId) => { - // if table_configuration.table_properties().enable_row_tracking == Some(true) - // { - // let row_id_col = table_configuration - // .metadata() - // .configuration() - // .get("delta.rowTracking.materializedRowIdColumnName") - // .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; - // // we can `take` as we should only have one RowId col - // let index_column_name = match selected_row_index_col_name.take() { - // Some(index_column_name) => index_column_name, - // None => { - // // ensure we have a column name that isn't already in our schema - // let index_column_name = (0..) - // .map(|i| format!("row_indexes_for_row_id_{}", i)) - // .find(|name| logical_schema.field(name).is_none()) - // .ok_or(Error::generic( - // "Couldn't generate row index column name", - // ))?; - // read_fields.push(StructField::create_metadata_column( - // &index_column_name, - // MetadataColumnSpec::RowIndex, - // )); - // transform_spec.push(FieldTransformSpec::StaticDrop { - // field_name: index_column_name.clone(), - // }); - // index_column_name - // } - // }; - - // read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); - // transform_spec.push(FieldTransformSpec::GenerateRowId { - // field_name: row_id_col.to_string(), - // row_index_field_name: index_column_name, - // }); - // } else { - // return Err(Error::unsupported( - // "Row ids are not enabled on this table", - // )); - // } - // } - // Some(MetadataColumnSpec::RowIndex) => { - // // handled in parquet reader - // select_physical_field() - // } - // Some(MetadataColumnSpec::RowCommitVersion) => { - // return Err(Error::unsupported("Row commit versions not supported")); - // } - // _ => select_physical_field(), - // } - // } } let physical_schema = Arc::new(StructType::try_new(read_fields)?); @@ -207,7 +197,15 @@ impl StateInfo { } #[cfg(test)] -mod tests { +pub(crate) mod tests { + use std::{collections::HashMap, sync::Arc}; + + use url::Url; + + use crate::actions::{Metadata, Protocol}; + use crate::expressions::{column_expr, Expression as Expr}; + + use super::*; // get a state info with no predicate or extra metadata pub(crate) fn get_simple_state_info( @@ -250,12 +248,7 @@ mod tests { ); } - StateInfo::try_new( - schema.clone(), - &table_configuration, - predicate, - ScanTransformFieldClassifierieldClassifier, - ) + StateInfo::try_new(schema.clone(), &table_configuration, predicate, None::<()>) } #[test] diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index ebc2bd166..e5c31e29a 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -118,7 +118,7 @@ impl TableChangesScanBuilder { logical_schema, self.table_changes.end_snapshot.table_configuration(), self.predicate, - CdfTransformFieldClassifier, + Some(CdfTransformFieldClassifier), )?; Ok(TableChangesScan { From 2f1cba7546172ee7fe3844759ce862f6759e3e44 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 16:16:12 -0700 Subject: [PATCH 13/28] cleanup --- kernel/src/scan/field_classifiers.rs | 136 +++------------------------ kernel/src/scan/mod.rs | 5 +- kernel/src/scan/state_info.rs | 27 ++---- kernel/src/table_changes/mod.rs | 7 -- kernel/src/table_changes/scan.rs | 2 +- 5 files changed, 23 insertions(+), 154 deletions(-) diff --git a/kernel/src/scan/field_classifiers.rs b/kernel/src/scan/field_classifiers.rs index 6af895741..dc7278ec6 100644 --- a/kernel/src/scan/field_classifiers.rs +++ b/kernel/src/scan/field_classifiers.rs @@ -1,19 +1,10 @@ //! Field classifier implementations for different scan types (regular and CDF scans) -use crate::schema::{DataType, MetadataColumnSpec, StructField}; +use crate::schema::StructField; use crate::table_changes::{ CHANGE_TYPE_COL_NAME, COMMIT_TIMESTAMP_COL_NAME, COMMIT_VERSION_COL_NAME, }; -use crate::table_configuration::TableConfiguration; use crate::transforms::FieldTransformSpec; -use crate::{DeltaResult, Error}; - -/// If a field requires transformation(s) or extra read fields, we use this struct to communicate -/// them together -pub(crate) struct FieldRequirements { - pub(crate) transform_specs: Vec, - pub(crate) extra_read_fields: Vec, -} /// Trait for classifying fields during StateInfo construction. /// Allows different scan types (regular, CDF) to customize field handling. @@ -25,9 +16,8 @@ pub(crate) trait TransformFieldClassifier { &self, field: &StructField, field_index: usize, - table_configuration: &TableConfiguration, last_physical_field: &Option, - ) -> DeltaResult>; + ) -> Option; } // Empty classifier, always returns None @@ -36,98 +26,9 @@ impl TransformFieldClassifier for () { &self, _: &StructField, _: usize, - _: &TableConfiguration, _: &Option, - ) -> DeltaResult> { - Ok(None) - } -} - -/// Regular scan field classifier for standard Delta table scans. -/// Handles partition columns as metadata-derived fields. -pub(crate) struct ScanTransformFieldClassifierieldClassifier; -impl TransformFieldClassifier for ScanTransformFieldClassifierieldClassifier { - fn classify_field( - &self, - field: &StructField, - field_index: usize, - table_configuration: &TableConfiguration, - last_physical_field: &Option, - ) -> DeltaResult> { - if table_configuration - .metadata() - .partition_columns() - .contains(field.name()) - { - // Partition column: needs transform to inject metadata - let transform_specs = vec![FieldTransformSpec::MetadataDerivedColumn { - field_index, - insert_after: last_physical_field.clone(), - }]; - Ok(Some(FieldRequirements { - transform_specs, - extra_read_fields: vec![], - })) - } else { - match field.get_metadata_column_spec() { - Some(MetadataColumnSpec::RowId) => { - if table_configuration.table_properties().enable_row_tracking == Some(true) { - let mut transform_specs = vec![]; - let mut extra_read_fields = vec![]; - let row_id_col = table_configuration - .metadata() - .configuration() - .get("delta.rowTracking.materializedRowIdColumnName") - .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; - // we can `take` as we should only have one RowId col - let mut selected_row_index_col_name = None; - let index_column_name = match selected_row_index_col_name.take() { - Some(index_column_name) => index_column_name, - None => { - // ensure we have a column name that isn't already in our schema - let index_column_name = (0..) - .map(|i| format!("row_indexes_for_row_id_{}", i)) - .find(|_name| true) //logical_schema.field(name).is_none()) - .ok_or(Error::generic( - "Couldn't generate row index column name", - ))?; - extra_read_fields.push(StructField::create_metadata_column( - &index_column_name, - MetadataColumnSpec::RowIndex, - )); - transform_specs.push(FieldTransformSpec::StaticDrop { - field_name: index_column_name.clone(), - }); - index_column_name - } - }; - - extra_read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); - transform_specs.push(FieldTransformSpec::GenerateRowId { - field_name: row_id_col.to_string(), - row_index_field_name: index_column_name, - }); - Ok(Some(FieldRequirements { - transform_specs, - extra_read_fields, - })) - } else { - Err(Error::unsupported("Row ids are not enabled on this table")) - } - } - Some(MetadataColumnSpec::RowIndex) => { - // handled in parquet reader, treat as regular physical field - Ok(None) - } - Some(MetadataColumnSpec::RowCommitVersion) => { - Err(Error::unsupported("Row commit versions not supported")) - } - _ => { - // Regular physical field - no transform needed - Ok(None) - } - } - } + ) -> Option { + None } } @@ -139,36 +40,23 @@ impl TransformFieldClassifier for CdfTransformFieldClassifier { &self, field: &StructField, field_index: usize, - table_configuration: &TableConfiguration, last_physical_field: &Option, - ) -> DeltaResult> { - let transform_spec = match field.name().as_str() { + ) -> Option { + match field.name().as_str() { // _change_type is dynamic - physical in CDC files, metadata in Add/Remove files - CHANGE_TYPE_COL_NAME => FieldTransformSpec::DynamicColumn { + CHANGE_TYPE_COL_NAME => Some(FieldTransformSpec::DynamicColumn { field_index, physical_name: CHANGE_TYPE_COL_NAME.to_string(), insert_after: last_physical_field.clone(), - }, + }), // _commit_version and _commit_timestamp are always derived from metadata COMMIT_VERSION_COL_NAME | COMMIT_TIMESTAMP_COL_NAME => { - FieldTransformSpec::MetadataDerivedColumn { + Some(FieldTransformSpec::MetadataDerivedColumn { field_index, insert_after: last_physical_field.clone(), - } - } - // Defer to default classifier for partition columns and physical fields - _ => { - return ScanTransformFieldClassifierieldClassifier.classify_field( - field, - field_index, - table_configuration, - last_physical_field, - ) + }) } - }; - Ok(Some(FieldRequirements { - transform_specs: vec![transform_spec], - extra_read_fields: vec![], - })) + _ => None, + } } } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index fd2b1d2f1..3d0ad66d6 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -22,6 +22,7 @@ use crate::listed_log_files::ListedLogFiles; use crate::log_replay::{ActionsBatch, HasSelectionVector}; use crate::log_segment::LogSegment; use crate::scan::state::{DvInfo, Stats}; +use crate::scan::state_info::StateInfo; use crate::schema::{ ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, }; @@ -114,11 +115,11 @@ impl ScanBuilder { // if no schema is provided, use snapshot's entire schema (e.g. SELECT *) let logical_schema = self.schema.unwrap_or_else(|| self.snapshot.schema()); - let state_info = state_info::StateInfo::try_new( + let state_info = StateInfo::try_new( logical_schema, self.snapshot.table_configuration(), self.predicate, - None::<()>, // No classifer, default is for scans + (), // No classifer, default is for scans )?; Ok(Scan { diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index ca17866ed..23f134a3a 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -35,12 +35,12 @@ impl StateInfo { /// `partition_columns` - List of column names that are partition columns in the table /// `column_mapping_mode` - The column mapping mode used by the table for physical to logical mapping /// `predicate` - Optional predicate to filter data during the scan - /// `classifier` - The classifier to use for different scan types + /// `classifier` - The classifier to use for different scan types. Use `()` if not needed pub(crate) fn try_new( logical_schema: SchemaRef, table_configuration: &TableConfiguration, predicate: Option, - classifier_opt: Option, + classifier: C, ) -> DeltaResult { let partition_columns = table_configuration.metadata().partition_columns(); let column_mapping_mode = table_configuration.column_mapping_mode(); @@ -65,24 +65,11 @@ impl StateInfo { // Loop over all selected fields and build both the physical schema and transform spec for (index, logical_field) in logical_schema.fields().enumerate() { - let requirements = classifier_opt - .as_ref() - .map(|classifier| { - classifier.classify_field( - logical_field, - index, - table_configuration, - &last_physical_field, - ) - }) - .transpose()? - .flatten(); - - if let Some(requirements) = requirements { + if let Some(spec) = + classifier.classify_field(logical_field, index, &last_physical_field) + { // Field needs transformation - not in physical schema - for spec in requirements.transform_specs.into_iter() { - transform_spec.push(spec); - } + transform_spec.push(spec); } else if partition_columns.contains(logical_field.name()) { // this is a partition column, add the transform if logical_field.is_metadata_column() { @@ -248,7 +235,7 @@ pub(crate) mod tests { ); } - StateInfo::try_new(schema.clone(), &table_configuration, predicate, None::<()>) + StateInfo::try_new(schema.clone(), &table_configuration, predicate, ()) } #[test] diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index c5898e0eb..9172dc883 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -240,13 +240,6 @@ impl TableChanges { pub fn table_root(&self) -> &Url { &self.table_root } - /// The partition columns that will be read. - pub(crate) fn partition_columns(&self) -> &Vec { - self.end_snapshot - .table_configuration() - .metadata() - .partition_columns() - } /// Create a [`TableChangesScanBuilder`] for an `Arc`. pub fn scan_builder(self: Arc) -> TableChangesScanBuilder { diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index e5c31e29a..ebc2bd166 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -118,7 +118,7 @@ impl TableChangesScanBuilder { logical_schema, self.table_changes.end_snapshot.table_configuration(), self.predicate, - Some(CdfTransformFieldClassifier), + CdfTransformFieldClassifier, )?; Ok(TableChangesScan { From 693ab5ee9ca6db991469546b64e4cefc7b3d0756 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 16:27:42 -0700 Subject: [PATCH 14/28] cleanup --- kernel/src/scan/state_info.rs | 125 +++++++++++++++++----------------- 1 file changed, 62 insertions(+), 63 deletions(-) diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index 23f134a3a..bccbfa53e 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -68,93 +68,92 @@ impl StateInfo { if let Some(spec) = classifier.classify_field(logical_field, index, &last_physical_field) { - // Field needs transformation - not in physical schema + // Classifier has handled this field via a transformation, just push it and move on transform_spec.push(spec); } else if partition_columns.contains(logical_field.name()) { - // this is a partition column, add the transform + // This is a partition column: needs to be injected via transform + + // First ensure we don't have a metadata column with same name as a partition column if logical_field.is_metadata_column() { return Err(Error::Schema(format!( "Metadata column names must not match partition columns: {}", logical_field.name() ))); } - - // Partition column: needs to be injected via transform + // push the transform for this partition column transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { field_index: index, insert_after: last_physical_field.clone(), }); } else { - // Regular field field or a metadata column - let mut select_physical_field = || { - let physical_field = logical_field.make_physical(column_mapping_mode); - debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); - let physical_name = physical_field.name.clone(); - - if logical_field.is_metadata_column() - && partition_columns.contains(logical_field.name()) - { - return Err(Error::Schema(format!( - "Metadata column names must not match partition columns: {}", - logical_field.name() - ))); - } - last_physical_field = Some(physical_name); - read_fields.push(physical_field); - Ok(()) - }; - + // Regular field field or a metadata column, figure out which and handle it match logical_field.get_metadata_column_spec() { Some(MetadataColumnSpec::RowId) => { - if table_configuration.table_properties().enable_row_tracking == Some(true) + if table_configuration.table_properties().enable_row_tracking != Some(true) { - let row_id_col = table_configuration - .metadata() - .configuration() - .get("delta.rowTracking.materializedRowIdColumnName") - .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; - // we can `take` as we should only have one RowId col - let index_column_name = match selected_row_index_col_name.take() { - Some(index_column_name) => index_column_name, - None => { - // ensure we have a column name that isn't already in our schema - let index_column_name = (0..) - .map(|i| format!("row_indexes_for_row_id_{}", i)) - .find(|name| logical_schema.field(name).is_none()) - .ok_or(Error::generic( - "Couldn't generate row index column name", - ))?; - read_fields.push(StructField::create_metadata_column( - &index_column_name, - MetadataColumnSpec::RowIndex, - )); - transform_spec.push(FieldTransformSpec::StaticDrop { - field_name: index_column_name.clone(), - }); - index_column_name - } - }; - - read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); - transform_spec.push(FieldTransformSpec::GenerateRowId { - field_name: row_id_col.to_string(), - row_index_field_name: index_column_name, - }); - } else { return Err(Error::unsupported( "Row ids are not enabled on this table", )); } - } - Some(MetadataColumnSpec::RowIndex) => { - // handled in parquet reader - select_physical_field()?; + + let row_id_col = table_configuration + .metadata() + .configuration() + .get("delta.rowTracking.materializedRowIdColumnName") + .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; + + // we can `take` as we should only have one RowId col + let index_column_name = match selected_row_index_col_name.take() { + Some(index_column_name) => index_column_name, + None => { + // the index column isn't being explicitly requested, so add it to + // `read_fields` so the parquet_reader will generate it, and add a + // transform to drop it before returning logical data + + // ensure we have a column name that isn't already in our schema + let index_column_name = (0..) + .map(|i| format!("row_indexes_for_row_id_{}", i)) + .find(|name| logical_schema.field(name).is_none()) + .ok_or(Error::generic( + "Couldn't generate row index column name", + ))?; + read_fields.push(StructField::create_metadata_column( + &index_column_name, + MetadataColumnSpec::RowIndex, + )); + transform_spec.push(FieldTransformSpec::StaticDrop { + field_name: index_column_name.clone(), + }); + index_column_name + } + }; + + read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); + transform_spec.push(FieldTransformSpec::GenerateRowId { + field_name: row_id_col.to_string(), + row_index_field_name: index_column_name, + }); } Some(MetadataColumnSpec::RowCommitVersion) => { return Err(Error::unsupported("Row commit versions not supported")); } - _ => { - select_physical_field()?; + Some(MetadataColumnSpec::RowIndex) | None => { + // note that RowIndex is handled in the parquet reader so we just add it as + // if it's a normal physical column + let physical_field = logical_field.make_physical(column_mapping_mode); + debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); + let physical_name = physical_field.name.clone(); + + if logical_field.is_metadata_column() + && partition_columns.contains(logical_field.name()) + { + return Err(Error::Schema(format!( + "Metadata column names must not match partition columns: {}", + logical_field.name() + ))); + } + last_physical_field = Some(physical_name); + read_fields.push(physical_field); } } } From 8ef94870ac8cf9e2de7ad32f013989b02a11c1e3 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 16:55:01 -0700 Subject: [PATCH 15/28] add some more tests --- kernel/src/scan/state_info.rs | 114 +++++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 29 deletions(-) diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index bccbfa53e..f806f0ab6 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -32,8 +32,7 @@ impl StateInfo { /// Get the state needed to process a scan. /// /// `logical_schema` - The logical schema of the scan output, which includes partition columns - /// `partition_columns` - List of column names that are partition columns in the table - /// `column_mapping_mode` - The column mapping mode used by the table for physical to logical mapping + /// `table_configuration` - The TableConfiguration for this table /// `predicate` - Optional predicate to filter data during the scan /// `classifier` - The classifier to use for different scan types. Use `()` if not needed pub(crate) fn try_new( @@ -144,12 +143,12 @@ impl StateInfo { debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); let physical_name = physical_field.name.clone(); - if logical_field.is_metadata_column() - && partition_columns.contains(logical_field.name()) + if !logical_field.is_metadata_column() + && metadata_field_names.contains(&physical_name) { return Err(Error::Schema(format!( - "Metadata column names must not match partition columns: {}", - logical_field.name() + "Metadata column names must not match physical columns, but logical column '{}' has physical name '{}'", + logical_field.name(), physical_name, ))); } last_physical_field = Some(physical_name); @@ -190,6 +189,7 @@ pub(crate) mod tests { use crate::actions::{Metadata, Protocol}; use crate::expressions::{column_expr, Expression as Expr}; + use crate::schema::{ColumnMetadataKey, MetadataValue}; use super::*; @@ -217,7 +217,7 @@ pub(crate) mod tests { metadata_configuration, )?; let no_features: Option> = None; // needed for type annotation - let protocol = Protocol::try_new(1, 2, no_features.clone(), no_features)?; + let protocol = Protocol::try_new(2, 2, no_features.clone(), no_features)?; let table_configuration = TableConfiguration::try_new( metadata, protocol, @@ -238,7 +238,7 @@ pub(crate) mod tests { } #[test] - fn test_state_info_no_partition_columns() { + fn no_partition_columns() { // Test case: No partition columns, no column mapping let schema = Arc::new(StructType::new_unchecked(vec![ StructField::nullable("id", DataType::STRING), @@ -259,7 +259,7 @@ pub(crate) mod tests { } #[test] - fn test_state_info_with_partition_columns() { + fn with_partition_columns() { // Test case: With partition columns let schema = Arc::new(StructType::new_unchecked(vec![ StructField::nullable("id", DataType::STRING), @@ -296,7 +296,7 @@ pub(crate) mod tests { } #[test] - fn test_state_info_multiple_partition_columns() { + fn multiple_partition_columns() { // Test case: Multiple partition columns interspersed with regular columns let schema = Arc::new(StructType::new_unchecked(vec![ StructField::nullable("col1", DataType::STRING), @@ -345,7 +345,7 @@ pub(crate) mod tests { } #[test] - fn test_state_info_with_predicate() { + fn with_predicate() { // Test case: With a valid predicate let schema = Arc::new(StructType::new_unchecked(vec![ StructField::nullable("id", DataType::STRING), @@ -374,7 +374,7 @@ pub(crate) mod tests { } #[test] - fn test_state_info_partition_at_beginning() { + fn partition_at_beginning() { // Test case: Partition column at the beginning let schema = Arc::new(StructType::new_unchecked(vec![ StructField::nullable("date", DataType::DATE), // Partition column @@ -400,8 +400,15 @@ pub(crate) mod tests { } } + fn get_string_map(slice: &[(&str, &str)]) -> HashMap { + slice + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } + #[test] - fn test_state_info_request_row_ids() { + fn request_row_ids() { let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( "id", DataType::STRING, @@ -411,16 +418,13 @@ pub(crate) mod tests { schema.clone(), vec![], None, - [ + get_string_map(&[ ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", "some-row-id_col", ), - ] - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(), + ]), vec![("row_id", MetadataColumnSpec::RowId)], ) .unwrap(); @@ -449,7 +453,7 @@ pub(crate) mod tests { } #[test] - fn test_state_info_request_row_ids_and_indexes() { + fn request_row_ids_and_indexes() { let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( "id", DataType::STRING, @@ -459,16 +463,13 @@ pub(crate) mod tests { schema.clone(), vec![], None, - [ + get_string_map(&[ ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", "some-row-id_col", ), - ] - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(), + ]), vec![ ("row_id", MetadataColumnSpec::RowId), ("row_index", MetadataColumnSpec::RowIndex), @@ -493,7 +494,7 @@ pub(crate) mod tests { } #[test] - fn test_state_info_invalid_rowtracking_config() { + fn invalid_rowtracking_config() { let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( "id", DataType::STRING, @@ -502,10 +503,7 @@ pub(crate) mod tests { for (metadata_config, metadata_cols, expected_error) in [ (HashMap::new(), vec![("row_id", MetadataColumnSpec::RowId)], "Unsupported: Row ids are not enabled on this table"), ( - [("delta.enableRowTracking", "true")] - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(), + get_string_map(&[("delta.enableRowTracking", "true")]), vec![("row_id", MetadataColumnSpec::RowId)], "Generic delta kernel error: No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration", ), @@ -523,4 +521,62 @@ pub(crate) mod tests { } } } + + #[test] + fn metadata_column_matches_partition_column() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "id", + DataType::STRING, + )])); + match get_state_info( + schema.clone(), + vec!["part_col".to_string()], + None, + HashMap::new(), + vec![("part_col", MetadataColumnSpec::RowId)], + ) { + Ok(_) => { + panic!("Should not have succeeded generating state info with invalid config") + } + Err(e) => { + assert_eq!(e.to_string(), + "Schema error: Metadata column names must not match partition columns: part_col") + } + } + } + + #[test] + fn metadata_column_matches_read_field() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "id", + DataType::STRING, + ) + .with_metadata(HashMap::::from([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref().to_string(), + 1.into(), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName + .as_ref() + .to_string(), + "other".into(), + ), + ]))])); + match get_state_info( + schema.clone(), + vec![], + None, + get_string_map(&[("delta.columnMapping.mode", "name")]), + vec![("other", MetadataColumnSpec::RowId)], + ) { + Ok(_) => { + panic!("Should not have succeeded generating state info with invalid config") + } + Err(e) => { + assert_eq!(e.to_string(), + "Schema error: Metadata column names must not match physical columns, but logical column 'id' has physical name 'other'"); + } + } + } } From d707a5f5c90e6675da0cb6b3d185875c62668b8a Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 17:00:25 -0700 Subject: [PATCH 16/28] unneeded mod path --- kernel/src/scan/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 3d0ad66d6..8de13e7e9 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -354,7 +354,7 @@ impl HasSelectionVector for ScanMetadata { /// scanning the table. pub struct Scan { snapshot: SnapshotRef, - state_info: Arc, + state_info: Arc, } impl std::fmt::Debug for Scan { From fa9c6b130e67c2030b6dae11bc6ce7c094053245 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 17:05:54 -0700 Subject: [PATCH 17/28] address comment --- kernel/src/scan/mod.rs | 3 +-- kernel/src/transforms.rs | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 8de13e7e9..22c87324d 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -25,6 +25,7 @@ use crate::scan::state::{DvInfo, Stats}; use crate::scan::state_info::StateInfo; use crate::schema::{ ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, + ToSchema as _, }; use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, SnapshotRef, Version}; @@ -487,8 +488,6 @@ impl Scan { _existing_predicate: Option, ) -> DeltaResult>>> { static RESTORED_ADD_SCHEMA: LazyLock = LazyLock::new(|| { - use crate::schema::ToSchema as _; - let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); DataType::struct_type_unchecked(vec![StructField::nullable( "add", diff --git a/kernel/src/transforms.rs b/kernel/src/transforms.rs index 7a1d6b960..e1fbb2087 100644 --- a/kernel/src/transforms.rs +++ b/kernel/src/transforms.rs @@ -9,7 +9,9 @@ use std::sync::Arc; use itertools::Itertools; -use crate::expressions::{BinaryExpressionOp, Expression, ExpressionRef, Scalar, Transform}; +use crate::expressions::{ + BinaryExpressionOp, Expression, ExpressionRef, Scalar, Transform, VariadicExpressionOp, +}; use crate::schema::{DataType, SchemaRef, StructType}; use crate::{DeltaResult, Error}; @@ -127,7 +129,7 @@ pub(crate) fn get_transform_expr( Error::generic("Asked to generate RowIds, but no baseRowId found.") })?; let expr = Arc::new(Expression::variadic( - crate::expressions::VariadicExpressionOp::Coalesce, + VariadicExpressionOp::Coalesce, vec![ Expression::column([field_name]), Expression::binary( From 0d57b88a419345555a2ee4841c7d94d75f4220eb Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 17:07:47 -0700 Subject: [PATCH 18/28] remove unneeded --- kernel/src/engine/arrow_utils.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 48ab51df5..21cb246f3 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -595,13 +595,6 @@ fn get_indices( Arc::new(field.try_into_arrow()?), )); } - Some(MetadataColumnSpec::RowId) => { - debug!("Inserting a row index column for row ids: {}", field.name()); - reorder_indices.push(ReorderIndex::row_index( - requested_position, - Arc::new(field.try_into_arrow()?), - )); - } Some(metadata_spec) => { return Err(Error::Generic(format!( "Metadata column {metadata_spec:?} is not supported by the default parquet reader" From 6100e6fbfdd22a63d0b592845aea25cca3b39b5d Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 10 Oct 2025 17:13:44 -0700 Subject: [PATCH 19/28] more coverage --- kernel/src/transforms.rs | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel/src/transforms.rs b/kernel/src/transforms.rs index e1fbb2087..08c991535 100644 --- a/kernel/src/transforms.rs +++ b/kernel/src/transforms.rs @@ -539,7 +539,7 @@ mod tests { } #[test] - fn test_get_transform_expr_generate_row_ids() { + fn get_transform_expr_generate_row_ids() { let transform_spec = vec![FieldTransformSpec::GenerateRowId { field_name: "row_id_col".to_string(), row_index_field_name: "row_index_col".to_string(), @@ -586,4 +586,27 @@ mod tests { let expr = &row_id_transform.exprs[0]; assert_eq!(expr, &expeceted_expr); } + + #[test] + fn get_transform_expr_generate_row_ids_no_base_id() { + let transform_spec = vec![FieldTransformSpec::GenerateRowId { + field_name: "row_id_col".to_string(), + row_index_field_name: "row_index_col".to_string(), + }]; + + // Physical schema contains row index col, but no row-id col + let physical_schema = StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::not_null("row_index_col", DataType::LONG), + ]); + let metadata_values = HashMap::new(); + + assert!(get_transform_expr( + &transform_spec, + metadata_values, + &physical_schema, + None, /* base_row_id */ + ) + .is_err()); + } } From cae7eaf67f9b560d03fb3a0d63bc076c0bfb8463 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Thu, 16 Oct 2025 17:38:37 -0700 Subject: [PATCH 20/28] address comment --- kernel/src/scan/state_info.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index f806f0ab6..d533eaec6 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -41,7 +41,6 @@ impl StateInfo { predicate: Option, classifier: C, ) -> DeltaResult { - let partition_columns = table_configuration.metadata().partition_columns(); let column_mapping_mode = table_configuration.column_mapping_mode(); let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); let mut metadata_field_names = HashSet::new(); @@ -69,7 +68,11 @@ impl StateInfo { { // Classifier has handled this field via a transformation, just push it and move on transform_spec.push(spec); - } else if partition_columns.contains(logical_field.name()) { + } else if table_configuration + .metadata() + .partition_columns() + .contains(logical_field.name()) + { // This is a partition column: needs to be injected via transform // First ensure we don't have a metadata column with same name as a partition column From 1e36f003e83f52700a8d9ba0f60e4c7fc3d882cd Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Thu, 16 Oct 2025 17:51:49 -0700 Subject: [PATCH 21/28] simplify schema logic --- kernel/examples/common/src/lib.rs | 71 ++++++++----------------------- 1 file changed, 18 insertions(+), 53 deletions(-) diff --git a/kernel/examples/common/src/lib.rs b/kernel/examples/common/src/lib.rs index a56c93db1..3a318d4f4 100644 --- a/kernel/examples/common/src/lib.rs +++ b/kernel/examples/common/src/lib.rs @@ -7,7 +7,7 @@ use delta_kernel::{ arrow::array::RecordBatch, engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, scan::Scan, - schema::{MetadataColumnSpec, Schema}, + schema::MetadataColumnSpec, DeltaResult, SnapshotRef, }; @@ -189,61 +189,26 @@ pub fn get_scan(snapshot: SnapshotRef, args: &ScanArgs) -> DeltaResult DeltaResult<_> { - let table_schema = snapshot.schema(); - let cols: Vec<&str> = cols.split(",").map(str::trim).collect(); - let selected_fields = cols.iter().map(|col| { - table_schema - .field(col) - .cloned() - .ok_or(delta_kernel::Error::Generic(format!( - "Table has no such column: {col}" - ))) - }); - let schema = Schema::try_from_results(selected_fields); - let schema = if args.with_row_index { - schema.and_then(|schema| { - schema.add_metadata_column("row_index", MetadataColumnSpec::RowIndex) - }) - } else { - schema - }; - schema.map(Arc::new) - }) - .transpose()?; + let mut table_schema = snapshot.schema(); + if let Some(cols) = args.columns.as_ref() { + let cols: Vec<&str> = cols.split(",").map(str::trim).collect(); + table_schema = table_schema.project_as_struct(&cols)?.into(); + } - let read_schema_opt = read_schema_opt.or_else(|| { - (args.with_row_index || args.with_row_id).then(|| { - let schema = snapshot.schema(); - let schema = if args.with_row_index { - Arc::new( - schema - .add_metadata_column("_metadata.row_index", MetadataColumnSpec::RowIndex) - .unwrap(), - ) - } else { - schema - }; - if args.with_row_id { - Arc::new( - schema - .add_metadata_column("_metadata.row_id", MetadataColumnSpec::RowId) - .unwrap(), - ) - } else { - schema - } - }) - }); + if args.with_row_index { + table_schema = table_schema + .add_metadata_column("_metadata.row_index", MetadataColumnSpec::RowIndex)? + .into(); + } + + if args.with_row_id { + table_schema = table_schema + .add_metadata_column("_metadata.row_index", MetadataColumnSpec::RowIndex)? + .into(); + } Ok(Some( - snapshot - .scan_builder() - .with_schema_opt(read_schema_opt) - .build()?, + snapshot.scan_builder().with_schema(table_schema).build()?, )) } From 2822bb803172bf38dab5402ef061ec1d87c40fec Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Thu, 16 Oct 2025 17:53:04 -0700 Subject: [PATCH 22/28] better name --- kernel/examples/common/src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/examples/common/src/lib.rs b/kernel/examples/common/src/lib.rs index 3a318d4f4..7023f2632 100644 --- a/kernel/examples/common/src/lib.rs +++ b/kernel/examples/common/src/lib.rs @@ -189,26 +189,26 @@ pub fn get_scan(snapshot: SnapshotRef, args: &ScanArgs) -> DeltaResult = cols.split(",").map(str::trim).collect(); - table_schema = table_schema.project_as_struct(&cols)?.into(); + scan_schema = scan_schema.project_as_struct(&cols)?.into(); } if args.with_row_index { - table_schema = table_schema + scan_schema = scan_schema .add_metadata_column("_metadata.row_index", MetadataColumnSpec::RowIndex)? .into(); } if args.with_row_id { - table_schema = table_schema + scan_schema = scan_schema .add_metadata_column("_metadata.row_index", MetadataColumnSpec::RowIndex)? .into(); } Ok(Some( - snapshot.scan_builder().with_schema(table_schema).build()?, + snapshot.scan_builder().with_schema(scan_schema).build()?, )) } From a4cbedcfaba466e0998176efc0780f11de159d11 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 17 Oct 2025 12:59:16 -0700 Subject: [PATCH 23/28] move partition col check out of loop --- kernel/src/scan/state_info.rs | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index d533eaec6..e84a69cb3 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -41,6 +41,7 @@ impl StateInfo { predicate: Option, classifier: C, ) -> DeltaResult { + let partition_columns = table_configuration.metadata().partition_columns(); let column_mapping_mode = table_configuration.column_mapping_mode(); let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); let mut metadata_field_names = HashSet::new(); @@ -55,6 +56,13 @@ impl StateInfo { // This iteration runs in O(supported_number_of_metadata_columns) time since each metadata // column can appear at most once in the schema for metadata_column in logical_schema.metadata_columns() { + // Ensure we don't have a metadata column with same name as a partition column + if partition_columns.contains(metadata_column.name()) { + return Err(Error::Schema(format!( + "Metadata column names must not match partition columns: {}", + metadata_column.name() + ))); + } if let Some(MetadataColumnSpec::RowIndex) = metadata_column.get_metadata_column_spec() { selected_row_index_col_name = Some(metadata_column.name().to_string()); } @@ -68,20 +76,7 @@ impl StateInfo { { // Classifier has handled this field via a transformation, just push it and move on transform_spec.push(spec); - } else if table_configuration - .metadata() - .partition_columns() - .contains(logical_field.name()) - { - // This is a partition column: needs to be injected via transform - - // First ensure we don't have a metadata column with same name as a partition column - if logical_field.is_metadata_column() { - return Err(Error::Schema(format!( - "Metadata column names must not match partition columns: {}", - logical_field.name() - ))); - } + } else if partition_columns.contains(logical_field.name()) { // push the transform for this partition column transform_spec.push(FieldTransformSpec::MetadataDerivedColumn { field_index: index, From 8f25b5e01f16a11e3c4e14870d19284f89860c18 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 17 Oct 2025 13:35:50 -0700 Subject: [PATCH 24/28] consolidate assertions on the row id transform --- kernel/src/scan/log_replay.rs | 13 +++- kernel/src/scan/state_info.rs | 110 ++++++++++++++++++++++++---------- 2 files changed, 91 insertions(+), 32 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 12d339b97..d98033cad 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -397,7 +397,9 @@ mod tests { use crate::expressions::Scalar; use crate::log_replay::ActionsBatch; use crate::scan::state::{DvInfo, Stats}; - use crate::scan::state_info::tests::{get_simple_state_info, get_state_info}; + use crate::scan::state_info::tests::{ + assert_transform_spec, get_simple_state_info, get_state_info, + }; use crate::scan::state_info::StateInfo; use crate::scan::test_utils::{ add_batch_for_row_id, add_batch_simple, add_batch_with_partition_col, @@ -562,6 +564,15 @@ mod tests { vec![("row_id", MetadataColumnSpec::RowId)], ) .unwrap(); + + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_transform_spec( + transform_spec, + false, + "row_id_col", + "row_indexes_for_row_id_0", + ); + let batch = vec![add_batch_for_row_id(get_log_schema().clone())]; let iter = scan_action_iter( &SyncEngine::new(), diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index e84a69cb3..7dfba968f 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -235,6 +235,41 @@ pub(crate) mod tests { StateInfo::try_new(schema.clone(), &table_configuration, predicate, ()) } + pub(crate) fn assert_transform_spec( + transform_spec: &TransformSpec, + requested_row_indexes: bool, + expected_row_id_name: &str, + expected_row_index_name: &str, + ) { + // if we requested row indexes, there's only one transform for the row id col, otherwise the + // first transform drops the row index column, and the second one adds the row ids + let expected_transform_count = if requested_row_indexes { 1 } else { 2 }; + let generate_offset = if requested_row_indexes { 0 } else { 1 }; + + assert_eq!(transform_spec.len(), expected_transform_count); + + if !requested_row_indexes { + // ensure we have a drop transform if we didn't request row indexes + match &transform_spec[0] { + FieldTransformSpec::StaticDrop { field_name } => { + assert_eq!(field_name, expected_row_index_name); + } + _ => panic!("Expected StaticDrop transform"), + } + } + + match &transform_spec[generate_offset] { + FieldTransformSpec::GenerateRowId { + field_name, + row_index_field_name, + } => { + assert_eq!(field_name, expected_row_id_name); + assert_eq!(row_index_field_name, expected_row_index_name); + } + _ => panic!("Expected GenerateRowId transform"), + } + } + #[test] fn no_partition_columns() { // Test case: No partition columns, no column mapping @@ -420,7 +455,7 @@ pub(crate) mod tests { ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", - "some-row-id_col", + "some_row_id_col", ), ]), vec![("row_id", MetadataColumnSpec::RowId)], @@ -429,25 +464,44 @@ pub(crate) mod tests { // Should have a transform spec for the row_id column let transform_spec = state_info.transform_spec.as_ref().unwrap(); - assert_eq!(transform_spec.len(), 2); // one for rowid col, one to drop indexes + assert_transform_spec( + transform_spec, + false, // we did not request row indexes + "some_row_id_col", + "row_indexes_for_row_id_0", + ); + } - match &transform_spec[0] { - FieldTransformSpec::StaticDrop { field_name } => { - assert_eq!(field_name, "row_indexes_for_row_id_0"); - } - _ => panic!("Expected StaticDrop transform"), - } + #[test] + fn request_row_ids_conflicting_row_index_col_name() { + let schema = Arc::new(StructType::new_unchecked(vec![StructField::nullable( + "row_indexes_for_row_id_0", // this will conflict with the first generated name for row indexes + DataType::STRING, + )])); - match &transform_spec[1] { - FieldTransformSpec::GenerateRowId { - field_name, - row_index_field_name, - } => { - assert_eq!(field_name, "some-row-id_col"); - assert_eq!(row_index_field_name, "row_indexes_for_row_id_0"); - } - _ => panic!("Expected GenerateRowId transform"), - } + let state_info = get_state_info( + schema.clone(), + vec![], + None, + get_string_map(&[ + ("delta.enableRowTracking", "true"), + ( + "delta.rowTracking.materializedRowIdColumnName", + "some_row_id_col", + ), + ]), + vec![("row_id", MetadataColumnSpec::RowId)], + ) + .unwrap(); + + // Should have a transform spec for the row_id column + let transform_spec = state_info.transform_spec.as_ref().unwrap(); + assert_transform_spec( + transform_spec, + false, // we did not request row indexes + "some_row_id_col", + "row_indexes_for_row_id_1", // ensure we didn't conflict with the col in the schema + ); } #[test] @@ -465,7 +519,7 @@ pub(crate) mod tests { ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", - "some-row-id_col", + "some_row_id_col", ), ]), vec![ @@ -477,18 +531,12 @@ pub(crate) mod tests { // Should have a transform spec for the row_id column let transform_spec = state_info.transform_spec.as_ref().unwrap(); - assert_eq!(transform_spec.len(), 1); // just one because we don't want to drop indexes now - - match &transform_spec[0] { - FieldTransformSpec::GenerateRowId { - field_name, - row_index_field_name, - } => { - assert_eq!(field_name, "some-row-id_col"); - assert_eq!(row_index_field_name, "row_index"); - } - _ => panic!("Expected GenerateRowId transform"), - } + assert_transform_spec( + transform_spec, + true, // we did request row indexes + "some_row_id_col", + "row_index", + ); } #[test] From 194cb988e77a6caa42512b2c918822abbeb1e03d Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 17 Oct 2025 13:41:46 -0700 Subject: [PATCH 25/28] comments --- kernel/src/scan/log_replay.rs | 6 +++--- kernel/src/transforms.rs | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index d98033cad..66fdec709 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -394,7 +394,7 @@ mod tests { use std::{collections::HashMap, sync::Arc}; use crate::actions::get_log_schema; - use crate::expressions::Scalar; + use crate::expressions::{BinaryExpressionOp, Scalar, VariadicExpressionOp}; use crate::log_replay::ActionsBatch; use crate::scan::state::{DvInfo, Stats}; use crate::scan::state_info::tests::{ @@ -596,11 +596,11 @@ mod tests { assert_eq!(row_id_transform.exprs.len(), 1); let expr = &row_id_transform.exprs[0]; let expeceted_expr = Arc::new(Expr::variadic( - crate::expressions::VariadicExpressionOp::Coalesce, + VariadicExpressionOp::Coalesce, vec![ Expr::column(["row_id_col"]), Expr::binary( - crate::expressions::BinaryExpressionOp::Plus, + BinaryExpressionOp::Plus, Expr::literal(42i64), Expr::column(["row_indexes_for_row_id_0"]), ), diff --git a/kernel/src/transforms.rs b/kernel/src/transforms.rs index 08c991535..571c86130 100644 --- a/kernel/src/transforms.rs +++ b/kernel/src/transforms.rs @@ -572,7 +572,7 @@ mod tests { assert!(row_id_transform.is_replace); let expeceted_expr = Arc::new(Expression::variadic( - crate::expressions::VariadicExpressionOp::Coalesce, + VariadicExpressionOp::Coalesce, vec![ Expression::column(["row_id_col"]), Expression::binary( @@ -601,12 +601,14 @@ mod tests { ]); let metadata_values = HashMap::new(); - assert!(get_transform_expr( - &transform_spec, - metadata_values, - &physical_schema, - None, /* base_row_id */ - ) - .is_err()); + assert_result_error_with_message( + get_transform_expr( + &transform_spec, + metadata_values, + &physical_schema, + None, /* base_row_id */ + ), + "Asked to generate RowIds, but no baseRowId found", + ); } } From 8f0ff20bb25b24650ab6499803f557e63592b03a Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 17 Oct 2025 14:13:45 -0700 Subject: [PATCH 26/28] factor out `validate_metadata_columns` --- kernel/src/scan/state_info.rs | 115 ++++++++++++++++++++++------------ 1 file changed, 74 insertions(+), 41 deletions(-) diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index 7dfba968f..913370517 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -27,6 +27,67 @@ pub(crate) struct StateInfo { pub(crate) transform_spec: Option>, } +/// Validating the metadata columns also extracts information needed to properly construct the full +/// `StateInfo`. We use this struct to group this information so it can be cleanly passed back from +/// `validate_metadata_columns` +#[derive(Default)] +struct MetadataInfo<'a> { + /// What are the names of the requested metadata fields + metadata_field_names: HashSet<&'a String>, + /// The name of the column that's selecting row indexes if that's been requested or None if they + /// are not requested . We remember this if it's been requested explicitly. this is so we can + /// reference this column and not re-add it as a requested column if we're _also_ requesting + /// row-ids. + selected_row_index_col_name: Option<&'a String>, + /// the materializedRowIdColumnName extracted from the table config if row ids are requested, or + /// None if they are not requested + materialized_row_id_column_name: Option<&'a String>, +} + +/// This validates that we have sensible metadata columns, and that the requested metadata is +/// supported by the table. Also computes and returns any extra info needed to build the transform +/// for the requested columns. +// Runs in O(supported_number_of_metadata_columns) time since each metadata +// column can appear at most once in the schema +fn validate_metadata_columns<'a>( + logical_schema: &'a SchemaRef, + table_configuration: &'a TableConfiguration, +) -> DeltaResult> { + let mut metadata_info = MetadataInfo::default(); + let partition_columns = table_configuration.metadata().partition_columns(); + for metadata_column in logical_schema.metadata_columns() { + // Ensure we don't have a metadata column with same name as a partition column + if partition_columns.contains(metadata_column.name()) { + return Err(Error::Schema(format!( + "Metadata column names must not match partition columns: {}", + metadata_column.name() + ))); + } + match metadata_column.get_metadata_column_spec() { + Some(MetadataColumnSpec::RowIndex) => { + metadata_info.selected_row_index_col_name = Some(metadata_column.name()); + } + Some(MetadataColumnSpec::RowId) => { + if table_configuration.table_properties().enable_row_tracking != Some(true) { + return Err(Error::unsupported("Row ids are not enabled on this table")); + } + let row_id_col = table_configuration + .metadata() + .configuration() + .get("delta.rowTracking.materializedRowIdColumnName") + .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; + metadata_info.materialized_row_id_column_name = Some(row_id_col); + } + Some(MetadataColumnSpec::RowCommitVersion) => {} + None => {} + } + metadata_info + .metadata_field_names + .insert(metadata_column.name()); + } + Ok(metadata_info) +} + impl StateInfo { /// Create StateInfo with a custom field classifier for different scan types. /// Get the state needed to process a scan. @@ -44,30 +105,10 @@ impl StateInfo { let partition_columns = table_configuration.metadata().partition_columns(); let column_mapping_mode = table_configuration.column_mapping_mode(); let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); - let mut metadata_field_names = HashSet::new(); let mut transform_spec = Vec::new(); let mut last_physical_field: Option = None; - // We remember the name of the column that's selecting row indexes, if it's been requested - // explicitly. this is so we can reference this column and not re-add it as a requested - // column if we're _also_ requesting row-ids - let mut selected_row_index_col_name = None; - - // This iteration runs in O(supported_number_of_metadata_columns) time since each metadata - // column can appear at most once in the schema - for metadata_column in logical_schema.metadata_columns() { - // Ensure we don't have a metadata column with same name as a partition column - if partition_columns.contains(metadata_column.name()) { - return Err(Error::Schema(format!( - "Metadata column names must not match partition columns: {}", - metadata_column.name() - ))); - } - if let Some(MetadataColumnSpec::RowIndex) = metadata_column.get_metadata_column_spec() { - selected_row_index_col_name = Some(metadata_column.name().to_string()); - } - metadata_field_names.insert(metadata_column.name()); - } + let metadata_info = validate_metadata_columns(&logical_schema, table_configuration)?; // Loop over all selected fields and build both the physical schema and transform spec for (index, logical_field) in logical_schema.fields().enumerate() { @@ -86,22 +127,8 @@ impl StateInfo { // Regular field field or a metadata column, figure out which and handle it match logical_field.get_metadata_column_spec() { Some(MetadataColumnSpec::RowId) => { - if table_configuration.table_properties().enable_row_tracking != Some(true) - { - return Err(Error::unsupported( - "Row ids are not enabled on this table", - )); - } - - let row_id_col = table_configuration - .metadata() - .configuration() - .get("delta.rowTracking.materializedRowIdColumnName") - .ok_or(Error::generic("No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration"))?; - - // we can `take` as we should only have one RowId col - let index_column_name = match selected_row_index_col_name.take() { - Some(index_column_name) => index_column_name, + let index_column_name = match metadata_info.selected_row_index_col_name { + Some(index_column_name) => index_column_name.to_string(), None => { // the index column isn't being explicitly requested, so add it to // `read_fields` so the parquet_reader will generate it, and add a @@ -124,10 +151,16 @@ impl StateInfo { index_column_name } }; + let Some(row_id_col_name) = metadata_info.materialized_row_id_column_name + else { + return Err(Error::internal_error( + "Should always return a materialized_row_id_column_name if selecting row ids" + )); + }; - read_fields.push(StructField::nullable(row_id_col, DataType::LONG)); + read_fields.push(StructField::nullable(row_id_col_name, DataType::LONG)); transform_spec.push(FieldTransformSpec::GenerateRowId { - field_name: row_id_col.to_string(), + field_name: row_id_col_name.to_string(), row_index_field_name: index_column_name, }); } @@ -142,7 +175,7 @@ impl StateInfo { let physical_name = physical_field.name.clone(); if !logical_field.is_metadata_column() - && metadata_field_names.contains(&physical_name) + && metadata_info.metadata_field_names.contains(&physical_name) { return Err(Error::Schema(format!( "Metadata column names must not match physical columns, but logical column '{}' has physical name '{}'", @@ -614,7 +647,7 @@ pub(crate) mod tests { vec![], None, get_string_map(&[("delta.columnMapping.mode", "name")]), - vec![("other", MetadataColumnSpec::RowId)], + vec![("other", MetadataColumnSpec::RowIndex)], ) { Ok(_) => { panic!("Should not have succeeded generating state info with invalid config") From 502a96758defdeb73e1b8c3d8fbc80fc8aee312d Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 21 Oct 2025 17:03:19 -0700 Subject: [PATCH 27/28] update comment --- kernel/src/scan/field_classifiers.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/src/scan/field_classifiers.rs b/kernel/src/scan/field_classifiers.rs index dc7278ec6..81611a46c 100644 --- a/kernel/src/scan/field_classifiers.rs +++ b/kernel/src/scan/field_classifiers.rs @@ -6,8 +6,10 @@ use crate::table_changes::{ }; use crate::transforms::FieldTransformSpec; -/// Trait for classifying fields during StateInfo construction. -/// Allows different scan types (regular, CDF) to customize field handling. +/// Trait for classifying fields during StateInfo construction. Allows different scan types +/// (regular, CDF) to customize field handling. Note that the default set of field handling occurs +/// in [`StateInfo::try_new`](crate::scan::state_info::StateInfo::try_new). A +/// `TransformFieldClassifier` can be used to override the behavior implemented in that method. pub(crate) trait TransformFieldClassifier { /// Classify a field and return its transform spec. /// Returns None if the field is physical (should be read from parquet). From 6c744bc8f3957b50a5462bde20057e214304e078 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 22 Oct 2025 11:38:40 -0700 Subject: [PATCH 28/28] address final comments --- kernel/src/scan/state_info.rs | 48 +++++++------------ .../src/table_changes/physical_to_logical.rs | 3 ++ kernel/tests/read.rs | 26 ++++------ 3 files changed, 29 insertions(+), 48 deletions(-) diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index 913370517..8e5a4731e 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -35,7 +35,7 @@ struct MetadataInfo<'a> { /// What are the names of the requested metadata fields metadata_field_names: HashSet<&'a String>, /// The name of the column that's selecting row indexes if that's been requested or None if they - /// are not requested . We remember this if it's been requested explicitly. this is so we can + /// are not requested. We remember this if it's been requested explicitly. this is so we can /// reference this column and not re-add it as a requested column if we're _also_ requesting /// row-ids. selected_row_index_col_name: Option<&'a String>, @@ -221,6 +221,7 @@ pub(crate) mod tests { use crate::actions::{Metadata, Protocol}; use crate::expressions::{column_expr, Expression as Expr}; use crate::schema::{ColumnMetadataKey, MetadataValue}; + use crate::utils::test_utils::assert_result_error_with_message; use super::*; @@ -587,17 +588,8 @@ pub(crate) mod tests { "Generic delta kernel error: No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration", ), ] { - match get_state_info(schema.clone(), vec![], None, metadata_config, metadata_cols) { - Ok(_) => { - panic!("Should not have succeeded generating state info with invalid config") - } - Err(e) => { - assert_eq!( - e.to_string(), - expected_error, - ) - } - } + let res = get_state_info(schema.clone(), vec![], None, metadata_config, metadata_cols); + assert_result_error_with_message(res, expected_error); } } @@ -607,21 +599,17 @@ pub(crate) mod tests { "id", DataType::STRING, )])); - match get_state_info( + let res = get_state_info( schema.clone(), vec!["part_col".to_string()], None, HashMap::new(), vec![("part_col", MetadataColumnSpec::RowId)], - ) { - Ok(_) => { - panic!("Should not have succeeded generating state info with invalid config") - } - Err(e) => { - assert_eq!(e.to_string(), - "Schema error: Metadata column names must not match partition columns: part_col") - } - } + ); + assert_result_error_with_message( + res, + "Schema error: Metadata column names must not match partition columns: part_col", + ); } #[test] @@ -642,20 +630,16 @@ pub(crate) mod tests { "other".into(), ), ]))])); - match get_state_info( + let res = get_state_info( schema.clone(), vec![], None, get_string_map(&[("delta.columnMapping.mode", "name")]), vec![("other", MetadataColumnSpec::RowIndex)], - ) { - Ok(_) => { - panic!("Should not have succeeded generating state info with invalid config") - } - Err(e) => { - assert_eq!(e.to_string(), - "Schema error: Metadata column names must not match physical columns, but logical column 'id' has physical name 'other'"); - } - } + ); + assert_result_error_with_message( + res, + "Schema error: Metadata column names must not match physical columns, but logical column 'id' has physical name 'other'" + ); } } diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 619f83577..dd38f8015 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -79,6 +79,9 @@ pub(crate) fn scan_file_physical_schema( // added to overwrite any conflicting values. This behavior can be made more strict by changing // the parse_partition_values function to return an error for missing partition values, // and adding cdf values to the partition_values map + +// Note: Delta doesn't support row-tracking for CDF (see: +// https://docs.databricks.com/aws/en/delta/row-tracking#limitations) pub(crate) fn get_cdf_transform_expr( scan_file: &CdfScanFile, state_info: &StateInfo, diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 0edb33765..0a8c474a3 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -1519,22 +1519,16 @@ async fn test_unsupported_metadata_columns() -> Result<(), Box { - let error_msg = e.to_string(); - assert!( - error_msg.contains(error_text), - "Expected {error_msg} to contain {error_text}" - ); - } - Ok(_) => { - panic!( - "Expected error for {} metadata column, but scan succeeded", - error_text - ); - } - } + let scan_err = snapshot + .scan_builder() + .with_schema(schema) + .build() + .unwrap_err(); + let error_msg = scan_err.to_string(); + assert!( + error_msg.contains(error_text), + "Expected {error_msg} to contain {error_text}" + ); } Ok(())