-
Couldn't load subscription status.
- Fork 118
feat: Add row tracking support #1375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
7aa1d7f
d263f2a
397af57
aa68b80
a0f11a6
4d27d6d
e1931dd
723ab1a
bffa4e8
515ee7c
be7d60d
975d2b6
6e96dc2
e46cb15
b91e50d
798abbb
2f1cba7
693ab5e
8ef9487
07b84af
d707a5f
fa9c6b1
0d57b88
6100e6f
cae7eaf
3a43190
1e36f00
2822bb8
a4cbedc
8f25b5e
194cb98
8f0ff20
e451940
502a967
6c744bc
dafe973
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,32 +16,19 @@ pub(crate) trait TransformFieldClassifier { | |
| &self, | ||
| field: &StructField, | ||
| field_index: usize, | ||
| partition_columns: &[String], | ||
| last_physical_field: &Option<String>, | ||
| ) -> Option<FieldTransformSpec>; | ||
| } | ||
|
|
||
| /// Regular scan field classifier for standard Delta table scans. | ||
| /// Handles partition columns as metadata-derived fields. | ||
| pub(crate) struct ScanTransformFieldClassifier; | ||
| impl TransformFieldClassifier for ScanTransformFieldClassifier { | ||
| // Empty classifier, always returns None | ||
| impl TransformFieldClassifier for () { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We probably want to update the TransformFieldClassifier description to describe its optional nature now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not quite sure where you mean to update? I think the docs already say it's optional? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ^ Update it clarify that general field handling happens in StateInfo? |
||
| fn classify_field( | ||
| &self, | ||
| field: &StructField, | ||
| field_index: usize, | ||
| partition_columns: &[String], | ||
| last_physical_field: &Option<String>, | ||
| _: &StructField, | ||
| _: usize, | ||
| _: &Option<String>, | ||
| ) -> Option<FieldTransformSpec> { | ||
| if partition_columns.contains(field.name()) { | ||
| // Partition column: needs transform to inject metadata | ||
| Some(FieldTransformSpec::MetadataDerivedColumn { | ||
| field_index, | ||
| insert_after: last_physical_field.clone(), | ||
| }) | ||
| } else { | ||
| // Regular physical field - no transform needed | ||
| None | ||
| } | ||
| None | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -53,7 +40,6 @@ impl TransformFieldClassifier for CdfTransformFieldClassifier { | |
| &self, | ||
| field: &StructField, | ||
| field_index: usize, | ||
| partition_columns: &[String], | ||
| last_physical_field: &Option<String>, | ||
| ) -> Option<FieldTransformSpec> { | ||
| match field.name().as_str() { | ||
|
|
@@ -70,13 +56,7 @@ impl TransformFieldClassifier for CdfTransformFieldClassifier { | |
| insert_after: last_physical_field.clone(), | ||
| }) | ||
| } | ||
| // Defer to default classifier for partition columns and physical fields | ||
| _ => ScanTransformFieldClassifier.classify_field( | ||
| field, | ||
| field_index, | ||
| partition_columns, | ||
| last_physical_field, | ||
| ), | ||
| _ => None, | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,7 +3,8 @@ use std::collections::{HashMap, HashSet}; | |
| use std::sync::{Arc, LazyLock}; | ||
|
|
||
| use super::data_skipping::DataSkippingFilter; | ||
| use super::{PhysicalPredicate, ScanMetadata, StateInfo}; | ||
| use super::state_info::StateInfo; | ||
| use super::{PhysicalPredicate, ScanMetadata}; | ||
| use crate::actions::deletion_vector::DeletionVectorDescriptor; | ||
| use crate::actions::get_log_add_schema; | ||
| use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; | ||
|
|
@@ -105,8 +106,9 @@ impl AddRemoveDedupVisitor<'_> { | |
| const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters | ||
| const ADD_PARTITION_VALUES_INDEX: usize = 1; // Position of "add.partitionValues" in getters | ||
| const ADD_DV_START_INDEX: usize = 2; // Start position of add deletion vector columns | ||
| const REMOVE_PATH_INDEX: usize = 5; // Position of "remove.path" in getters | ||
| const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns | ||
| const BASE_ROW_ID_INDEX: usize = 5; // Position of add.baseRowId in getters | ||
| const REMOVE_PATH_INDEX: usize = 6; // Position of "remove.path" in getters | ||
| const REMOVE_DV_START_INDEX: usize = 7; // Start position of remove deletion vector columns | ||
|
|
||
| fn new( | ||
| seen: &mut HashSet<FileActionKey>, | ||
|
|
@@ -195,10 +197,19 @@ impl AddRemoveDedupVisitor<'_> { | |
| if self.deduplicator.check_and_record_seen(file_key) || !is_add { | ||
| return Ok(false); | ||
| } | ||
| let base_row_id: Option<i64> = | ||
| getters[Self::BASE_ROW_ID_INDEX].get_opt(i, "add.baseRowId")?; | ||
| let transform = self | ||
| .transform_spec | ||
| .as_ref() | ||
| .map(|transform| get_transform_expr(transform, partition_values, &self.physical_schema)) | ||
| .map(|transform| { | ||
| get_transform_expr( | ||
| transform, | ||
| partition_values, | ||
| &self.physical_schema, | ||
| base_row_id, | ||
| ) | ||
| }) | ||
| .transpose()?; | ||
| if transform.is_some() { | ||
| // fill in any needed `None`s for previous rows | ||
|
|
@@ -215,13 +226,15 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { | |
| static NAMES_AND_TYPES: LazyLock<ColumnNamesAndTypes> = LazyLock::new(|| { | ||
| const STRING: DataType = DataType::STRING; | ||
| const INTEGER: DataType = DataType::INTEGER; | ||
| const LONG: DataType = DataType::LONG; | ||
| let ss_map: DataType = MapType::new(STRING, STRING, true).into(); | ||
| let types_and_names = vec![ | ||
| (STRING, column_name!("add.path")), | ||
| (ss_map, column_name!("add.partitionValues")), | ||
| (STRING, column_name!("add.deletionVector.storageType")), | ||
| (STRING, column_name!("add.deletionVector.pathOrInlineDv")), | ||
| (INTEGER, column_name!("add.deletionVector.offset")), | ||
| (LONG, column_name!("add.baseRowId")), | ||
| (STRING, column_name!("remove.path")), | ||
| (STRING, column_name!("remove.deletionVector.storageType")), | ||
| (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), | ||
|
|
@@ -236,13 +249,13 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { | |
| } else { | ||
| // All checkpoint actions are already reconciled and Remove actions in checkpoint files | ||
| // only serve as tombstones for vacuum jobs. So we only need to examine the adds here. | ||
| (&names[..5], &types[..5]) | ||
| (&names[..6], &types[..6]) | ||
| } | ||
| } | ||
|
|
||
| fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { | ||
| let is_log_batch = self.deduplicator.is_log_batch(); | ||
| let expected_getters = if is_log_batch { 9 } else { 5 }; | ||
| let expected_getters = if is_log_batch { 10 } else { 6 }; | ||
| require!( | ||
| getters.len() == expected_getters, | ||
| Error::InternalError(format!( | ||
|
|
@@ -266,8 +279,10 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { | |
| pub(crate) static SCAN_ROW_SCHEMA: LazyLock<Arc<StructType>> = LazyLock::new(|| { | ||
| // Note that fields projected out of a nullable struct must be nullable | ||
| let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); | ||
| let file_constant_values = | ||
| StructType::new_unchecked([StructField::nullable("partitionValues", partition_values)]); | ||
| let file_constant_values = StructType::new_unchecked([ | ||
| StructField::nullable("partitionValues", partition_values), | ||
| StructField::nullable("baseRowId", DataType::LONG), | ||
| ]); | ||
|
Comment on lines
+282
to
+285
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: my async prototype would also let us avoid a lot this static schema munging since the schema can be inferred from a plan. |
||
| Arc::new(StructType::new_unchecked([ | ||
| StructField::nullable("path", DataType::STRING), | ||
| StructField::nullable("size", DataType::LONG), | ||
|
|
@@ -290,9 +305,10 @@ fn get_add_transform_expr() -> ExpressionRef { | |
| column_expr_ref!("add.modificationTime"), | ||
| column_expr_ref!("add.stats"), | ||
| column_expr_ref!("add.deletionVector"), | ||
| Arc::new(Expression::Struct(vec![column_expr_ref!( | ||
| "add.partitionValues" | ||
| )])), | ||
| Arc::new(Expression::Struct(vec![ | ||
| column_expr_ref!("add.partitionValues"), | ||
| column_expr_ref!("add.baseRowId"), | ||
| ])), | ||
| ])) | ||
| }); | ||
| EXPR.clone() | ||
|
|
@@ -311,6 +327,7 @@ pub(crate) fn get_scan_metadata_transform_expr() -> ExpressionRef { | |
| column_expr_ref!("modificationTime"), | ||
| column_expr_ref!("stats"), | ||
| column_expr_ref!("deletionVector"), | ||
| column_expr_ref!("fileConstantValues.baseRowId"), | ||
| ], | ||
| ))])) | ||
| }); | ||
|
|
@@ -380,12 +397,14 @@ mod tests { | |
| use crate::expressions::Scalar; | ||
| use crate::log_replay::ActionsBatch; | ||
| use crate::scan::state::{DvInfo, Stats}; | ||
| use crate::scan::state_info::tests::{get_simple_state_info, get_state_info}; | ||
| use crate::scan::state_info::StateInfo; | ||
| use crate::scan::test_utils::{ | ||
| add_batch_simple, add_batch_with_partition_col, add_batch_with_remove, | ||
| run_with_validate_callback, | ||
| add_batch_for_row_id, add_batch_simple, add_batch_with_partition_col, | ||
| add_batch_with_remove, run_with_validate_callback, | ||
| }; | ||
| use crate::scan::{PhysicalPredicate, StateInfo}; | ||
| use crate::table_features::ColumnMappingMode; | ||
| use crate::scan::PhysicalPredicate; | ||
| use crate::schema::MetadataColumnSpec; | ||
| use crate::Expression as Expr; | ||
| use crate::{ | ||
| engine::sync::SyncEngine, | ||
|
|
@@ -473,15 +492,8 @@ mod tests { | |
| StructField::new("value", DataType::INTEGER, true), | ||
| StructField::new("date", DataType::DATE, true), | ||
| ])); | ||
| let partition_cols = ["date".to_string()]; | ||
| let state_info = StateInfo::try_new( | ||
| schema.clone(), | ||
| &partition_cols, | ||
| ColumnMappingMode::None, | ||
| None, | ||
| crate::scan::field_classifiers::ScanTransformFieldClassifier, | ||
| ) | ||
| .unwrap(); | ||
| let partition_cols = vec!["date".to_string()]; | ||
| let state_info = get_simple_state_info(schema, partition_cols).unwrap(); | ||
| let batch = vec![add_batch_with_partition_col()]; | ||
| let iter = scan_action_iter( | ||
| &SyncEngine::new(), | ||
|
|
@@ -525,4 +537,68 @@ mod tests { | |
| validate_transform(transforms[3].as_ref(), 17510); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_row_id_transform() { | ||
| let schema: SchemaRef = Arc::new(StructType::new_unchecked([StructField::new( | ||
| "value", | ||
| DataType::INTEGER, | ||
| true, | ||
| )])); | ||
| let state_info = get_state_info( | ||
| schema.clone(), | ||
| vec![], | ||
| None, | ||
| [ | ||
| ("delta.enableRowTracking", "true"), | ||
| ( | ||
| "delta.rowTracking.materializedRowIdColumnName", | ||
| "row_id_col", | ||
| ), | ||
| ] | ||
| .iter() | ||
| .map(|(k, v)| (k.to_string(), v.to_string())) | ||
| .collect(), | ||
| vec![("row_id", MetadataColumnSpec::RowId)], | ||
| ) | ||
| .unwrap(); | ||
OussamaSaoudi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| let batch = vec![add_batch_for_row_id(get_log_schema().clone())]; | ||
| let iter = scan_action_iter( | ||
| &SyncEngine::new(), | ||
| batch | ||
| .into_iter() | ||
| .map(|batch| Ok(ActionsBatch::new(batch as _, true))), | ||
| Arc::new(state_info), | ||
| ); | ||
|
|
||
| for res in iter { | ||
| let scan_metadata = res.unwrap(); | ||
| let transforms = scan_metadata.scan_file_transforms; | ||
| assert_eq!(transforms.len(), 1, "Should have 1 transform"); | ||
| if let Some(Expr::Transform(transform_expr)) = transforms[0].as_ref().map(Arc::as_ref) { | ||
| assert!(transform_expr.input_path.is_none()); | ||
| let row_id_transform = transform_expr | ||
| .field_transforms | ||
| .get("row_id_col") | ||
| .expect("Should have row_id_col transform"); | ||
| assert!(row_id_transform.is_replace); | ||
| assert_eq!(row_id_transform.exprs.len(), 1); | ||
| let expr = &row_id_transform.exprs[0]; | ||
| let expeceted_expr = Arc::new(Expr::variadic( | ||
| crate::expressions::VariadicExpressionOp::Coalesce, | ||
nicklan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| vec![ | ||
| Expr::column(["row_id_col"]), | ||
| Expr::binary( | ||
| crate::expressions::BinaryExpressionOp::Plus, | ||
nicklan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Expr::literal(42i64), | ||
| Expr::column(["row_indexes_for_row_id_0"]), | ||
| ), | ||
| ], | ||
| )); | ||
| assert_eq!(expr, &expeceted_expr); | ||
| } else { | ||
| panic!("Should have been a transform expression"); | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This all seems like it could be simplified like so:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah good call. I should have just given up on keeping the schema
Optional:)