Skip to content

Commit ca2051e

Browse files
committed
Merge branch 'main' into fix/name-validation
2 parents e4d74c1 + 7d604e5 commit ca2051e

7 files changed

Lines changed: 448 additions & 4 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ A [DataFusion](https://datafusion.apache.org/) extension for querying [DuckLake]
44

55
The goal of this project is to make DuckLake a first-class, Arrow-native lakehouse format inside DataFusion.
66

7+
[Join the Discord](https://discord.gg/FefVb3u9sH)
8+
79
---
810

911
## Currently Supported

src/metadata_writer.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,18 @@ pub struct DataFileInfo {
140140

141141
impl DataFileInfo {
142142
/// Create a new data file info with relative path.
143+
///
144+
/// # Panics
145+
///
146+
/// Panics if `record_count` is negative. Record counts originate from
147+
/// `RecordBatch::num_rows()` (always non-negative), so a negative value
148+
/// indicates a programming error.
143149
pub fn new(path: impl Into<String>, file_size_bytes: i64, record_count: i64) -> Self {
150+
assert!(
151+
record_count >= 0,
152+
"record_count must be non-negative, got {}",
153+
record_count
154+
);
144155
Self {
145156
path: path.into(),
146157
path_is_relative: true,
@@ -443,4 +454,16 @@ mod tests {
443454
other => panic!("Expected InvalidConfig, got {:?}", other),
444455
}
445456
}
457+
458+
#[test]
459+
fn test_data_file_info_zero_record_count() {
460+
let file = DataFileInfo::new("empty.parquet", 0, 0);
461+
assert_eq!(file.record_count, 0);
462+
}
463+
464+
#[test]
465+
#[should_panic(expected = "record_count must be non-negative")]
466+
fn test_data_file_info_negative_record_count_panics() {
467+
DataFileInfo::new("test.parquet", 1024, -1);
468+
}
446469
}

src/metadata_writer_sqlite.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,8 @@ impl MetadataWriter for SqliteMetadataWriter {
441441
if let Some((existing_type, _existing_nullable)) =
442442
existing_map.get(new_col.name.as_str())
443443
{
444-
// Column exists - check type matches
445-
if *existing_type != new_col.ducklake_type {
444+
// Column exists - check type compatibility (normalize aliases + allow promotions)
445+
if !crate::types::types_compatible(existing_type, &new_col.ducklake_type) {
446446
return Err(crate::error::DuckLakeError::InvalidConfig(format!(
447447
"Schema evolution error: column '{}' has type '{}' in existing table but '{}' in new schema. Type changes are not allowed.",
448448
new_col.name, existing_type, new_col.ducklake_type

src/table.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,20 @@ pub(crate) fn validated_file_size(file_size_bytes: i64, file_path: &str) -> Data
6363
})
6464
}
6565

66+
/// Validate and convert record_count from i64 (as stored in DuckLake metadata) to u64.
67+
///
68+
/// DuckLake stores record counts as signed integers in SQL. A negative value indicates
69+
/// corrupt or invalid metadata. Without this check, a negative record_count would cause
70+
/// incorrect behavior (e.g., empty ranges in full-file deletes, or incorrect row filtering).
71+
pub(crate) fn validated_record_count(record_count: i64, file_path: &str) -> DataFusionResult<u64> {
72+
u64::try_from(record_count).map_err(|_| {
73+
DataFusionError::Execution(format!(
74+
"Invalid record_count ({}) for file '{}': value must be non-negative",
75+
record_count, file_path
76+
))
77+
})
78+
}
79+
6680
/// Returns the expected schema for DuckLake delete files
6781
///
6882
/// Delete files have a standard schema: (file_path: VARCHAR, pos: INT64)
@@ -731,4 +745,43 @@ mod tests {
731745
assert!(msg.contains("bad.parquet"));
732746
assert!(msg.contains(&i64::MIN.to_string()));
733747
}
748+
749+
#[test]
750+
fn test_validated_record_count_positive() {
751+
assert_eq!(validated_record_count(0, "test.parquet").unwrap(), 0);
752+
assert_eq!(validated_record_count(100, "test.parquet").unwrap(), 100);
753+
assert_eq!(
754+
validated_record_count(i64::MAX, "test.parquet").unwrap(),
755+
i64::MAX as u64
756+
);
757+
}
758+
759+
#[test]
760+
fn test_validated_record_count_negative() {
761+
let err = validated_record_count(-1, "data/test.parquet").unwrap_err();
762+
let msg = err.to_string();
763+
assert!(
764+
msg.contains("-1"),
765+
"Error should contain the negative value: {}",
766+
msg
767+
);
768+
assert!(
769+
msg.contains("data/test.parquet"),
770+
"Error should contain the file path: {}",
771+
msg
772+
);
773+
assert!(
774+
msg.contains("record_count"),
775+
"Error should mention record_count: {}",
776+
msg
777+
);
778+
}
779+
780+
#[test]
781+
fn test_validated_record_count_large_negative() {
782+
let err = validated_record_count(i64::MIN, "bad.parquet").unwrap_err();
783+
let msg = err.to_string();
784+
assert!(msg.contains("bad.parquet"));
785+
assert!(msg.contains(&i64::MIN.to_string()));
786+
}
734787
}

src/table_deletions.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use futures::Stream;
3636

3737
use crate::metadata_provider::{DeleteFileChange, MetadataProvider};
3838
use crate::path_resolver::resolve_path;
39-
use crate::table::validated_file_size;
39+
use crate::table::{validated_file_size, validated_record_count};
4040

4141
/// Delete file schema: (file_path: VARCHAR, pos: INT64)
4242
fn delete_file_schema() -> SchemaRef {
@@ -143,6 +143,10 @@ impl TableDeletionsTable {
143143
delete_file.data_file_footer_size,
144144
)?;
145145

146+
// Validate record_count before use — a negative value from corrupt metadata
147+
// would cause incorrect behavior (e.g., empty ranges in full-file deletes).
148+
validated_record_count(delete_file.data_record_count, &delete_file.data_file_path)?;
149+
146150
Ok(Arc::new(DeletedRowsExec::new(
147151
current_delete_exec,
148152
previous_delete_exec,

0 commit comments

Comments
 (0)