From 206940ee527836be56e87e2fa6f3b0ab385fd239 Mon Sep 17 00:00:00 2001 From: Zac Farrell Date: Thu, 26 Feb 2026 20:17:11 +0100 Subject: [PATCH 1/3] fix: validate record_count metadata to reject negative values - Add validated_record_count() mirroring existing validated_file_size() pattern - Apply in table_deletions before constructing DeletedRowsExec - Add assert in DataFileInfo::new() for write-side safety Found during Feb 2026 security review Co-Authored-By: Claude Opus 4.6 --- src/metadata_writer.rs | 23 +++++++++++++++++ src/table.rs | 56 ++++++++++++++++++++++++++++++++++++++++++ src/table_deletions.rs | 9 ++++++- 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/src/metadata_writer.rs b/src/metadata_writer.rs index 15ca7b6..3339262 100644 --- a/src/metadata_writer.rs +++ b/src/metadata_writer.rs @@ -106,7 +106,18 @@ pub struct DataFileInfo { impl DataFileInfo { /// Create a new data file info with relative path. + /// + /// # Panics + /// + /// Panics if `record_count` is negative. Record counts originate from + /// `RecordBatch::num_rows()` (always non-negative), so a negative value + /// indicates a programming error. pub fn new(path: impl Into, file_size_bytes: i64, record_count: i64) -> Self { + assert!( + record_count >= 0, + "record_count must be non-negative, got {}", + record_count + ); Self { path: path.into(), path_is_relative: true, @@ -300,4 +311,16 @@ mod tests { let file = DataFileInfo::new("/absolute/path.parquet", 1024, 100).with_absolute_path(); assert!(!file.path_is_relative); } + + #[test] + fn test_data_file_info_zero_record_count() { + let file = DataFileInfo::new("empty.parquet", 0, 0); + assert_eq!(file.record_count, 0); + } + + #[test] + #[should_panic(expected = "record_count must be non-negative")] + fn test_data_file_info_negative_record_count_panics() { + DataFileInfo::new("test.parquet", 1024, -1); + } } diff --git a/src/table.rs b/src/table.rs index ed8870a..55dfb1e 100644 --- a/src/table.rs +++ b/src/table.rs @@ -63,6 +63,23 @@ pub(crate) fn validated_file_size(file_size_bytes: i64, file_path: &str) -> Data }) } +/// Validate and convert record_count from i64 (as stored in DuckLake metadata) to u64. +/// +/// DuckLake stores record counts as signed integers in SQL. A negative value indicates +/// corrupt or invalid metadata. Without this check, a negative record_count would cause +/// incorrect behavior (e.g., empty ranges in full-file deletes, or incorrect row filtering). +pub(crate) fn validated_record_count( + record_count: i64, + file_path: &str, +) -> DataFusionResult { + u64::try_from(record_count).map_err(|_| { + DataFusionError::Execution(format!( + "Invalid record_count ({}) for file '{}': value must be non-negative", + record_count, file_path + )) + }) +} + /// Returns the expected schema for DuckLake delete files /// /// Delete files have a standard schema: (file_path: VARCHAR, pos: INT64) @@ -731,4 +748,43 @@ mod tests { assert!(msg.contains("bad.parquet")); assert!(msg.contains(&i64::MIN.to_string())); } + + #[test] + fn test_validated_record_count_positive() { + assert_eq!(validated_record_count(0, "test.parquet").unwrap(), 0); + assert_eq!(validated_record_count(100, "test.parquet").unwrap(), 100); + assert_eq!( + validated_record_count(i64::MAX, "test.parquet").unwrap(), + i64::MAX as u64 + ); + } + + #[test] + fn test_validated_record_count_negative() { + let err = validated_record_count(-1, "data/test.parquet").unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("-1"), + "Error should contain the negative value: {}", + msg + ); + assert!( + msg.contains("data/test.parquet"), + "Error should contain the file path: {}", + msg + ); + assert!( + msg.contains("record_count"), + "Error should mention record_count: {}", + msg + ); + } + + #[test] + fn test_validated_record_count_large_negative() { + let err = validated_record_count(i64::MIN, "bad.parquet").unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("bad.parquet")); + assert!(msg.contains(&i64::MIN.to_string())); + } } diff --git a/src/table_deletions.rs b/src/table_deletions.rs index 9c3b6a4..88e8d49 100644 --- a/src/table_deletions.rs +++ b/src/table_deletions.rs @@ -36,7 +36,7 @@ use futures::Stream; use crate::metadata_provider::{DeleteFileChange, MetadataProvider}; use crate::path_resolver::resolve_path; -use crate::table::validated_file_size; +use crate::table::{validated_file_size, validated_record_count}; /// Delete file schema: (file_path: VARCHAR, pos: INT64) fn delete_file_schema() -> SchemaRef { @@ -143,6 +143,13 @@ impl TableDeletionsTable { delete_file.data_file_footer_size, )?; + // Validate record_count before use — a negative value from corrupt metadata + // would cause incorrect behavior (e.g., empty ranges in full-file deletes). + validated_record_count( + delete_file.data_record_count, + &delete_file.data_file_path, + )?; + Ok(Arc::new(DeletedRowsExec::new( current_delete_exec, previous_delete_exec, From 7c2cb83a6efd2a65cc6db7c23eee0daba3f2b81b Mon Sep 17 00:00:00 2001 From: Zac Farrell Date: Sun, 1 Mar 2026 03:20:24 +0100 Subject: [PATCH 2/3] fix: run rustfmt --- src/table.rs | 5 +---- src/table_deletions.rs | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/table.rs b/src/table.rs index 55dfb1e..ded6d5c 100644 --- a/src/table.rs +++ b/src/table.rs @@ -68,10 +68,7 @@ pub(crate) fn validated_file_size(file_size_bytes: i64, file_path: &str) -> Data /// DuckLake stores record counts as signed integers in SQL. A negative value indicates /// corrupt or invalid metadata. Without this check, a negative record_count would cause /// incorrect behavior (e.g., empty ranges in full-file deletes, or incorrect row filtering). -pub(crate) fn validated_record_count( - record_count: i64, - file_path: &str, -) -> DataFusionResult { +pub(crate) fn validated_record_count(record_count: i64, file_path: &str) -> DataFusionResult { u64::try_from(record_count).map_err(|_| { DataFusionError::Execution(format!( "Invalid record_count ({}) for file '{}': value must be non-negative", diff --git a/src/table_deletions.rs b/src/table_deletions.rs index 88e8d49..8ad73c7 100644 --- a/src/table_deletions.rs +++ b/src/table_deletions.rs @@ -145,10 +145,7 @@ impl TableDeletionsTable { // Validate record_count before use — a negative value from corrupt metadata // would cause incorrect behavior (e.g., empty ranges in full-file deletes). - validated_record_count( - delete_file.data_record_count, - &delete_file.data_file_path, - )?; + validated_record_count(delete_file.data_record_count, &delete_file.data_file_path)?; Ok(Arc::new(DeletedRowsExec::new( current_delete_exec, From 13b928c3f61ff19ab9a07e9793132c0b8c778041 Mon Sep 17 00:00:00 2001 From: Zac Farrell Date: Sun, 1 Mar 2026 18:42:41 +0100 Subject: [PATCH 3/3] fix: pre-install parquet extension to avoid macOS CI race condition Multiple concurrent tests triggering DuckDB's parquet auto-install simultaneously caused flaky failures on macOS CI. --- tests/common/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 8d4b770..b6081c5 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -14,12 +14,17 @@ use std::sync::Once; static INSTALL_DUCKLAKE: Once = Once::new(); -/// Install the ducklake extension exactly once (thread-safe across parallel tests). +/// Install the ducklake and parquet extensions exactly once (thread-safe across parallel tests). +/// +/// Pre-installing parquet avoids race conditions when multiple concurrent tests +/// trigger DuckDB's auto-install simultaneously (observed on macOS CI). fn ensure_ducklake_installed() { INSTALL_DUCKLAKE.call_once(|| { let conn = duckdb::Connection::open_in_memory().expect("open in-memory duckdb"); conn.execute("INSTALL ducklake;", []) .expect("install ducklake extension"); + conn.execute("INSTALL parquet;", []) + .expect("install parquet extension"); }); }