Skip to content

Commit d5d0320

Browse files
zfarrellclaude
andcommitted
fix: validate file_size_bytes and footer_size from metadata
Replace unsafe `as u64` cast for file_size_bytes with `u64::try_from()` + clear error. Skip negative footer_size values instead of wrapping to huge usize. Negative values in catalog metadata indicate corruption; now caught early with clear errors. Closes #58, closes #59 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 22ea714 commit d5d0320

2 files changed

Lines changed: 229 additions & 10 deletions

File tree

src/table.rs

Lines changed: 76 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,20 @@ use datafusion::execution::parquet_encryption::EncryptionFactory;
4949
pub const DELETE_FILE_PATH_COL: &str = "file_path";
5050
pub const DELETE_POS_COL: &str = "pos";
5151

52+
/// Validate and convert file_size_bytes from i64 (as stored in DuckLake metadata) to u64.
53+
///
54+
/// DuckLake stores file sizes as signed integers in SQL. A negative value indicates
55+
/// corrupt or invalid metadata. Without this check, a negative i64 cast to u64 would
56+
/// wrap to a huge value (e.g., -1 becomes u64::MAX), causing confusing downstream errors.
57+
pub(crate) fn validated_file_size(file_size_bytes: i64, file_path: &str) -> DataFusionResult<u64> {
58+
u64::try_from(file_size_bytes).map_err(|_| {
59+
DataFusionError::Execution(format!(
60+
"Invalid file_size_bytes ({}) for file '{}': value must be non-negative",
61+
file_size_bytes, file_path
62+
))
63+
})
64+
}
65+
5266
/// Returns the expected schema for DuckLake delete files
5367
///
5468
/// Delete files have a standard schema: (file_path: VARCHAR, pos: INT64)
@@ -280,10 +294,14 @@ impl DuckLakeTable {
280294
let resolved_delete_path = self.resolve_file_path(delete_file);
281295

282296
// Create PartitionedFile with footer size hint if available
283-
let mut pf =
284-
PartitionedFile::new(&resolved_delete_path, delete_file.file_size_bytes as u64);
297+
let mut pf = PartitionedFile::new(
298+
&resolved_delete_path,
299+
validated_file_size(delete_file.file_size_bytes, &resolved_delete_path)?,
300+
);
285301
if let Some(footer_size) = delete_file.footer_size {
286-
pf = pf.with_metadata_size_hint(footer_size as usize);
302+
if footer_size > 0 {
303+
pf = pf.with_metadata_size_hint(footer_size as usize);
304+
}
287305
}
288306

289307
// Create file scan config for the delete file
@@ -334,18 +352,22 @@ impl DuckLakeTable {
334352
.iter()
335353
.map(|table_file| {
336354
let resolved_path = self.resolve_file_path(&table_file.file);
337-
let mut pf =
338-
PartitionedFile::new(&resolved_path, table_file.file.file_size_bytes as u64);
355+
let mut pf = PartitionedFile::new(
356+
&resolved_path,
357+
validated_file_size(table_file.file.file_size_bytes, &resolved_path)?,
358+
);
339359

340360
// Apply footer size hint if available from DuckLake metadata
341361
// This reduces I/O from 2 reads to 1 read per file (especially beneficial for S3/MinIO)
342362
if let Some(footer_size) = table_file.file.footer_size {
343-
pf = pf.with_metadata_size_hint(footer_size as usize);
363+
if footer_size > 0 {
364+
pf = pf.with_metadata_size_hint(footer_size as usize);
365+
}
344366
}
345367

346-
pf
368+
Ok(pf)
347369
})
348-
.collect();
370+
.collect::<DataFusionResult<Vec<_>>>()?;
349371

350372
// Use read_schema (with original Parquet names) for reading
351373
let mut builder = FileScanConfigBuilder::new(
@@ -413,9 +435,14 @@ impl DuckLakeTable {
413435
let resolved_path = self.resolve_file_path(&table_file.file);
414436

415437
// Create PartitionedFile with footer size hint if available
416-
let mut pf = PartitionedFile::new(&resolved_path, table_file.file.file_size_bytes as u64);
438+
let mut pf = PartitionedFile::new(
439+
&resolved_path,
440+
validated_file_size(table_file.file.file_size_bytes, &resolved_path)?,
441+
);
417442
if let Some(footer_size) = table_file.file.footer_size {
418-
pf = pf.with_metadata_size_hint(footer_size as usize);
443+
if footer_size > 0 {
444+
pf = pf.with_metadata_size_hint(footer_size as usize);
445+
}
419446
}
420447

421448
// Use read_schema (with original Parquet names) for reading
@@ -637,3 +664,42 @@ fn extract_deleted_positions_from_batch(
637664

638665
Ok(())
639666
}
667+
668+
#[cfg(test)]
669+
mod tests {
670+
use super::*;
671+
672+
#[test]
673+
fn test_validated_file_size_positive() {
674+
assert_eq!(validated_file_size(0, "test.parquet").unwrap(), 0);
675+
assert_eq!(validated_file_size(1024, "test.parquet").unwrap(), 1024);
676+
assert_eq!(
677+
validated_file_size(i64::MAX, "test.parquet").unwrap(),
678+
i64::MAX as u64
679+
);
680+
}
681+
682+
#[test]
683+
fn test_validated_file_size_negative() {
684+
let err = validated_file_size(-1, "data/test.parquet").unwrap_err();
685+
let msg = err.to_string();
686+
assert!(
687+
msg.contains("-1"),
688+
"Error should contain the negative value: {}",
689+
msg
690+
);
691+
assert!(
692+
msg.contains("data/test.parquet"),
693+
"Error should contain the file path: {}",
694+
msg
695+
);
696+
}
697+
698+
#[test]
699+
fn test_validated_file_size_large_negative() {
700+
let err = validated_file_size(i64::MIN, "bad.parquet").unwrap_err();
701+
let msg = err.to_string();
702+
assert!(msg.contains("bad.parquet"));
703+
assert!(msg.contains(&i64::MIN.to_string()));
704+
}
705+
}
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#![cfg(feature = "metadata-duckdb")]
2+
//! Tests for numeric metadata validation (#58, #59)
3+
//!
4+
//! Verifies that negative file_size_bytes and footer_size values in catalog
5+
//! metadata are caught early with clear error messages instead of wrapping
6+
//! to huge values via unchecked `as u64`/`as usize` casts.
7+
8+
use std::sync::Arc;
9+
10+
use datafusion::error::Result as DataFusionResult;
11+
use datafusion::prelude::*;
12+
use datafusion_ducklake::{DuckLakeCatalog, DuckdbMetadataProvider};
13+
use tempfile::TempDir;
14+
15+
/// Creates a catalog with data, then corrupts file_size_bytes to a negative value
16+
fn create_catalog_with_negative_file_size(
17+
catalog_path: &std::path::Path,
18+
) -> anyhow::Result<()> {
19+
let conn = duckdb::Connection::open_in_memory()?;
20+
conn.execute("INSTALL ducklake;", [])?;
21+
conn.execute("LOAD ducklake;", [])?;
22+
23+
let ducklake_path = format!("ducklake:{}", catalog_path.display());
24+
conn.execute(
25+
&format!("ATTACH '{}' AS test_catalog;", ducklake_path),
26+
[],
27+
)?;
28+
29+
conn.execute(
30+
"CREATE TABLE test_catalog.items (id INT, name VARCHAR);",
31+
[],
32+
)?;
33+
conn.execute(
34+
"INSERT INTO test_catalog.items VALUES (1, 'Widget'), (2, 'Gadget');",
35+
[],
36+
)?;
37+
38+
// Detach the DuckLake catalog so we can tamper with the raw metadata
39+
conn.execute("DETACH test_catalog;", [])?;
40+
41+
// Now open the catalog DB directly and corrupt file_size_bytes
42+
let meta_conn = duckdb::Connection::open(catalog_path)?;
43+
meta_conn.execute(
44+
"UPDATE ducklake_data_file SET file_size_bytes = -1;",
45+
[],
46+
)?;
47+
48+
Ok(())
49+
}
50+
51+
/// Creates a catalog with data, then corrupts footer_size to a negative value
52+
fn create_catalog_with_negative_footer_size(
53+
catalog_path: &std::path::Path,
54+
) -> anyhow::Result<()> {
55+
let conn = duckdb::Connection::open_in_memory()?;
56+
conn.execute("INSTALL ducklake;", [])?;
57+
conn.execute("LOAD ducklake;", [])?;
58+
59+
let ducklake_path = format!("ducklake:{}", catalog_path.display());
60+
conn.execute(
61+
&format!("ATTACH '{}' AS test_catalog;", ducklake_path),
62+
[],
63+
)?;
64+
65+
conn.execute(
66+
"CREATE TABLE test_catalog.items (id INT, name VARCHAR);",
67+
[],
68+
)?;
69+
conn.execute(
70+
"INSERT INTO test_catalog.items VALUES (1, 'Widget'), (2, 'Gadget');",
71+
[],
72+
)?;
73+
74+
// Detach the DuckLake catalog so we can tamper with the raw metadata
75+
conn.execute("DETACH test_catalog;", [])?;
76+
77+
// Now open the catalog DB directly and set footer_size to negative
78+
let meta_conn = duckdb::Connection::open(catalog_path)?;
79+
meta_conn.execute(
80+
"UPDATE ducklake_data_file SET footer_size = -42;",
81+
[],
82+
)?;
83+
84+
Ok(())
85+
}
86+
87+
fn create_catalog(path: &str) -> DataFusionResult<Arc<DuckLakeCatalog>> {
88+
let provider = DuckdbMetadataProvider::new(path)
89+
.map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?;
90+
let catalog = DuckLakeCatalog::new(provider)
91+
.map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?;
92+
Ok(Arc::new(catalog))
93+
}
94+
95+
#[tokio::test]
96+
async fn test_negative_file_size_produces_clear_error() -> DataFusionResult<()> {
97+
let temp_dir =
98+
TempDir::new().map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?;
99+
let catalog_path = temp_dir.path().join("neg_size.ducklake");
100+
101+
create_catalog_with_negative_file_size(&catalog_path)
102+
.map_err(|e| datafusion::error::DataFusionError::External(e.into()))?;
103+
104+
let catalog = create_catalog(&catalog_path.to_string_lossy())?;
105+
let ctx = SessionContext::new();
106+
ctx.register_catalog("ducklake", catalog);
107+
108+
let result = ctx
109+
.sql("SELECT * FROM ducklake.main.items")
110+
.await?
111+
.collect()
112+
.await;
113+
114+
assert!(result.is_err(), "Query should fail with negative file_size_bytes");
115+
let err_msg = result.unwrap_err().to_string();
116+
assert!(
117+
err_msg.contains("Invalid file_size_bytes"),
118+
"Error should mention invalid file_size_bytes, got: {}",
119+
err_msg
120+
);
121+
assert!(
122+
err_msg.contains("-1"),
123+
"Error should contain the negative value, got: {}",
124+
err_msg
125+
);
126+
127+
Ok(())
128+
}
129+
130+
#[tokio::test]
131+
async fn test_negative_footer_size_is_gracefully_skipped() -> DataFusionResult<()> {
132+
let temp_dir =
133+
TempDir::new().map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?;
134+
let catalog_path = temp_dir.path().join("neg_footer.ducklake");
135+
136+
create_catalog_with_negative_footer_size(&catalog_path)
137+
.map_err(|e| datafusion::error::DataFusionError::External(e.into()))?;
138+
139+
let catalog = create_catalog(&catalog_path.to_string_lossy())?;
140+
let ctx = SessionContext::new();
141+
ctx.register_catalog("ducklake", catalog);
142+
143+
// Negative footer_size should be skipped (not used as hint), query should succeed
144+
let df = ctx
145+
.sql("SELECT * FROM ducklake.main.items")
146+
.await?;
147+
let batches = df.collect().await?;
148+
149+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
150+
assert_eq!(total_rows, 2, "Should still return all rows when footer_size is negative");
151+
152+
Ok(())
153+
}

0 commit comments

Comments
 (0)