Skip to content

Commit c82ddad

Browse files
Fix: Filter data files by snapshot to handle complete row deletion (#32)
1 parent 049f340 commit c82ddad

4 files changed

Lines changed: 115 additions & 35 deletions

File tree

src/metadata_provider.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ pub const SQL_GET_DATA_FILES: &str = "
4343
AND del.table_id = ?
4444
AND ? >= del.begin_snapshot
4545
AND (? < del.end_snapshot OR del.end_snapshot IS NULL)
46-
WHERE data.table_id = ?";
46+
WHERE data.table_id = ?
47+
AND ? >= data.begin_snapshot
48+
AND (? < data.end_snapshot OR data.end_snapshot IS NULL)";
4749

4850
pub const SQL_GET_DATA_PATH: &str =
4951
"SELECT value FROM ducklake_metadata WHERE key = 'data_path' AND scope IS NULL";
@@ -155,6 +157,8 @@ pub const SQL_LIST_ALL_FILES: &str = "
155157
AND (? < s.end_snapshot OR s.end_snapshot IS NULL)
156158
AND ? >= t.begin_snapshot
157159
AND (? < t.end_snapshot OR t.end_snapshot IS NULL)
160+
AND ? >= data.begin_snapshot
161+
AND (? < data.end_snapshot OR data.end_snapshot IS NULL)
158162
ORDER BY s.schema_name, t.table_name, data.path";
159163

160164
/// Metadata for a snapshot in the DuckLake catalog

src/metadata_provider_duckdb.rs

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -165,40 +165,43 @@ impl MetadataProvider for DuckdbMetadataProvider {
165165
let mut stmt = conn.prepare(SQL_GET_DATA_FILES)?;
166166

167167
let files = stmt
168-
.query_map([table_id, snapshot_id, snapshot_id, table_id], |row| {
169-
// Parse data file (columns 0-5)
170-
let _data_file_id: i64 = row.get(0)?;
171-
let data_file = DuckLakeFileData {
172-
path: row.get(1)?,
173-
path_is_relative: row.get(2)?,
174-
file_size_bytes: row.get(3)?,
175-
footer_size: row.get(4)?,
176-
encryption_key: row.get(5)?,
177-
};
178-
179-
// Parse delete file (columns 6-12) if exists
180-
let delete_file = if let Ok(Some(_)) = row.get::<_, Option<i64>>(6) {
181-
Some(DuckLakeFileData {
182-
path: row.get(7)?,
183-
path_is_relative: row.get(8)?,
184-
file_size_bytes: row.get(9)?,
185-
footer_size: row.get(10)?,
186-
encryption_key: row.get(11)?,
168+
.query_map(
169+
[table_id, snapshot_id, snapshot_id, table_id, snapshot_id, snapshot_id],
170+
|row| {
171+
// Parse data file (columns 0-5)
172+
let _data_file_id: i64 = row.get(0)?;
173+
let data_file = DuckLakeFileData {
174+
path: row.get(1)?,
175+
path_is_relative: row.get(2)?,
176+
file_size_bytes: row.get(3)?,
177+
footer_size: row.get(4)?,
178+
encryption_key: row.get(5)?,
179+
};
180+
181+
// Parse delete file (columns 6-12) if exists
182+
let delete_file = if let Ok(Some(_)) = row.get::<_, Option<i64>>(6) {
183+
Some(DuckLakeFileData {
184+
path: row.get(7)?,
185+
path_is_relative: row.get(8)?,
186+
file_size_bytes: row.get(9)?,
187+
footer_size: row.get(10)?,
188+
encryption_key: row.get(11)?,
189+
})
190+
} else {
191+
None
192+
};
193+
194+
let _delete_count: Option<i64> = row.get(12)?;
195+
196+
Ok(DuckLakeTableFile {
197+
file: data_file,
198+
delete_file,
199+
row_id_start: None,
200+
snapshot_id: Some(snapshot_id),
201+
max_row_count: None, // Set to None until we have actual row count from data file metadata
187202
})
188-
} else {
189-
None
190-
};
191-
192-
let _delete_count: Option<i64> = row.get(12)?;
193-
194-
Ok(DuckLakeTableFile {
195-
file: data_file,
196-
delete_file,
197-
row_id_start: None,
198-
snapshot_id: Some(snapshot_id),
199-
max_row_count: None, // Set to None until we have actual row count from data file metadata
200-
})
201-
})?
203+
},
204+
)?
202205
.collect::<Result<Vec<_>, _>>()?;
203206

204207
Ok(files)
@@ -332,6 +335,8 @@ impl MetadataProvider for DuckdbMetadataProvider {
332335
snapshot_id,
333336
snapshot_id,
334337
snapshot_id,
338+
snapshot_id,
339+
snapshot_id,
335340
snapshot_id
336341
],
337342
|row| {

src/metadata_provider_postgres.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,17 @@ macro_rules! bind_repeat {
3232
.bind($value)
3333
.bind($value)
3434
};
35+
($query:expr, $value:expr, 8) => {
36+
$query
37+
.bind($value)
38+
.bind($value)
39+
.bind($value)
40+
.bind($value)
41+
.bind($value)
42+
.bind($value)
43+
.bind($value)
44+
.bind($value)
45+
};
3546
}
3647

3748
/// PostgreSQL-based metadata provider for DuckLake catalogs.
@@ -211,12 +222,16 @@ impl MetadataProvider for PostgresMetadataProvider {
211222
AND del.table_id = $1
212223
AND $2 >= del.begin_snapshot
213224
AND ($3 < del.end_snapshot OR del.end_snapshot IS NULL)
214-
WHERE data.table_id = $4",
225+
WHERE data.table_id = $4
226+
AND $5 >= data.begin_snapshot
227+
AND ($6 < data.end_snapshot OR data.end_snapshot IS NULL)",
215228
)
216229
.bind(table_id)
217230
.bind(snapshot_id)
218231
.bind(snapshot_id)
219232
.bind(table_id)
233+
.bind(snapshot_id)
234+
.bind(snapshot_id)
220235
.fetch_all(&self.pool)
221236
.await?;
222237

@@ -442,6 +457,8 @@ impl MetadataProvider for PostgresMetadataProvider {
442457
AND ($4 < s.end_snapshot OR s.end_snapshot IS NULL)
443458
AND $5 >= t.begin_snapshot
444459
AND ($6 < t.end_snapshot OR t.end_snapshot IS NULL)
460+
AND $7 >= data.begin_snapshot
461+
AND ($8 < data.end_snapshot OR data.end_snapshot IS NULL)
445462
ORDER BY s.schema_name, t.table_name, data.path",
446463
)
447464
.bind(snapshot_id)
@@ -450,6 +467,8 @@ impl MetadataProvider for PostgresMetadataProvider {
450467
.bind(snapshot_id)
451468
.bind(snapshot_id)
452469
.bind(snapshot_id)
470+
.bind(snapshot_id)
471+
.bind(snapshot_id)
453472
.fetch_all(&self.pool)
454473
.await?;
455474

tests/delete_filter_tests.rs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,58 @@ mod integration_tests {
325325
Ok(())
326326
}
327327

328+
/// Test that a table with all rows deleted returns 0 rows
329+
///
330+
/// This is the bug scenario from https://github.com/hotdata-dev/datafusion-ducklake/issues/30
331+
/// When all rows are deleted from a table, querying should return 0 rows.
332+
#[tokio::test]
333+
async fn test_table_with_all_rows_deleted() -> DataFusionResult<()> {
334+
let temp_dir = TempDir::new()
335+
.map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?;
336+
let catalog_path = temp_dir.path().join("all_deleted.ducklake");
337+
338+
// Generate test data - inserts a row, then deletes it
339+
common::create_catalog_empty_table(&catalog_path).map_err(common::to_datafusion_error)?;
340+
341+
let catalog = create_catalog(&catalog_path.to_string_lossy())?;
342+
343+
let ctx = SessionContext::new();
344+
ctx.register_catalog("all_deleted", catalog);
345+
346+
// Query the table - should return 0 rows since all data was deleted
347+
let df = ctx.sql("SELECT * FROM all_deleted.main.tbl").await?;
348+
let results = df.collect().await?;
349+
350+
let total_rows: usize = results.iter().map(|b| b.num_rows()).sum();
351+
assert_eq!(
352+
total_rows, 0,
353+
"Table with all rows deleted should return 0 rows, but got {}",
354+
total_rows
355+
);
356+
357+
// Also verify COUNT(*) returns 0
358+
let df = ctx
359+
.sql("SELECT COUNT(*) as cnt FROM all_deleted.main.tbl")
360+
.await?;
361+
let results = df.collect().await?;
362+
363+
assert!(!results.is_empty());
364+
let batch = &results[0];
365+
let counts = batch
366+
.column(0)
367+
.as_any()
368+
.downcast_ref::<Int64Array>()
369+
.unwrap();
370+
371+
assert_eq!(
372+
counts.value(0),
373+
0,
374+
"COUNT(*) should be 0 after all rows deleted"
375+
);
376+
377+
Ok(())
378+
}
379+
328380
/// Test filter pushdown correctness with delete files
329381
///
330382
/// This test verifies that WHERE filters are applied AFTER delete filtering,

0 commit comments

Comments
 (0)