Skip to content

Commit 372d4f7

Browse files
Shefeek JinnahShefeek Jinnah
authored andcommitted
Fix: Filter data files by snapshot to handle complete row deletion
1 parent e5157f6 commit 372d4f7

4 files changed

Lines changed: 82 additions & 3 deletions

File tree

src/metadata_provider.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ pub const SQL_GET_DATA_FILES: &str = "
4343
AND del.table_id = ?
4444
AND ? >= del.begin_snapshot
4545
AND (? < del.end_snapshot OR del.end_snapshot IS NULL)
46-
WHERE data.table_id = ?";
46+
WHERE data.table_id = ?
47+
AND ? >= data.begin_snapshot
48+
AND (? < data.end_snapshot OR data.end_snapshot IS NULL)";
4749

4850
pub const SQL_GET_DATA_PATH: &str =
4951
"SELECT value FROM ducklake_metadata WHERE key = 'data_path' AND scope IS NULL";
@@ -155,6 +157,8 @@ pub const SQL_LIST_ALL_FILES: &str = "
155157
AND (? < s.end_snapshot OR s.end_snapshot IS NULL)
156158
AND ? >= t.begin_snapshot
157159
AND (? < t.end_snapshot OR t.end_snapshot IS NULL)
160+
AND ? >= data.begin_snapshot
161+
AND (? < data.end_snapshot OR data.end_snapshot IS NULL)
158162
ORDER BY s.schema_name, t.table_name, data.path";
159163

160164
/// Metadata for a snapshot in the DuckLake catalog

src/metadata_provider_duckdb.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ impl MetadataProvider for DuckdbMetadataProvider {
165165
let mut stmt = conn.prepare(SQL_GET_DATA_FILES)?;
166166

167167
let files = stmt
168-
.query_map([table_id, snapshot_id, snapshot_id, table_id], |row| {
168+
.query_map([table_id, snapshot_id, snapshot_id, table_id, snapshot_id, snapshot_id], |row| {
169169
// Parse data file (columns 0-5)
170170
let _data_file_id: i64 = row.get(0)?;
171171
let data_file = DuckLakeFileData {
@@ -332,6 +332,8 @@ impl MetadataProvider for DuckdbMetadataProvider {
332332
snapshot_id,
333333
snapshot_id,
334334
snapshot_id,
335+
snapshot_id,
336+
snapshot_id,
335337
snapshot_id
336338
],
337339
|row| {

src/metadata_provider_postgres.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,17 @@ macro_rules! bind_repeat {
3232
.bind($value)
3333
.bind($value)
3434
};
35+
($query:expr, $value:expr, 8) => {
36+
$query
37+
.bind($value)
38+
.bind($value)
39+
.bind($value)
40+
.bind($value)
41+
.bind($value)
42+
.bind($value)
43+
.bind($value)
44+
.bind($value)
45+
};
3546
}
3647

3748
/// PostgreSQL-based metadata provider for DuckLake catalogs.
@@ -211,12 +222,16 @@ impl MetadataProvider for PostgresMetadataProvider {
211222
AND del.table_id = $1
212223
AND $2 >= del.begin_snapshot
213224
AND ($3 < del.end_snapshot OR del.end_snapshot IS NULL)
214-
WHERE data.table_id = $4",
225+
WHERE data.table_id = $4
226+
AND $5 >= data.begin_snapshot
227+
AND ($6 < data.end_snapshot OR data.end_snapshot IS NULL)",
215228
)
216229
.bind(table_id)
217230
.bind(snapshot_id)
218231
.bind(snapshot_id)
219232
.bind(table_id)
233+
.bind(snapshot_id)
234+
.bind(snapshot_id)
220235
.fetch_all(&self.pool)
221236
.await?;
222237

@@ -442,6 +457,8 @@ impl MetadataProvider for PostgresMetadataProvider {
442457
AND ($4 < s.end_snapshot OR s.end_snapshot IS NULL)
443458
AND $5 >= t.begin_snapshot
444459
AND ($6 < t.end_snapshot OR t.end_snapshot IS NULL)
460+
AND $7 >= data.begin_snapshot
461+
AND ($8 < data.end_snapshot OR data.end_snapshot IS NULL)
445462
ORDER BY s.schema_name, t.table_name, data.path",
446463
)
447464
.bind(snapshot_id)
@@ -450,6 +467,8 @@ impl MetadataProvider for PostgresMetadataProvider {
450467
.bind(snapshot_id)
451468
.bind(snapshot_id)
452469
.bind(snapshot_id)
470+
.bind(snapshot_id)
471+
.bind(snapshot_id)
453472
.fetch_all(&self.pool)
454473
.await?;
455474

tests/delete_filter_tests.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,60 @@ mod integration_tests {
325325
Ok(())
326326
}
327327

328+
/// Test that a table with all rows deleted returns 0 rows
329+
///
330+
/// This is the bug scenario from https://github.com/hotdata-dev/datafusion-ducklake/issues/30
331+
/// When all rows are deleted from a table, querying should return 0 rows.
332+
#[tokio::test]
333+
async fn test_table_with_all_rows_deleted() -> DataFusionResult<()> {
334+
let temp_dir = TempDir::new()
335+
.map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?;
336+
let catalog_path = temp_dir.path().join("all_deleted.ducklake");
337+
338+
// Generate test data - inserts a row, then deletes it
339+
common::create_catalog_empty_table(&catalog_path).map_err(common::to_datafusion_error)?;
340+
341+
let catalog = create_catalog(&catalog_path.to_string_lossy())?;
342+
343+
let ctx = SessionContext::new();
344+
ctx.register_catalog("all_deleted", catalog);
345+
346+
// Query the table - should return 0 rows since all data was deleted
347+
let df = ctx
348+
.sql("SELECT * FROM all_deleted.main.tbl")
349+
.await?;
350+
let results = df.collect().await?;
351+
352+
let total_rows: usize = results.iter().map(|b| b.num_rows()).sum();
353+
assert_eq!(
354+
total_rows, 0,
355+
"Table with all rows deleted should return 0 rows, but got {}",
356+
total_rows
357+
);
358+
359+
// Also verify COUNT(*) returns 0
360+
let df = ctx
361+
.sql("SELECT COUNT(*) as cnt FROM all_deleted.main.tbl")
362+
.await?;
363+
let results = df.collect().await?;
364+
365+
assert!(!results.is_empty());
366+
let batch = &results[0];
367+
let counts = batch
368+
.column(0)
369+
.as_any()
370+
.downcast_ref::<Int64Array>()
371+
.unwrap();
372+
373+
assert_eq!(
374+
counts.value(0),
375+
0,
376+
"COUNT(*) should be 0 after all rows deleted"
377+
);
378+
379+
Ok(())
380+
}
381+
328382
/// Test filter pushdown correctness with delete files
329383
///
330384
/// This test verifies that WHERE filters are applied AFTER delete filtering,

0 commit comments

Comments
 (0)