Skip to content

Commit 3174be5

Browse files
committed
Deduplicate delete files by path in $files table
Deduplicate files by path in FilesTablePageSource since the same delete file can appear multiple times. Add a HashSet to track seen file paths and skip duplicates. Add testFilesTableDeleteFileDeduplication to BaseIcebergSystemTables that verifies the $files table shows each delete file exactly once. Follow-up to #28911 as requested by findinpath.
1 parent 04e1836 commit 3174be5

File tree

2 files changed

+51
-1
lines changed

2 files changed

+51
-1
lines changed

plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTablePageSource.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,13 @@
4949
import java.io.UncheckedIOException;
5050
import java.nio.ByteBuffer;
5151
import java.util.Comparator;
52+
import java.util.HashSet;
5253
import java.util.Iterator;
5354
import java.util.List;
5455
import java.util.Map;
5556
import java.util.Optional;
5657
import java.util.OptionalLong;
58+
import java.util.Set;
5759
import java.util.function.BiConsumer;
5860
import java.util.function.Supplier;
5961

@@ -110,6 +112,7 @@ public final class FilesTablePageSource
110112
private final Iterator<? extends ContentFile<?>> contentIterator;
111113
private final Map<String, Integer> columnNameToIndex;
112114
private final PageBuilder pageBuilder;
115+
private final Set<String> seenFilePaths;
113116
private final long completedBytes;
114117
private long completedPositions;
115118
private long readTimeNanos;
@@ -146,6 +149,7 @@ public FilesTablePageSource(
146149
this.columnNameToIndex = mapWithIndex(requiredColumns.stream(),
147150
(columnName, position) -> immutableEntry(columnName, Long.valueOf(position).intValue()))
148151
.collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue));
152+
this.seenFilePaths = new HashSet<>();
149153
this.completedBytes = split.manifestFile().length();
150154
this.completedPositions = 0L;
151155
this.readTimeNanos = 0L;
@@ -184,9 +188,13 @@ public SourcePage getNextSourcePage()
184188
}
185189

186190
while (contentIterator.hasNext() && !pageBuilder.isFull()) {
191+
ContentFile<?> contentFile = contentIterator.next();
192+
// Deduplicate files by path since the same file can appear multiple times
193+
if (!seenFilePaths.add(contentFile.location())) {
194+
continue;
195+
}
187196
pageBuilder.declarePosition();
188197
long start = System.nanoTime();
189-
ContentFile<?> contentFile = contentIterator.next();
190198

191199
// content
192200
writeValueOrNull(pageBuilder, CONTENT_COLUMN_NAME, () -> contentFile.content().id(), INTEGER::writeInt);

plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergSystemTables.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,48 @@ public void testFilesTable()
431431
}
432432
}
433433

434+
@Test
435+
public void testFilesTableDeleteFileDeduplication()
436+
throws Exception
437+
{
438+
try (TestTable testTable = newTrinoTable("test_files_delete_dedup_",
439+
"WITH (partitioning = ARRAY['regionkey']) AS SELECT * FROM tpch.tiny.nation")) {
440+
String tableName = testTable.getName();
441+
Table icebergTable = loadTable(tableName);
442+
443+
// Verify initial state: only data files, no delete files
444+
assertQuery(
445+
"SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = 0",
446+
"VALUES 5"); // one data file per regionkey partition
447+
assertQuery(
448+
"SELECT count(*) FROM \"" + tableName + "$files\" WHERE content != 0",
449+
"VALUES 0");
450+
451+
// Write a position delete via MOR path
452+
assertUpdate("DELETE FROM " + tableName + " WHERE nationkey = 7", 1);
453+
454+
// Write an equality delete file for regionkey=2
455+
writeEqualityDeleteForTable(icebergTable, fileSystemFactory,
456+
Optional.of(icebergTable.spec()),
457+
Optional.of(new PartitionData(new Long[] {2L})),
458+
ImmutableMap.of("regionkey", 2L),
459+
Optional.empty());
460+
461+
// Verify: each file path should appear exactly once (no duplicates)
462+
assertQuery(
463+
"SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = 1",
464+
"VALUES 1"); // exactly 1 position delete file
465+
assertQuery(
466+
"SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = 2",
467+
"VALUES 1"); // exactly 1 equality delete file
468+
469+
// Verify no duplicate file paths exist
470+
assertQuery(
471+
"SELECT count(file_path) - count(DISTINCT file_path) FROM \"" + tableName + "$files\"",
472+
"VALUES 0");
473+
}
474+
}
475+
434476
@Test
435477
public void testFilesPartitionTable()
436478
{

0 commit comments

Comments
 (0)