fix

JingsongLi · JingsongLi · commit add8d4273d3f · 2026-04-13T11:00:05.000+08:00
diff --git a/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/clustering/ClusteringCompactManager.java b/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/clustering/ClusteringCompactManager.java
@@ -43,8 +43,10 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.util.Collections;
 import java.util.List;
 import java.util.Optional;
+import java.util.Set;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.stream.IntStream;
@@ -204,10 +206,14 @@ private CompactResult compact(boolean fullCompaction) throws Exception {
         // Snapshot sorted files before Phase 1 to avoid including newly created files in Phase 2
         List<DataFileMeta> existingSortedFiles = fileLevels.sortedFiles();
         for (DataFileMeta file : unsortedFiles) {
+            Set<String> originalFileNames = Collections.singleton(file.fileName());
             List<DataFileMeta> sortedFiles =
                     fileRewriter.sortAndRewriteFiles(
-                            singletonList(file), kvSerializer, kvSchemaType);
-            keyIndex.updateIndex(file, sortedFiles);
+                            singletonList(file),
+                            kvSerializer,
+                            kvSchemaType,
+                            keyIndex,
+                            originalFileNames);
             result.before().add(file);
             result.after().addAll(sortedFiles);
         }
@@ -232,19 +238,23 @@ private CompactResult compact(boolean fullCompaction) throws Exception {
                     keyIndex.rebuildIndex(newFile);
                 }
                 // Remove stale deletion vectors for merged-away files
-                for (DataFileMeta file : mergeGroup) {
-                    dvMaintainer.removeDeletionVectorOf(file.fileName());
+                if (dvMaintainer != null) {
+                    for (DataFileMeta file : mergeGroup) {
+                        dvMaintainer.removeDeletionVectorOf(file.fileName());
+                    }
                 }
                 result.before().addAll(mergeGroup);
                 result.after().addAll(mergedFiles);
             }
         }
 
-        CompactDeletionFile deletionFile =
-                lazyGenDeletionFile
-                        ? CompactDeletionFile.lazyGeneration(dvMaintainer)
-                        : CompactDeletionFile.generateFiles(dvMaintainer);
-        result.setDeletionFile(deletionFile);
+        if (dvMaintainer != null) {
+            CompactDeletionFile deletionFile =
+                    lazyGenDeletionFile
+                            ? CompactDeletionFile.lazyGeneration(dvMaintainer)
+                            : CompactDeletionFile.generateFiles(dvMaintainer);
+            result.setDeletionFile(deletionFile);
+        }
         return result;
     }
 
diff --git a/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/clustering/ClusteringFileRewriter.java b/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/clustering/ClusteringFileRewriter.java
@@ -27,6 +27,7 @@
 import org.apache.paimon.data.InternalRow;
 import org.apache.paimon.data.serializer.BinaryRowSerializer;
 import org.apache.paimon.data.serializer.InternalRowSerializer;
+import org.apache.paimon.data.serializer.RowCompactedSerializer;
 import org.apache.paimon.disk.ChannelReaderInputView;
 import org.apache.paimon.disk.ChannelReaderInputViewIterator;
 import org.apache.paimon.disk.ChannelWithMeta;
@@ -47,13 +48,16 @@
 import org.apache.paimon.utils.KeyValueWithLevelNoReusingSerializer;
 import org.apache.paimon.utils.MutableObjectIterator;
 
+import javax.annotation.Nullable;
+
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 import java.util.PriorityQueue;
+import java.util.Set;
 
 /**
  * Handles file rewriting for clustering compaction, including sorting unsorted files (Phase 1) and
@@ -112,10 +116,19 @@ public ClusteringFileRewriter(
 
     /**
      * Sort and rewrite unsorted files by clustering columns. Reads all KeyValue records, sorts them
-     * using an external sort buffer, and writes to new level-1 files.
+     * using an external sort buffer, and writes to new level-1 files. Checks the key index inline
+     * during writing to handle deduplication (FIRST_ROW skips duplicates, DEDUPLICATE marks old
+     * positions in DV) and updates the index without re-reading the output files.
+     *
+     * @param keyIndex the key index for inline checking and batch update, or null to skip
+     * @param originalFileNames file names of the original files being replaced (for index check)
      */
     public List<DataFileMeta> sortAndRewriteFiles(
-            List<DataFileMeta> inputFiles, KeyValueSerializer kvSerializer, RowType kvSchemaType)
+            List<DataFileMeta> inputFiles,
+            KeyValueSerializer kvSerializer,
+            RowType kvSchemaType,
+            @Nullable ClusteringKeyIndex keyIndex,
+            Set<String> originalFileNames)
             throws Exception {
         int[] sortFieldsInKeyValue =
                 Arrays.stream(clusteringColumns)
@@ -145,17 +158,29 @@ public List<DataFileMeta> sortAndRewriteFiles(
             }
         }
 
+        RowCompactedSerializer keySerializer =
+                keyIndex != null ? new RowCompactedSerializer(keyType) : null;
+        List<byte[]> collectedKeys = keyIndex != null ? new ArrayList<>() : null;
+
         RollingFileWriter<KeyValue, DataFileMeta> writer =
                 writerFactory.createRollingClusteringFileWriter();
         try {
             MutableObjectIterator<BinaryRow> sortedIterator = sortBuffer.sortedIterator();
             BinaryRow binaryRow = new BinaryRow(kvSchemaType.getFieldCount());
             while ((binaryRow = sortedIterator.next(binaryRow)) != null) {
                 KeyValue kv = kvSerializer.fromRow(binaryRow);
-                writer.write(
+                KeyValue copied =
                         kv.copy(
                                 new InternalRowSerializer(keyType),
-                                new InternalRowSerializer(valueType)));
+                                new InternalRowSerializer(valueType));
+                if (keyIndex != null) {
+                    byte[] keyBytes = keySerializer.serializeToBytes(copied.key());
+                    if (!keyIndex.checkKey(keyBytes, originalFileNames)) {
+                        continue;
+                    }
+                    collectedKeys.add(keyBytes);
+                }
+                writer.write(copied);
             }
         } finally {
             sortBuffer.clear();
@@ -170,6 +195,16 @@ public List<DataFileMeta> sortAndRewriteFiles(
             fileLevels.addNewFile(newFile);
         }
 
+        // Batch update index using collected keys, split by file rowCount
+        if (keyIndex != null) {
+            int offset = 0;
+            for (DataFileMeta newFile : newFiles) {
+                int count = (int) newFile.rowCount();
+                keyIndex.batchPutIndex(newFile, collectedKeys.subList(offset, offset + count));
+                offset += count;
+            }
+        }
+
         return newFiles;
     }
 
diff --git a/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/clustering/ClusteringKeyIndex.java b/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/clustering/ClusteringKeyIndex.java
@@ -45,8 +45,6 @@
 import java.io.IOException;
 import java.util.AbstractMap;
 import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -209,61 +207,47 @@ public Map.Entry<byte[], byte[]> next() {
     }
 
     /**
-     * Update the key index after a single original file is replaced by new sorted files.
+     * Check a key against the index during sort-and-rewrite writing.
      *
-     * <p>For DEDUPLICATE mode: mark the old position in deletion vectors, keep the new position.
+     * <p>For FIRST_ROW mode: if key exists pointing to a non-original file, return false (skip
+     * writing this record — it's a duplicate).
      *
-     * <p>For FIRST_ROW mode: if key exists, mark the new position in deletion vectors (keep the
-     * first/old one); if key is new, store the new position.
+     * <p>For DEDUPLICATE mode: if key exists pointing to a non-original file, mark the old position
+     * in deletion vectors, return true (write the new record).
+     *
+     * @param keyBytes serialized key bytes
+     * @param originalFileNames file names of the original unsorted files being replaced
+     * @return true if the record should be written, false to skip (FIRST_ROW dedup)
      */
-    public void updateIndex(DataFileMeta originalFile, List<DataFileMeta> newSortedFiles)
-            throws Exception {
-        updateIndex(Collections.singletonList(originalFile), newSortedFiles);
+    public boolean checkKey(byte[] keyBytes, Set<String> originalFileNames) throws Exception {
+        byte[] oldValue = kvDb.get(keyBytes);
+        if (oldValue != null) {
+            ByteArrayInputStream valueIn = new ByteArrayInputStream(oldValue);
+            int oldFileId = decodeInt(valueIn);
+            int oldPosition = decodeInt(valueIn);
+            DataFileMeta oldFile = fileLevels.getFileById(oldFileId);
+            if (oldFile != null && !originalFileNames.contains(oldFile.fileName())) {
+                if (firstRow) {
+                    return false;
+                } else {
+                    dvMaintainer.notifyNewDeletion(oldFile.fileName(), oldPosition);
+                }
+            }
+        }
+        return true;
     }
 
     /**
-     * Update the key index after multiple original files are replaced by new sorted files.
-     *
-     * @see #updateIndex(DataFileMeta, List)
+     * Batch update the key index for a new sorted file using pre-collected key bytes. Avoids
+     * re-reading the file.
      */
-    public void updateIndex(List<DataFileMeta> originalFiles, List<DataFileMeta> newSortedFiles)
-            throws Exception {
-        RowCompactedSerializer keySerializer = new RowCompactedSerializer(keyType);
-
-        Set<String> originalFileNames = new HashSet<>();
-        for (DataFileMeta file : originalFiles) {
-            originalFileNames.add(file.fileName());
-        }
-
-        for (DataFileMeta sortedFile : newSortedFiles) {
-            int fileId = fileLevels.getFileIdByName(sortedFile.fileName());
-            int position = 0;
-            try (CloseableIterator<InternalRow> iterator = readKeyIterator(sortedFile)) {
-                while (iterator.hasNext()) {
-                    byte[] key = keySerializer.serializeToBytes(iterator.next());
-                    byte[] oldValue = kvDb.get(key);
-                    if (oldValue != null) {
-                        ByteArrayInputStream valueIn = new ByteArrayInputStream(oldValue);
-                        int oldFileId = decodeInt(valueIn);
-                        int oldPosition = decodeInt(valueIn);
-                        DataFileMeta oldFile = fileLevels.getFileById(oldFileId);
-                        if (oldFile != null && !originalFileNames.contains(oldFile.fileName())) {
-                            if (firstRow) {
-                                dvMaintainer.notifyNewDeletion(sortedFile.fileName(), position);
-                                position++;
-                                continue;
-                            } else {
-                                dvMaintainer.notifyNewDeletion(oldFile.fileName(), oldPosition);
-                            }
-                        }
-                    }
-                    ByteArrayOutputStream value = new ByteArrayOutputStream(8);
-                    encodeInt(value, fileId);
-                    encodeInt(value, position);
-                    kvDb.put(key, value.toByteArray());
-                    position++;
-                }
-            }
+    public void batchPutIndex(DataFileMeta sortedFile, List<byte[]> keyBytesList) throws Exception {
+        int fileId = fileLevels.getFileIdByName(sortedFile.fileName());
+        for (int position = 0; position < keyBytesList.size(); position++) {
+            ByteArrayOutputStream value = new ByteArrayOutputStream(8);
+            encodeInt(value, fileId);
+            encodeInt(value, position);
+            kvDb.put(keyBytesList.get(position), value.toByteArray());
         }
     }
 
diff --git a/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java b/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java
@@ -28,6 +28,7 @@
 import org.apache.paimon.data.InternalRow;
 import org.apache.paimon.disk.IOManager;
 import org.apache.paimon.fs.Path;
+import org.apache.paimon.io.DataFileMeta;
 import org.apache.paimon.manifest.IndexManifestEntry;
 import org.apache.paimon.predicate.Predicate;
 import org.apache.paimon.predicate.PredicateBuilder;
@@ -633,6 +634,59 @@ public void testFirstRowDeletionVectorCorrectness() throws Exception {
                 .containsExactlyInAnyOrder(GenericRow.of(1, 10), GenericRow.of(2, 20));
     }
 
+    /**
+     * Test that FIRST_ROW inline dedup actually reduces the number of records written. Duplicate
+     * keys should be dropped during sort-and-rewrite, resulting in fewer total rows across data
+     * files compared to the number of rows written.
+     */
+    @Test
+    public void testFirstRowInlineDedupReducesFileRows() throws Exception {
+        Table firstRowTable = createFirstRowTable();
+
+        // Commit 1: write 5 unique keys
+        writeRows(
+                firstRowTable,
+                Arrays.asList(
+                        GenericRow.of(1, 10),
+                        GenericRow.of(2, 20),
+                        GenericRow.of(3, 30),
+                        GenericRow.of(4, 40),
+                        GenericRow.of(5, 50)));
+
+        // Commit 2: write 5 duplicate keys (all should be dropped inline)
+        writeRows(
+                firstRowTable,
+                Arrays.asList(
+                        GenericRow.of(1, 99),
+                        GenericRow.of(2, 99),
+                        GenericRow.of(3, 99),
+                        GenericRow.of(4, 99),
+                        GenericRow.of(5, 99)));
+
+        // Verify correctness: still see first values
+        assertThat(readRows(firstRowTable))
+                .containsExactlyInAnyOrder(
+                        GenericRow.of(1, 10),
+                        GenericRow.of(2, 20),
+                        GenericRow.of(3, 30),
+                        GenericRow.of(4, 40),
+                        GenericRow.of(5, 50));
+
+        // Verify optimization: total row count across all data files should be exactly 5
+        // (duplicates dropped during writing, not just DV-marked)
+        List<Split> splits = firstRowTable.newReadBuilder().newScan().plan().splits();
+        long totalRows =
+                splits.stream()
+                        .mapToLong(
+                                split ->
+                                        ((DataSplit) split)
+                                                .dataFiles().stream()
+                                                        .mapToLong(DataFileMeta::rowCount)
+                                                        .sum())
+                        .sum();
+        assertThat(totalRows).isEqualTo(5);
+    }
+
     /** Test first-row mode with many writes to trigger compaction. */
     @Test
     public void testFirstRowManyWrites() throws Exception {