[iceberg] Fix duplicate records when schema change splits writes within a checkpoint

spoorthibasu · spoorthibasu · commit 6f666f222f0c · 2026-04-02T15:26:41.000-04:00
When a schema-change event arrives mid-checkpoint, the writer flushes the
affected table before applying the new schema, producing two batches for
the same table. Previously these were merged into one RowDelta and committed
as a single Iceberg snapshot. Because Iceberg equality-delete files only
suppress data with a strictly lower sequence number, same-snapshot deletes
were ineffective and both versions of a row appeared on read.

- flush(boolean) is now a no-op to prevent unrelated tables from being
  split into multiple batches on non-schema-change flushes
- Schema-change events call flushTableWriter(tableId) to flush only the
  affected table; a per-table batchIndex increments on each flush
- Each batch is committed as a separate Iceberg snapshot so equality-deletes
  in batch N have a strictly higher sequence number than data in batch M (M&lt;N)
- flink.batch-index and flink.checkpoint-id snapshot properties enable
  retry-safe idempotency: on failure, the committer resumes from the last
  uncommitted batch without re-committing already-persisted files

Tests added for: same-PK dedup across batches, schema-change split correctness,
retry after partial batch commit, multiple schema changes in one checkpoint,
and multi-table isolation.
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergCommitter.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergCommitter.java
@@ -42,6 +42,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -61,6 +62,12 @@ public class IcebergCommitter implements Committer<WriteResultWrapper> {
 
     public static final String TABLE_GROUP_KEY = "table";
 
+    /** Snapshot summary key for the batch index; used to resume partial commits on retry. */
+    static final String FLINK_BATCH_INDEX = "flink.batch-index";
+
+    /** Snapshot summary key for the checkpoint ID on intermediate batch commits. */
+    static final String FLINK_CHECKPOINT_ID_PROP = "flink.checkpoint-id";
+
     private final Catalog catalog;
 
     private final SinkCommitterMetricGroup metricGroup;
@@ -91,74 +98,133 @@ private void commit(List<WriteResultWrapper> writeResultWrappers) {
         if (writeResultWrappers.isEmpty()) {
             return;
         }
-        // all commits a same checkpoint-id
         long checkpointId = writeResultWrappers.get(0).getCheckpointId();
         String newFlinkJobId = writeResultWrappers.get(0).getJobId();
         String operatorId = writeResultWrappers.get(0).getOperatorId();
 
-        Map<TableId, List<WriteResult>> tableMap = new HashMap<>();
-        for (WriteResultWrapper writeResultWrapper : writeResultWrappers) {
-            List<WriteResult> writeResult =
-                    tableMap.getOrDefault(writeResultWrapper.getTableId(), new ArrayList<>());
-            writeResult.add(writeResultWrapper.getWriteResult());
-            tableMap.put(writeResultWrapper.getTableId(), writeResult);
-            LOGGER.info(writeResultWrapper.buildDescription());
+        Map<TableId, List<WriteResultWrapper>> tableMap = new HashMap<>();
+        for (WriteResultWrapper w : writeResultWrappers) {
+            tableMap.computeIfAbsent(w.getTableId(), k -> new ArrayList<>()).add(w);
+            LOGGER.info(w.buildDescription());
         }
-        for (Map.Entry<TableId, List<WriteResult>> entry : tableMap.entrySet()) {
+
+        for (Map.Entry<TableId, List<WriteResultWrapper>> entry : tableMap.entrySet()) {
             TableId tableId = entry.getKey();
 
+            // Sort ascending by batch index to guarantee correct Iceberg sequence number ordering.
+            // Equality-delete files in batch N will have sequence number > batch M (M < N), so
+            // they correctly supersede stale data written by earlier same-checkpoint batches.
+            List<WriteResultWrapper> batches = entry.getValue();
+            batches.sort(Comparator.comparingInt(WriteResultWrapper::getBatchIndex));
+
             Table table =
                     catalog.loadTable(
                             TableIdentifier.of(tableId.getSchemaName(), tableId.getTableName()));
 
+            int startBatchIndex = 0;
             Snapshot snapshot = table.currentSnapshot();
             if (snapshot != null) {
                 Iterable<Snapshot> ancestors =
                         SnapshotUtil.ancestorsOf(snapshot.snapshotId(), table::snapshot);
-                long lastCheckpointId =
+                long lastCommittedCheckpointId =
                         getMaxCommittedCheckpointId(ancestors, newFlinkJobId, operatorId);
-                if (lastCheckpointId == checkpointId) {
+                if (lastCommittedCheckpointId >= checkpointId) {
                     LOGGER.warn(
                             "Checkpoint id {} has been committed to table {}, skipping",
                             checkpointId,
                             tableId.identifier());
                     continue;
                 }
+                ancestors = SnapshotUtil.ancestorsOf(snapshot.snapshotId(), table::snapshot);
+                startBatchIndex =
+                        getLastCommittedBatchIndex(
+                                        ancestors, newFlinkJobId, operatorId, checkpointId)
+                                + 1;
             }
 
             Optional<TableMetric> tableMetric = getTableMetric(tableId);
             tableMetric.ifPresent(TableMetric::increaseCommitTimes);
 
-            List<WriteResult> results = entry.getValue();
-            List<DataFile> dataFiles =
-                    results.stream()
-                            .filter(payload -> payload.dataFiles() != null)
-                            .flatMap(payload -> Arrays.stream(payload.dataFiles()))
-                            .filter(dataFile -> dataFile.recordCount() > 0)
-                            .collect(toList());
-            List<DeleteFile> deleteFiles =
-                    results.stream()
-                            .filter(payload -> payload.deleteFiles() != null)
-                            .flatMap(payload -> Arrays.stream(payload.deleteFiles()))
-                            .filter(deleteFile -> deleteFile.recordCount() > 0)
-                            .collect(toList());
-            if (dataFiles.isEmpty() && deleteFiles.isEmpty()) {
-                LOGGER.info(String.format("Nothing to commit to table %s, skipping", table.name()));
-            } else {
+            // Find the last non-empty batch so we know where to write MAX_COMMITTED_CHECKPOINT_ID.
+            int lastNonEmptyBatchPos = -1;
+            for (int i = batches.size() - 1; i >= startBatchIndex; i--) {
+                if (!isBatchEmpty(batches.get(i))) {
+                    lastNonEmptyBatchPos = i;
+                    break;
+                }
+            }
+
+            // Commit each batch as a separate Iceberg snapshot to get distinct sequence numbers.
+            for (int i = startBatchIndex; i < batches.size(); i++) {
+                WriteResultWrapper batch = batches.get(i);
+                List<DataFile> dataFiles = collectDataFiles(batch.getWriteResult());
+                List<DeleteFile> deleteFiles = collectDeleteFiles(batch.getWriteResult());
+
+                if (dataFiles.isEmpty() && deleteFiles.isEmpty()) {
+                    LOGGER.info(
+                            "Batch {} for checkpoint {} of table {} has nothing to commit, skipping",
+                            batch.getBatchIndex(),
+                            checkpointId,
+                            tableId.identifier());
+                    continue;
+                }
+
+                boolean isLastNonEmptyBatch = (i == lastNonEmptyBatchPos);
+
+                SnapshotUpdate<?> operation;
                 if (deleteFiles.isEmpty()) {
                     AppendFiles append = table.newAppend();
                     dataFiles.forEach(append::appendFile);
-                    commitOperation(append, newFlinkJobId, operatorId, checkpointId);
+                    operation = append;
                 } else {
                     RowDelta delta = table.newRowDelta();
                     dataFiles.forEach(delta::addRows);
                     deleteFiles.forEach(delta::addDeletes);
-                    commitOperation(delta, newFlinkJobId, operatorId, checkpointId);
+                    operation = delta;
+                }
+
+                operation.set(SinkUtil.FLINK_JOB_ID, newFlinkJobId);
+                operation.set(SinkUtil.OPERATOR_ID, operatorId);
+                operation.set(FLINK_BATCH_INDEX, String.valueOf(batch.getBatchIndex()));
+                operation.set(FLINK_CHECKPOINT_ID_PROP, String.valueOf(checkpointId));
+                if (isLastNonEmptyBatch) {
+                    operation.set(
+                            SinkUtil.MAX_COMMITTED_CHECKPOINT_ID, String.valueOf(checkpointId));
                 }
+                operation.commit();
             }
         }
     }
 
+    private static boolean isBatchEmpty(WriteResultWrapper batch) {
+        WriteResult r = batch.getWriteResult();
+        long dataCount =
+                r.dataFiles() == null
+                        ? 0
+                        : Arrays.stream(r.dataFiles()).filter(f -> f.recordCount() > 0).count();
+        long deleteCount =
+                r.deleteFiles() == null
+                        ? 0
+                        : Arrays.stream(r.deleteFiles()).filter(f -> f.recordCount() > 0).count();
+        return dataCount == 0 && deleteCount == 0;
+    }
+
+    private static List<DataFile> collectDataFiles(WriteResult result) {
+        if (result.dataFiles() == null) {
+            return new ArrayList<>();
+        }
+        return Arrays.stream(result.dataFiles()).filter(f -> f.recordCount() > 0).collect(toList());
+    }
+
+    private static List<DeleteFile> collectDeleteFiles(WriteResult result) {
+        if (result.deleteFiles() == null) {
+            return new ArrayList<>();
+        }
+        return Arrays.stream(result.deleteFiles())
+                .filter(f -> f.recordCount() > 0)
+                .collect(toList());
+    }
+
     private static long getMaxCommittedCheckpointId(
             Iterable<Snapshot> ancestors, String flinkJobId, String operatorId) {
         long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID - 1;
@@ -180,15 +246,35 @@ private static long getMaxCommittedCheckpointId(
         return lastCommittedCheckpointId;
     }
 
-    private static void commitOperation(
-            SnapshotUpdate<?> operation,
-            String newFlinkJobId,
-            String operatorId,
-            long checkpointId) {
-        operation.set(SinkUtil.MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId));
-        operation.set(SinkUtil.FLINK_JOB_ID, newFlinkJobId);
-        operation.set(SinkUtil.OPERATOR_ID, operatorId);
-        operation.commit();
+    /**
+     * Returns the highest batch index already committed for the given checkpoint, or -1 if none.
+     * Used to skip already-persisted batches on retry.
+     */
+    private static int getLastCommittedBatchIndex(
+            Iterable<Snapshot> ancestors, String flinkJobId, String operatorId, long checkpointId) {
+        for (Snapshot ancestor : ancestors) {
+            Map<String, String> summary = ancestor.summary();
+            if (!flinkJobId.equals(summary.get(SinkUtil.FLINK_JOB_ID))) {
+                continue;
+            }
+            String snapshotOperatorId = summary.get(SinkUtil.OPERATOR_ID);
+            if (snapshotOperatorId != null && !snapshotOperatorId.equals(operatorId)) {
+                continue;
+            }
+            // Stop once we pass a fully-committed earlier checkpoint; intermediate batch
+            // snapshots for the current checkpoint lie between it and the current tip.
+            String maxCommittedStr = summary.get(SinkUtil.MAX_COMMITTED_CHECKPOINT_ID);
+            if (maxCommittedStr != null && Long.parseLong(maxCommittedStr) < checkpointId) {
+                break;
+            }
+            String snapshotCheckpointId = summary.get(FLINK_CHECKPOINT_ID_PROP);
+            if (snapshotCheckpointId != null
+                    && Long.parseLong(snapshotCheckpointId) == checkpointId) {
+                String batchIndexStr = summary.get(FLINK_BATCH_INDEX);
+                return batchIndexStr != null ? Integer.parseInt(batchIndexStr) : 0;
+            }
+        }
+        return -1;
     }
 
     private Optional<TableMetric> getTableMetric(TableId tableId) {
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriter.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriter.java
@@ -71,6 +71,9 @@ public class IcebergWriter
 
     private final List<WriteResultWrapper> temporaryWriteResult;
 
+    /** Per-table batch index within the current checkpoint; incremented on each schema-change flush. */
+    private Map<TableId, Integer> tableBatchIndexMap;
+
     private Catalog catalog;
 
     private final int taskId;
@@ -99,6 +102,7 @@ public IcebergWriter(
         writerFactoryMap = new HashMap<>();
         writerMap = new HashMap<>();
         schemaMap = new HashMap<>();
+        tableBatchIndexMap = new HashMap<>();
         temporaryWriteResult = new ArrayList<>();
         this.taskId = taskId;
         this.attemptId = attemptId;
@@ -126,6 +130,7 @@ public Collection<WriteResultWrapper> prepareCommit() throws IOException {
         list.addAll(temporaryWriteResult);
         list.addAll(getWriteResult());
         temporaryWriteResult.clear();
+        tableBatchIndexMap.clear();
         lastCheckpointId++;
         return list;
     }
@@ -163,6 +168,8 @@ public void write(Event event, Context context) throws IOException {
         } else {
             SchemaChangeEvent schemaChangeEvent = (SchemaChangeEvent) event;
             TableId tableId = schemaChangeEvent.tableId();
+            // Flush only this table before applying schema change to avoid global writer rotation.
+            flushTableWriter(tableId);
             TableSchemaWrapper tableSchemaWrapper = schemaMap.get(tableId);
 
             Schema newSchema =
@@ -176,21 +183,45 @@ public void write(Event event, Context context) throws IOException {
 
     @Override
     public void flush(boolean flush) throws IOException {
-        // Notice: flush method may be called many times during one checkpoint.
-        temporaryWriteResult.addAll(getWriteResult());
+        // Flush may be called many times during one checkpoint by non-data events.
+        // Avoid rotating all task writers here, which can split same-PK updates into multiple
+        // batches within one checkpoint and break dedup semantics in downstream reads.
+    }
+
+    private void flushTableWriter(TableId tableId) throws IOException {
+        TaskWriter<RowData> writer = writerMap.remove(tableId);
+        if (writer == null) {
+            return;
+        }
+        int batchIndex = tableBatchIndexMap.getOrDefault(tableId, 0);
+        tableBatchIndexMap.put(tableId, batchIndex + 1);
+        WriteResultWrapper writeResultWrapper =
+                new WriteResultWrapper(
+                        writer.complete(),
+                        tableId,
+                        lastCheckpointId + 1,
+                        jobId,
+                        operatorId,
+                        batchIndex);
+        temporaryWriteResult.add(writeResultWrapper);
+        LOGGER.info(writeResultWrapper.buildDescription());
+        writerFactoryMap.remove(tableId);
     }
 
     private List<WriteResultWrapper> getWriteResult() throws IOException {
         long currentCheckpointId = lastCheckpointId + 1;
         List<WriteResultWrapper> writeResults = new ArrayList<>();
         for (Map.Entry<TableId, TaskWriter<RowData>> entry : writerMap.entrySet()) {
+            TableId tableId = entry.getKey();
+            int batchIndex = tableBatchIndexMap.getOrDefault(tableId, 0);
             WriteResultWrapper writeResultWrapper =
                     new WriteResultWrapper(
                             entry.getValue().complete(),
-                            entry.getKey(),
+                            tableId,
                             currentCheckpointId,
                             jobId,
-                            operatorId);
+                            operatorId,
+                            batchIndex);
             writeResults.add(writeResultWrapper);
             LOGGER.info(writeResultWrapper.buildDescription());
         }
@@ -222,6 +253,11 @@ public void close() throws Exception {
             writerFactoryMap = null;
         }
 
+        if (tableBatchIndexMap != null) {
+            tableBatchIndexMap.clear();
+            tableBatchIndexMap = null;
+        }
+
         catalog = null;
     }
 }
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/WriteResultWrapper.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/WriteResultWrapper.java
@@ -40,17 +40,31 @@ public class WriteResultWrapper implements Serializable {
 
     private final String operatorId;
 
+    /** Batch index within the checkpoint for this table; increments on each schema-change flush. */
+    private final int batchIndex;
+
     public WriteResultWrapper(
             WriteResult writeResult,
             TableId tableId,
             long checkpointId,
             String jobId,
-            String operatorId) {
+            String operatorId,
+            int batchIndex) {
         this.writeResult = writeResult;
         this.tableId = tableId;
         this.checkpointId = checkpointId;
         this.jobId = jobId;
         this.operatorId = operatorId;
+        this.batchIndex = batchIndex;
+    }
+
+    public WriteResultWrapper(
+            WriteResult writeResult,
+            TableId tableId,
+            long checkpointId,
+            String jobId,
+            String operatorId) {
+        this(writeResult, tableId, checkpointId, jobId, operatorId, 0);
     }
 
     public WriteResult getWriteResult() {
@@ -73,6 +87,10 @@ public String getOperatorId() {
         return operatorId;
     }
 
+    public int getBatchIndex() {
+        return batchIndex;
+    }
+
     /** Build a simple description for the write result. */
     public String buildDescription() {
         long addCount = 0;
@@ -95,6 +113,8 @@ public String buildDescription() {
                 + jobId
                 + ", OperatorId: "
                 + operatorId
+                + ", BatchIndex: "
+                + batchIndex
                 + ", AddCount: "
                 + addCount
                 + ", DeleteCount: "
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/test/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriterTest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/test/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriterTest.java