[iceberg] Fix batchIndex sync and parallel subtask grouping in Iceberg sink

spoorthibasu · spoorthibasu · commit 4eba0f0f3774 · 2026-04-06T00:29:43.000-04:00
Address parallelism issues identified during review:
- Writer: Advance tableBatchIndexMap before the writer == null guard so all subtasks stay in sync when a subtask has no data for the table at schema-change time
- Writer: Skip flushTableWriter on initial CreateTableEvent since no data has been written yet and there is nothing to split
- Committer: Group WriteResultWrappers by batchIndex using a TreeMap, so wrappers from different subtasks with the same batchIndex are merged into a single Iceberg snapshot instead of being committed separately

Tests added:
- testBatchIndexInSyncWhenSubtaskHasNoWriterAtSchemaChange
- testNoDuplicateWithParallelSubtasksMissingPreSchemaChangeData
- testSameBatchIndexFromTwoSubtasksMergedIntoOneSnapshot
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergCommitter.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergCommitter.java
@@ -42,11 +42,11 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.TreeMap;
 
 import static java.util.stream.Collectors.toList;
 import static org.apache.flink.runtime.checkpoint.CheckpointIDCounter.INITIAL_CHECKPOINT_ID;
@@ -105,17 +105,18 @@ private void commit(List<WriteResultWrapper> writeResultWrappers) {
         Map<TableId, List<WriteResultWrapper>> tableMap = new HashMap<>();
         for (WriteResultWrapper w : writeResultWrappers) {
             tableMap.computeIfAbsent(w.getTableId(), k -> new ArrayList<>()).add(w);
-            LOGGER.info(w.buildDescription());
         }
 
         for (Map.Entry<TableId, List<WriteResultWrapper>> entry : tableMap.entrySet()) {
             TableId tableId = entry.getKey();
 
-            // Sort ascending by batch index to guarantee correct Iceberg sequence number ordering.
-            // Equality-delete files in batch N will have sequence number > batch M (M < N), so
-            // they correctly supersede stale data written by earlier same-checkpoint batches.
-            List<WriteResultWrapper> batches = entry.getValue();
-            batches.sort(Comparator.comparingInt(WriteResultWrapper::getBatchIndex));
+            // Group by batchIndex so wrappers from different subtasks for the same batch
+            // are merged into one snapshot, not committed separately.
+            TreeMap<Integer, List<WriteResultWrapper>> batchGroups = new TreeMap<>();
+            for (WriteResultWrapper w : entry.getValue()) {
+                batchGroups.computeIfAbsent(w.getBatchIndex(), k -> new ArrayList<>()).add(w);
+                LOGGER.info(w.buildDescription());
+            }
 
             Table table =
                     catalog.loadTable(
@@ -145,32 +146,39 @@ private void commit(List<WriteResultWrapper> writeResultWrappers) {
             Optional<TableMetric> tableMetric = getTableMetric(tableId);
             tableMetric.ifPresent(TableMetric::increaseCommitTimes);
 
-            // Find the last non-empty batch so we know where to write MAX_COMMITTED_CHECKPOINT_ID.
-            int lastNonEmptyBatchPos = -1;
-            for (int i = batches.size() - 1; i >= startBatchIndex; i--) {
-                if (!isBatchEmpty(batches.get(i))) {
-                    lastNonEmptyBatchPos = i;
-                    break;
+            int lastNonEmptyBatchIndex = -1;
+            for (Map.Entry<Integer, List<WriteResultWrapper>> g : batchGroups.entrySet()) {
+                List<DataFile> df = collectDataFilesFromGroup(g.getValue());
+                List<DeleteFile> del = collectDeleteFilesFromGroup(g.getValue());
+                if (!df.isEmpty() || !del.isEmpty()) {
+                    lastNonEmptyBatchIndex = g.getKey();
                 }
             }
 
-            // Commit each batch as a separate Iceberg snapshot to get distinct sequence numbers.
-            for (int i = startBatchIndex; i < batches.size(); i++) {
-                WriteResultWrapper batch = batches.get(i);
-                List<DataFile> dataFiles = collectDataFiles(batch.getWriteResult());
-                List<DeleteFile> deleteFiles = collectDeleteFiles(batch.getWriteResult());
+            // Commit each batch as a separate snapshot so sequence numbers increase per batch.
+            for (Map.Entry<Integer, List<WriteResultWrapper>> g : batchGroups.entrySet()) {
+                int batchIdx = g.getKey();
+                if (batchIdx < startBatchIndex) {
+                    LOGGER.info(
+                            "Batch {} for checkpoint {} of table {} already committed, skipping",
+                            batchIdx,
+                            checkpointId,
+                            tableId.identifier());
+                    continue;
+                }
+
+                List<DataFile> dataFiles = collectDataFilesFromGroup(g.getValue());
+                List<DeleteFile> deleteFiles = collectDeleteFilesFromGroup(g.getValue());
 
                 if (dataFiles.isEmpty() && deleteFiles.isEmpty()) {
                     LOGGER.info(
                             "Batch {} for checkpoint {} of table {} has nothing to commit, skipping",
-                            batch.getBatchIndex(),
+                            batchIdx,
                             checkpointId,
                             tableId.identifier());
                     continue;
                 }
 
-                boolean isLastNonEmptyBatch = (i == lastNonEmptyBatchPos);
-
                 SnapshotUpdate<?> operation;
                 if (deleteFiles.isEmpty()) {
                     AppendFiles append = table.newAppend();
@@ -185,9 +193,9 @@ private void commit(List<WriteResultWrapper> writeResultWrappers) {
 
                 operation.set(SinkUtil.FLINK_JOB_ID, newFlinkJobId);
                 operation.set(SinkUtil.OPERATOR_ID, operatorId);
-                operation.set(FLINK_BATCH_INDEX, String.valueOf(batch.getBatchIndex()));
+                operation.set(FLINK_BATCH_INDEX, String.valueOf(batchIdx));
                 operation.set(FLINK_CHECKPOINT_ID_PROP, String.valueOf(checkpointId));
-                if (isLastNonEmptyBatch) {
+                if (batchIdx == lastNonEmptyBatchIndex) {
                     operation.set(
                             SinkUtil.MAX_COMMITTED_CHECKPOINT_ID, String.valueOf(checkpointId));
                 }
@@ -196,17 +204,16 @@ private void commit(List<WriteResultWrapper> writeResultWrappers) {
         }
     }
 
-    private static boolean isBatchEmpty(WriteResultWrapper batch) {
-        WriteResult r = batch.getWriteResult();
-        long dataCount =
-                r.dataFiles() == null
-                        ? 0
-                        : Arrays.stream(r.dataFiles()).filter(f -> f.recordCount() > 0).count();
-        long deleteCount =
-                r.deleteFiles() == null
-                        ? 0
-                        : Arrays.stream(r.deleteFiles()).filter(f -> f.recordCount() > 0).count();
-        return dataCount == 0 && deleteCount == 0;
+    private static List<DataFile> collectDataFilesFromGroup(List<WriteResultWrapper> group) {
+        return group.stream()
+                .flatMap(w -> collectDataFiles(w.getWriteResult()).stream())
+                .collect(toList());
+    }
+
+    private static List<DeleteFile> collectDeleteFilesFromGroup(List<WriteResultWrapper> group) {
+        return group.stream()
+                .flatMap(w -> collectDeleteFiles(w.getWriteResult()).stream())
+                .collect(toList());
     }
 
     private static List<DataFile> collectDataFiles(WriteResult result) {
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriter.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriter.java
@@ -71,7 +71,7 @@ public class IcebergWriter
 
     private final List<WriteResultWrapper> temporaryWriteResult;
 
-    /** Per-table batch index within the current checkpoint; incremented on each schema-change flush. */
+    /** Per-table batch index; incremented on each schema-change flush, even when no writer exists. */
     private Map<TableId, Integer> tableBatchIndexMap;
 
     private Catalog catalog;
@@ -168,8 +168,11 @@ public void write(Event event, Context context) throws IOException {
         } else {
             SchemaChangeEvent schemaChangeEvent = (SchemaChangeEvent) event;
             TableId tableId = schemaChangeEvent.tableId();
-            // Flush only this table before applying schema change to avoid global writer rotation.
-            flushTableWriter(tableId);
+            // Flush only when the table is already known; skip on initial CreateTableEvent since
+            // no data has been written yet and there is nothing to split.
+            if (schemaMap.containsKey(tableId)) {
+                flushTableWriter(tableId);
+            }
             TableSchemaWrapper tableSchemaWrapper = schemaMap.get(tableId);
 
             Schema newSchema =
@@ -182,19 +185,20 @@ public void write(Event event, Context context) throws IOException {
     }
 
     @Override
-    public void flush(boolean flush) throws IOException {
+    public void flush(boolean flush) {
         // Flush may be called many times during one checkpoint by non-data events.
         // Avoid rotating all task writers here, which can split same-PK updates into multiple
         // batches within one checkpoint and break dedup semantics in downstream reads.
     }
 
     private void flushTableWriter(TableId tableId) throws IOException {
         TaskWriter<RowData> writer = writerMap.remove(tableId);
+        // Advance even when no writer exists, to keep batchIndex in sync across subtasks.
+        int batchIndex = tableBatchIndexMap.getOrDefault(tableId, 0);
+        tableBatchIndexMap.put(tableId, batchIndex + 1);
         if (writer == null) {
             return;
         }
-        int batchIndex = tableBatchIndexMap.getOrDefault(tableId, 0);
-        tableBatchIndexMap.put(tableId, batchIndex + 1);
         WriteResultWrapper writeResultWrapper =
                 new WriteResultWrapper(
                         writer.complete(),
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/test/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriterTest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/test/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriterTest.java