polyzos
diff --git a/‎fluss-common/src/test/java/org/apache/fluss/lake/source/TestingLakeSource.java‎
Lines changed: 142 additions & 0 deletions b/‎fluss-common/src/test/java/org/apache/fluss/lake/source/TestingLakeSource.java‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎fluss-common/src/test/java/org/apache/fluss/lake/source/TestingLakeSplit.java‎
Lines changed: 47 additions & 0 deletions b/‎fluss-common/src/test/java/org/apache/fluss/lake/source/TestingLakeSplit.java‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎fluss-common/src/test/java/org/apache/fluss/lake/source/TestingPlanner.java‎
Lines changed: 51 additions & 0 deletions b/‎fluss-common/src/test/java/org/apache/fluss/lake/source/TestingPlanner.java‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/source/enumerator/FlinkSourceEnumerator.java‎
Lines changed: 45 additions & 7 deletions b/‎fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/source/enumerator/FlinkSourceEnumerator.java‎
Lines changed: 45 additions & 7 deletions
diff --git a/‎fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/source/event/PartitionsRemovedEvent.java‎
Lines changed: 15 additions & 0 deletions b/‎fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/source/event/PartitionsRemovedEvent.java‎
Lines changed: 15 additions & 0 deletions
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.lake.source;
+
+import org.apache.fluss.lake.serializer.SimpleVersionedSerializer;
+import org.apache.fluss.metadata.PartitionInfo;
+import org.apache.fluss.predicate.Predicate;
+import org.apache.fluss.utils.CloseableIterator;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/** A testing implementation of {@link LakeSource}. */
+public class TestingLakeSource implements LakeSource<LakeSplit> {
+
+    // bucket num of source table
+    private final int bucketNum;
+
+    // partition infos of partitions contain lake splits
+    private final List<PartitionInfo> partitionInfos;
+
+    public TestingLakeSource() {
+        this.bucketNum = 0;
+        this.partitionInfos = null;
+    }
+
+    public TestingLakeSource(int bucketNum, List<PartitionInfo> partitionInfos) {
+        this.bucketNum = bucketNum;
+        this.partitionInfos = partitionInfos;
+    }
+
+    @Override
+    public void withProject(int[][] project) {}
+
+    @Override
+    public void withLimit(int limit) {}
+
+    @Override
+    public FilterPushDownResult withFilters(List<Predicate> predicates) {
+        return null;
+    }
+
+    @Override
+    public Planner<LakeSplit> createPlanner(PlannerContext context) throws IOException {
+        return new TestingPlanner(bucketNum, partitionInfos);
+    }
+
+    @Override
+    public RecordReader createRecordReader(ReaderContext<LakeSplit> context) throws IOException {
+        return CloseableIterator::emptyIterator;
+    }
+
+    @Override
+    public SimpleVersionedSerializer<LakeSplit> getSplitSerializer() {
+        return new SimpleVersionedSerializer<LakeSplit>() {
+
+            @Override
+            public int getVersion() {
+                return 0;
+            }
+
+            @Override
+            public byte[] serialize(LakeSplit split) throws IOException {
+                if (split instanceof TestingLakeSplit) {
+                    TestingLakeSplit testingSplit = (TestingLakeSplit) split;
+                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+                    try (DataOutputStream dos = new DataOutputStream(baos)) {
+                        // Serialize bucket
+                        dos.writeInt(testingSplit.bucket());
+
+                        // Serialize partition list
+                        List<String> partition = testingSplit.partition();
+                        if (partition == null) {
+                            dos.writeInt(-1);
+                        } else {
+                            dos.writeInt(partition.size());
+                            for (String part : partition) {
+                                // Write a boolean flag to indicate if the string is null
+                                dos.writeBoolean(part != null);
+                                if (part != null) {
+                                    dos.writeUTF(part);
+                                }
+                            }
+                        }
+                    }
+                    return baos.toByteArray();
+                }
+                throw new IOException("Unsupported split type: " + split.getClass().getName());
+            }
+
+            @Override
+            public LakeSplit deserialize(int version, byte[] serialized) throws IOException {
+                if (version != 0) {
+                    throw new IOException("Unsupported version: " + version);
+                }
+
+                try (DataInputStream dis =
+                        new DataInputStream(new ByteArrayInputStream(serialized))) {
+                    // Deserialize bucket
+                    int bucket = dis.readInt();
+
+                    // Deserialize partition list
+                    int partitionSize = dis.readInt();
+                    List<String> partition;
+                    if (partitionSize < 0) {
+                        partition = null;
+                    } else {
+                        partition = new ArrayList<>(partitionSize);
+                        for (int i = 0; i < partitionSize; i++) {
+                            // Read boolean flag to determine if the string is null
+                            boolean isNotNull = dis.readBoolean();
+                            String part = isNotNull ? dis.readUTF() : null;
+                            partition.add(part);
+                        }
+                    }
+
+                    return new TestingLakeSplit(bucket, partition);
+                }
+            }
+        };
+    }
+}
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.lake.source;
+
+import java.util.List;
+
+/** A testing implementation of {@link LakeSplit}. */
+public class TestingLakeSplit implements LakeSplit {
+
+    private final int bucket;
+    private final List<String> partition;
+
+    public TestingLakeSplit(int bucket, List<String> partition) {
+        this.bucket = bucket;
+        this.partition = partition;
+    }
+
+    @Override
+    public String toString() {
+        return "TestingLakeSplit{" + "bucket=" + bucket + ", partition=" + partition + '}';
+    }
+
+    @Override
+    public int bucket() {
+        return bucket;
+    }
+
+    @Override
+    public List<String> partition() {
+        return partition;
+    }
+}
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.lake.source;
+
+import org.apache.fluss.metadata.PartitionInfo;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/** A testing implementation of {@link Planner}. */
+public class TestingPlanner implements Planner<LakeSplit> {
+
+    private final int bucketNum;
+    private final List<PartitionInfo> partitionInfos;
+
+    public TestingPlanner(int bucketNum, List<PartitionInfo> partitionInfos) {
+        this.bucketNum = bucketNum;
+        this.partitionInfos = partitionInfos;
+    }
+
+    @Override
+    public List<LakeSplit> plan() throws IOException {
+        List<LakeSplit> splits = new ArrayList<>();
+
+        for (PartitionInfo partitionInfo : partitionInfos) {
+            for (int i = 0; i < bucketNum; i++) {
+                splits.add(
+                        new TestingLakeSplit(
+                                i, partitionInfo.getResolvedPartitionSpec().getPartitionValues()));
+            }
+        }
+
+        return splits;
+    }
+}
@@ -24,6 +24,8 @@
 import org.apache.fluss.config.ConfigOptions;
 import org.apache.fluss.config.Configuration;
 import org.apache.fluss.flink.lake.LakeSplitGenerator;
+import org.apache.fluss.flink.lake.split.LakeSnapshotAndFlussLogSplit;
+import org.apache.fluss.flink.lake.split.LakeSnapshotSplit;
 import org.apache.fluss.flink.source.enumerator.initializer.BucketOffsetsRetrieverImpl;
 import org.apache.fluss.flink.source.enumerator.initializer.NoStoppingOffsetsInitializer;
 import org.apache.fluss.flink.source.enumerator.initializer.OffsetsInitializer;
@@ -83,7 +85,8 @@
  * <p>The enumerator is responsible for:
  *
  * <ul>
- *   <li>Get the all splits(snapshot split + log split) for a table of Fluss to be read.
+ *   <li>Get the all splits(lake split + kv snapshot split + log split) for a table of Fluss to be
+ *       read.
  *   <li>Assign the splits to readers with the guarantee that the splits belong to the same bucket
  *       will be assigned to same reader.
  * </ul>
@@ -110,10 +113,15 @@ public class FlinkSourceEnumerator
      *
      * <p>It's mainly used to help enumerator to broadcast the partition removed event to the
      * readers when partitions is dropped.
+     *
+     * <p>If an assigned partition exists only in the lake and has already expired in Fluss, it will
+     * remain here indefinitely and will not be removed. However, considering that only a small
+     * number of such lake-only partitions might exist during the initial startup, and they consume
+     * minimal memory, this issue is being ignored for now.
      */
     private final Map<Long, String> assignedPartitions;
 
-    /** buckets that have been assigned to readers. */
+    /** Buckets that have been assigned to readers. */
     private final Set<TableBucket> assignedTableBuckets;
 
     @Nullable private List<SourceSplitBase> pendingHybridLakeFlussSplits;
@@ -222,12 +230,12 @@ public FlinkSourceEnumerator(
         this.context = checkNotNull(context);
         this.pendingSplitAssignment = new HashMap<>();
         this.assignedTableBuckets = new HashSet<>(assignedTableBuckets);
-        this.startingOffsetsInitializer = startingOffsetsInitializer;
         this.assignedPartitions = new HashMap<>(assignedPartitions);
         this.pendingHybridLakeFlussSplits =
                 pendingHybridLakeFlussSplits == null
                         ? null
                         : new LinkedList<>(pendingHybridLakeFlussSplits);
+        this.startingOffsetsInitializer = startingOffsetsInitializer;
         this.scanPartitionDiscoveryIntervalMs = scanPartitionDiscoveryIntervalMs;
         this.streaming = streaming;
         this.partitionFilters = partitionFilters;
@@ -258,6 +266,10 @@ public void start() {
                     // we'll need to consider lake splits
                     List<SourceSplitBase> hybridLakeFlussSplits = generateHybridLakeFlussSplits();
                     if (hybridLakeFlussSplits != null) {
+                        LOG.info(
+                                "Generated {} hybrid lake splits for table {}.",
+                                hybridLakeFlussSplits.size(),
+                                tablePath);
                         // handle hybrid lake fluss splits firstly
                         handleSplitsAdd(hybridLakeFlussSplits, null);
                     }
@@ -554,7 +566,8 @@ private List<SourceSplitBase> getSnapshotAndLogSplits(
                 // hybrid snapshot log split;
                 OptionalLong logOffset = snapshots.getLogOffset(bucketId);
                 checkState(
-                        logOffset.isPresent(), "Log offset should be present if snapshot id is.");
+                        logOffset.isPresent(),
+                        "Log offset should be present if snapshot id is present.");
                 splits.add(
                         new HybridSnapshotLogSplit(
                                 tb, partitionName, snapshotId.getAsLong(), logOffset.getAsLong()));
@@ -616,6 +629,7 @@ private List<SourceSplitBase> generateHybridLakeFlussSplits() {
         // should be restored from checkpoint, shouldn't
         // list splits again
         if (pendingHybridLakeFlussSplits != null) {
+            LOG.info("Still have pending lake fluss splits, shouldn't list splits again.");
             return pendingHybridLakeFlussSplits;
         }
         try {
@@ -664,9 +678,28 @@ private void handlePartitionsRemoved(Collection<Partition> removedPartitionInfo)
         pendingSplitAssignment.forEach(
                 (reader, splits) ->
                         splits.removeIf(
-                                split ->
-                                        removedPartitionsMap.containsKey(
-                                                split.getTableBucket().getPartitionId())));
+                                split -> {
+                                    // Never remove LakeSnapshotSplit, because during union reads,
+                                    // data from the lake must still be read even if the partition
+                                    // has already expired in Fluss.
+                                    if (split instanceof LakeSnapshotSplit) {
+                                        return false;
+                                    }
+
+                                    // Similar to LakeSnapshotSplit, if it contains any lake split,
+                                    // never remove it; otherwise, it can be removed when the Fluss
+                                    // partition expires.
+                                    if (split instanceof LakeSnapshotAndFlussLogSplit) {
+                                        LakeSnapshotAndFlussLogSplit hybridSplit =
+                                                (LakeSnapshotAndFlussLogSplit) split;
+                                        if (!hybridSplit.isLakeSplitFinished()) {
+                                            return false;
+                                        }
+                                    }
+
+                                    return removedPartitionsMap.containsKey(
+                                            split.getTableBucket().getPartitionId());
+                                }));
 
         // send partition removed event to all readers
         PartitionsRemovedEvent event = new PartitionsRemovedEvent(removedPartitionsMap);
@@ -863,6 +896,11 @@ Map<Long, String> getAssignedPartitions() {
         return assignedPartitions;
     }
 
+    @VisibleForTesting
+    Map<Integer, List<SourceSplitBase>> getPendingSplitAssignment() {
+        return pendingSplitAssignment;
+    }
+
     @Override
     public void addSplitsBack(List<SourceSplitBase> splits, int subtaskId) {
         LOG.debug("Flink Source Enumerator adds splits back: {}", splits);
 
@@ -20,6 +20,7 @@
 import org.apache.flink.api.connector.source.SourceEvent;
 
 import java.util.Map;
+import java.util.Objects;
 
 /**
  * A source event to represent partitions is removed to send from enumerator to reader.
@@ -41,6 +42,20 @@ public Map<Long, String> getRemovedPartitions() {
         return removedPartitions;
     }
 
+    @Override
+    public boolean equals(Object o) {
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+        PartitionsRemovedEvent that = (PartitionsRemovedEvent) o;
+        return Objects.equals(removedPartitions, that.removedPartitions);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hashCode(removedPartitions);
+    }
+
     @Override
     public String toString() {
         return "PartitionsRemovedEvent{" + "removedPartitions=" + removedPartitions + '}';