Skip to content

Commit dfaeb3e

Browse files
authored
[lake/flink] Enforce to keep data insistent when commit to lake but fail to commit to fluss (#1067)
1 parent 37c3866 commit dfaeb3e

File tree

18 files changed

+895
-43
lines changed

18 files changed

+895
-43
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* Copyright (c) 2025 Alibaba Group Holding Ltd.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.alibaba.fluss.lake.committer;
18+
19+
import com.alibaba.fluss.utils.types.Tuple2;
20+
21+
import java.util.HashMap;
22+
import java.util.Map;
23+
import java.util.Objects;
24+
25+
/**
26+
* The lake already committed snapshot, containing the lake snapshot id and the bucket end offsets
27+
* in this snapshot.
28+
*/
29+
public class CommittedLakeSnapshot {
30+
31+
private final long lakeSnapshotId;
32+
// <partition_name, bucket> -> log offset, partition_name will be null if it's not a
33+
// partition bucket
34+
private final Map<Tuple2<String, Integer>, Long> logEndOffsets = new HashMap<>();
35+
36+
public CommittedLakeSnapshot(long lakeSnapshotId) {
37+
this.lakeSnapshotId = lakeSnapshotId;
38+
}
39+
40+
public long getLakeSnapshotId() {
41+
return lakeSnapshotId;
42+
}
43+
44+
public void addBucket(int bucketId, long offset) {
45+
logEndOffsets.put(Tuple2.of(null, bucketId), offset);
46+
}
47+
48+
public void addPartitionBucket(String partitionName, int bucketId, long offset) {
49+
logEndOffsets.put(Tuple2.of(partitionName, bucketId), offset);
50+
}
51+
52+
public Map<Tuple2<String, Integer>, Long> getLogEndOffsets() {
53+
return logEndOffsets;
54+
}
55+
56+
@Override
57+
public boolean equals(Object o) {
58+
if (o == null || getClass() != o.getClass()) {
59+
return false;
60+
}
61+
CommittedLakeSnapshot that = (CommittedLakeSnapshot) o;
62+
return lakeSnapshotId == that.lakeSnapshotId
63+
&& Objects.equals(logEndOffsets, that.logEndOffsets);
64+
}
65+
66+
@Override
67+
public int hashCode() {
68+
return Objects.hash(lakeSnapshotId, logEndOffsets);
69+
}
70+
71+
@Override
72+
public String toString() {
73+
return "CommittedLakeSnapshot{"
74+
+ "lakeSnapshotId="
75+
+ lakeSnapshotId
76+
+ ", logEndOffsets="
77+
+ logEndOffsets
78+
+ '}';
79+
}
80+
}

fluss-common/src/main/java/com/alibaba/fluss/lake/committer/LakeCommitter.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
import com.alibaba.fluss.annotation.PublicEvolving;
2020

21+
import javax.annotation.Nullable;
22+
2123
import java.io.IOException;
2224
import java.util.List;
2325

@@ -49,4 +51,24 @@ public interface LakeCommitter<WriteResult, CommittableT> extends AutoCloseable
4951
* @throws IOException if an I/O error occurs
5052
*/
5153
long commit(CommittableT committable) throws IOException;
54+
55+
/**
56+
* Aborts the given committable object.
57+
*
58+
* @param committable the committable object
59+
* @throws IOException if an I/O error occurs
60+
*/
61+
void abort(CommittableT committable) throws IOException;
62+
63+
/**
64+
* Get missing lake snapshot that has been committed to lake but didn't commit to fluss.
65+
*
66+
* @param latestLakeSnapshotIdOfFluss the latest lake snapshot id in fluss, used to judge which
67+
* lake snapshot is missing.
68+
* @return the missing lake snapshot, returns null if no any missing snapshot found
69+
* @throws IOException if an I/O error occurs
70+
*/
71+
@Nullable
72+
CommittedLakeSnapshot getMissingLakeSnapshot(@Nullable Long latestLakeSnapshotIdOfFluss)
73+
throws IOException;
5274
}

fluss-flink/fluss-flink-common/src/main/java/com/alibaba/fluss/flink/tiering/committer/FlussTableLakeSnapshot.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import com.alibaba.fluss.metadata.TableBucket;
2020

21+
import java.util.HashMap;
2122
import java.util.Map;
2223

2324
/** A lake snapshot for a Fluss table. */
@@ -29,6 +30,10 @@ public class FlussTableLakeSnapshot {
2930

3031
private final Map<TableBucket, Long> logEndOffsets;
3132

33+
public FlussTableLakeSnapshot(long tableId, long lakeSnapshotId) {
34+
this(tableId, lakeSnapshotId, new HashMap<>());
35+
}
36+
3237
public FlussTableLakeSnapshot(
3338
long tableId, long lakeSnapshotId, Map<TableBucket, Long> logEndOffsets) {
3439
this.tableId = tableId;
@@ -48,6 +53,10 @@ public Map<TableBucket, Long> logEndOffsets() {
4853
return logEndOffsets;
4954
}
5055

56+
public void addBucketOffset(TableBucket bucket, long offset) {
57+
logEndOffsets.put(bucket, offset);
58+
}
59+
5160
@Override
5261
public String toString() {
5362
return "FlussTableLakeSnapshot{"

fluss-flink/fluss-flink-common/src/main/java/com/alibaba/fluss/flink/tiering/committer/FlussTableLakeSnapshotCommitter.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import com.alibaba.fluss.client.metadata.MetadataUpdater;
2020
import com.alibaba.fluss.config.ConfigOptions;
2121
import com.alibaba.fluss.config.Configuration;
22+
import com.alibaba.fluss.lake.committer.CommittedLakeSnapshot;
2223
import com.alibaba.fluss.metadata.TableBucket;
2324
import com.alibaba.fluss.metrics.registry.MetricRegistry;
2425
import com.alibaba.fluss.rpc.GatewayClientProxy;
@@ -29,6 +30,9 @@
2930
import com.alibaba.fluss.rpc.messages.PbLakeTableSnapshotInfo;
3031
import com.alibaba.fluss.rpc.metrics.ClientMetricGroup;
3132
import com.alibaba.fluss.utils.ExceptionUtils;
33+
import com.alibaba.fluss.utils.types.Tuple2;
34+
35+
import javax.annotation.Nullable;
3236

3337
import java.io.IOException;
3438
import java.util.Map;
@@ -71,6 +75,38 @@ public void commit(FlussTableLakeSnapshot flussTableLakeSnapshot) throws IOExcep
7175
}
7276
}
7377

78+
public void commit(
79+
long tableId,
80+
@Nullable Map<String, Long> partitionIdByName,
81+
CommittedLakeSnapshot committedLakeSnapshot)
82+
throws IOException {
83+
// construct lake snapshot to commit to Fluss
84+
FlussTableLakeSnapshot flussTableLakeSnapshot =
85+
new FlussTableLakeSnapshot(tableId, committedLakeSnapshot.getLakeSnapshotId());
86+
for (Map.Entry<Tuple2<String, Integer>, Long> entry :
87+
committedLakeSnapshot.getLogEndOffsets().entrySet()) {
88+
Tuple2<String, Integer> partitionBucket = entry.getKey();
89+
TableBucket tableBucket;
90+
if (partitionBucket.f0 == null) {
91+
tableBucket = new TableBucket(tableId, partitionBucket.f1);
92+
} else {
93+
String partitionName = partitionBucket.f0;
94+
// todo: remove this
95+
// in paimon 1.12, we can store this offsets(including partitionId) into snapshot
96+
// properties, then, we won't need to get partitionId from partition name
97+
Long partitionId = partitionIdByName.get(partitionName);
98+
if (partitionId != null) {
99+
tableBucket = new TableBucket(tableId, partitionId, partitionBucket.f1);
100+
} else {
101+
// let's skip the bucket
102+
continue;
103+
}
104+
}
105+
flussTableLakeSnapshot.addBucketOffset(tableBucket, entry.getValue());
106+
}
107+
commit(flussTableLakeSnapshot);
108+
}
109+
74110
private CommitLakeTableSnapshotRequest toCommitLakeTableSnapshotRequest(
75111
FlussTableLakeSnapshot flussTableLakeSnapshot) {
76112
CommitLakeTableSnapshotRequest commitLakeTableSnapshotRequest =

fluss-flink/fluss-flink-common/src/main/java/com/alibaba/fluss/flink/tiering/committer/TieringCommitOperator.java

Lines changed: 100 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,24 @@
1616

1717
package com.alibaba.fluss.flink.tiering.committer;
1818

19+
import com.alibaba.fluss.client.Connection;
20+
import com.alibaba.fluss.client.ConnectionFactory;
21+
import com.alibaba.fluss.client.admin.Admin;
1922
import com.alibaba.fluss.config.Configuration;
20-
import com.alibaba.fluss.flink.tiering.event.FinishTieringEvent;
23+
import com.alibaba.fluss.exception.LakeTableSnapshotNotExistException;
24+
import com.alibaba.fluss.flink.tiering.event.FailedTieringEvent;
25+
import com.alibaba.fluss.flink.tiering.event.FinishedTieringEvent;
2126
import com.alibaba.fluss.flink.tiering.source.TableBucketWriteResult;
2227
import com.alibaba.fluss.flink.tiering.source.TieringSource;
28+
import com.alibaba.fluss.lake.committer.CommittedLakeSnapshot;
2329
import com.alibaba.fluss.lake.committer.LakeCommitter;
2430
import com.alibaba.fluss.lake.writer.LakeTieringFactory;
2531
import com.alibaba.fluss.lake.writer.LakeWriter;
32+
import com.alibaba.fluss.metadata.PartitionInfo;
2633
import com.alibaba.fluss.metadata.TableBucket;
34+
import com.alibaba.fluss.metadata.TableInfo;
2735
import com.alibaba.fluss.metadata.TablePath;
36+
import com.alibaba.fluss.utils.ExceptionUtils;
2837

2938
import org.apache.flink.runtime.operators.coordination.OperatorEventGateway;
3039
import org.apache.flink.runtime.source.event.SourceEventWrapper;
@@ -67,8 +76,11 @@ public class TieringCommitOperator<WriteResult, Committable>
6776

6877
private static final long serialVersionUID = 1L;
6978

79+
private final Configuration flussConfig;
7080
private final LakeTieringFactory<WriteResult, Committable> lakeTieringFactory;
7181
private final FlussTableLakeSnapshotCommitter flussTableLakeSnapshotCommitter;
82+
private Connection connection;
83+
private Admin admin;
7284

7385
// gateway to send event to flink source coordinator
7486
private final OperatorEventGateway operatorEventGateway;
@@ -84,6 +96,7 @@ public TieringCommitOperator(
8496
this.lakeTieringFactory = lakeTieringFactory;
8597
this.flussTableLakeSnapshotCommitter = new FlussTableLakeSnapshotCommitter(flussConf);
8698
this.collectedTableBucketWriteResults = new HashMap<>();
99+
this.flussConfig = flussConf;
87100
this.setup(
88101
parameters.getContainingTask(),
89102
parameters.getStreamConfig(),
@@ -97,6 +110,8 @@ public TieringCommitOperator(
97110
@Override
98111
public void open() {
99112
flussTableLakeSnapshotCommitter.open();
113+
connection = ConnectionFactory.createConnection(flussConfig);
114+
admin = connection.getAdmin();
100115
}
101116

102117
@Override
@@ -112,16 +127,27 @@ public void processElement(StreamRecord<TableBucketWriteResult<WriteResult>> str
112127
collectTableAllBucketWriteResult(tableId);
113128

114129
if (committableWriteResults != null) {
115-
Committable committable =
116-
commitWriteResults(
117-
tableId, tableBucketWriteResult.tablePath(), committableWriteResults);
118-
collectedTableBucketWriteResults.remove(tableId);
119-
// notify that the table id has been finished tier
120-
operatorEventGateway.sendEventToCoordinator(
121-
new SourceEventWrapper(new FinishTieringEvent(tableId)));
122-
// only emit when committable is not-null
123-
if (committable != null) {
124-
output.collect(new StreamRecord<>(new CommittableMessage<>(committable)));
130+
try {
131+
Committable committable =
132+
commitWriteResults(
133+
tableId,
134+
tableBucketWriteResult.tablePath(),
135+
committableWriteResults);
136+
// only emit when committable is not-null
137+
if (committable != null) {
138+
output.collect(new StreamRecord<>(new CommittableMessage<>(committable)));
139+
}
140+
// notify that the table id has been finished tier
141+
operatorEventGateway.sendEventToCoordinator(
142+
new SourceEventWrapper(new FinishedTieringEvent(tableId)));
143+
} catch (Exception e) {
144+
// if any exception happens, send to source coordinator to mark it as failed
145+
operatorEventGateway.sendEventToCoordinator(
146+
new SourceEventWrapper(
147+
new FailedTieringEvent(
148+
tableId, ExceptionUtils.stringifyException(e))));
149+
} finally {
150+
collectedTableBucketWriteResults.remove(tableId);
125151
}
126152
}
127153
}
@@ -154,6 +180,8 @@ private Committable commitWriteResults(
154180
.collect(Collectors.toList());
155181
// to committable
156182
Committable committable = lakeCommitter.toCommitable(writeResults);
183+
// before commit to lake, check fluss not missing any lake snapshot commited by fluss
184+
checkFlussNotMissingLakeSnapshot(tablePath, lakeCommitter, committable);
157185
long commitedSnapshotId = lakeCommitter.commit(committable);
158186
// commit to fluss
159187
Map<TableBucket, Long> logEndOffsets = new HashMap<>();
@@ -166,6 +194,61 @@ private Committable commitWriteResults(
166194
}
167195
}
168196

197+
private void checkFlussNotMissingLakeSnapshot(
198+
TablePath tablePath,
199+
LakeCommitter<WriteResult, Committable> lakeCommitter,
200+
Committable committable)
201+
throws Exception {
202+
Long flussCurrentLakeSnapshot;
203+
try {
204+
flussCurrentLakeSnapshot = admin.getLatestLakeSnapshot(tablePath).get().getSnapshotId();
205+
} catch (Exception e) {
206+
Throwable throwable = e.getCause();
207+
if (throwable instanceof LakeTableSnapshotNotExistException) {
208+
// do-nothing
209+
flussCurrentLakeSnapshot = null;
210+
} else {
211+
throw e;
212+
}
213+
}
214+
215+
// get Fluss missing lake snapshot in Lake
216+
CommittedLakeSnapshot missingCommittedSnapshot =
217+
lakeCommitter.getMissingLakeSnapshot(flussCurrentLakeSnapshot);
218+
219+
// fluss's known snapshot is less than lake snapshot committed by fluss
220+
// fail this commit since the data is read from the log end-offset of a invalid fluss
221+
// known lake snapshot, which means the data already has been committed to lake,
222+
// not to commit to lake to avoid data duplicated
223+
if (missingCommittedSnapshot != null) {
224+
// commit this missing snapshot to fluss
225+
TableInfo tableInfo = admin.getTableInfo(tablePath).get();
226+
Map<String, Long> partitionIdByName = null;
227+
if (tableInfo.isPartitioned()) {
228+
partitionIdByName =
229+
admin.listPartitionInfos(tablePath).get().stream()
230+
.collect(
231+
Collectors.toMap(
232+
PartitionInfo::getPartitionName,
233+
PartitionInfo::getPartitionId));
234+
}
235+
flussTableLakeSnapshotCommitter.commit(
236+
tableInfo.getTableId(), partitionIdByName, missingCommittedSnapshot);
237+
// abort this committable to delete the written files
238+
lakeCommitter.abort(committable);
239+
throw new IllegalStateException(
240+
String.format(
241+
"The current Fluss's lake snapshot %d is less than"
242+
+ " lake actual snapshot %d committed by Fluss for table: {tablePath=%s, tableId=%d},"
243+
+ " missing snapshot: %s.",
244+
flussCurrentLakeSnapshot,
245+
missingCommittedSnapshot.getLakeSnapshotId(),
246+
tableInfo.getTablePath(),
247+
tableInfo.getTableId(),
248+
missingCommittedSnapshot));
249+
}
250+
}
251+
169252
private void registerTableBucketWriteResult(
170253
long tableId, TableBucketWriteResult<WriteResult> tableBucketWriteResult) {
171254
collectedTableBucketWriteResults
@@ -214,5 +297,11 @@ private List<TableBucketWriteResult<WriteResult>> collectTableAllBucketWriteResu
214297
@Override
215298
public void close() throws Exception {
216299
flussTableLakeSnapshotCommitter.close();
300+
if (admin != null) {
301+
admin.close();
302+
}
303+
if (connection != null) {
304+
connection.close();
305+
}
217306
}
218307
}

0 commit comments

Comments
 (0)