Skip to content

Commit afc990d

Browse files
authored
[Fix] [Mongo-cdc] Fallback to timestamp startup mode when resume token has expired (#8754)
1 parent cbcce7a commit afc990d

File tree

11 files changed

+556
-27
lines changed

11 files changed

+556
-27
lines changed

Diff for: seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/MongodbIncrementalSource.java

+11
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import org.apache.seatunnel.api.configuration.Option;
2121
import org.apache.seatunnel.api.configuration.ReadonlyConfig;
22+
import org.apache.seatunnel.api.source.SourceReader;
2223
import org.apache.seatunnel.api.source.SupportParallelism;
2324
import org.apache.seatunnel.api.table.catalog.CatalogTable;
2425
import org.apache.seatunnel.connectors.cdc.base.config.SourceConfig;
@@ -28,15 +29,19 @@
2829
import org.apache.seatunnel.connectors.cdc.base.option.StopMode;
2930
import org.apache.seatunnel.connectors.cdc.base.source.IncrementalSource;
3031
import org.apache.seatunnel.connectors.cdc.base.source.offset.OffsetFactory;
32+
import org.apache.seatunnel.connectors.cdc.base.source.split.SourceRecords;
33+
import org.apache.seatunnel.connectors.cdc.base.source.split.state.SourceSplitStateBase;
3134
import org.apache.seatunnel.connectors.cdc.debezium.DebeziumDeserializationSchema;
3235
import org.apache.seatunnel.connectors.cdc.debezium.DeserializeFormat;
3336
import org.apache.seatunnel.connectors.cdc.debezium.row.DebeziumJsonDeserializeSchema;
3437
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceConfig;
3538
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceConfigProvider;
3639
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions;
3740
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.sender.MongoDBConnectorDeserializationSchema;
41+
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.MongoDBRecordEmitter;
3842
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.dialect.MongodbDialect;
3943
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.offset.ChangeStreamOffsetFactory;
44+
import org.apache.seatunnel.connectors.seatunnel.common.source.reader.RecordEmitter;
4045

4146
import javax.annotation.Nonnull;
4247

@@ -123,4 +128,10 @@ public DataSourceDialect<MongodbSourceConfig> createDataSourceDialect(ReadonlyCo
123128
public OffsetFactory createOffsetFactory(ReadonlyConfig config) {
124129
return new ChangeStreamOffsetFactory();
125130
}
131+
132+
@Override
133+
protected RecordEmitter<SourceRecords, T, SourceSplitStateBase> createRecordEmitter(
134+
SourceConfig sourceConfig, SourceReader.Context context) {
135+
return new MongoDBRecordEmitter<>(deserializationSchema, offsetFactory, context);
136+
}
126137
}

Diff for: seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceOptions.java

+27
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,12 @@
3131
import java.util.Arrays;
3232
import java.util.Collections;
3333
import java.util.HashMap;
34+
import java.util.HashSet;
3435
import java.util.List;
3536
import java.util.Map;
37+
import java.util.Set;
38+
39+
import static java.util.Arrays.asList;
3640

3741
public class MongodbSourceOptions extends SourceOptions {
3842

@@ -54,6 +58,10 @@ public class MongodbSourceOptions extends SourceOptions {
5458

5559
public static final String ID_FIELD = "_id";
5660

61+
public static final String HEARTBEAT_KEY_FIELD = "HEARTBEAT";
62+
63+
public static final String COPY_KEY_FIELD = "copy";
64+
5765
public static final String DOCUMENT_KEY = "documentKey";
5866

5967
public static final String NS_FIELD = "ns";
@@ -76,6 +84,25 @@ public class MongodbSourceOptions extends SourceOptions {
7684

7785
public static final int ILLEGAL_OPERATION_ERROR = 20;
7886

87+
public static final int INVALIDATED_RESUME_TOKEN_ERROR = 260;
88+
public static final int CHANGE_STREAM_FATAL_ERROR = 280;
89+
public static final int CHANGE_STREAM_HISTORY_LOST = 286;
90+
public static final int BSON_OBJECT_TOO_LARGE = 10334;
91+
92+
public static final Set<Integer> INVALID_CHANGE_STREAM_ERRORS =
93+
new HashSet<>(
94+
asList(
95+
INVALIDATED_RESUME_TOKEN_ERROR,
96+
CHANGE_STREAM_FATAL_ERROR,
97+
CHANGE_STREAM_HISTORY_LOST,
98+
BSON_OBJECT_TOO_LARGE));
99+
100+
public static final String RESUME_TOKEN = "resume token";
101+
public static final String NOT_FOUND = "not found";
102+
public static final String DOES_NOT_EXIST = "does not exist";
103+
public static final String INVALID_RESUME_TOKEN = "invalid resume token";
104+
public static final String NO_LONGER_IN_THE_OPLOG = "no longer be in the oplog";
105+
79106
public static final int UNKNOWN_FIELD_ERROR = 40415;
80107

81108
public static final String DROPPED_FIELD = "dropped";
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source;
19+
20+
import org.apache.seatunnel.api.source.Collector;
21+
import org.apache.seatunnel.api.source.SourceReader;
22+
import org.apache.seatunnel.connectors.cdc.base.source.offset.Offset;
23+
import org.apache.seatunnel.connectors.cdc.base.source.offset.OffsetFactory;
24+
import org.apache.seatunnel.connectors.cdc.base.source.reader.IncrementalSourceReader;
25+
import org.apache.seatunnel.connectors.cdc.base.source.reader.IncrementalSourceRecordEmitter;
26+
import org.apache.seatunnel.connectors.cdc.base.source.split.state.IncrementalSplitState;
27+
import org.apache.seatunnel.connectors.cdc.base.source.split.state.SourceSplitStateBase;
28+
import org.apache.seatunnel.connectors.cdc.debezium.DebeziumDeserializationSchema;
29+
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.offset.ChangeStreamOffset;
30+
import org.apache.seatunnel.connectors.seatunnel.common.source.reader.RecordEmitter;
31+
32+
import org.apache.kafka.connect.source.SourceRecord;
33+
34+
import org.bson.BsonDocument;
35+
36+
import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isHighWatermarkEvent;
37+
import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isLowWatermarkEvent;
38+
import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isSchemaChangeAfterWatermarkEvent;
39+
import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isSchemaChangeBeforeWatermarkEvent;
40+
import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isWatermarkEvent;
41+
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils.getResumeToken;
42+
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils.isDataChangeRecord;
43+
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils.isHeartbeatEvent;
44+
45+
/**
46+
* The {@link RecordEmitter} implementation for {@link IncrementalSourceReader}.
47+
*
48+
* <p>The {@link RecordEmitter} buffers the snapshot records of split and call the stream reader to
49+
* emit records rather than emit the records directly.
50+
*/
51+
public final class MongoDBRecordEmitter<T> extends IncrementalSourceRecordEmitter<T> {
52+
53+
public MongoDBRecordEmitter(
54+
DebeziumDeserializationSchema<T> deserializationSchema,
55+
OffsetFactory offsetFactory,
56+
SourceReader.Context context) {
57+
super(deserializationSchema, offsetFactory, context);
58+
}
59+
60+
@Override
61+
protected void processElement(
62+
SourceRecord element, Collector<T> output, SourceSplitStateBase splitState)
63+
throws Exception {
64+
if (isWatermarkEvent(element)) {
65+
Offset watermark = getOffsetPosition(element);
66+
if (isLowWatermarkEvent(element) && splitState.isSnapshotSplitState()) {
67+
splitState.asSnapshotSplitState().setLowWatermark(watermark);
68+
} else if (isHighWatermarkEvent(element) && splitState.isSnapshotSplitState()) {
69+
splitState.asSnapshotSplitState().setHighWatermark(watermark);
70+
} else if ((isSchemaChangeBeforeWatermarkEvent(element)
71+
|| isSchemaChangeAfterWatermarkEvent(element))
72+
&& splitState.isIncrementalSplitState()) {
73+
emitElement(element, output);
74+
}
75+
} else if (isDataChangeRecord(element) || isHeartbeatEvent(element)) {
76+
if (splitState.isIncrementalSplitState()) {
77+
updatePositionForStreamSplit(element, splitState);
78+
}
79+
emitElement(element, output);
80+
} else {
81+
emitElement(element, output);
82+
}
83+
}
84+
85+
private void updatePositionForStreamSplit(
86+
SourceRecord element, SourceSplitStateBase splitState) {
87+
BsonDocument resumeToken = getResumeToken(element);
88+
IncrementalSplitState streamSplitState = splitState.asIncrementalSplitState();
89+
ChangeStreamOffset offset = (ChangeStreamOffset) streamSplitState.getStartupOffset();
90+
if (offset != null) {
91+
offset.updatePosition(resumeToken);
92+
}
93+
splitState.asIncrementalSplitState().setStartupOffset(offset);
94+
}
95+
}

Diff for: seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/source/fetch/MongodbStreamFetchTask.java

+74-21
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.offset.ChangeStreamDescriptor;
2828
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.offset.ChangeStreamOffset;
2929
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils;
30+
import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbUtils;
3031

3132
import org.apache.kafka.common.utils.SystemTime;
3233
import org.apache.kafka.common.utils.Time;
@@ -46,6 +47,7 @@
4647
import com.mongodb.client.ChangeStreamIterable;
4748
import com.mongodb.client.MongoChangeStreamCursor;
4849
import com.mongodb.client.MongoClient;
50+
import com.mongodb.client.model.changestream.OperationType;
4951
import com.mongodb.kafka.connect.source.heartbeat.HeartbeatManager;
5052
import io.debezium.connector.base.ChangeEventQueue;
5153
import io.debezium.pipeline.DataChangeEvent;
@@ -68,6 +70,7 @@
6870
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.ID_FIELD;
6971
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.ILLEGAL_OPERATION_ERROR;
7072
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.NS_FIELD;
73+
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.OPERATION_TYPE;
7174
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.SNAPSHOT_FIELD;
7275
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.SOURCE_FIELD;
7376
import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.TS_MS_FIELD;
@@ -117,7 +120,23 @@ public void execute(Context context) {
117120
this.taskRunning = true;
118121
try {
119122
while (taskRunning) {
120-
Optional<BsonDocument> next = Optional.ofNullable(changeStreamCursor.tryNext());
123+
Optional<BsonDocument> next;
124+
try {
125+
next = Optional.ofNullable(changeStreamCursor.tryNext());
126+
} catch (MongoCommandException e) {
127+
if (MongodbUtils.checkIfChangeStreamCursorExpires(e)) {
128+
log.warn("Change stream cursor has expired, trying to recreate cursor");
129+
boolean resumeTokenExpires = MongodbUtils.checkIfResumeTokenExpires(e);
130+
if (resumeTokenExpires) {
131+
log.warn(
132+
"Resume token has expired, fallback to timestamp restart mode");
133+
}
134+
changeStreamCursor = openChangeStreamCursor(descriptor, resumeTokenExpires);
135+
next = Optional.ofNullable(changeStreamCursor.tryNext());
136+
} else {
137+
throw e;
138+
}
139+
}
121140
SourceRecord changeRecord = null;
122141
if (!next.isPresent()) {
123142
long untilNext = nextUpdate - time.milliseconds();
@@ -138,34 +157,51 @@ public void execute(Context context) {
138157
nextUpdate = time.milliseconds() + sourceConfig.getPollAwaitTimeMillis();
139158
} else {
140159
BsonDocument changeStreamDocument = next.get();
141-
MongoNamespace namespace = getMongoNamespace(changeStreamDocument);
142-
143-
BsonDocument resumeToken = changeStreamDocument.getDocument(ID_FIELD);
144-
BsonDocument valueDocument =
145-
normalizeChangeStreamDocument(changeStreamDocument);
146-
147-
log.trace("Adding {} to {}", valueDocument, namespace.getFullName());
148-
149-
changeRecord =
150-
MongodbRecordUtils.buildSourceRecord(
151-
createPartitionMap(
152-
sourceConfig.getHosts(),
153-
namespace.getDatabaseName(),
154-
namespace.getCollectionName()),
155-
createSourceOffsetMap(resumeToken, false),
156-
namespace.getFullName(),
157-
changeStreamDocument.getDocument(ID_FIELD),
158-
valueDocument);
160+
OperationType operationType = getOperationType(changeStreamDocument);
161+
162+
switch (operationType) {
163+
case INSERT:
164+
case UPDATE:
165+
case REPLACE:
166+
case DELETE:
167+
MongoNamespace namespace = getMongoNamespace(changeStreamDocument);
168+
169+
BsonDocument resumeToken = changeStreamDocument.getDocument(ID_FIELD);
170+
BsonDocument valueDocument =
171+
normalizeChangeStreamDocument(changeStreamDocument);
172+
173+
log.trace("Adding {} to {}", valueDocument, namespace.getFullName());
174+
175+
changeRecord =
176+
MongodbRecordUtils.buildSourceRecord(
177+
createPartitionMap(
178+
sourceConfig.getHosts(),
179+
namespace.getDatabaseName(),
180+
namespace.getCollectionName()),
181+
createSourceOffsetMap(resumeToken, false),
182+
namespace.getFullName(),
183+
changeStreamDocument.getDocument(ID_FIELD),
184+
valueDocument);
185+
break;
186+
default:
187+
// Ignore drop、drop_database、rename and other record to prevent
188+
// documentKey from being empty.
189+
log.info("Ignored {} record: {}", operationType, changeStreamDocument);
190+
}
159191
}
160192

161-
if (changeRecord != null) {
193+
if (changeRecord != null && !isBoundedRead()) {
162194
queue.enqueue(new DataChangeEvent(changeRecord));
163195
}
164196

165197
if (isBoundedRead()) {
166198
ChangeStreamOffset currentOffset;
167199
if (changeRecord != null) {
168200
currentOffset = new ChangeStreamOffset(getResumeToken(changeRecord));
201+
// The log after the high watermark won't emit.
202+
if (currentOffset.isAtOrBefore(streamSplit.getStopOffset())) {
203+
queue.enqueue(new DataChangeEvent(changeRecord));
204+
}
169205
} else {
170206
// Heartbeat is not turned on or there is no update event
171207
currentOffset = new ChangeStreamOffset(getCurrentClusterTime(mongoClient));
@@ -215,6 +251,11 @@ public IncrementalSplit getSplit() {
215251

216252
private MongoChangeStreamCursor<BsonDocument> openChangeStreamCursor(
217253
ChangeStreamDescriptor changeStreamDescriptor) {
254+
return openChangeStreamCursor(changeStreamDescriptor, false);
255+
}
256+
257+
private MongoChangeStreamCursor<BsonDocument> openChangeStreamCursor(
258+
ChangeStreamDescriptor changeStreamDescriptor, boolean forceTimestampStartup) {
218259
ChangeStreamOffset offset =
219260
new ChangeStreamOffset(streamSplit.getStartupOffset().getOffset());
220261

@@ -224,7 +265,7 @@ private MongoChangeStreamCursor<BsonDocument> openChangeStreamCursor(
224265
BsonDocument resumeToken = offset.getResumeToken();
225266
BsonTimestamp timestamp = offset.getTimestamp();
226267

227-
if (resumeToken != null) {
268+
if (resumeToken != null && !forceTimestampStartup) {
228269
if (supportsStartAfter) {
229270
log.info("Open the change stream after the previous offset: {}", resumeToken);
230271
changeStreamIterable.startAfter(resumeToken);
@@ -238,6 +279,11 @@ private MongoChangeStreamCursor<BsonDocument> openChangeStreamCursor(
238279
if (supportsStartAtOperationTime) {
239280
log.info("Open the change stream at the timestamp: {}", timestamp);
240281
changeStreamIterable.startAtOperationTime(timestamp);
282+
} else if (forceTimestampStartup) {
283+
log.error("Open change stream failed. Unable to resume from timestamp");
284+
throw new MongodbConnectorException(
285+
ILLEGAL_ARGUMENT,
286+
"Open change stream failed. Unable to resume from timestamp");
241287
} else {
242288
log.warn("Open the change stream of the latest offset");
243289
}
@@ -273,6 +319,9 @@ private MongoChangeStreamCursor<BsonDocument> openChangeStreamCursor(
273319
"Unauthorized $changeStream operation: %s %s",
274320
e.getErrorMessage(), e.getErrorCode()));
275321

322+
} else if (!forceTimestampStartup && MongodbUtils.checkIfResumeTokenExpires(e)) {
323+
log.info("Failed to open cursor with resume token, fallback to timestamp startup");
324+
return openChangeStreamCursor(changeStreamDescriptor, true);
276325
} else {
277326
throw new MongodbConnectorException(ILLEGAL_ARGUMENT, "Open change stream failed");
278327
}
@@ -353,6 +402,10 @@ private MongoNamespace getMongoNamespace(@Nonnull BsonDocument changeStreamDocum
353402
ns.getString(DB_FIELD).getValue(), ns.getString(COLL_FIELD).getValue());
354403
}
355404

405+
private OperationType getOperationType(BsonDocument changeStreamDocument) {
406+
return OperationType.fromString(changeStreamDocument.getString(OPERATION_TYPE).getValue());
407+
}
408+
356409
private boolean isBoundedRead() {
357410
return !NO_STOPPING_OFFSET.equals(streamSplit.getStopOffset());
358411
}

Diff for: seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/source/offset/ChangeStreamOffset.java

+6
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ public ChangeStreamOffset(BsonTimestamp timestamp) {
6161
this.offset = offsetMap;
6262
}
6363

64+
public void updatePosition(BsonDocument resumeToken) {
65+
Objects.requireNonNull(resumeToken);
66+
offset.put(TIMESTAMP_FIELD, String.valueOf(decodeTimestamp(resumeToken).getValue()));
67+
offset.put(RESUME_TOKEN_FIELD, resumeToken.toJson());
68+
}
69+
6470
@Nullable public BsonDocument getResumeToken() {
6571
String resumeTokenJson = offset.get(RESUME_TOKEN_FIELD);
6672
return Optional.ofNullable(resumeTokenJson).map(BsonDocument::parse).orElse(null);

0 commit comments

Comments
 (0)