Skip to content

Commit 9954d1e

Browse files
Hisoka-Xhailin0
andauthored
[Fix][Zeta] Fix the thread stuck problem caused by savepoint checking mechanism (#6568)
Co-authored-by: hailin0 <[email protected]>
1 parent 24441c8 commit 9954d1e

File tree

8 files changed

+268
-28
lines changed

8 files changed

+268
-28
lines changed

Diff for: seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/sink/inmemory/InMemorySink.java

+9-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.seatunnel.e2e.sink.inmemory;
1919

20+
import org.apache.seatunnel.api.configuration.ReadonlyConfig;
2021
import org.apache.seatunnel.api.serialization.DefaultSerializer;
2122
import org.apache.seatunnel.api.serialization.Serializer;
2223
import org.apache.seatunnel.api.sink.SeaTunnelSink;
@@ -35,6 +36,13 @@ public class InMemorySink
3536
InMemoryCommitInfo,
3637
InMemoryAggregatedCommitInfo>,
3738
SupportMultiTableSink {
39+
40+
private ReadonlyConfig config;
41+
42+
public InMemorySink(ReadonlyConfig config) {
43+
this.config = config;
44+
}
45+
3846
@Override
3947
public String getPluginName() {
4048
return "InMemorySink";
@@ -43,7 +51,7 @@ public String getPluginName() {
4351
@Override
4452
public SinkWriter<SeaTunnelRow, InMemoryCommitInfo, InMemoryState> createWriter(
4553
SinkWriter.Context context) throws IOException {
46-
return new InMemorySinkWriter();
54+
return new InMemorySinkWriter(config);
4755
}
4856

4957
@Override

Diff for: seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/sink/inmemory/InMemorySinkFactory.java

+9-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.seatunnel.e2e.sink.inmemory;
1919

20+
import org.apache.seatunnel.api.configuration.Option;
21+
import org.apache.seatunnel.api.configuration.Options;
2022
import org.apache.seatunnel.api.configuration.util.OptionRule;
2123
import org.apache.seatunnel.api.table.connector.TableSink;
2224
import org.apache.seatunnel.api.table.factory.Factory;
@@ -31,19 +33,24 @@ public class InMemorySinkFactory
3133
implements TableSinkFactory<
3234
SeaTunnelRow, InMemoryState, InMemoryCommitInfo, InMemoryAggregatedCommitInfo> {
3335

36+
public static final Option<Boolean> THROW_EXCEPTION =
37+
Options.key("throw_exception").booleanType().defaultValue(false);
38+
public static final Option<Boolean> CHECKPOINT_SLEEP =
39+
Options.key("checkpoint_sleep").booleanType().defaultValue(false);
40+
3441
@Override
3542
public String factoryIdentifier() {
3643
return "InMemory";
3744
}
3845

3946
@Override
4047
public OptionRule optionRule() {
41-
return OptionRule.builder().build();
48+
return OptionRule.builder().optional(THROW_EXCEPTION, CHECKPOINT_SLEEP).build();
4249
}
4350

4451
@Override
4552
public TableSink<SeaTunnelRow, InMemoryState, InMemoryCommitInfo, InMemoryAggregatedCommitInfo>
4653
createSink(TableSinkFactoryContext context) {
47-
return InMemorySink::new;
54+
return () -> new InMemorySink(context.getOptions());
4855
}
4956
}

Diff for: seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/sink/inmemory/InMemorySinkWriter.java

+18
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.seatunnel.e2e.sink.inmemory;
1919

20+
import org.apache.seatunnel.api.configuration.ReadonlyConfig;
2021
import org.apache.seatunnel.api.sink.MultiTableResourceManager;
2122
import org.apache.seatunnel.api.sink.SinkWriter;
2223
import org.apache.seatunnel.api.sink.SupportMultiTableSinkWriter;
@@ -66,13 +67,30 @@ public static List<InMemoryMultiTableResourceManager> getResourceManagers() {
6667
return resourceManagers;
6768
}
6869

70+
private ReadonlyConfig config;
71+
72+
public InMemorySinkWriter(ReadonlyConfig config) {
73+
this.config = config;
74+
}
75+
6976
private InMemoryMultiTableResourceManager resourceManager;
7077

7178
@Override
7279
public void write(SeaTunnelRow element) throws IOException {}
7380

7481
@Override
7582
public Optional<InMemoryCommitInfo> prepareCommit() throws IOException {
83+
try {
84+
if (config.get(InMemorySinkFactory.THROW_EXCEPTION)) {
85+
Thread.sleep(4000L);
86+
throw new IOException("write failed");
87+
}
88+
if (config.get(InMemorySinkFactory.CHECKPOINT_SLEEP)) {
89+
Thread.sleep(5000L);
90+
}
91+
} catch (InterruptedException e) {
92+
throw new RuntimeException(e);
93+
}
7694
return Optional.of(new InMemoryCommitInfo());
7795
}
7896

Diff for: seatunnel-engine/seatunnel-engine-server/pom.xml

+7
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@
8888
<classifier>optional</classifier>
8989
<scope>provided</scope>
9090
</dependency>
91+
<dependency>
92+
<groupId>org.apache.seatunnel</groupId>
93+
<artifactId>seatunnel-e2e-common</artifactId>
94+
<version>${project.version}</version>
95+
<type>test-jar</type>
96+
<scope>test</scope>
97+
</dependency>
9198
<dependency>
9299
<groupId>com.squareup.okhttp</groupId>
93100
<artifactId>mockwebserver</artifactId>

Diff for: seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointCoordinator.java

+37-11
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,10 @@ public class CheckpointCoordinator {
142142

143143
private final IMap<Object, Object> runningJobStateIMap;
144144

145+
// save pending checkpoint for savepoint, to make sure the different savepoint request can be
146+
// processed with one savepoint operation in the same time.
147+
private PendingCheckpoint savepointPendingCheckpoint;
148+
145149
private final String checkpointStateImapKey;
146150

147151
@SneakyThrows
@@ -449,15 +453,14 @@ protected void tryTriggerPendingCheckpoint(CheckpointType checkpointType) {
449453
CompletableFuture<PendingCheckpoint> pendingCheckpoint =
450454
createPendingCheckpoint(currentTimestamp, checkpointType);
451455
startTriggerPendingCheckpoint(pendingCheckpoint);
452-
pendingCounter.incrementAndGet();
453456
// if checkpoint type are final type, we don't need to trigger next checkpoint
454457
if (checkpointType.notFinalCheckpoint() && checkpointType.notSchemaChangeCheckpoint()) {
455458
scheduleTriggerPendingCheckpoint(coordinatorConfig.getCheckpointInterval());
456459
}
457460
}
458461
}
459462

460-
public boolean isShutdown() {
463+
private boolean isShutdown() {
461464
return shutdown;
462465
}
463466

@@ -472,29 +475,45 @@ public static Map<Long, Integer> getPipelineTasks(Set<TaskLocation> pipelineSubt
472475
@SneakyThrows
473476
public PassiveCompletableFuture<CompletedCheckpoint> startSavepoint() {
474477
LOG.info(String.format("Start save point for Job (%s)", jobId));
478+
if (shutdown || isCompleted()) {
479+
return completableFutureWithError(
480+
CheckpointCloseReason.CHECKPOINT_COORDINATOR_SHUTDOWN);
481+
}
475482
if (!isAllTaskReady) {
476-
CompletableFuture<CompletedCheckpoint> savepointFuture = new CompletableFuture<>();
477-
savepointFuture.completeExceptionally(
478-
new CheckpointException(
479-
CheckpointCloseReason.TASK_NOT_ALL_READY_WHEN_SAVEPOINT));
480-
return new PassiveCompletableFuture<>(savepointFuture);
483+
return completableFutureWithError(
484+
CheckpointCloseReason.TASK_NOT_ALL_READY_WHEN_SAVEPOINT);
485+
}
486+
if (savepointPendingCheckpoint != null
487+
&& !savepointPendingCheckpoint.getCompletableFuture().isDone()) {
488+
return savepointPendingCheckpoint.getCompletableFuture();
481489
}
482490
CompletableFuture<PendingCheckpoint> savepoint;
483491
synchronized (lock) {
484-
while (pendingCounter.get() > 0) {
492+
while (pendingCounter.get() > 0 && !shutdown) {
485493
Thread.sleep(500);
486494
}
495+
if (shutdown || isCompleted()) {
496+
return completableFutureWithError(
497+
CheckpointCloseReason.CHECKPOINT_COORDINATOR_SHUTDOWN);
498+
}
487499
savepoint = createPendingCheckpoint(Instant.now().toEpochMilli(), SAVEPOINT_TYPE);
488500
startTriggerPendingCheckpoint(savepoint);
489501
}
490-
PendingCheckpoint savepointPendingCheckpoint = savepoint.join();
502+
savepointPendingCheckpoint = savepoint.join();
491503
LOG.info(
492504
String.format(
493505
"The save point checkpointId is %s",
494506
savepointPendingCheckpoint.getCheckpointId()));
495507
return savepointPendingCheckpoint.getCompletableFuture();
496508
}
497509

510+
private PassiveCompletableFuture<CompletedCheckpoint> completableFutureWithError(
511+
CheckpointCloseReason closeReason) {
512+
CompletableFuture<CompletedCheckpoint> future = new CompletableFuture<>();
513+
future.completeExceptionally(new CheckpointException(closeReason));
514+
return new PassiveCompletableFuture<>(future);
515+
}
516+
498517
private void startTriggerPendingCheckpoint(
499518
CompletableFuture<PendingCheckpoint> pendingCompletableFuture) {
500519
pendingCompletableFuture.thenAccept(
@@ -577,9 +596,10 @@ private void startTriggerPendingCheckpoint(
577596
TimeUnit.MILLISECONDS));
578597
}
579598
});
599+
pendingCounter.incrementAndGet();
580600
}
581601

582-
CompletableFuture<PendingCheckpoint> createPendingCheckpoint(
602+
private CompletableFuture<PendingCheckpoint> createPendingCheckpoint(
583603
long triggerTimestamp, CheckpointType checkpointType) {
584604
synchronized (lock) {
585605
CompletableFuture<Long> idFuture;
@@ -610,7 +630,7 @@ CompletableFuture<PendingCheckpoint> createPendingCheckpoint(
610630
}
611631
}
612632

613-
CompletableFuture<PendingCheckpoint> triggerPendingCheckpoint(
633+
private CompletableFuture<PendingCheckpoint> triggerPendingCheckpoint(
614634
long triggerTimestamp,
615635
CompletableFuture<Long> idFuture,
616636
CheckpointType checkpointType) {
@@ -944,4 +964,10 @@ protected void completeSchemaChangeAfterCheckpoint(CompletedCheckpoint checkpoin
944964
checkpoint.getCheckpointId(), pipelineId, jobId));
945965
}
946966
}
967+
968+
/** Only for test */
969+
@VisibleForTesting
970+
public PendingCheckpoint getSavepointPendingCheckpoint() {
971+
return savepointPendingCheckpoint;
972+
}
947973
}

0 commit comments

Comments
 (0)