Skip to content

Commit 1098421

Browse files
authored
Give up partition when there is no Ack (#5499)
* Give up partition when there is no Ack Signed-off-by: Dinu John <[email protected]> * Update No Ack parition timeout to 15 mins Signed-off-by: Dinu John <[email protected]> --------- Signed-off-by: Dinu John <[email protected]>
1 parent 76ba065 commit 1098421

File tree

2 files changed

+31
-17
lines changed

2 files changed

+31
-17
lines changed

data-prepper-plugins/mongodb/src/main/java/org/opensearch/dataprepper/plugins/mongo/stream/StreamAcknowledgementManager.java

+27-12
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
public class StreamAcknowledgementManager {
2222
private static final Logger LOG = LoggerFactory.getLogger(StreamAcknowledgementManager.class);
2323
private static final int CHECKPOINT_RECORD_INTERVAL = 50;
24+
private static final int NO_ACK_PARTITION_TIME_OUT_SECONDS = 900; // 15 minutes
2425
private final ConcurrentLinkedQueue<CheckpointStatus> checkpoints = new ConcurrentLinkedQueue<>();
2526
private final ConcurrentHashMap<String, CheckpointStatus> ackStatus = new ConcurrentHashMap<>();
2627
private final AcknowledgementSetManager acknowledgementSetManager;
@@ -42,7 +43,7 @@ public class StreamAcknowledgementManager {
4243
public static final String NEGATIVE_ACKNOWLEDGEMENT_SET_METRIC_NAME = "negativeAcknowledgementSets";
4344
public static final String RECORDS_CHECKPOINTED = "recordsCheckpointed";
4445
public static final String NO_DATA_EXTEND_LEASE_COUNT = "noDataExtendLeaseCount";
45-
public static final String GIVE_UP_PARTITION_COUNT = "giveupPartitionCount";
46+
public static final String GIVE_UP_PARTITION_COUNT = "giveUpPartitionCount";
4647

4748

4849
public StreamAcknowledgementManager(final AcknowledgementSetManager acknowledgementSetManager,
@@ -85,29 +86,34 @@ private void monitorAcknowledgment(final ExecutorService executorService, final
8586
checkpointStatus = checkpoints.peek();
8687
ackCount++;
8788
// at high TPS each ack contains 100 records. This should checkpoint every 100*50 = 5000 records.
88-
if (ackCount % CHECKPOINT_RECORD_INTERVAL == 0) {
89+
if ((ackCount % CHECKPOINT_RECORD_INTERVAL == 0) ||
90+
(System.currentTimeMillis() - lastCheckpointTime >= checkPointIntervalInMs)){
8991
checkpoint(lastCheckpointStatus.getResumeToken(), lastCheckpointStatus.getRecordCount());
92+
lastCheckpointTime = System.currentTimeMillis();
9093
}
9194
} while (checkpointStatus != null && checkpointStatus.isPositiveAcknowledgement());
9295
checkpoint(lastCheckpointStatus.getResumeToken(), lastCheckpointStatus.getRecordCount());
9396
lastCheckpointTime = System.currentTimeMillis();
9497
}
9598
} else {
9699
LOG.debug("Checkpoint not complete for resume token {}", checkpointStatus.getResumeToken());
97-
final Duration ackWaitDuration = Duration.between(Instant.ofEpochMilli(checkpointStatus.getCreateTimestamp()), Instant.now());
100+
// negative ack
98101
if (checkpointStatus.isNegativeAcknowledgement()) {
99-
// Give up partition and should interrupt parent thread to stop processing stream
100-
if (lastCheckpointStatus != null && lastCheckpointStatus.isPositiveAcknowledgement()) {
101-
checkpoint(lastCheckpointStatus.getResumeToken(), lastCheckpointStatus.getRecordCount());
102-
}
103-
LOG.warn("Acknowledgement not received for the checkpoint {} past wait time. Giving up partition.", checkpointStatus.getResumeToken());
104-
partitionCheckpoint.giveUpPartition();
105-
this.giveupPartitionCount.increment();
102+
LOG.warn("Negative Acknowledgement received for the checkpoint {}. Giving up partition.", checkpointStatus.getResumeToken());
103+
giveUpPartition(lastCheckpointStatus);
106104
break;
105+
} else {
106+
final Duration ackWaitDuration = Duration.between(Instant.ofEpochMilli(checkpointStatus.getCreateTimestamp()), Instant.now());
107+
// no ack received within timeout period
108+
if (!ackWaitDuration.minusSeconds(NO_ACK_PARTITION_TIME_OUT_SECONDS).isNegative()) {
109+
LOG.warn("Acknowledgement not received for the checkpoint {} past wait time. Giving up partition.", checkpointStatus.getResumeToken());
110+
giveUpPartition(lastCheckpointStatus);
111+
break;
112+
}
107113
}
108114
}
109115
} else {
110-
if (System.currentTimeMillis() - lastCheckpointTime >= checkPointIntervalInMs) {
116+
if (System.currentTimeMillis() - lastCheckpointTime >= checkPointIntervalInMs) { // 1 min
111117
partitionCheckpoint.extendLease();
112118
this.noDataExtendLeaseCount.increment();
113119
lastCheckpointTime = System.currentTimeMillis();
@@ -128,10 +134,19 @@ private void monitorAcknowledgment(final ExecutorService executorService, final
128134
executorService.shutdown();
129135
}
130136

137+
private void giveUpPartition(final CheckpointStatus lastCheckpointStatus) {
138+
// Give up partition and should interrupt parent thread to stop processing stream
139+
if (lastCheckpointStatus != null && lastCheckpointStatus.isPositiveAcknowledgement()) {
140+
checkpoint(lastCheckpointStatus.getResumeToken(), lastCheckpointStatus.getRecordCount());
141+
}
142+
partitionCheckpoint.giveUpPartition();
143+
this.giveupPartitionCount.increment();
144+
}
145+
131146
private void checkpoint(final String resumeToken, final long recordCount) {
132147
LOG.debug("Perform regular checkpointing for resume token {} at record count {}", resumeToken, recordCount);
133148
partitionCheckpoint.checkpoint(resumeToken, recordCount);
134-
this.recordsCheckpointed.increment(recordCount);
149+
this.recordsCheckpointed.increment();
135150
}
136151

137152
Optional<AcknowledgementSet> createAcknowledgementSet(final String resumeToken, final long recordNumber) {

data-prepper-plugins/mongodb/src/test/java/org/opensearch/dataprepper/plugins/mongo/stream/StreamAcknowledgementManagerTest.java

+4-5
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import static org.hamcrest.MatcherAssert.assertThat;
2525
import static org.hamcrest.CoreMatchers.is;
2626
import static org.mockito.ArgumentMatchers.any;
27-
import static org.mockito.ArgumentMatchers.anyDouble;
2827
import static org.mockito.ArgumentMatchers.eq;
2928
import static org.mockito.Mockito.atLeastOnce;
3029
import static org.mockito.Mockito.lenient;
@@ -107,11 +106,11 @@ public void createAcknowledgementSet_enabled_ackSetWithAck() {
107106
assertThat(ackCheckpointStatus.isPositiveAcknowledgement(), is(true));
108107
await()
109108
.atMost(Duration.ofSeconds(10)).untilAsserted(() ->
110-
verify(partitionCheckpoint).checkpoint(resumeToken, recordCount));
109+
verify(partitionCheckpoint, times(2)).checkpoint(resumeToken, recordCount));
111110
assertThat(streamAckManager.getCheckpoints().peek(), is(nullValue()));
112111
verify(positiveAcknowledgementSets).increment();
113112
verifyNoInteractions(negativeAcknowledgementSets);
114-
verify(recordsCheckpointed, atLeastOnce()).increment(anyDouble());
113+
verify(recordsCheckpointed, atLeastOnce()).increment();
115114
}
116115

117116
@Test
@@ -146,12 +145,12 @@ public void createAcknowledgementSet_enabled_multipleAckSetWithAck() {
146145
assertThat(ackCheckpointStatus.isPositiveAcknowledgement(), is(true));
147146
await()
148147
.atMost(Duration.ofSeconds(10)).untilAsserted(() ->
149-
verify(partitionCheckpoint).checkpoint(resumeToken2, recordCount2));
148+
verify(partitionCheckpoint, times(2)).checkpoint(resumeToken2, recordCount2));
150149
assertThat(streamAckManager.getCheckpoints().peek(), is(nullValue()));
151150

152151
verify(positiveAcknowledgementSets, atLeastOnce()).increment();
153152
verifyNoInteractions(negativeAcknowledgementSets);
154-
verify(recordsCheckpointed, atLeastOnce()).increment(anyDouble());
153+
verify(recordsCheckpointed, atLeastOnce()).increment();
155154
}
156155

157156
@Test

0 commit comments

Comments
 (0)