-
Notifications
You must be signed in to change notification settings - Fork 40
Fix RFS Shutdown logic during exception cases and set kafka tests as isolated #1385
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -429,37 +429,48 @@ private static void exitOnLeaseTimeout( | |
log.atWarn().setMessage("Terminating RfsMigrateDocuments because the lease has expired for {}") | ||
.addArgument(workItemId) | ||
.log(); | ||
if (progressCursorRef.get() != null) { | ||
log.atWarn().setMessage("Progress cursor set, cancelling active doc migration").log(); | ||
cancellationRunnable.run(); | ||
// Get a new progressCursor after cancellation for most up-to-date checkpoint | ||
var progressCursor = progressCursorRef.get(); | ||
log.atWarn().setMessage("Progress cursor: {}") | ||
.addArgument(progressCursor).log(); | ||
var workItemAndDuration = workItemRef.get(); | ||
if (workItemAndDuration == null) { | ||
throw new IllegalStateException("Unexpected state with progressCursor set without a" + | ||
"work item"); | ||
try { | ||
if (progressCursorRef.get() != null) { | ||
log.atWarn().setMessage("Progress cursor set, cancelling active doc migration").log(); | ||
cancellationRunnable.run(); | ||
// Get a new progressCursor after cancellation for most up-to-date checkpoint | ||
var progressCursor = progressCursorRef.get(); | ||
log.atWarn().setMessage("Progress cursor: {}") | ||
.addArgument(progressCursor).log(); | ||
var workItemAndDuration = workItemRef.get(); | ||
if (workItemAndDuration == null) { | ||
throw new IllegalStateException("Unexpected state with progressCursor set without a" + | ||
"work item"); | ||
} | ||
log.atWarn().setMessage("Work Item and Duration: {}").addArgument(workItemAndDuration) | ||
.log(); | ||
log.atWarn().setMessage("Work Item: {}").addArgument(workItemAndDuration.getWorkItem()) | ||
.log(); | ||
var successorWorkItemIds = getSuccessorWorkItemIds(workItemAndDuration, progressCursor); | ||
if (successorWorkItemIds.size() == 1 && workItemId.equals(successorWorkItemIds.get(0))) { | ||
log.atWarn().setMessage("No real progress was made for work item: {}. Will retry with larger timeout").addArgument(workItemId).log(); | ||
} else { | ||
log.atWarn().setMessage("Successor Work Ids: {}").addArgument(String.join(", ", successorWorkItemIds)) | ||
.log(); | ||
var successorNextAcquisitionLeaseExponent = getSuccessorNextAcquisitionLeaseExponent(workItemTimeProvider, initialLeaseDuration, workItemAndDuration.getLeaseExpirationTime()); | ||
coordinator.createSuccessorWorkItemsAndMarkComplete( | ||
workItemId, | ||
successorWorkItemIds, | ||
successorNextAcquisitionLeaseExponent, | ||
contextSupplier | ||
); | ||
} | ||
} else { | ||
log.atWarn().setMessage("No progress cursor to create successor work items from. This can happen when" + | ||
"downloading and unpacking shard takes longer than the lease").log(); | ||
log.atWarn().setMessage("Skipping creation of successor work item to retry the existing one with more time") | ||
.log(); | ||
} | ||
Comment on lines
+463
to
468
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Can we invert the flow of control and return if the precondition fails right away? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally we'd have only one 'level' of if/elseif/else blocks for each function, makes it much cleaner to read. |
||
log.atWarn().setMessage("Work Item and Duration: {}").addArgument(workItemAndDuration) | ||
.log(); | ||
log.atWarn().setMessage("Work Item: {}").addArgument(workItemAndDuration.getWorkItem()) | ||
.log(); | ||
var successorWorkItemIds = getSuccessorWorkItemIds(workItemAndDuration, progressCursor); | ||
log.atWarn().setMessage("Successor Work Ids: {}").addArgument(String.join(", ", successorWorkItemIds)) | ||
.log(); | ||
var successorNextAcquisitionLeaseExponent = getSuccessorNextAcquisitionLeaseExponent(workItemTimeProvider, initialLeaseDuration, workItemAndDuration.getLeaseExpirationTime()); | ||
coordinator.createSuccessorWorkItemsAndMarkComplete( | ||
workItemId, | ||
successorWorkItemIds, | ||
successorNextAcquisitionLeaseExponent, | ||
contextSupplier | ||
); | ||
} else { | ||
log.atWarn().setMessage("No progress cursor to create successor work items from. This can happen when" + | ||
"downloading and unpacking shard takes longer than the lease").log(); | ||
log.atWarn().setMessage("Skipping creation of successor work item to retry the existing one with more time") | ||
.log(); | ||
} catch (Exception e) { | ||
log.atError().setMessage("Exception during exit on lease timeout, clean shutdown failed") | ||
.setCause(e).log(); | ||
cleanShutdownCompleted.set(false); | ||
System.exit(PROCESS_TIMED_OUT_EXIT_CODE); | ||
} | ||
cleanShutdownCompleted.set(true); | ||
System.exit(PROCESS_TIMED_OUT_EXIT_CODE); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a strange case, it seems like the
getSuccessorWorkItemIds
should error out internally before returning up to this level. Can we rework this?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getSuccessorWorkItemIds does error out, but this should really be a warn instead of an error.
The case here is that the lease is just long enough to send one request to the target cluster successfully, getSuccessorWorkItemIds does throw.
With the new try catch, this would be caught, but this isn't an "Error" case, more of a Warn which is why we shouldn't rely on that exception in getSuccessorWorkItemIds