You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
compactor: allow grace period for final status update (#15113)
#### What this PR does
This adds a grace period for compactor workers to send final status
updates when the parent context of the scheduler executor is canceled
(e.g. during shutdown). The purpose is for compactors to be able to
request job reassignment before shutting down. This avoids having to
wait for lease expiration if restarts occur mid-job and during a rollout
would help avoid the confusing situation of having more active jobs than
running compactors.
<img width="2420" height="800" alt="image"
src="https://github.com/user-attachments/assets/26a68ffd-0b33-4dba-80c4-40fd90d96bb7"
/>
My original attempt at fixing this was to check `if ctx.Err() != nil`,
then pass a different context, but that subtly never provided the grace
period if the shutdown happened during the final update itself. This
approach watches the context and starts a timer only if the context
cancellation occurs. This restricts when the deadline is imposed to not
get in front of the normal retry behavior.
#### Which issue(s) this PR fixes or relates to
Fixes N/A
#### Checklist
- [x] Tests updated.
- [ ] Documentation added.
- [ ] `CHANGELOG.md` updated - the order of entries should be
`[CHANGE]`, `[FEATURE]`, `[ENHANCEMENT]`, `[BUGFIX]`. If changelog entry
is not needed, please add the `changelog-not-needed` label to the PR.
- [ ]
[`about-versioning.md`](https://github.com/grafana/mimir/blob/main/docs/sources/mimir/configure/about-versioning.md)
updated with experimental features.
errJobCanceledByScheduler=errors.New("job canceled by scheduler")
46
+
errFinalStatusGracePeriodTimeout=errors.New("final status grace period timed out")
46
47
)
47
48
48
49
// compactionExecutor defines how compaction work is executed.
@@ -82,25 +83,27 @@ func (e *standaloneExecutor) stop() error {
82
83
}
83
84
84
85
var (
85
-
errInvalidSchedulerEndpoint=fmt.Errorf("invalid compactor.scheduler-client.scheduler-endpoint, required when compactor.scheduler-client.enabled is true")
86
-
errInvalidSchedulerUpdateInterval=fmt.Errorf("invalid compactor.scheduler-client.update-interval, interval must be positive")
87
-
errInvalidSchedulerLeasingMinBackoff=fmt.Errorf("invalid compactor.scheduler-client.leasing-min-backoff, must be positive")
88
-
errInvalidSchedulerLeasingMaxBackoff=fmt.Errorf("invalid compactor.scheduler-client.leasing-max-backoff, must be greater than min backoff")
89
-
errInvalidSchedulerUpdateMinBackoff=fmt.Errorf("invalid compactor.scheduler-client.update-min-backoff, must be positive")
90
-
errInvalidSchedulerUpdateMaxBackoff=fmt.Errorf("invalid compactor.scheduler-client.update-max-backoff, must be greater than min backoff")
86
+
errInvalidSchedulerEndpoint=fmt.Errorf("invalid compactor.scheduler-client.scheduler-endpoint, required when compactor.scheduler-client.enabled is true")
87
+
errInvalidSchedulerUpdateInterval=fmt.Errorf("invalid compactor.scheduler-client.update-interval, interval must be positive")
88
+
errInvalidSchedulerLeasingMinBackoff=fmt.Errorf("invalid compactor.scheduler-client.leasing-min-backoff, must be positive")
89
+
errInvalidSchedulerLeasingMaxBackoff=fmt.Errorf("invalid compactor.scheduler-client.leasing-max-backoff, must be greater than min backoff")
90
+
errInvalidSchedulerUpdateMinBackoff=fmt.Errorf("invalid compactor.scheduler-client.update-min-backoff, must be positive")
91
+
errInvalidSchedulerUpdateMaxBackoff=fmt.Errorf("invalid compactor.scheduler-client.update-max-backoff, must be greater than min backoff")
92
+
errInvalidSchedulerTerminatingFinalStatusTimeout=fmt.Errorf("invalid compactor.scheduler-client.terminating-final-status-timeout, must be positive")
f.DurationVar(&cfg.UpdateMinBackoff, flagPrefix+"update-min-backoff", 1*time.Second, "Minimum backoff time for compaction executor retries when sending scheduler status updates.")
114
117
f.DurationVar(&cfg.UpdateMaxBackoff, flagPrefix+"update-max-backoff", 32*time.Second, "Maximum backoff time for compaction executor retries when sending scheduler status updates.")
115
118
f.DurationVar(&cfg.CompactionDirCleanupInterval, flagPrefix+"compaction-dir-cleanup-interval", 30*time.Minute, "Defines how frequently to clean up the compaction working directory. The directory is cleaned on startup and then only when this interval has elapsed since the last cleanup. Set to 0 to disable periodic cleanup.")
119
+
f.DurationVar(&cfg.TerminatingFinalStatusTimeout, flagPrefix+"terminating-final-status-timeout", 30*time.Second, "Timeout for sending a final job status update to the scheduler when the parent context is canceled (e.g. during shutdown).")
0 commit comments