Skip to content

Commit b19e140

Browse files
feat(RELEASE-2120): add managed pipeline retry with mitigations
Retry managed PipelineRuns that fail with OOM or timeout. Generic errors are not retried. *Add EnsureManagedPipelineProcessingIsCompleted operation for retry or mark release as failed *Keep managed condition as Progressing during retries *Apply memory and timeout mitigations on each retry using the failed PipelineRun specs as the base *Bump pipeline timeouts when task timeout grows *Add attempt label for PipelineRun lookup *Handle cleanup across multiple PipelineRun attempts *Rename ManagedPipelineAttempt to PipelineAttempt *Add GetRoleBindingFromPipelineAttempt and GetReleasePipelineRunAttempt to loader Assisted-by: Claude Signed-off-by: Sean Conroy <sconroy@redhat.com>
1 parent 3e0d009 commit b19e140

20 files changed

Lines changed: 1421 additions & 157 deletions

api/v1alpha1/release_types.go

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ type ReleaseStatus struct {
8787

8888
// ManagedPipelineAttempts contains information about each attempt of the release managed pipeline processing
8989
// +optional
90-
ManagedPipelineAttempts []ManagedPipelineAttempt `json:"managedPipelineAttempts,omitempty"`
90+
ManagedPipelineAttempts []PipelineAttempt `json:"managedPipelineAttempts,omitempty"`
9191

9292
// TenantProcessing contains information about the release tenant processing
9393
// +optional
@@ -178,8 +178,8 @@ type PipelineInfo struct {
178178
StartTime *metav1.Time `json:"startTime,omitempty"`
179179
}
180180

181-
// ManagedPipelineAttempt defines the observed state of a managed pipeline processing attempt
182-
type ManagedPipelineAttempt struct {
181+
// PipelineAttempt defines the observed state of a pipeline processing attempt
182+
type PipelineAttempt struct {
183183
// PipelineRun contains the namespaced name of the managed Release PipelineRun executed as part of this attempt
184184
// +kubebuilder:validation:Pattern=^[a-z0-9]([-a-z0-9]*[a-z0-9])?\/[a-z0-9]([-a-z0-9]*[a-z0-9])?$
185185
// +optional
@@ -260,14 +260,19 @@ func (r *Release) HasManagedPipelineProcessingFinished() bool {
260260
return r.hasPhaseFinished(managedProcessedConditionType)
261261
}
262262

263-
// GetCurrentManagedPipelineAttempt returns a pointer to the last ManagedPipelineAttempt or nil if none exist.
264-
func (r *Release) GetCurrentManagedPipelineAttempt() *ManagedPipelineAttempt {
263+
// GetCurrentManagedPipelineAttempt returns a pointer to the last PipelineAttempt or nil if none exist.
264+
func (r *Release) GetCurrentManagedPipelineAttempt() *PipelineAttempt {
265265
if len(r.Status.ManagedPipelineAttempts) <= 0 {
266266
return nil
267267
}
268268
return &r.Status.ManagedPipelineAttempts[len(r.Status.ManagedPipelineAttempts)-1]
269269
}
270270

271+
// GetManagedPipelineRetryCount returns the number of attempts performed so far.
272+
func (r *Release) GetManagedPipelineRetryCount() int {
273+
return max(0, len(r.Status.ManagedPipelineAttempts)-1)
274+
}
275+
271276
// HasTenantCollectorsPipelineProcessingFinished checks whether the Release Tenant Collectors Pipeline processing has finished, regardless of the result.
272277
func (r *Release) HasTenantCollectorsPipelineProcessingFinished() bool {
273278
return r.hasPhaseFinished(tenantCollectorsProcessedConditionType)
@@ -320,6 +325,28 @@ func (r *Release) IsManagedPipelineProcessedSuccessfully() bool {
320325
return meta.IsStatusConditionTrue(r.Status.Conditions, managedProcessedConditionType.String())
321326
}
322327

328+
// IsCurrentManagedPipelineAttemptDone checks whether the current managed pipeline attempt has completed.
329+
func (r *Release) IsCurrentManagedPipelineAttemptDone() bool {
330+
attempt := r.GetCurrentManagedPipelineAttempt()
331+
return attempt != nil && (attempt.Status == AttemptFailedReason || attempt.Status == AttemptSucceededReason)
332+
}
333+
334+
// IsCurrentManagedPipelineAttemptFailed checks whether the current managed pipeline attempt has failed.
335+
func (r *Release) IsCurrentManagedPipelineAttemptFailed() bool {
336+
attempt := r.GetCurrentManagedPipelineAttempt()
337+
return attempt != nil && attempt.Status == AttemptFailedReason
338+
}
339+
340+
// IsCurrentManagedPipelineAttemptRetriable checks whether the current managed pipeline attempt
341+
// failed with a retriable reason.
342+
func (r *Release) IsCurrentManagedPipelineAttemptRetriable() bool {
343+
attempt := r.GetCurrentManagedPipelineAttempt()
344+
return attempt != nil && attempt.Status == AttemptFailedReason &&
345+
(attempt.FailureReason == AttemptFailureOOMKillReason ||
346+
attempt.FailureReason == AttemptFailureTaskRunTimeoutReason ||
347+
attempt.FailureReason == AttemptFailurePipelineRunTimeoutReason)
348+
}
349+
323350
// IsTenantCollectorsPipelineProcessedSuccessfully checks whether the Release Tenant Collectors Pipeline was successfully processed.
324351
func (r *Release) IsTenantCollectorsPipelineProcessedSuccessfully() bool {
325352
return meta.IsStatusConditionTrue(r.Status.Conditions, tenantCollectorsProcessedConditionType.String())
@@ -443,6 +470,10 @@ func (r *Release) MarkCurrentManagedPipelineAttemptProcessed() {
443470
return
444471
}
445472

473+
if r.IsCurrentManagedPipelineAttemptDone() {
474+
return
475+
}
476+
446477
attempt := r.GetCurrentManagedPipelineAttempt()
447478
if attempt == nil {
448479
return
@@ -590,10 +621,12 @@ func (r *Release) MarkCurrentManagedPipelineAttemptProcessing() {
590621
return
591622
}
592623

593-
if !r.IsManagedPipelineProcessing() {
624+
if attempt.StartTime == nil {
594625
attempt.StartTime = &metav1.Time{Time: time.Now()}
595626
// Deprecated: mirror to ManagedProcessing for backward compatibility
596-
r.Status.ManagedProcessing.StartTime = attempt.StartTime
627+
if !r.IsManagedPipelineProcessing() {
628+
r.Status.ManagedProcessing.StartTime = attempt.StartTime
629+
}
597630
}
598631

599632
attempt.Status = AttemptProgressingReason
@@ -694,6 +727,10 @@ func (r *Release) MarkCurrentManagedPipelineAttemptFailed(message, failureReason
694727
return
695728
}
696729

730+
if r.IsCurrentManagedPipelineAttemptDone() {
731+
return
732+
}
733+
697734
attempt := r.GetCurrentManagedPipelineAttempt()
698735
if attempt == nil {
699736
return
@@ -709,8 +746,6 @@ func (r *Release) MarkCurrentManagedPipelineAttemptFailed(message, failureReason
709746
// Deprecated: mirror to ManagedProcessing for backward compatibility
710747
r.Status.ManagedProcessing.CompletionTime = attempt.CompletionTime
711748

712-
conditions.SetConditionWithMessage(&r.Status.Conditions, managedProcessedConditionType, metav1.ConditionFalse, FailedReason, message)
713-
714749
go metrics.RegisterCompletedReleasePipelineProcessing(
715750
attempt.StartTime,
716751
attempt.CompletionTime,

0 commit comments

Comments
 (0)