@@ -7,12 +7,12 @@ import (
7
7
"strconv"
8
8
"time"
9
9
10
- "github.com/rancher/system-upgrade-controller/pkg/apis/condition"
11
10
upgradeapi "github.com/rancher/system-upgrade-controller/pkg/apis/upgrade.cattle.io"
12
11
upgradejob "github.com/rancher/system-upgrade-controller/pkg/upgrade/job"
13
12
batchctlv1 "github.com/rancher/wrangler/v3/pkg/generated/controllers/batch/v1"
14
13
"github.com/sirupsen/logrus"
15
14
batchv1 "k8s.io/api/batch/v1"
15
+ corev1 "k8s.io/api/core/v1"
16
16
"k8s.io/apimachinery/pkg/api/errors"
17
17
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18
18
"k8s.io/apimachinery/pkg/labels"
@@ -81,21 +81,47 @@ func (ctl *Controller) handleJobs(ctx context.Context) error {
81
81
}
82
82
// if the job has failed enqueue-or-delete it depending on the TTL window
83
83
if upgradejob .ConditionFailed .IsTrue (obj ) {
84
- return obj , enqueueOrDelete (jobs , obj , upgradejob .ConditionFailed )
84
+ failedTime := upgradejob .ConditionFailed .GetLastTransitionTime (obj )
85
+ if failedTime .IsZero () {
86
+ return obj , fmt .Errorf ("condition %q missing field %q" , upgradejob .ConditionFailed , "LastTransitionTime" )
87
+ }
88
+ ctl .recorder .Eventf (plan , corev1 .EventTypeWarning , "JobFailed" , "Job failed on Node %s" , node .Name )
89
+ return obj , enqueueOrDelete (jobs , obj , failedTime )
85
90
}
86
91
// if the job has completed tag the node then enqueue-or-delete depending on the TTL window
87
92
if upgradejob .ConditionComplete .IsTrue (obj ) {
93
+ completeTime := upgradejob .ConditionComplete .GetLastTransitionTime (obj )
94
+ if completeTime .IsZero () {
95
+ return obj , fmt .Errorf ("condition %q missing field %q" , upgradejob .ConditionComplete , "LastTransitionTime" )
96
+ }
88
97
planLabel := upgradeapi .LabelPlanName (planName )
89
98
if planHash , ok := obj .Labels [planLabel ]; ok {
90
- node .Labels [planLabel ] = planHash
99
+ var delay time.Duration
100
+ if plan .Spec .PostCompleteDelay != nil {
101
+ delay = plan .Spec .PostCompleteDelay .Duration
102
+ }
103
+ // if the job has not been completed for the configured delay, re-enqueue
104
+ // it for processing once the delay has elapsed.
105
+ // the job's TTLSecondsAfterFinished is guaranteed to be set to a larger value
106
+ // than the plan's requested delay.
107
+ if interval := time .Now ().Sub (completeTime ); interval < delay {
108
+ logrus .Debugf ("Enqueing sync of Job %s/%s in %v" , obj .Namespace , obj .Name , delay - interval )
109
+ ctl .recorder .Eventf (plan , corev1 .EventTypeNormal , "JobCompleteWaiting" , "Job completed on Node %s, waiting %s PostCompleteDelay" , node .Name , delay )
110
+ jobs .EnqueueAfter (obj .Namespace , obj .Name , delay - interval )
111
+ } else {
112
+ ctl .recorder .Eventf (plan , corev1 .EventTypeNormal , "JobComplete" , "Job completed on Node %s" , node .Name )
113
+ node .Labels [planLabel ] = planHash
114
+ }
115
+ // mark the node as schedulable even if the delay has not elapsed, so that
116
+ // workloads can resume scheduling.
91
117
if node .Spec .Unschedulable && (plan .Spec .Cordon || plan .Spec .Drain != nil ) {
92
118
node .Spec .Unschedulable = false
93
119
}
94
120
if node , err = nodes .Update (node ); err != nil {
95
121
return obj , err
96
122
}
97
123
}
98
- return obj , enqueueOrDelete (jobs , obj , upgradejob . ConditionComplete )
124
+ return obj , enqueueOrDelete (jobs , obj , completeTime )
99
125
}
100
126
// if the job is hasn't failed or completed but the job Node is not on the applying list, consider it running out-of-turn and delete it
101
127
if i := sort .SearchStrings (plan .Status .Applying , nodeName ); i == len (plan .Status .Applying ) ||
@@ -108,12 +134,7 @@ func (ctl *Controller) handleJobs(ctx context.Context) error {
108
134
return nil
109
135
}
110
136
111
- func enqueueOrDelete (jobController batchctlv1.JobController , job * batchv1.Job , done condition.Cond ) error {
112
- lastTransitionTime := done .GetLastTransitionTime (job )
113
- if lastTransitionTime .IsZero () {
114
- return fmt .Errorf ("condition %q missing field %q" , done , "LastTransitionTime" )
115
- }
116
-
137
+ func enqueueOrDelete (jobController batchctlv1.JobController , job * batchv1.Job , lastTransitionTime time.Time ) error {
117
138
var ttlSecondsAfterFinished time.Duration
118
139
119
140
if job .Spec .TTLSecondsAfterFinished == nil {
0 commit comments