@@ -5,10 +5,8 @@ import (
5
5
"fmt"
6
6
"reflect"
7
7
8
- "github.com/buildkite/agent-stack-k8s/v2/internal/controller/config"
9
8
"github.com/buildkite/agent-stack-k8s/v2/internal/controller/model"
10
9
11
- "github.com/google/uuid"
12
10
"go.uber.org/zap"
13
11
batchv1 "k8s.io/api/batch/v1"
14
12
"k8s.io/client-go/informers"
@@ -59,12 +57,13 @@ func New(logger *zap.Logger, scheduler model.JobHandler, maxInFlight int) *MaxIn
59
57
func (l * MaxInFlight ) RegisterInformer (ctx context.Context , factory informers.SharedInformerFactory ) error {
60
58
informer := factory .Batch ().V1 ().Jobs ()
61
59
jobInformer := informer .Informer ()
62
- if _ , err := jobInformer .AddEventHandler (l ); err != nil {
60
+ reg , err := jobInformer .AddEventHandler (l )
61
+ if err != nil {
63
62
return err
64
63
}
65
64
go factory .Start (ctx .Done ())
66
65
67
- if ! cache .WaitForCacheSync (ctx .Done (), jobInformer .HasSynced ) {
66
+ if ! cache .WaitForCacheSync (ctx .Done (), reg .HasSynced ) {
68
67
return fmt .Errorf ("failed to sync informer cache" )
69
68
}
70
69
@@ -98,7 +97,7 @@ func (l *MaxInFlight) Handle(ctx context.Context, job model.Job) error {
98
97
zap .String ("uuid" , job .Uuid ),
99
98
)
100
99
if err := l .handler .Handle (ctx , job ); err != nil {
101
- // Oh well. Return the token and un-record the job .
100
+ // Oh well. Return the token.
102
101
l .tryReturnToken ()
103
102
104
103
l .logger .Debug ("next handler failed" ,
@@ -111,54 +110,54 @@ func (l *MaxInFlight) Handle(ctx context.Context, job model.Job) error {
111
110
}
112
111
113
112
// OnAdd is called by k8s to inform us a resource is added.
114
- func (l * MaxInFlight ) OnAdd (obj any , _ bool ) {
113
+ func (l * MaxInFlight ) OnAdd (obj any , inInitialList bool ) {
115
114
job , _ := obj .(* batchv1.Job )
116
115
if job == nil {
117
116
return
118
117
}
119
- l .trackJob (job )
120
- l .logger .Debug ("at end of OnAdd" , zap .Int ("tokens-available" , len (l .tokenBucket )))
121
- }
122
-
123
- // OnUpdate is called by k8s to inform us a resource is updated.
124
- func (l * MaxInFlight ) OnUpdate (_ , obj any ) {
125
- job , _ := obj .(* batchv1.Job )
126
- if job == nil {
118
+ if ! inInitialList {
119
+ // After sync is finished, the limiter handler takes tokens directly.
127
120
return
128
121
}
129
- l .trackJob (job )
130
- l .logger .Debug ("at end of OnUpdate" , zap .Int ("tokens-available" , len (l .tokenBucket )))
122
+ // Before sync is finished: we're learning about existing jobs, so we should
123
+ // (try to) take tokens for unfinished jobs started by a previous controller.
124
+ // If it's added as already finished, no need to take a token for it.
125
+ // Otherwise, try to take one, but don't block (in case the stack was
126
+ // restarted with a different limit).
127
+ if ! model .JobFinished (job ) {
128
+ l .tryTakeToken ()
129
+ l .logger .Debug ("existing not-finished job discovered" , zap .Int ("tokens-available" , len (l .tokenBucket )))
130
+ }
131
131
}
132
132
133
- // OnDelete is called by k8s to inform us a resource is deleted.
134
- func (l * MaxInFlight ) OnDelete (obj any ) {
135
- // The job condition at the point of deletion could be non-terminal, but
136
- // it is being deleted, so ignore it and skip to marking complete.
137
- // If buildkite.com/job-uuid label is missing or malformed, don't track it.
138
- job , _ := obj .(* batchv1.Job )
139
- if job == nil {
133
+ // OnUpdate is called by k8s to inform us a resource is updated.
134
+ func (l * MaxInFlight ) OnUpdate (prev , curr any ) {
135
+ prevState , _ := prev .(* batchv1.Job )
136
+ currState , _ := curr .(* batchv1.Job )
137
+ if prevState == nil || currState == nil {
140
138
return
141
139
}
142
- l .trackJob (job )
143
- if _ , err := uuid .Parse (job .Labels [config .UUIDLabel ]); err != nil {
144
- return
140
+ // Only take or return a token if the job state has *changed*.
141
+ // The only valid change is from not-finished to finished.
142
+ if ! model .JobFinished (prevState ) && model .JobFinished (currState ) {
143
+ l .tryReturnToken ()
144
+ l .logger .Debug ("job state changed from not-finished to finished" , zap .Int ("tokens-available" , len (l .tokenBucket )))
145
145
}
146
- l .tryReturnToken ()
147
- l .logger .Debug ("at end of OnDelete" , zap .Int ("tokens-available" , len (l .tokenBucket )))
148
146
}
149
147
150
- // trackJob is called by the k8s informer callbacks to update job state and
151
- // take/return tokens. It does the same thing for all three callbacks.
152
- func (l * MaxInFlight ) trackJob (job * batchv1.Job ) {
153
- // If buildkite.com/job-uuid label is missing or malformed, don't track it.
154
- if _ , err := uuid .Parse (job .Labels [config .UUIDLabel ]); err != nil {
148
+ // OnDelete is called by k8s to inform us a resource is deleted.
149
+ func (l * MaxInFlight ) OnDelete (obj any ) {
150
+ prevState , _ := obj .(* batchv1.Job )
151
+ if prevState == nil {
155
152
return
156
153
}
157
154
158
- if model .JobFinished (job ) {
155
+ // OnDelete gives us the last-known state prior to deletion.
156
+ // If that state was finished, we've already returned a token.
157
+ // If that state was not-finished, we need to return a token now.
158
+ if ! model .JobFinished (prevState ) {
159
159
l .tryReturnToken ()
160
- } else {
161
- l .tryTakeToken ()
160
+ l .logger .Debug ("not-finished job was deleted" , zap .Int ("tokens-available" , len (l .tokenBucket )))
162
161
}
163
162
}
164
163
0 commit comments