Skip to content

Commit eb355cc

Browse files
authored
Merge pull request #432 from buildkite/fix-limiter-token-tracking-again
Fix limiter token tracking (again)
2 parents 0fa2554 + 878ddf3 commit eb355cc

File tree

2 files changed

+51
-43
lines changed

2 files changed

+51
-43
lines changed

internal/controller/limiter/limiter.go

+35-36
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@ import (
55
"fmt"
66
"reflect"
77

8-
"github.com/buildkite/agent-stack-k8s/v2/internal/controller/config"
98
"github.com/buildkite/agent-stack-k8s/v2/internal/controller/model"
109

11-
"github.com/google/uuid"
1210
"go.uber.org/zap"
1311
batchv1 "k8s.io/api/batch/v1"
1412
"k8s.io/client-go/informers"
@@ -59,12 +57,13 @@ func New(logger *zap.Logger, scheduler model.JobHandler, maxInFlight int) *MaxIn
5957
func (l *MaxInFlight) RegisterInformer(ctx context.Context, factory informers.SharedInformerFactory) error {
6058
informer := factory.Batch().V1().Jobs()
6159
jobInformer := informer.Informer()
62-
if _, err := jobInformer.AddEventHandler(l); err != nil {
60+
reg, err := jobInformer.AddEventHandler(l)
61+
if err != nil {
6362
return err
6463
}
6564
go factory.Start(ctx.Done())
6665

67-
if !cache.WaitForCacheSync(ctx.Done(), jobInformer.HasSynced) {
66+
if !cache.WaitForCacheSync(ctx.Done(), reg.HasSynced) {
6867
return fmt.Errorf("failed to sync informer cache")
6968
}
7069

@@ -98,7 +97,7 @@ func (l *MaxInFlight) Handle(ctx context.Context, job model.Job) error {
9897
zap.String("uuid", job.Uuid),
9998
)
10099
if err := l.handler.Handle(ctx, job); err != nil {
101-
// Oh well. Return the token and un-record the job.
100+
// Oh well. Return the token.
102101
l.tryReturnToken()
103102

104103
l.logger.Debug("next handler failed",
@@ -111,54 +110,54 @@ func (l *MaxInFlight) Handle(ctx context.Context, job model.Job) error {
111110
}
112111

113112
// OnAdd is called by k8s to inform us a resource is added.
114-
func (l *MaxInFlight) OnAdd(obj any, _ bool) {
113+
func (l *MaxInFlight) OnAdd(obj any, inInitialList bool) {
115114
job, _ := obj.(*batchv1.Job)
116115
if job == nil {
117116
return
118117
}
119-
l.trackJob(job)
120-
l.logger.Debug("at end of OnAdd", zap.Int("tokens-available", len(l.tokenBucket)))
121-
}
122-
123-
// OnUpdate is called by k8s to inform us a resource is updated.
124-
func (l *MaxInFlight) OnUpdate(_, obj any) {
125-
job, _ := obj.(*batchv1.Job)
126-
if job == nil {
118+
if !inInitialList {
119+
// After sync is finished, the limiter handler takes tokens directly.
127120
return
128121
}
129-
l.trackJob(job)
130-
l.logger.Debug("at end of OnUpdate", zap.Int("tokens-available", len(l.tokenBucket)))
122+
// Before sync is finished: we're learning about existing jobs, so we should
123+
// (try to) take tokens for unfinished jobs started by a previous controller.
124+
// If it's added as already finished, no need to take a token for it.
125+
// Otherwise, try to take one, but don't block (in case the stack was
126+
// restarted with a different limit).
127+
if !model.JobFinished(job) {
128+
l.tryTakeToken()
129+
l.logger.Debug("existing not-finished job discovered", zap.Int("tokens-available", len(l.tokenBucket)))
130+
}
131131
}
132132

133-
// OnDelete is called by k8s to inform us a resource is deleted.
134-
func (l *MaxInFlight) OnDelete(obj any) {
135-
// The job condition at the point of deletion could be non-terminal, but
136-
// it is being deleted, so ignore it and skip to marking complete.
137-
// If buildkite.com/job-uuid label is missing or malformed, don't track it.
138-
job, _ := obj.(*batchv1.Job)
139-
if job == nil {
133+
// OnUpdate is called by k8s to inform us a resource is updated.
134+
func (l *MaxInFlight) OnUpdate(prev, curr any) {
135+
prevState, _ := prev.(*batchv1.Job)
136+
currState, _ := curr.(*batchv1.Job)
137+
if prevState == nil || currState == nil {
140138
return
141139
}
142-
l.trackJob(job)
143-
if _, err := uuid.Parse(job.Labels[config.UUIDLabel]); err != nil {
144-
return
140+
// Only take or return a token if the job state has *changed*.
141+
// The only valid change is from not-finished to finished.
142+
if !model.JobFinished(prevState) && model.JobFinished(currState) {
143+
l.tryReturnToken()
144+
l.logger.Debug("job state changed from not-finished to finished", zap.Int("tokens-available", len(l.tokenBucket)))
145145
}
146-
l.tryReturnToken()
147-
l.logger.Debug("at end of OnDelete", zap.Int("tokens-available", len(l.tokenBucket)))
148146
}
149147

150-
// trackJob is called by the k8s informer callbacks to update job state and
151-
// take/return tokens. It does the same thing for all three callbacks.
152-
func (l *MaxInFlight) trackJob(job *batchv1.Job) {
153-
// If buildkite.com/job-uuid label is missing or malformed, don't track it.
154-
if _, err := uuid.Parse(job.Labels[config.UUIDLabel]); err != nil {
148+
// OnDelete is called by k8s to inform us a resource is deleted.
149+
func (l *MaxInFlight) OnDelete(obj any) {
150+
prevState, _ := obj.(*batchv1.Job)
151+
if prevState == nil {
155152
return
156153
}
157154

158-
if model.JobFinished(job) {
155+
// OnDelete gives us the last-known state prior to deletion.
156+
// If that state was finished, we've already returned a token.
157+
// If that state was not-finished, we need to return a token now.
158+
if !model.JobFinished(prevState) {
159159
l.tryReturnToken()
160-
} else {
161-
l.tryTakeToken()
160+
l.logger.Debug("not-finished job was deleted", zap.Int("tokens-available", len(l.tokenBucket)))
162161
}
163162
}
164163

internal/controller/model/fake_scheduler.go

+16-7
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,23 @@ func (f *FakeScheduler) complete(uuid string) {
7171
f.Finished = append(f.Finished, uuid)
7272
f.mu.Unlock()
7373

74-
f.EventHandler.OnUpdate(nil, &batchv1.Job{
75-
ObjectMeta: metav1.ObjectMeta{
76-
Labels: map[string]string{config.UUIDLabel: uuid},
74+
f.EventHandler.OnUpdate(
75+
// Previous state
76+
&batchv1.Job{
77+
ObjectMeta: metav1.ObjectMeta{
78+
Labels: map[string]string{config.UUIDLabel: uuid},
79+
},
80+
// No status conditions
7781
},
78-
Status: batchv1.JobStatus{
79-
Conditions: []batchv1.JobCondition{{Type: batchv1.JobComplete}},
80-
},
81-
})
82+
// New state
83+
&batchv1.Job{
84+
ObjectMeta: metav1.ObjectMeta{
85+
Labels: map[string]string{config.UUIDLabel: uuid},
86+
},
87+
Status: batchv1.JobStatus{
88+
Conditions: []batchv1.JobCondition{{Type: batchv1.JobComplete}},
89+
},
90+
})
8291
f.wg.Done()
8392
}
8493

0 commit comments

Comments
 (0)