Skip to content

Commit dbe2cd8

Browse files
authored
Merge pull request #549 from buildkite/chore/informer_debug_logs
chore: add additional logging and metrics for limiter
2 parents 1769fe1 + d7fd91e commit dbe2cd8

File tree

3 files changed

+63
-0
lines changed

3 files changed

+63
-0
lines changed

internal/controller/limiter/limiter.go

+42
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,18 @@ func (l *MaxInFlight) RegisterInformer(ctx context.Context, factory informers.Sh
7373
return fmt.Errorf("failed to sync informer cache")
7474
}
7575

76+
informerStore := jobInformer.GetStore()
77+
informerCacheKeys := informerStore.ListKeys()
78+
l.logger.Debug("informer cache sync complete, dump informer cache keys...",
79+
zap.Int("cache-keys-found", len(informerCacheKeys)),
80+
)
81+
82+
for _, cacheKey := range informerCacheKeys {
83+
l.logger.Debug("informer cache key found",
84+
zap.String("informer-cache-key", fmt.Sprint(cacheKey)),
85+
)
86+
}
87+
7688
return nil
7789
}
7890

@@ -83,6 +95,17 @@ func (l *MaxInFlight) Handle(ctx context.Context, job model.Job) error {
8395
// Block until there's a token in the bucket, or cancel if the job
8496
// information becomes too stale.
8597
start := time.Now()
98+
99+
l.logger.Debug("Waiting for token",
100+
zap.String("job-uuid", job.Uuid),
101+
zap.Int("max-in-flight", l.MaxInFlight),
102+
zap.Int("available-tokens", len(l.tokenBucket)),
103+
zap.Int("running-jobs", l.MaxInFlight-len(l.tokenBucket)),
104+
)
105+
106+
waitingForTokenGauge.Inc()
107+
defer waitingForTokenGauge.Dec()
108+
86109
select {
87110
case <-ctx.Done():
88111
return context.Cause(ctx)
@@ -92,8 +115,11 @@ func (l *MaxInFlight) Handle(ctx context.Context, job model.Job) error {
92115
l.logger.Debug("Unable to create job before stale-job-data-timeout",
93116
zap.String("job-uuid", job.Uuid),
94117
zap.Int("max-in-flight", l.MaxInFlight),
118+
zap.Int("available-tokens", len(l.tokenBucket)),
95119
zap.Int("running-jobs", l.MaxInFlight-len(l.tokenBucket)),
120+
zap.Duration("wait-time", time.Since(start)),
96121
)
122+
jobStaleWhileWaitingCounter.Inc()
97123
return model.ErrStaleJob
98124

99125
case <-l.tokenBucket:
@@ -198,7 +224,15 @@ func (l *MaxInFlight) tryTakeToken(source string) {
198224
select {
199225
case <-l.tokenBucket:
200226
// Success.
227+
l.logger.Debug("Token taken successfully",
228+
zap.String("source", source),
229+
zap.Int("remaining-tokens", len(l.tokenBucket)),
230+
)
201231
default:
232+
l.logger.Debug("Failed to take token - bucket empty",
233+
zap.String("source", source),
234+
zap.Int("max-in-flight", l.MaxInFlight),
235+
)
202236
tokenUnderflowCounter.WithLabelValues(source).Inc()
203237
}
204238
}
@@ -208,7 +242,15 @@ func (l *MaxInFlight) tryReturnToken(source string) {
208242
select {
209243
case l.tokenBucket <- struct{}{}:
210244
// Success.
245+
l.logger.Debug("Token returned successfully",
246+
zap.String("source", source),
247+
zap.Int("available-tokens", len(l.tokenBucket)),
248+
)
211249
default:
250+
l.logger.Warn("Failed to return token - bucket full",
251+
zap.String("source", source),
252+
zap.Int("max-in-flight", l.MaxInFlight),
253+
)
212254
tokenOverflowCounter.WithLabelValues(source).Inc()
213255
}
214256
}

internal/controller/limiter/metrics.go

+14
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,20 @@ var (
3535
NativeHistogramZeroThreshold: 0.01,
3636
})
3737

38+
waitingForTokenGauge = promauto.NewGauge(prometheus.GaugeOpts{
39+
Namespace: promNamespace,
40+
Subsystem: promSubsystem,
41+
Name: "waiting_for_token",
42+
Help: "Number of jobs currently waiting for a token",
43+
})
44+
45+
jobStaleWhileWaitingCounter = promauto.NewCounter(prometheus.CounterOpts{
46+
Namespace: promNamespace,
47+
Subsystem: promSubsystem,
48+
Name: "jobs_stale_while_waiting_total",
49+
Help: "Count of jobs that became stale while waiting for a token",
50+
})
51+
3852
jobHandlerCallsCounter = promauto.NewCounter(prometheus.CounterOpts{
3953
Namespace: promNamespace,
4054
Subsystem: promSubsystem,

internal/controller/monitor/monitor.go

+7
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ func (r clusteredJobResp) CommandJobs() []*api.JobJobTypeCommand {
109109
// getScheduledCommandJobs calls either the clustered or unclustered GraphQL API
110110
// methods, depending on if a cluster uuid was provided in the config
111111
func (m *Monitor) getScheduledCommandJobs(ctx context.Context, queue string) (jobResp jobResp, err error) {
112+
m.logger.Debug("getting scheduled jobs via GQL query...",
113+
zap.Duration("poll-interval", m.cfg.PollInterval),
114+
)
115+
112116
jobQueryCounter.Inc()
113117
start := time.Now()
114118
defer func() {
@@ -274,6 +278,9 @@ func (m *Monitor) Start(ctx context.Context, handler model.JobHandler) <-chan er
274278
}
275279

276280
jobs := resp.CommandJobs()
281+
m.logger.Debug("jobs in GQL query response(s)...",
282+
zap.Int("jobs-processed", len(jobs)),
283+
)
277284
if len(jobs) == 0 {
278285
continue
279286
}

0 commit comments

Comments
 (0)