Skip to content

Commit 637da43

Browse files
authored
Merge pull request #406 from buildkite/clean-up-pending-cancelled
Clean up pending pods for cancelled jobs
2 parents 54a1888 + 4a41f29 commit 637da43

File tree

10 files changed

+326
-80
lines changed

10 files changed

+326
-80
lines changed

charts/agent-stack-k8s/values.schema.json

+13-1
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@
174174
"image": {
175175
"type": "string",
176176
"default": "",
177-
"title": "The image Schema",
177+
"title": "The container image used to obtain buildkite-agent, and for running commands without any k8s-specific configuration. The default for each release of agent-stack-k8s is set to a version tag of gchr.io/buildkite/agent matching agent-stack-k8s's go.mod file",
178178
"examples": [""]
179179
},
180180
"debug": {
@@ -232,6 +232,18 @@
232232
},
233233
"examples": [["SECRET_RECIPE"]]
234234
},
235+
"image-pull-backoff-grace-period": {
236+
"type": "string",
237+
"default": "30s",
238+
"title": "Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled). Must be a Go duration string",
239+
"examples": ["60s"]
240+
},
241+
"job-cancel-checker-poll-interval": {
242+
"type": "string",
243+
"default": "5s",
244+
"title": "Controls the interval between job state queries while a pod is still Pending. Must be a Go duration string",
245+
"examples": ["10s"]
246+
},
235247
"prohibit-kubernetes-plugin": {
236248
"type": "boolean",
237249
"default": false,

cmd/controller/controller.go

+5
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ func AddConfigFlags(cmd *cobra.Command) {
9090
config.DefaultImagePullBackOffGracePeriod,
9191
"Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled)",
9292
)
93+
cmd.Flags().Duration(
94+
"job-cancel-checker-poll-interval",
95+
config.DefaultJobCancelCheckerPollInterval,
96+
"Controls the interval between job state queries while a pod is still Pending",
97+
)
9398
cmd.Flags().Bool(
9499
"prohibit-kubernetes-plugin",
95100
false,

cmd/controller/controller_test.go

+15-14
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,21 @@ func ptr[T any](v T) *T {
1919

2020
func TestReadAndParseConfig(t *testing.T) {
2121
expected := config.Config{
22-
Debug: true,
23-
AgentTokenSecret: "my-kubernetes-secret",
24-
BuildkiteToken: "my-graphql-enabled-token",
25-
Image: "my.registry.dev/buildkite-agent:latest",
26-
JobTTL: 300 * time.Second,
27-
ImagePullBackOffGradePeriod: 60 * time.Second,
28-
PollInterval: 5 * time.Second,
29-
MaxInFlight: 100,
30-
Namespace: "my-buildkite-ns",
31-
Org: "my-buildkite-org",
32-
Tags: []string{"queue=my-queue", "priority=high"},
33-
ClusterUUID: "beefcafe-abbe-baba-abba-deedcedecade",
34-
ProhibitKubernetesPlugin: true,
35-
GraphQLEndpoint: "http://graphql.buildkite.localhost/v1",
22+
Debug: true,
23+
AgentTokenSecret: "my-kubernetes-secret",
24+
BuildkiteToken: "my-graphql-enabled-token",
25+
Image: "my.registry.dev/buildkite-agent:latest",
26+
JobTTL: 300 * time.Second,
27+
ImagePullBackOffGracePeriod: 60 * time.Second,
28+
JobCancelCheckerPollInterval: 10 * time.Second,
29+
PollInterval: 5 * time.Second,
30+
MaxInFlight: 100,
31+
Namespace: "my-buildkite-ns",
32+
Org: "my-buildkite-org",
33+
Tags: []string{"queue=my-queue", "priority=high"},
34+
ClusterUUID: "beefcafe-abbe-baba-abba-deedcedecade",
35+
ProhibitKubernetesPlugin: true,
36+
GraphQLEndpoint: "http://graphql.buildkite.localhost/v1",
3637
AgentConfig: &config.AgentConfig{
3738
Endpoint: ptr("http://agent.buildkite.localhost/v3"),
3839
},

examples/config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ debug: true
33
image: my.registry.dev/buildkite-agent:latest
44
job-ttl: 5m
55
image-pull-backoff-grace-period: 60s
6+
job-cancel-checker-poll-interval: 10s
67
poll-interval: 5s
78
max-in-flight: 100
89
namespace: my-buildkite-ns

internal/controller/config/config.go

+13-10
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ import (
1111
)
1212

1313
const (
14-
UUIDLabel = "buildkite.com/job-uuid"
15-
BuildURLAnnotation = "buildkite.com/build-url"
16-
JobURLAnnotation = "buildkite.com/job-url"
17-
DefaultNamespace = "default"
18-
DefaultImagePullBackOffGracePeriod = 30 * time.Second
14+
UUIDLabel = "buildkite.com/job-uuid"
15+
BuildURLAnnotation = "buildkite.com/build-url"
16+
JobURLAnnotation = "buildkite.com/job-url"
17+
DefaultNamespace = "default"
18+
DefaultImagePullBackOffGracePeriod = 30 * time.Second
19+
DefaultJobCancelCheckerPollInterval = 5 * time.Second
1920
)
2021

2122
var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version()
@@ -40,10 +41,11 @@ type Config struct {
4041

4142
// ClusterUUID field is mandatory for most new orgs.
4243
// Some old orgs allows unclustered setup.
43-
ClusterUUID string `json:"cluster-uuid" validate:"omitempty"`
44-
AdditionalRedactedVars stringSlice `json:"additional-redacted-vars" validate:"omitempty"`
45-
PodSpecPatch *corev1.PodSpec `json:"pod-spec-patch" validate:"omitempty"`
46-
ImagePullBackOffGradePeriod time.Duration `json:"image-pull-backoff-grace-period" validate:"omitempty"`
44+
ClusterUUID string `json:"cluster-uuid" validate:"omitempty"`
45+
AdditionalRedactedVars stringSlice `json:"additional-redacted-vars" validate:"omitempty"`
46+
PodSpecPatch *corev1.PodSpec `json:"pod-spec-patch" validate:"omitempty"`
47+
ImagePullBackOffGracePeriod time.Duration `json:"image-pull-backoff-grace-period" validate:"omitempty"`
48+
JobCancelCheckerPollInterval time.Duration `json:"job-cancel-checker-poll-interval" validate:"omitempty"`
4749

4850
AgentConfig *AgentConfig `json:"agent-config" validate:"omitempty"`
4951
DefaultCheckoutParams *CheckoutParams `json:"default-checkout-params" validate:"omitempty"`
@@ -83,7 +85,8 @@ func (c Config) MarshalLogObject(enc zapcore.ObjectEncoder) error {
8385
if err := enc.AddReflected("pod-spec-patch", c.PodSpecPatch); err != nil {
8486
return err
8587
}
86-
enc.AddDuration("image-pull-backoff-grace-period", c.ImagePullBackOffGradePeriod)
88+
enc.AddDuration("image-pull-backoff-grace-period", c.ImagePullBackOffGracePeriod)
89+
enc.AddDuration("job-cancel-checker-poll-interval", c.JobCancelCheckerPollInterval)
8790
if err := enc.AddReflected("agent-config", c.AgentConfig); err != nil {
8891
return err
8992
}

internal/controller/controller.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,13 @@ func Run(
8080
logger.Fatal("failed to register completions informer", zap.Error(err))
8181
}
8282

83-
imagePullBackOffWatcher := scheduler.NewImagePullBackOffWatcher(
84-
logger.Named("imagePullBackoffWatcher"),
83+
podWatcher := scheduler.NewPodWatcher(
84+
logger.Named("podWatcher"),
8585
k8sClient,
8686
cfg,
8787
)
88-
if err := imagePullBackOffWatcher.RegisterInformer(ctx, informerFactory); err != nil {
89-
logger.Fatal("failed to register imagePullBackoffWatcher informer", zap.Error(err))
88+
if err := podWatcher.RegisterInformer(ctx, informerFactory); err != nil {
89+
logger.Fatal("failed to register podWatcher informer", zap.Error(err))
9090
}
9191

9292
select {

0 commit comments

Comments
 (0)