Skip to content

Commit 7d4f057

Browse files
authored
Merge pull request #748 from buildkite/ps-1220-set-signal-cancel-grace-period
Set {signal,cancel} grace period agent config
2 parents e0dfb3e + 885759f commit 7d4f057

File tree

1 file changed

+27
-9
lines changed

1 file changed

+27
-9
lines changed

internal/controller/scheduler/scheduler.go

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -630,15 +630,33 @@ func (w *worker) Build(podSpec *corev1.PodSpec, skipCheckout bool, inputs buildI
630630
// Set log collection grace period based on termination grace period.
631631
// This allows the agent to coordinate log collection timing with pod termination.
632632
if podSpec.TerminationGracePeriodSeconds != nil {
633-
// Calculate log collection grace period: termination grace period minus buffer for cleanup
634-
logCollectionGracePeriod := *podSpec.TerminationGracePeriodSeconds - 10
635-
if logCollectionGracePeriod < 1 {
636-
logCollectionGracePeriod = 1 // Minimum 1 second
637-
}
638-
agentContainer.Env = append(agentContainer.Env, corev1.EnvVar{
639-
Name: "BUILDKITE_KUBERNETES_LOG_COLLECTION_GRACE_PERIOD",
640-
Value: fmt.Sprintf("%ds", logCollectionGracePeriod),
641-
})
633+
termGraceSecs := int(*podSpec.TerminationGracePeriodSeconds)
634+
// When the agent cancels the job (e.g. when it receives SIGTERM), it
635+
// first interrupts the job, waits for signal-grace-period, then
636+
// terminates the job before waiting the remainder of the cancel-grace-
637+
// period.
638+
// When the pod is deleted, Kubernetes first sends SIGTERM to all
639+
// containers. From Agent v3.110.0, kubernetes-bootstrap absorbs that
640+
// signal and instead waits for the agent container to send an interrupt
641+
// over the socket.
642+
// Kubernetes will then take care of killing the job after
643+
// TerminationGracePeriodSeconds is up. But we should still configure
644+
// signal-grace-period so that the agent has time to upload logs and
645+
// mark the job as finished, and also cancel-grace-period in case it
646+
// needs to know how long it has left.
647+
signalGracePeriod := max(0, termGraceSecs-10)
648+
// Note that the agent requires cancelGracePeriod > signalGracePeriod.
649+
cancelGracePeriod := max(termGraceSecs, 1)
650+
agentContainer.Env = append(agentContainer.Env,
651+
corev1.EnvVar{
652+
Name: "BUILDKITE_CANCEL_GRACE_PERIOD",
653+
Value: strconv.Itoa(cancelGracePeriod),
654+
},
655+
corev1.EnvVar{
656+
Name: "BUILDKITE_SIGNAL_GRACE_PERIOD_SECONDS",
657+
Value: strconv.Itoa(signalGracePeriod),
658+
},
659+
)
642660
}
643661

644662
// Append some agent config and checkout config to the agent container.

0 commit comments

Comments
 (0)