Skip to content

Commit f751db7

Browse files
authored
Merge pull request #3200 from buildkite/feat_force_shutdown
Change the signal handler to ensure the agent quits after the grace period
2 parents b24a168 + 861af80 commit f751db7

File tree

3 files changed

+36
-8
lines changed

3 files changed

+36
-8
lines changed

clicommand/agent_start.go

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/buildkite/agent/v3/internal/awslib"
3030
awssigner "github.com/buildkite/agent/v3/internal/cryptosigner/aws"
3131
"github.com/buildkite/agent/v3/internal/experiments"
32+
"github.com/buildkite/agent/v3/internal/job"
3233
"github.com/buildkite/agent/v3/internal/job/hook"
3334
"github.com/buildkite/agent/v3/internal/osutil"
3435
"github.com/buildkite/agent/v3/internal/shell"
@@ -62,6 +63,8 @@ Example:
6263
$ buildkite-agent start --token xxx`
6364

6465
var (
66+
minGracePeriod = 10
67+
6568
verificationFailureBehaviors = []string{agent.VerificationBehaviourBlock, agent.VerificationBehaviourWarn}
6669

6770
buildkiteSetEnvironmentVariables = []*regexp.Regexp{
@@ -785,6 +788,9 @@ var AgentStartCommand = cli.Command{
785788
))
786789
defer done()
787790

791+
// used later to force the shutdown of the agent
792+
ctx, cancel := context.WithCancel(ctx)
793+
788794
// Verify that the bootstrap buildkite-agent executable matches the current host agent
789795
// executable. In development builds, it exits on mismatch; otherwise it only logs a warning.
790796
// This is to avoid confusion when making changes to the buildkite-agent but forgetting to
@@ -1274,7 +1280,7 @@ var AgentStartCommand = cli.Command{
12741280
}
12751281

12761282
// Handle process signals
1277-
signals := handlePoolSignals(ctx, l, pool)
1283+
signals := handlePoolSignals(ctx, l, pool, cancel, cfg.CancelGracePeriod)
12781284
defer close(signals)
12791285

12801286
l.Info("Starting %d Agent(s)", cfg.Spawn)
@@ -1362,7 +1368,7 @@ func parseAndValidateJWKS(ctx context.Context, keysetType, path string) (jwk.Set
13621368
return jwks, nil
13631369
}
13641370

1365-
func handlePoolSignals(ctx context.Context, l logger.Logger, pool *agent.AgentPool) chan os.Signal {
1371+
func handlePoolSignals(ctx context.Context, l logger.Logger, pool *agent.AgentPool, cancel context.CancelFunc, cancelGracePeriod int) chan os.Signal {
13661372
signals := make(chan os.Signal, 1)
13671373
signal.Notify(signals, os.Interrupt,
13681374
syscall.SIGHUP,
@@ -1392,8 +1398,30 @@ func handlePoolSignals(ctx context.Context, l logger.Logger, pool *agent.AgentPo
13921398
l.Info("Received CTRL-C, send again to forcefully kill the agent(s)")
13931399
pool.Stop(true)
13941400
} else {
1395-
l.Info("Forcefully stopping running jobs and stopping the agent(s)")
1396-
pool.Stop(false)
1401+
l.Info("Forcefully stopping running jobs and stopping the agent(s) in %d seconds", cancelGracePeriod)
1402+
1403+
gracefulContext, _ := job.WithGracePeriod(ctx, time.Duration(max(cancelGracePeriod, minGracePeriod))*time.Second)
1404+
1405+
go func() {
1406+
l.Info("Forced agent(s) to stop")
1407+
pool.Stop(false) // one last chance to stop
1408+
1409+
// Wait half the grace period before cancelling the context
1410+
time.Sleep(time.Duration(max(cancelGracePeriod/2, minGracePeriod/2)) * time.Second)
1411+
1412+
l.Info("Cancelling all internal tasks and API requests")
1413+
cancel() // cancel the context to stop all network operations
1414+
}()
1415+
1416+
// Once pending retries and requests are cancelled,
1417+
// the main goroutine should exit before this grace period expires, ending the program.
1418+
// If that doesn't happen, exit 1 below.
1419+
<-gracefulContext.Done()
1420+
l.Info("exiting with status 1")
1421+
1422+
// this should only be called if the context cancellation and forceful stop fails
1423+
os.Exit(1)
1424+
13971425
}
13981426
default:
13991427
l.Debug("Ignoring signal `%s`", sig.String())

internal/job/executor.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {
145145

146146
// Listen for cancellation. Once ctx is cancelled, some tasks can run
147147
// afterwards during the signal grace period. These use graceCtx.
148-
graceCtx, graceCancel := withGracePeriod(ctx, e.SignalGracePeriod)
148+
graceCtx, graceCancel := WithGracePeriod(ctx, e.SignalGracePeriod)
149149
defer graceCancel()
150150
go func() {
151151
<-e.cancelCh
@@ -948,7 +948,7 @@ func (e *Executor) CommandPhase(ctx context.Context) (hookErr error, commandErr
948948
}
949949
// Because post-command hooks are often used for post-job cleanup, they
950950
// can run during the grace period.
951-
graceCtx, cancel := withGracePeriod(ctx, e.SignalGracePeriod)
951+
graceCtx, cancel := WithGracePeriod(ctx, e.SignalGracePeriod)
952952
defer cancel()
953953
hookErr = e.runPostCommandHooks(graceCtx)
954954
}()

internal/job/grace.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ import (
55
"time"
66
)
77

8-
// withGracePeriod returns a context that is cancelled some time *after* the
8+
// WithGracePeriod returns a context that is cancelled some time *after* the
99
// parent context is cancelled. In general this is not a good pattern, since it
1010
// breaks the usual connection between context cancellations and requires an
1111
// extra goroutine. However, we need to enforce the signal grace period from
1212
// within the job executor for use-cases where the executor is _not_ forked from
1313
// something else that will enforce the grace period (with SIGKILL).
14-
func withGracePeriod(ctx context.Context, graceTimeout time.Duration) (context.Context, context.CancelFunc) {
14+
func WithGracePeriod(ctx context.Context, graceTimeout time.Duration) (context.Context, context.CancelFunc) {
1515
gctx, cancel := context.WithCancelCause(context.WithoutCancel(ctx))
1616
go func() {
1717
<-ctx.Done()

0 commit comments

Comments
 (0)