@@ -29,6 +29,7 @@ import (
2929 "github.com/buildkite/agent/v3/internal/awslib"
3030 awssigner "github.com/buildkite/agent/v3/internal/cryptosigner/aws"
3131 "github.com/buildkite/agent/v3/internal/experiments"
32+ "github.com/buildkite/agent/v3/internal/job"
3233 "github.com/buildkite/agent/v3/internal/job/hook"
3334 "github.com/buildkite/agent/v3/internal/osutil"
3435 "github.com/buildkite/agent/v3/internal/shell"
@@ -62,6 +63,8 @@ Example:
6263 $ buildkite-agent start --token xxx`
6364
6465var (
66+ minGracePeriod = 10
67+
6568 verificationFailureBehaviors = []string {agent .VerificationBehaviourBlock , agent .VerificationBehaviourWarn }
6669
6770 buildkiteSetEnvironmentVariables = []* regexp.Regexp {
@@ -785,6 +788,9 @@ var AgentStartCommand = cli.Command{
785788 ))
786789 defer done ()
787790
791+ // used later to force the shutdown of the agent
792+ ctx , cancel := context .WithCancel (ctx )
793+
788794 // Verify that the bootstrap buildkite-agent executable matches the current host agent
789795 // executable. In development builds, it exits on mismatch; otherwise it only logs a warning.
790796 // This is to avoid confusion when making changes to the buildkite-agent but forgetting to
@@ -1274,7 +1280,7 @@ var AgentStartCommand = cli.Command{
12741280 }
12751281
12761282 // Handle process signals
1277- signals := handlePoolSignals (ctx , l , pool )
1283+ signals := handlePoolSignals (ctx , l , pool , cancel , cfg . CancelGracePeriod )
12781284 defer close (signals )
12791285
12801286 l .Info ("Starting %d Agent(s)" , cfg .Spawn )
@@ -1362,7 +1368,7 @@ func parseAndValidateJWKS(ctx context.Context, keysetType, path string) (jwk.Set
13621368 return jwks , nil
13631369}
13641370
1365- func handlePoolSignals (ctx context.Context , l logger.Logger , pool * agent.AgentPool ) chan os.Signal {
1371+ func handlePoolSignals (ctx context.Context , l logger.Logger , pool * agent.AgentPool , cancel context. CancelFunc , cancelGracePeriod int ) chan os.Signal {
13661372 signals := make (chan os.Signal , 1 )
13671373 signal .Notify (signals , os .Interrupt ,
13681374 syscall .SIGHUP ,
@@ -1392,8 +1398,30 @@ func handlePoolSignals(ctx context.Context, l logger.Logger, pool *agent.AgentPo
13921398 l .Info ("Received CTRL-C, send again to forcefully kill the agent(s)" )
13931399 pool .Stop (true )
13941400 } else {
1395- l .Info ("Forcefully stopping running jobs and stopping the agent(s)" )
1396- pool .Stop (false )
1401+ l .Info ("Forcefully stopping running jobs and stopping the agent(s) in %d seconds" , cancelGracePeriod )
1402+
1403+ gracefulContext , _ := job .WithGracePeriod (ctx , time .Duration (max (cancelGracePeriod , minGracePeriod ))* time .Second )
1404+
1405+ go func () {
1406+ l .Info ("Forced agent(s) to stop" )
1407+ pool .Stop (false ) // one last chance to stop
1408+
1409+ // Wait half the grace period before cancelling the context
1410+ time .Sleep (time .Duration (max (cancelGracePeriod / 2 , minGracePeriod / 2 )) * time .Second )
1411+
1412+ l .Info ("Cancelling all internal tasks and API requests" )
1413+ cancel () // cancel the context to stop all network operations
1414+ }()
1415+
1416+ // Once pending retries and requests are cancelled,
1417+ // the main goroutine should exit before this grace period expires, ending the program.
1418+ // If that doesn't happen, exit 1 below.
1419+ <- gracefulContext .Done ()
1420+ l .Info ("exiting with status 1" )
1421+
1422+ // this should only be called if the context cancellation and forceful stop fails
1423+ os .Exit (1 )
1424+
13971425 }
13981426 default :
13991427 l .Debug ("Ignoring signal `%s`" , sig .String ())
0 commit comments