From d26b14fa156d70823d546e6297dfce785886b4da Mon Sep 17 00:00:00 2001 From: James Rasell Date: Fri, 28 Nov 2025 08:29:43 +0000 Subject: [PATCH 1/2] server: Fix potential race condition in TLS reload and RPC handler The Nomad server agent can perform configuration reloads of certain items, notably the TLS configuration when it is sent a SIGHUP signal. If the TLS configuration has changed, the server will overwrite the existing in-memory config object and reload its TLS connections. The RPC connection handler reads two values from the in-memory TLS configuration in order to understand how to correctly handle incoming TLS connections. This can cause a data race condition with the config reload where the config is written at the same time RPC handler reads the boolean values. One potential fix would be to wrap the entire server config object in a mutex and have all read/writers use this. This, however, seems inefficient as only two TLS config options are concurrently read and the server makes liberal use of reading other config options that are only written at server initialization. The change introduces two new server object atomics which hold the boolean TLS values used by the RPC connection handler. These atomics give thread safe read/write access while being tightly scoped to the servers current requirements. --- nomad/rpc.go | 4 ++-- nomad/server.go | 41 +++++++++++++++++++++++++++++++++++++---- nomad/server_test.go | 30 +++++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 7 deletions(-) diff --git a/nomad/rpc.go b/nomad/rpc.go index d2e7a359f13..4a997aa2b32 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -301,8 +301,8 @@ func (r *rpcHandler) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCC } // Enforce TLS if EnableRPC is set - if r.srv.config.TLSConfig.EnableRPC && !rpcCtx.TLS && pool.RPCType(buf[0]) != pool.RpcTLS { - if !r.srv.config.TLSConfig.RPCUpgradeMode { + if r.srv.rpcTLSEnabled.Load() && !rpcCtx.TLS && pool.RPCType(buf[0]) != pool.RpcTLS { + if !r.srv.rpcTLSUpgradeMode.Load() { r.logger.Warn("non-TLS connection attempted with RequireTLS set", "remote_addr", conn.RemoteAddr()) conn.Close() return diff --git a/nomad/server.go b/nomad/server.go index 912bc2c1b22..5ac3b7af8b4 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -105,8 +105,22 @@ const ( // Server is Nomad server which manages the job queues, // schedulers, and notification bus for agents. type Server struct { + + // config holds the server configuration. No fields are not mutated after + // initialization except for the TLS config, so it is safe to access them + // without a lock. Any fields that become reloadable, must be carefully + // assessed for concurrent access. config *Config + // rpcTLSEnabled and rpcTLSUpgradeMode are used by the RPC connection + // handler to determine TLS enforcement. The values can change at runtime + // due to config reloading. + // + // The setRPCTLSAtomics helper function should be used to set these values + // as it correctly handles nil configs. + rpcTLSEnabled atomic.Bool + rpcTLSUpgradeMode atomic.Bool + logger log.InterceptLogger // Connection pool to other Nomad servers @@ -350,6 +364,8 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, consulConfigFunc // Create the server s := &Server{ config: config, + rpcTLSEnabled: atomic.Bool{}, + rpcTLSUpgradeMode: atomic.Bool{}, consulCatalog: consulCatalog, connPool: pool.NewPool(logger, serverRPCCache, serverMaxStreams, tlsWrap, config.RPCSessionConfig), logger: logger, @@ -372,6 +388,10 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, consulConfigFunc lockDelayTimer: lock.NewDelayTimer(), } + // Store the server TLS RPC settings that will be used by the connection + // handler. + s.setRPCTLSAtomics(config.TLSConfig) + s.shutdownCtx, s.shutdownCancel = context.WithCancel(context.Background()) s.shutdownCh = s.shutdownCtx.Done() @@ -690,16 +710,19 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { s.tlsWrap = tlsWrap s.tlsWrapLock.Unlock() - // Keeping configuration in sync is important for other places that require - // access to config information, such as rpc.go, where we decide on what kind - // of network connections to accept depending on the server configuration + // Ensure the config object is updated. This is done to keep things in sync + // only; all access to TLS enforcement should be done via the atomics which + // are updated below. s.config.TLSConfig = newTLSConfig + // Update the RPC TLS atomics used for connection handling. + s.setRPCTLSAtomics(newTLSConfig) + // Kill any old listeners s.rpcCancel() // Update the authenticator, so any changes in TLS verification are applied. - s.auth.SetVerifyTLS(s.config.TLSConfig != nil && s.config.TLSConfig.EnableRPC && s.config.TLSConfig.VerifyServerHostname) + s.auth.SetVerifyTLS(s.config.TLSConfig != nil && s.rpcTLSEnabled.Load() && s.config.TLSConfig.VerifyServerHostname) s.rpcTLS = incomingTLS s.connPool.ReloadTLS(tlsWrap) @@ -2088,6 +2111,16 @@ func (s *Server) setReplyQueryMeta(stateStore *state.StateStore, table string, r return nil } +// setRPCTLSAtomics is a helper function to set the atomic values for RPC TLS +// handling based on the provided config. +// +// This should be used whenever the RPC TLS config is changed to ensure as it +// correctly handles nil configs which might otherwise be missed. +func (s *Server) setRPCTLSAtomics(cfg *config.TLSConfig) { + s.rpcTLSEnabled.Store(cfg != nil && cfg.EnableRPC) + s.rpcTLSUpgradeMode.Store(cfg != nil && cfg.RPCUpgradeMode) +} + // Region returns the region of the server func (s *Server) Region() string { return s.config.Region diff --git a/nomad/server_test.go b/nomad/server_test.go index ea490c6e48b..2a933b6fcfa 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -9,6 +9,7 @@ import ( "path" "runtime" "strings" + "sync/atomic" "testing" "time" @@ -441,7 +442,7 @@ func TestServer_Reload_TLSConnections_PlaintextToTLS_OnlyRPC(t *testing.T) { err := s1.reloadTLSConnections(newTLSConfig) assert.Nil(err) - assert.True(s1.config.TLSConfig.EnableRPC) + must.True(t, s1.rpcTLSEnabled.Load()) assert.True(s1.config.TLSConfig.CertificateInfoIsEqual(newTLSConfig)) codec := rpcClient(t, s1) @@ -702,3 +703,30 @@ func TestServer_PreventRaftDowngrade(t *testing.T) { // Downgrading Raft should prevent the server from starting. require.Error(t, err) } + +func TestServer_setRPCTLSAtomics(t *testing.T) { + ci.Parallel(t) + + testServer := &Server{ + rpcTLSEnabled: atomic.Bool{}, + rpcTLSUpgradeMode: atomic.Bool{}, + } + + testServer.setRPCTLSAtomics(nil) + must.False(t, testServer.rpcTLSEnabled.Load()) + must.False(t, testServer.rpcTLSUpgradeMode.Load()) + + testServer.setRPCTLSAtomics(&config.TLSConfig{ + EnableRPC: true, + RPCUpgradeMode: false, + }) + must.True(t, testServer.rpcTLSEnabled.Load()) + must.False(t, testServer.rpcTLSUpgradeMode.Load()) + + testServer.setRPCTLSAtomics(&config.TLSConfig{ + EnableRPC: true, + RPCUpgradeMode: true, + }) + must.True(t, testServer.rpcTLSEnabled.Load()) + must.True(t, testServer.rpcTLSUpgradeMode.Load()) +} From aba9df734b0eaa27fd723a8c5f237ccd2cc251c5 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Fri, 28 Nov 2025 14:03:49 +0000 Subject: [PATCH 2/2] changelog: add entry for #27167 --- .changelog/27167.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/27167.txt diff --git a/.changelog/27167.txt b/.changelog/27167.txt new file mode 100644 index 00000000000..9a005b5da1c --- /dev/null +++ b/.changelog/27167.txt @@ -0,0 +1,3 @@ +```release-note:bug +server: Fix a potential race condition when reloading the server agent when changing TLS configuration +```