Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/27167.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
server: Fix a potential race condition when reloading the server agent when changing TLS configuration
```
4 changes: 2 additions & 2 deletions nomad/rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,8 @@ func (r *rpcHandler) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCC
}

// Enforce TLS if EnableRPC is set
if r.srv.config.TLSConfig.EnableRPC && !rpcCtx.TLS && pool.RPCType(buf[0]) != pool.RpcTLS {
if !r.srv.config.TLSConfig.RPCUpgradeMode {
if r.srv.rpcTLSEnabled.Load() && !rpcCtx.TLS && pool.RPCType(buf[0]) != pool.RpcTLS {
if !r.srv.rpcTLSUpgradeMode.Load() {
r.logger.Warn("non-TLS connection attempted with RequireTLS set", "remote_addr", conn.RemoteAddr())
conn.Close()
return
Expand Down
41 changes: 37 additions & 4 deletions nomad/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,22 @@ const (
// Server is Nomad server which manages the job queues,
// schedulers, and notification bus for agents.
type Server struct {

// config holds the server configuration. No fields are not mutated after
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// config holds the server configuration. No fields are not mutated after
// config holds the server configuration. No fields are mutated after

// initialization except for the TLS config, so it is safe to access them
// without a lock. Any fields that become reloadable, must be carefully
// assessed for concurrent access.
config *Config

// rpcTLSEnabled and rpcTLSUpgradeMode are used by the RPC connection
// handler to determine TLS enforcement. The values can change at runtime
// due to config reloading.
//
// The setRPCTLSAtomics helper function should be used to set these values
// as it correctly handles nil configs.
rpcTLSEnabled atomic.Bool
rpcTLSUpgradeMode atomic.Bool

logger log.InterceptLogger

// Connection pool to other Nomad servers
Expand Down Expand Up @@ -350,6 +364,8 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, consulConfigFunc
// Create the server
s := &Server{
config: config,
rpcTLSEnabled: atomic.Bool{},
rpcTLSUpgradeMode: atomic.Bool{},
Comment on lines +367 to +368
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't these actually be pointer values to account for unset fields?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The zero value stored is false which is right for these two configuration parameters and they are always set by the setRPCTLSAtomics functions no matter the contents of the TLS configuration or if it is nil. The concrete bool means we can perform the Load() atomic call in the RPC hot path without needing additional checks for nils, which feels safer but also easier to understand.

It is possible to turn them into pointers, but the benefit seems that it would save us the space of 2 x uint32 (8 bytes) when they are not set while requiring additional logic to ensure we correctly handle the pointer objects.

consulCatalog: consulCatalog,
connPool: pool.NewPool(logger, serverRPCCache, serverMaxStreams, tlsWrap, config.RPCSessionConfig),
logger: logger,
Expand All @@ -372,6 +388,10 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, consulConfigFunc
lockDelayTimer: lock.NewDelayTimer(),
}

// Store the server TLS RPC settings that will be used by the connection
// handler.
s.setRPCTLSAtomics(config.TLSConfig)

s.shutdownCtx, s.shutdownCancel = context.WithCancel(context.Background())
s.shutdownCh = s.shutdownCtx.Done()

Expand Down Expand Up @@ -690,16 +710,19 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error {
s.tlsWrap = tlsWrap
s.tlsWrapLock.Unlock()

// Keeping configuration in sync is important for other places that require
// access to config information, such as rpc.go, where we decide on what kind
// of network connections to accept depending on the server configuration
// Ensure the config object is updated. This is done to keep things in sync
// only; all access to TLS enforcement should be done via the atomics which
// are updated below.
s.config.TLSConfig = newTLSConfig
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we still have a race condition here if we read from this configuration? Even if nothing else in the server reads this config object, it's going to get read during serialization for the Query Self API

Not having a RW mutex on the (*Server) GetConfig() method smells fishy to me, but it also seems like we have a lot of places currently where we read the config without going through that method. So I'm also open to fixing this definitely-is-a-bug and coming back to a more comprehensive look at how we access/update the config struct later.


// Update the RPC TLS atomics used for connection handling.
s.setRPCTLSAtomics(newTLSConfig)

// Kill any old listeners
s.rpcCancel()

// Update the authenticator, so any changes in TLS verification are applied.
s.auth.SetVerifyTLS(s.config.TLSConfig != nil && s.config.TLSConfig.EnableRPC && s.config.TLSConfig.VerifyServerHostname)
s.auth.SetVerifyTLS(s.config.TLSConfig != nil && s.rpcTLSEnabled.Load() && s.config.TLSConfig.VerifyServerHostname)

s.rpcTLS = incomingTLS
s.connPool.ReloadTLS(tlsWrap)
Expand Down Expand Up @@ -2088,6 +2111,16 @@ func (s *Server) setReplyQueryMeta(stateStore *state.StateStore, table string, r
return nil
}

// setRPCTLSAtomics is a helper function to set the atomic values for RPC TLS
// handling based on the provided config.
//
// This should be used whenever the RPC TLS config is changed to ensure as it
// correctly handles nil configs which might otherwise be missed.
func (s *Server) setRPCTLSAtomics(cfg *config.TLSConfig) {
s.rpcTLSEnabled.Store(cfg != nil && cfg.EnableRPC)
s.rpcTLSUpgradeMode.Store(cfg != nil && cfg.RPCUpgradeMode)
}

// Region returns the region of the server
func (s *Server) Region() string {
return s.config.Region
Expand Down
30 changes: 29 additions & 1 deletion nomad/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"path"
"runtime"
"strings"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -441,7 +442,7 @@ func TestServer_Reload_TLSConnections_PlaintextToTLS_OnlyRPC(t *testing.T) {

err := s1.reloadTLSConnections(newTLSConfig)
assert.Nil(err)
assert.True(s1.config.TLSConfig.EnableRPC)
must.True(t, s1.rpcTLSEnabled.Load())
assert.True(s1.config.TLSConfig.CertificateInfoIsEqual(newTLSConfig))

codec := rpcClient(t, s1)
Expand Down Expand Up @@ -702,3 +703,30 @@ func TestServer_PreventRaftDowngrade(t *testing.T) {
// Downgrading Raft should prevent the server from starting.
require.Error(t, err)
}

func TestServer_setRPCTLSAtomics(t *testing.T) {
ci.Parallel(t)

testServer := &Server{
rpcTLSEnabled: atomic.Bool{},
rpcTLSUpgradeMode: atomic.Bool{},
}

testServer.setRPCTLSAtomics(nil)
must.False(t, testServer.rpcTLSEnabled.Load())
must.False(t, testServer.rpcTLSUpgradeMode.Load())

testServer.setRPCTLSAtomics(&config.TLSConfig{
EnableRPC: true,
RPCUpgradeMode: false,
})
must.True(t, testServer.rpcTLSEnabled.Load())
must.False(t, testServer.rpcTLSUpgradeMode.Load())

testServer.setRPCTLSAtomics(&config.TLSConfig{
EnableRPC: true,
RPCUpgradeMode: true,
})
must.True(t, testServer.rpcTLSEnabled.Load())
must.True(t, testServer.rpcTLSUpgradeMode.Load())
}