Skip to content

Backport of host volumes: add configuration to GC on node GC into release/1.10.x #25935

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/25903.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
client: Add gc_volumes_on_node_gc configuration to delete host volumes when nodes are garbage collected
```
1 change: 1 addition & 0 deletions api/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ type Node struct {
Events []*NodeEvent
Drivers map[string]*DriverInfo
HostVolumes map[string]*HostVolumeInfo
GCVolumesOnNodeGC bool
HostNetworks map[string]*HostNetworkInfo
CSIControllerPlugins map[string]*CSIInfo
CSINodePlugins map[string]*CSIInfo
Expand Down
2 changes: 2 additions & 0 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1595,6 +1595,8 @@ func (c *Client) setupNode() error {
node.HostVolumes[k] = v.Copy()
}
}
node.GCVolumesOnNodeGC = newConfig.GCVolumesOnNodeGC

if node.HostNetworks == nil {
if l := len(newConfig.HostNetworks); l != 0 {
node.HostNetworks = make(map[string]*structs.ClientHostNetworkConfig, l)
Expand Down
5 changes: 5 additions & 0 deletions client/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,11 @@ type Config struct {
// before garbage collection is triggered.
GCMaxAllocs int

// GCVolumesOnNodeGC indicates that the server should GC any dynamic host
// volumes on this node when the node is GC'd. This should only be set if
// you know that a GC'd node can never come back
GCVolumesOnNodeGC bool

// NoHostUUID disables using the host's UUID and will force generation of a
// random UUID.
NoHostUUID bool
Expand Down
2 changes: 2 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -946,6 +946,8 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) {
conf.GCDiskUsageThreshold = agentConfig.Client.GCDiskUsageThreshold
conf.GCInodeUsageThreshold = agentConfig.Client.GCInodeUsageThreshold
conf.GCMaxAllocs = agentConfig.Client.GCMaxAllocs
conf.GCVolumesOnNodeGC = agentConfig.Client.GCVolumesOnNodeGC

if agentConfig.Client.NoHostUUID != nil {
conf.NoHostUUID = *agentConfig.Client.NoHostUUID
} else {
Expand Down
8 changes: 8 additions & 0 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,11 @@ type ClientConfig struct {
// before garbage collection is triggered.
GCMaxAllocs int `hcl:"gc_max_allocs"`

// GCVolumesOnNodeGC indicates that the server should GC any dynamic host
// volumes on this node when the node is GC'd. This should only be set if
// you know that a GC'd node can never come back
GCVolumesOnNodeGC bool `hcl:"gc_volumes_on_node_gc"`

// NoHostUUID disables using the host's UUID and will force generation of a
// random UUID.
NoHostUUID *bool `hcl:"no_host_uuid"`
Expand Down Expand Up @@ -2543,6 +2548,9 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig {
if b.GCMaxAllocs != 0 {
result.GCMaxAllocs = b.GCMaxAllocs
}
if b.GCVolumesOnNodeGC {
result.GCVolumesOnNodeGC = b.GCVolumesOnNodeGC
}
// NoHostUUID defaults to true, merge if false
if b.NoHostUUID != nil {
result.NoHostUUID = b.NoHostUUID
Expand Down
1 change: 1 addition & 0 deletions command/agent/config_parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ var basicConfig = &Config{
GCDiskUsageThreshold: 82,
GCInodeUsageThreshold: 91,
GCMaxAllocs: 50,
GCVolumesOnNodeGC: true,
NoHostUUID: pointer.Of(false),
DisableRemoteExec: true,
HostVolumes: []*structs.ClientHostVolumeConfig{
Expand Down
12 changes: 12 additions & 0 deletions command/agent/host_volume_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package agent

import (
"net/http"
"strconv"
"strings"

"github.com/hashicorp/nomad/nomad/structs"
Expand Down Expand Up @@ -132,6 +133,17 @@ func (s *HTTPServer) hostVolumeDelete(id string, resp http.ResponseWriter, req *
args := structs.HostVolumeDeleteRequest{VolumeID: id}
s.parseWriteRequest(req, &args.WriteRequest)

raw := req.URL.Query().Get("force")
var force bool
if raw != "" {
var err error
force, err = strconv.ParseBool(raw)
if err != nil {
return nil, CodedError(400, "invalid force value")
}
}
args.Force = force

var out structs.HostVolumeDeleteResponse
if err := s.agent.RPC("HostVolume.Delete", &args, &out); err != nil {
return nil, err
Expand Down
1 change: 1 addition & 0 deletions command/agent/testdata/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ client {
gc_disk_usage_threshold = 82
gc_inode_usage_threshold = 91
gc_max_allocs = 50
gc_volumes_on_node_gc = true
no_host_uuid = false
disable_remote_exec = true

Expand Down
1 change: 1 addition & 0 deletions command/agent/testdata/basic.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
"gc_interval": "6s",
"gc_max_allocs": 50,
"gc_parallel_destroys": 6,
"gc_volumes_on_node_gc": true,
"host_volume": [
{
"tmp": [
Expand Down
2 changes: 1 addition & 1 deletion nomad/host_volume_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ func (v *HostVolume) Delete(args *structs.HostVolumeDeleteRequest, reply *struct
// serialize client RPC and raft write per volume ID
index, err := v.serializeCall(vol.ID, "delete", func() (uint64, error) {
if err := v.deleteVolume(vol); err != nil {
if structs.IsErrUnknownNode(err) {
if structs.IsErrUnknownNode(err) || structs.IsErrNoNodeConn(err) {
if !args.Force {
return 0, fmt.Errorf(
"volume cannot be removed from unknown node without force=true")
Expand Down
4 changes: 2 additions & 2 deletions nomad/state/events_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ func TestNodeEventsFromChanges(t *testing.T) {
return upsertNodeTxn(tx, tx.Index, testNode())
},
Mutate: func(s *StateStore, tx *txn) error {
return deleteNodeTxn(tx, tx.Index, []string{testNodeID()})
return s.deleteNodeTxn(tx, tx.Index, []string{testNodeID()})
},
WantEvents: []structs.Event{{
Topic: structs.TopicNode,
Expand All @@ -841,7 +841,7 @@ func TestNodeEventsFromChanges(t *testing.T) {
return upsertNodeTxn(tx, tx.Index, testNode(nodeIDTwo))
},
Mutate: func(s *StateStore, tx *txn) error {
return deleteNodeTxn(tx, tx.Index, []string{testNodeID(), testNodeIDTwo()})
return s.deleteNodeTxn(tx, tx.Index, []string{testNodeID(), testNodeIDTwo()})
},
WantEvents: []structs.Event{
{
Expand Down
9 changes: 7 additions & 2 deletions nomad/state/state_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -1052,14 +1052,14 @@ func (s *StateStore) DeleteNode(msgType structs.MessageType, index uint64, nodes
txn := s.db.WriteTxn(index)
defer txn.Abort()

err := deleteNodeTxn(txn, index, nodes)
err := s.deleteNodeTxn(txn, index, nodes)
if err != nil {
return nil
}
return txn.Commit()
}

func deleteNodeTxn(txn *txn, index uint64, nodes []string) error {
func (s *StateStore) deleteNodeTxn(txn *txn, index uint64, nodes []string) error {
if len(nodes) == 0 {
return fmt.Errorf("node ids missing")
}
Expand All @@ -1082,6 +1082,11 @@ func deleteNodeTxn(txn *txn, index uint64, nodes []string) error {
if err := deleteNodeCSIPlugins(txn, node, index); err != nil {
return fmt.Errorf("csi plugin delete failed: %v", err)
}
if node.GCVolumesOnNodeGC {
if err := s.deleteHostVolumesOnNode(txn, index, node.ID); err != nil {
return fmt.Errorf("dynamic host volume delete failed: %v", err)
}
}
}

if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
Expand Down
43 changes: 40 additions & 3 deletions nomad/state/state_store_host_volumes.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,20 @@ func (s *StateStore) DeleteHostVolume(index uint64, ns string, id string) error
txn := s.db.WriteTxnMsgT(structs.HostVolumeDeleteRequestType, index)
defer txn.Abort()

if err := s.deleteHostVolumeTxn(txn, index, ns, id); err != nil {
return err
}

if err := txn.Insert(tableIndex, &IndexEntry{TableHostVolumes, index}); err != nil {
return fmt.Errorf("index update failed: %w", err)
}

return txn.Commit()
}

// deleteHostVolumeTxn implements a single dynamic host volume delete. The
// caller is responsible for updating the index and committing the transaction.
func (s *StateStore) deleteHostVolumeTxn(txn *txn, index uint64, ns string, id string) error {
obj, err := txn.First(TableHostVolumes, indexID, ns, id)
if err != nil {
return err
Expand Down Expand Up @@ -153,12 +167,35 @@ func (s *StateStore) DeleteHostVolume(index uint64, ns string, id string) error
}
}

if err := txn.Insert(tableIndex, &IndexEntry{TableHostVolumes, index}); err != nil {
return fmt.Errorf("index update failed: %w", err)
return nil
}

func (s *StateStore) deleteHostVolumesOnNode(txn *txn, index uint64, nodeID string) error {
iter, err := s.HostVolumesByNodeID(nil, nodeID, SortDefault)
if err != nil {
return err
}

return txn.Commit()
deleted := false // only update index if there was a volume to delete
for {
raw := iter.Next()
if raw == nil {
break
}
vol := raw.(*structs.HostVolume)
err := s.deleteHostVolumeTxn(txn, index, vol.Namespace, vol.ID)
if err != nil {
return err
}
deleted = true
}
if deleted {
if err := txn.Insert(tableIndex, &IndexEntry{TableHostVolumes, index}); err != nil {
return fmt.Errorf("index update failed: %w", err)
}
}

return nil
}

// HostVolumes queries all the host volumes and is mostly used for
Expand Down
8 changes: 8 additions & 0 deletions nomad/state/state_store_host_volumes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ func TestStateStore_HostVolumes_CRUD(t *testing.T) {

// simulate a node registering one of the volumes
nodes[2] = nodes[2].Copy()
nodes[2].GCVolumesOnNodeGC = true
nodes[2].HostVolumes = map[string]*structs.ClientHostVolumeConfig{"example": {
Name: vols[2].Name,
Path: vols[2].HostPath,
Expand Down Expand Up @@ -180,6 +181,13 @@ func TestStateStore_HostVolumes_CRUD(t *testing.T) {
index++
must.NoError(t, store.UpdateAllocsFromClient(structs.MsgTypeTestSetup,
index, []*structs.Allocation{alloc}))

index++
must.NoError(t, store.DeleteNode(structs.MsgTypeTestSetup, index, []string{vol2.NodeID}))
iter, err = store.HostVolumesByNodeID(nil, vol2.NodeID, SortDefault)
must.NoError(t, err)
must.Nil(t, iter.Next(), must.Sprint("expected volume to be GC'd with node"))

for _, v := range vols {
index++
must.NoError(t, store.DeleteHostVolume(index, v.Namespace, v.ID))
Expand Down
5 changes: 5 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2157,6 +2157,11 @@ type Node struct {
// HostVolumes is a map of host volume names to their configuration
HostVolumes map[string]*ClientHostVolumeConfig

// GCVolumesOnNodeGC indicates that the server should GC any dynamic host
// volumes on this node when the node is GC'd. This should only be set if
// you know that a GC'd node can never come back
GCVolumesOnNodeGC bool

// HostNetworks is a map of host host_network names to their configuration
HostNetworks map[string]*ClientHostNetworkConfig

Expand Down
6 changes: 6 additions & 0 deletions website/content/docs/configuration/client.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,12 @@ client {
parallel destroys allowed by the garbage collector. This value should be
relatively low to avoid high resource usage during garbage collections.

- `gc_volumes_on_node_gc` `(bool: false)` - Specifies that the server should
delete any dynamic host volumes on this node when the node is garbage
collected. You should only set this to `true` if you know that garbage
collected nodes will never rejoin the cluster, such as with ephemeral cloud
hosts.

- `no_host_uuid` `(bool: true)` - By default a random node UUID will be
generated, but setting this to `false` will use the system's UUID.

Expand Down
Loading