From 0e9cce54f439496f2b3ea00a52d436094900dee4 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Wed, 16 Jul 2025 16:34:18 +0000 Subject: [PATCH 1/3] Add support for using DNS names instead of raw IPs for imex daemons Signed-off-by: Kevin Klues --- cmd/compute-domain-daemon/dnsnames.go | 229 ++++++++++++++++++++++++++ cmd/compute-domain-daemon/main.go | 64 ++++++- cmd/compute-domain-daemon/process.go | 10 +- pkg/featuregates/featuregates.go | 10 ++ 4 files changed, 303 insertions(+), 10 deletions(-) create mode 100644 cmd/compute-domain-daemon/dnsnames.go diff --git a/cmd/compute-domain-daemon/dnsnames.go b/cmd/compute-domain-daemon/dnsnames.go new file mode 100644 index 000000000..c0de78c7a --- /dev/null +++ b/cmd/compute-domain-daemon/dnsnames.go @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "fmt" + "maps" + "os" + "path/filepath" + "strings" + "sync" + + "k8s.io/klog/v2" + + nvapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1" +) + +const ( + maxDNSNames = 18 + hostsFilePath = "/etc/hosts" + dnsNamePrefix = "compute-domain-daemon-" + dnsNameFormat = dnsNamePrefix + "%d" +) + +// IPToDNSNameMap holds a map of IP Addresses to DNS names. +type IPToDNSNameMap map[string]string + +// DNSNameManager manages the allocation of static DNS names to IP addresses. +type DNSNameManager struct { + sync.Mutex + ipToDNSName IPToDNSNameMap + cliqueID string + nodesConfigPath string +} + +// NewDNSNameManager creates a new DNS name manager. +func NewDNSNameManager(cliqueID string, nodesConfigPath string) *DNSNameManager { + return &DNSNameManager{ + ipToDNSName: make(IPToDNSNameMap), + cliqueID: cliqueID, + nodesConfigPath: nodesConfigPath, + } +} + +// UpdateDNSNameMappings updates the /etc/hosts file with any new IP to DNS name mappings. +func (m *DNSNameManager) UpdateDNSNameMappings(nodes []*nvapi.ComputeDomainNode) error { + m.Lock() + defer m.Unlock() + + // Make a local copy of the current ipToDNSName mappings + ipToDNSName := maps.Clone(m.ipToDNSName) + + // Prefilter nodes to only consider those with the matching cliqueID + var cliqueNodes []*nvapi.ComputeDomainNode + for _, node := range nodes { + if node.CliqueID == m.cliqueID { + cliqueNodes = append(cliqueNodes, node) + } + } + + // Find and remove stale IPs from map + currentIPs := make(map[string]bool) + for _, node := range cliqueNodes { + currentIPs[node.IPAddress] = true + } + for ip := range ipToDNSName { + if !currentIPs[ip] { + delete(ipToDNSName, ip) + } + } + + // Add new IPs to map (filling in holes where others were removed) + for _, node := range cliqueNodes { + // If IP already has a DNS name, skip it + if _, exists := ipToDNSName[node.IPAddress]; exists { + continue + } + + dnsName, err := m.allocateDNSName(node.IPAddress) + if err != nil { + return fmt.Errorf("failed to allocate DNS name for IP %s: %w", node.IPAddress, err) + } + + // Assign the IP -> DNS name mapping + ipToDNSName[node.IPAddress] = dnsName + } + + // If the existing ipToDNSName mappings are unchanged, exit early + if maps.Equal(ipToDNSName, m.ipToDNSName) { + return nil + } + + // Otherwise, update the cached ipToDNSName mapping + m.ipToDNSName = ipToDNSName + + // And updated the hosts file with new mappings + return m.updateHostsFile() +} + +// LogDNSNameMappings logs the current compute-domain-daemon mappings from memory. +func (m *DNSNameManager) LogDNSNameMappings() { + m.Lock() + defer m.Unlock() + + if len(m.ipToDNSName) == 0 { + klog.Infof("Current compute-domain-daemon mappings: empty") + return + } + + klog.Infof("Current compute-domain-daemon mappings:") + for ip, dnsName := range m.ipToDNSName { + klog.Infof(" %s -> %s", ip, dnsName) + } +} + +// allocateDNSName allocates a DNS name for an IP address, reusing existing DNS names if possible. +func (m *DNSNameManager) allocateDNSName(ip string) (string, error) { + // If IP already has a DNS name, return it + if dnsName, exists := m.ipToDNSName[ip]; exists { + return dnsName, nil + } + + // Find the next available DNS name + for i := 0; i < maxDNSNames; i++ { + dnsName := fmt.Sprintf(dnsNameFormat, i) + // Check if this DNS name is already in use + inUse := false + for _, existingDNSName := range m.ipToDNSName { + if existingDNSName == dnsName { + inUse = true + break + } + } + if !inUse { + m.ipToDNSName[ip] = dnsName + return dnsName, nil + } + } + + // If all DNS names are used, return an error + return "", fmt.Errorf("no DNS names available (max: %d)", maxDNSNames) +} + +// updateHostsFile updates the /etc/hosts file with current IP to DNS name mappings. +func (m *DNSNameManager) updateHostsFile() error { + // Read hosts file + hostsContent, err := os.ReadFile(hostsFilePath) + if err != nil { + return fmt.Errorf("failed to read %s: %w", hostsFilePath, err) + } + + // Grab any lines to preserve, skipping existing DNS name mappings + var preservedLines []string + for _, line := range strings.Split(string(hostsContent), "\n") { + line = strings.TrimSpace(line) + + // Skip existing compute-domain-daemon mappings + if strings.Contains(line, dnsNamePrefix) { + continue + } + + // Keep all other lines + preservedLines = append(preservedLines, line) + } + + // Add preserved lines + var newHostsContent strings.Builder + for _, line := range preservedLines { + newHostsContent.WriteString(line) + newHostsContent.WriteString("\n") + } + + // Add a separator comment + newHostsContent.WriteString("# Compute Domain Daemon mappings\n") + + // Add new DNS name mappings + for ip, dnsName := range m.ipToDNSName { + newHostsContent.WriteString(fmt.Sprintf("%s\t%s\n", ip, dnsName)) + } + + // Write the updated hosts file + if err := os.WriteFile(hostsFilePath, []byte(newHostsContent.String()), 0644); err != nil { + return fmt.Errorf("failed to write %s: %w", hostsFilePath, err) + } + + return nil +} + +// WriteNodesConfig creates a static nodes config file with DNS names. +func (m *DNSNameManager) WriteNodesConfig() error { + // Ensure the directory exists + dir := filepath.Dir(m.nodesConfigPath) + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create directory %s: %w", dir, err) + } + + // Create or overwrite the nodesConfig file + f, err := os.Create(m.nodesConfigPath) + if err != nil { + return fmt.Errorf("failed to create nodes config file: %w", err) + } + defer f.Close() + + // Write static DNS names + for i := 0; i < maxDNSNames; i++ { + dnsName := fmt.Sprintf(dnsNameFormat, i) + if _, err := fmt.Fprintf(f, "%s\n", dnsName); err != nil { + return fmt.Errorf("failed to write to nodes config file: %w", err) + } + } + + klog.Infof("Created static nodes config file with %d DNS names using format %s", maxDNSNames, dnsNameFormat) + + return nil +} diff --git a/cmd/compute-domain-daemon/main.go b/cmd/compute-domain-daemon/main.go index aa11110c4..cbd64acdf 100644 --- a/cmd/compute-domain-daemon/main.go +++ b/cmd/compute-domain-daemon/main.go @@ -33,6 +33,7 @@ import ( "github.com/urfave/cli/v2" nvapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1" + "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates" "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags" ) @@ -163,7 +164,6 @@ func newApp() *cli.App { // Run invokes the IMEX daemon and manages its lifecycle. func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { - // Support heterogeneous compute domain if flags.cliqueID == "" { fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.") @@ -189,6 +189,18 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { } // Prepare IMEX daemon process manager (not invoking the process yet). + var dnsNameManager *DNSNameManager + if featuregates.Enabled(featuregates.IMEXDaemonsWithDNSNames) { + // Prepare DNS name manager + dnsNameManager = NewDNSNameManager(flags.cliqueID, nodesConfigPath) + + // Create static nodes config file with DNS names + if err := dnsNameManager.WriteNodesConfig(); err != nil { + return fmt.Errorf("failed to create static nodes config: %w", err) + } + } + + // Prepare IMEX daemon process manager. daemonCommandLine := []string{imexBinaryPath, "-c", imexConfigPath} processManager := NewProcessManager(daemonCommandLine) @@ -210,14 +222,23 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { } }() - // Start IMEXDaemonUpdateLoop() in goroutine (watches for CD status - // changes, and restarts the IMEX daemon as needed). + // Start IMEX daemon update loop in goroutine (watches for CD status + // changes and manages IMEX daemon updates). wg.Add(1) go func() { defer wg.Done() - if err := IMEXDaemonUpdateLoop(ctx, controller, flags.cliqueID, processManager); err != nil { - klog.Errorf("IMEXDaemonUpdateLoop failed, initiate shutdown: %s", err) - cancel() + if featuregates.Enabled(featuregates.IMEXDaemonsWithDNSNames) { + // Use new DNS name-based functionality + if err := IMEXDaemonUpdateLoopWithDNSNames(ctx, controller, processManager, dnsNameManager); err != nil { + klog.Errorf("IMEXDaemonUpdateLoop failed, initiate shutdown: %s", err) + cancel() + } + } else { + // Use original IP-based functionality + if err := IMEXDaemonUpdateLoopWithIPs(ctx, controller, flags.cliqueID, processManager); err != nil { + klog.Errorf("IMEXDaemonUpdateLoop failed, initiate shutdown: %s", err) + cancel() + } } }() @@ -239,14 +260,14 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { return nil } -// IMEXDaemonUpdateLoop() reacts to ComputeDomain status changes by updating the +// IMEXDaemonUpdateLoopWithIPs reacts to ComputeDomain status changes by updating the // IMEX daemon nodes config file and (re)starting the IMEX daemon process. -func IMEXDaemonUpdateLoop(ctx context.Context, controller *Controller, cliqueID string, pm *ProcessManager) error { +func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cliqueID string, pm *ProcessManager) error { for { klog.Infof("wait for nodes update") select { case <-ctx.Done(): - klog.Infof("shutdown: stop IMEXDaemonUpdateLoop") + klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithIPs") return nil case nodes := <-controller.GetNodesUpdateChan(): if err := writeNodesConfig(cliqueID, nodes); err != nil { @@ -263,6 +284,31 @@ func IMEXDaemonUpdateLoop(ctx context.Context, controller *Controller, cliqueID } } +// IMEXDaemonUpdateLoopWithDNSNames reacts to ComputeDomain status changes by +// updating the /etc/hosts file with IP to DNS name mappings. This relies on +// the IMEX daemon to pick up these changes automatically (and quickly) -- +// which it seems to do via grpc-based health-checking of individual +// connections. We only restart the IMEX daemon if it crashes (both +// unexpectedly and expectedly). +func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controller, processManager *ProcessManager, dnsNameManager *DNSNameManager) error { + for { + klog.Infof("wait for nodes update") + select { + case <-ctx.Done(): + klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithDNSNames") + return nil + case nodes := <-controller.GetNodesUpdateChan(): + if err := dnsNameManager.UpdateDNSNameMappings(nodes); err != nil { + return fmt.Errorf("failed to update DNS name => IP mappings: %w", err) + } + if err := processManager.EnsureStarted(); err != nil { + return fmt.Errorf("failed to ensure IMEX daemon is started: %w", err) + } + dnsNameManager.LogDNSNameMappings() + } + } +} + // check verifies if the node is IMEX capable and if so, checks if the IMEX daemon is ready. // It returns an error if any step fails. func check(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { diff --git a/cmd/compute-domain-daemon/process.go b/cmd/compute-domain-daemon/process.go index 7704d1698..4a832087e 100644 --- a/cmd/compute-domain-daemon/process.go +++ b/cmd/compute-domain-daemon/process.go @@ -46,7 +46,7 @@ func NewProcessManager(cmd []string) *ProcessManager { return m } -// Restart() starts or restarts the process. +// Restart starts or restarts the process. func (m *ProcessManager) Restart() error { if m.handle != nil { if err := m.stop(); err != nil { @@ -56,6 +56,14 @@ func (m *ProcessManager) Restart() error { return m.start() } +// EnsureStarted starts the process if it is not already running. If the process is already started, this is a no-op. +func (m *ProcessManager) EnsureStarted() error { + if m.handle != nil { + return nil + } + return m.start() +} + func (m *ProcessManager) start() error { m.Lock() defer m.Unlock() diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go index aa4777e7b..ce113fb87 100644 --- a/pkg/featuregates/featuregates.go +++ b/pkg/featuregates/featuregates.go @@ -33,6 +33,9 @@ const ( // MPSSupport allows MPS (Multi-Process Service) settings to be specified. MPSSupport featuregate.Feature = "MPSSupport" + + // IMEXDaemonsWithDNSNames allows using DNS names instead of raw IPs for IMEX daemons. + IMEXDaemonsWithDNSNames featuregate.Feature = "IMEXDaemonsWithDNSNames" ) // FeatureGates is a singleton representing the set of all feature gates and their values. @@ -56,6 +59,13 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.VersionedSpecs{ Version: version.MajorMinor(25, 8), }, }, + IMEXDaemonsWithDNSNames: { + { + Default: false, + PreRelease: featuregate.Alpha, + Version: version.MajorMinor(25, 8), + }, + }, } // init instantiates and sets the singleton 'FeatureGates' variable with newFeatureGates(). From 27b537ab7781b2a0f2dda417ead0566546cc2856 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Wed, 16 Jul 2025 20:37:40 +0000 Subject: [PATCH 2/3] Make maxNodesPerIMEXDomain configurable (default at 18) Signed-off-by: Kevin Klues --- cmd/compute-domain-controller/controller.go | 16 ++++++++----- cmd/compute-domain-controller/daemonset.go | 2 ++ cmd/compute-domain-controller/main.go | 20 ++++++++++++++--- cmd/compute-domain-daemon/dnsnames.go | 25 +++++++++++---------- cmd/compute-domain-daemon/main.go | 9 +++++++- templates/compute-domain-daemon.tmpl.yaml | 2 ++ 6 files changed, 52 insertions(+), 22 deletions(-) diff --git a/cmd/compute-domain-controller/controller.go b/cmd/compute-domain-controller/controller.go index 3f5893dc7..b507c9ff4 100644 --- a/cmd/compute-domain-controller/controller.go +++ b/cmd/compute-domain-controller/controller.go @@ -38,6 +38,9 @@ type ManagerConfig struct { // imageName is the full image name to use when rendering templates imageName string + // maxNodesPerIMEXDomain is the maximum number of nodes per IMEX domain to allocate + maxNodesPerIMEXDomain int + // clientsets provides access to various Kubernetes API client interfaces clientsets flags.ClientSets @@ -67,12 +70,13 @@ func (c *Controller) Run(ctx context.Context) error { workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter()) managerConfig := &ManagerConfig{ - driverName: c.config.driverName, - driverNamespace: c.config.flags.namespace, - additionalNamespaces: c.config.flags.additionalNamespaces.Value(), - imageName: c.config.flags.imageName, - clientsets: c.config.clientsets, - workQueue: workQueue, + driverName: c.config.driverName, + driverNamespace: c.config.flags.namespace, + additionalNamespaces: c.config.flags.additionalNamespaces.Value(), + imageName: c.config.flags.imageName, + maxNodesPerIMEXDomain: c.config.flags.maxNodesPerIMEXDomain, + clientsets: c.config.clientsets, + workQueue: workQueue, } cdManager := NewComputeDomainManager(managerConfig) diff --git a/cmd/compute-domain-controller/daemonset.go b/cmd/compute-domain-controller/daemonset.go index 29e43aa44..600ac20cb 100644 --- a/cmd/compute-domain-controller/daemonset.go +++ b/cmd/compute-domain-controller/daemonset.go @@ -50,6 +50,7 @@ type DaemonSetTemplateData struct { ComputeDomainLabelValue types.UID ResourceClaimTemplateName string ImageName string + MaxNodesPerIMEXDomain int FeatureGates map[string]bool } @@ -200,6 +201,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, cd *nvapi.ComputeDomain) ComputeDomainLabelValue: cd.UID, ResourceClaimTemplateName: rct.Name, ImageName: m.config.imageName, + MaxNodesPerIMEXDomain: m.config.maxNodesPerIMEXDomain, FeatureGates: featuregates.ToMap(), } diff --git a/cmd/compute-domain-controller/main.go b/cmd/compute-domain-controller/main.go index fb30eff33..411cb3b3f 100644 --- a/cmd/compute-domain-controller/main.go +++ b/cmd/compute-domain-controller/main.go @@ -44,6 +44,12 @@ import ( const ( DriverName = "compute-domain.nvidia.com" + + // This constant provides a reasonable default for the maximum size of + // a given IMEX Domain. On GB200 and GB300 the limit is 18, so we pick + // this for now. It can be overridden as an environment variable or + // command line argument as required. + defaultMaxNodesPerIMEXDomain = 18 ) type Flags struct { @@ -51,9 +57,10 @@ type Flags struct { loggingConfig *flags.LoggingConfig featureGateConfig *flags.FeatureGateConfig - podName string - namespace string - imageName string + podName string + namespace string + imageName string + maxNodesPerIMEXDomain int httpEndpoint string metricsPath string @@ -103,6 +110,13 @@ func newApp() *cli.App { Destination: &flags.imageName, EnvVars: []string{"IMAGE_NAME"}, }, + &cli.IntFlag{ + Name: "max-nodes-per-imex-domain", + Usage: "The maximum number of possible nodes per IMEX domain", + Value: defaultMaxNodesPerIMEXDomain, + EnvVars: []string{"MAX_NODES_PER_IMEX_DOMAIN"}, + Destination: &flags.maxNodesPerIMEXDomain, + }, &cli.StringFlag{ Category: "HTTP server:", Name: "http-endpoint", diff --git a/cmd/compute-domain-daemon/dnsnames.go b/cmd/compute-domain-daemon/dnsnames.go index c0de78c7a..77a816861 100644 --- a/cmd/compute-domain-daemon/dnsnames.go +++ b/cmd/compute-domain-daemon/dnsnames.go @@ -30,7 +30,6 @@ import ( ) const ( - maxDNSNames = 18 hostsFilePath = "/etc/hosts" dnsNamePrefix = "compute-domain-daemon-" dnsNameFormat = dnsNamePrefix + "%d" @@ -42,17 +41,19 @@ type IPToDNSNameMap map[string]string // DNSNameManager manages the allocation of static DNS names to IP addresses. type DNSNameManager struct { sync.Mutex - ipToDNSName IPToDNSNameMap - cliqueID string - nodesConfigPath string + ipToDNSName IPToDNSNameMap + cliqueID string + maxNodesPerIMEXDomain int + nodesConfigPath string } // NewDNSNameManager creates a new DNS name manager. -func NewDNSNameManager(cliqueID string, nodesConfigPath string) *DNSNameManager { +func NewDNSNameManager(cliqueID string, maxNodesPerIMEXDomain int, nodesConfigPath string) *DNSNameManager { return &DNSNameManager{ - ipToDNSName: make(IPToDNSNameMap), - cliqueID: cliqueID, - nodesConfigPath: nodesConfigPath, + ipToDNSName: make(IPToDNSNameMap), + cliqueID: cliqueID, + maxNodesPerIMEXDomain: maxNodesPerIMEXDomain, + nodesConfigPath: nodesConfigPath, } } @@ -135,7 +136,7 @@ func (m *DNSNameManager) allocateDNSName(ip string) (string, error) { } // Find the next available DNS name - for i := 0; i < maxDNSNames; i++ { + for i := 0; i < m.maxNodesPerIMEXDomain; i++ { dnsName := fmt.Sprintf(dnsNameFormat, i) // Check if this DNS name is already in use inUse := false @@ -152,7 +153,7 @@ func (m *DNSNameManager) allocateDNSName(ip string) (string, error) { } // If all DNS names are used, return an error - return "", fmt.Errorf("no DNS names available (max: %d)", maxDNSNames) + return "", fmt.Errorf("no DNS names available (max: %d)", m.maxNodesPerIMEXDomain) } // updateHostsFile updates the /etc/hosts file with current IP to DNS name mappings. @@ -216,14 +217,14 @@ func (m *DNSNameManager) WriteNodesConfig() error { defer f.Close() // Write static DNS names - for i := 0; i < maxDNSNames; i++ { + for i := 0; i < m.maxNodesPerIMEXDomain; i++ { dnsName := fmt.Sprintf(dnsNameFormat, i) if _, err := fmt.Fprintf(f, "%s\n", dnsName); err != nil { return fmt.Errorf("failed to write to nodes config file: %w", err) } } - klog.Infof("Created static nodes config file with %d DNS names using format %s", maxDNSNames, dnsNameFormat) + klog.Infof("Created static nodes config file with %d DNS names using format %s", m.maxNodesPerIMEXDomain, dnsNameFormat) return nil } diff --git a/cmd/compute-domain-daemon/main.go b/cmd/compute-domain-daemon/main.go index cbd64acdf..311f7d7b6 100644 --- a/cmd/compute-domain-daemon/main.go +++ b/cmd/compute-domain-daemon/main.go @@ -52,6 +52,7 @@ type Flags struct { computeDomainNamespace string nodeName string podIP string + maxNodesPerIMEXDomain int loggingConfig *flags.LoggingConfig featureGateConfig *flags.FeatureGateConfig } @@ -129,6 +130,12 @@ func newApp() *cli.App { EnvVars: []string{"POD_IP"}, Destination: &flags.podIP, }, + &cli.IntFlag{ + Name: "max-nodes-per-imex-domain", + Usage: "The maximum number of possible nodes per IMEX domain", + EnvVars: []string{"MAX_NODES_PER_IMEX_DOMAIN"}, + Destination: &flags.maxNodesPerIMEXDomain, + }, } cliFlags = append(cliFlags, flags.featureGateConfig.Flags()...) cliFlags = append(cliFlags, flags.loggingConfig.Flags()...) @@ -192,7 +199,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { var dnsNameManager *DNSNameManager if featuregates.Enabled(featuregates.IMEXDaemonsWithDNSNames) { // Prepare DNS name manager - dnsNameManager = NewDNSNameManager(flags.cliqueID, nodesConfigPath) + dnsNameManager = NewDNSNameManager(flags.cliqueID, flags.maxNodesPerIMEXDomain, nodesConfigPath) // Create static nodes config file with DNS names if err := dnsNameManager.WriteNodesConfig(); err != nil { diff --git a/templates/compute-domain-daemon.tmpl.yaml b/templates/compute-domain-daemon.tmpl.yaml index 4d2a602d2..143de3326 100644 --- a/templates/compute-domain-daemon.tmpl.yaml +++ b/templates/compute-domain-daemon.tmpl.yaml @@ -26,6 +26,8 @@ spec: image: {{ .ImageName }} command: ["compute-domain-daemon", "-v", "6", "run"] env: + - name: MAX_NODES_PER_IMEX_DOMAIN + value: "{{ .MaxNodesPerIMEXDomain }}" - name: NODE_NAME valueFrom: fieldRef: From 9715887fe8fc4613e66b419b6140907e58f1f070 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Sat, 30 Aug 2025 07:37:59 +0000 Subject: [PATCH 3/3] Ensure consistent dnsname --> IMEXDaemonIP mapping on all nodes Without this the IMEX daemons were getting confused if the same DNS name was used to by different nodes to point to differernt IMEX daemons in the ensemble. Signed-off-by: Kevin Klues --- .../resource/v1beta1/computedomain.go | 7 +++ cmd/compute-domain-daemon/computedomain.go | 47 +++++++++++++++++++ cmd/compute-domain-daemon/controller.go | 3 ++ cmd/compute-domain-daemon/dnsnames.go | 38 +++++---------- cmd/compute-domain-daemon/main.go | 1 + .../resource.nvidia.com_computedomains.yaml | 7 +++ 6 files changed, 77 insertions(+), 26 deletions(-) diff --git a/api/nvidia.com/resource/v1beta1/computedomain.go b/api/nvidia.com/resource/v1beta1/computedomain.go index ea48caa43..a05807f4e 100644 --- a/api/nvidia.com/resource/v1beta1/computedomain.go +++ b/api/nvidia.com/resource/v1beta1/computedomain.go @@ -91,4 +91,11 @@ type ComputeDomainNode struct { Name string `json:"name"` IPAddress string `json:"ipAddress"` CliqueID string `json:"cliqueID"` + // The Index field is used to ensure a consistent IP-to-DNS name + // mapping across all machines within an IMEX domain. Each node's index + // directly determines its DNS name. It is marked as optional (but not + // omitempty) in order to support downgrades and avoid an API bump. + // +optional + // +kubebuilder:validation:Optional + Index int `json:"index"` } diff --git a/cmd/compute-domain-daemon/computedomain.go b/cmd/compute-domain-daemon/computedomain.go index 8c847f5d0..643342912 100644 --- a/cmd/compute-domain-daemon/computedomain.go +++ b/cmd/compute-domain-daemon/computedomain.go @@ -218,9 +218,16 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context, // If there isn't one, create one and append it to the list if nodeInfo == nil { + // Get the next available index for this new node + nextIndex, err := getNextAvailableIndex(newCD.Status.Nodes, m.config.maxNodesPerIMEXDomain) + if err != nil { + return fmt.Errorf("error getting next available index: %w", err) + } + nodeInfo = &nvapi.ComputeDomainNode{ Name: m.config.nodeName, CliqueID: m.config.cliqueID, + Index: nextIndex, } newCD.Status.Nodes = append(newCD.Status.Nodes, nodeInfo) } @@ -243,6 +250,46 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context, return nil } +// The Index field in the Nodes section of the ComputeDomain status ensures a +// consistent IP-to-DNS name mapping across all machines within a given IMEX +// domain. Each node's index directly determines its DNS name using the format +// "compute-domain-daemon-{index}". +// +// getNextAvailableIndex finds the next available index for the current node by +// seeing which ones are already taken by other nodes in the ComputeDomain +// status. It fills in gaps where it can, and returns an error if no index is +// available within maxNodesPerIMEXDomain. +// +// By filling gaps in the index sequence (rather than always appending), we +// maintain stable DNS names for existing nodes even when intermediate nodes +// are removed from the compute domain and new ones are added. +func getNextAvailableIndex(nodes []*nvapi.ComputeDomainNode, maxNodesPerIMEXDomain int) (int, error) { + if len(nodes) >= maxNodesPerIMEXDomain { + return -1, fmt.Errorf("cannot add more nodes, already at maximum (%d)", maxNodesPerIMEXDomain) + } + + // Create a map to track used indices + usedIndices := make(map[int]bool) + + // Collect all currently used indices + for _, node := range nodes { + usedIndices[node.Index] = true + } + + // Find the next available index, starting from 0 and filling gaps + nextIndex := 0 + for usedIndices[nextIndex] { + nextIndex++ + } + + // Ensure nextIndex is within the range 0..maxNodesPerIMEXDomain + if nextIndex < 0 || nextIndex >= maxNodesPerIMEXDomain { + return -1, fmt.Errorf("no available indices within maxNodesPerIMEXDomain (%d)", maxNodesPerIMEXDomain) + } + + return nextIndex, nil +} + // If we've reached the expected number of nodes and if there was actually a // change compared to the previously known set of nodes: pass info to IMEX // daemon controller. diff --git a/cmd/compute-domain-daemon/controller.go b/cmd/compute-domain-daemon/controller.go index ec4a0c57b..aa4e5775e 100644 --- a/cmd/compute-domain-daemon/controller.go +++ b/cmd/compute-domain-daemon/controller.go @@ -35,6 +35,7 @@ type ManagerConfig struct { computeDomainNamespace string cliqueID string podIP string + maxNodesPerIMEXDomain int } // ControllerConfig holds the configuration for the controller. @@ -45,6 +46,7 @@ type ControllerConfig struct { computeDomainNamespace string cliqueID string podIP string + maxNodesPerIMEXDomain int } // Controller manages the lifecycle of compute domain operations. @@ -73,6 +75,7 @@ func NewController(config *ControllerConfig) (*Controller, error) { computeDomainNamespace: config.computeDomainNamespace, cliqueID: config.cliqueID, podIP: config.podIP, + maxNodesPerIMEXDomain: config.maxNodesPerIMEXDomain, } controller := &Controller{ diff --git a/cmd/compute-domain-daemon/dnsnames.go b/cmd/compute-domain-daemon/dnsnames.go index 77a816861..027178033 100644 --- a/cmd/compute-domain-daemon/dnsnames.go +++ b/cmd/compute-domain-daemon/dnsnames.go @@ -84,14 +84,15 @@ func (m *DNSNameManager) UpdateDNSNameMappings(nodes []*nvapi.ComputeDomainNode) } } - // Add new IPs to map (filling in holes where others were removed) + // Add new IPs to map for _, node := range cliqueNodes { // If IP already has a DNS name, skip it if _, exists := ipToDNSName[node.IPAddress]; exists { continue } - dnsName, err := m.allocateDNSName(node.IPAddress) + // Construct the DNS name from the node index + dnsName, err := m.constructDNSName(node) if err != nil { return fmt.Errorf("failed to allocate DNS name for IP %s: %w", node.IPAddress, err) } @@ -128,32 +129,17 @@ func (m *DNSNameManager) LogDNSNameMappings() { } } -// allocateDNSName allocates a DNS name for an IP address, reusing existing DNS names if possible. -func (m *DNSNameManager) allocateDNSName(ip string) (string, error) { - // If IP already has a DNS name, return it - if dnsName, exists := m.ipToDNSName[ip]; exists { - return dnsName, nil +// contructDNSName constructs a DNS name for a node based on its index field. +// Returns an error if the index is invalid or exceeds maxNodesPerIMEXDomain. +func (m *DNSNameManager) constructDNSName(node *nvapi.ComputeDomainNode) (string, error) { + if node.Index < 0 { + return "", fmt.Errorf("node %s has invalid index %d", node.Name, node.Index) } - - // Find the next available DNS name - for i := 0; i < m.maxNodesPerIMEXDomain; i++ { - dnsName := fmt.Sprintf(dnsNameFormat, i) - // Check if this DNS name is already in use - inUse := false - for _, existingDNSName := range m.ipToDNSName { - if existingDNSName == dnsName { - inUse = true - break - } - } - if !inUse { - m.ipToDNSName[ip] = dnsName - return dnsName, nil - } + if node.Index >= m.maxNodesPerIMEXDomain { + return "", fmt.Errorf("node %s has invalid index %d, must be less than %d", node.Name, node.Index, m.maxNodesPerIMEXDomain) } - - // If all DNS names are used, return an error - return "", fmt.Errorf("no DNS names available (max: %d)", m.maxNodesPerIMEXDomain) + dnsName := fmt.Sprintf(dnsNameFormat, node.Index) + return dnsName, nil } // updateHostsFile updates the /etc/hosts file with current IP to DNS name mappings. diff --git a/cmd/compute-domain-daemon/main.go b/cmd/compute-domain-daemon/main.go index 311f7d7b6..eb37178c1 100644 --- a/cmd/compute-domain-daemon/main.go +++ b/cmd/compute-domain-daemon/main.go @@ -187,6 +187,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { computeDomainNamespace: flags.computeDomainNamespace, nodeName: flags.nodeName, podIP: flags.podIP, + maxNodesPerIMEXDomain: flags.maxNodesPerIMEXDomain, } klog.Infof("config: %v", config) diff --git a/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml b/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml index a0b7f3c00..edbc4ce98 100644 --- a/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml +++ b/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml @@ -85,6 +85,13 @@ spec: properties: cliqueID: type: string + index: + description: |- + The Index field is used to ensure a consistent IP-to-DNS name + mapping across all machines within an IMEX domain. Each node's index + directly determines its DNS name. It is marked as optional (but not + omitempty) in order to support downgrades and avoid an API bump. + type: integer ipAddress: type: string name: