Skip to content

Commit eb75a1c

Browse files
committed
Move logic to run / check status of IMEX daemons into go binary
Signed-off-by: Kevin Klues <kklues@nvidia.com>
1 parent 024f379 commit eb75a1c

3 files changed

Lines changed: 189 additions & 34 deletions

File tree

cmd/compute-domain-daemon/main.go

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
/*
2+
* Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"os"
23+
"os/exec"
24+
"os/signal"
25+
"syscall"
26+
27+
"github.com/urfave/cli/v2"
28+
)
29+
30+
const (
31+
nodesConfig = "/etc/nvidia-imex/nodes_config.cfg"
32+
imexConfig = "/etc/nvidia-imex/config.cfg"
33+
imexLog = "/var/log/nvidia-imex.log"
34+
imexBinary = "/usr/bin/nvidia-imex"
35+
imexCtl = "/usr/bin/nvidia-imex-ctl"
36+
)
37+
38+
type Flags struct {
39+
cliqueID string
40+
}
41+
42+
func main() {
43+
if err := newApp().Run(os.Args); err != nil {
44+
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
45+
os.Exit(1)
46+
}
47+
}
48+
49+
func newApp() *cli.App {
50+
// Create local flags variable
51+
var flags Flags
52+
53+
// Create a wrapper that will be used to gracefully shut down all subcommands
54+
wrapper := func(ctx context.Context, f func(ctx context.Context, flags *Flags) error) error {
55+
// Create a cancelable context from the one passed in
56+
ctx, cancel := context.WithCancel(ctx)
57+
defer cancel()
58+
59+
// Handle SIGTERM
60+
sigChan := make(chan os.Signal, 1)
61+
signal.Notify(sigChan, syscall.SIGTERM)
62+
go func() {
63+
<-sigChan
64+
cancel()
65+
}()
66+
67+
// Call the wrapped function
68+
return f(ctx, &flags)
69+
}
70+
71+
// Create the app
72+
app := &cli.App{
73+
Name: "compute-domain-daemon",
74+
Usage: "compute-domain-daemon manages the IMEX daemon for NVIDIA compute domains.",
75+
Flags: []cli.Flag{
76+
&cli.StringFlag{
77+
Name: "cliqueid",
78+
Usage: "The clique ID for this node.",
79+
EnvVars: []string{"CLIQUE_ID"},
80+
Destination: &flags.cliqueID,
81+
},
82+
},
83+
Commands: []*cli.Command{
84+
{
85+
Name: "run",
86+
Usage: "Run the compute domain daemon",
87+
Action: func(c *cli.Context) error {
88+
return wrapper(c.Context, run)
89+
},
90+
},
91+
{
92+
Name: "check",
93+
Usage: "Check if the node is IMEX capable and if the IMEX daemon is ready",
94+
Action: func(c *cli.Context) error {
95+
return wrapper(c.Context, check)
96+
},
97+
},
98+
},
99+
}
100+
101+
return app
102+
}
103+
104+
// run runs the compute domain daemon, checking IMEX capability and managing the IMEX daemon lifecycle.
105+
// It returns an error if any step fails.
106+
func run(ctx context.Context, flags *Flags) error {
107+
if flags.cliqueID == "" {
108+
fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.")
109+
fmt.Println("The IMEX daemon will not be started.")
110+
fmt.Println("Sleeping forever...")
111+
<-ctx.Done()
112+
return nil
113+
}
114+
115+
// Print nodes config
116+
if err := printNodesConfig(ctx); err != nil {
117+
return fmt.Errorf("error printing nodes config: %w", err)
118+
}
119+
120+
// Run IMEX daemon
121+
if err := runIMEXDaemon(ctx, imexConfig); err != nil {
122+
return fmt.Errorf("error running IMEX daemon: %w", err)
123+
}
124+
125+
// Tail the log file
126+
if err := tail(ctx, imexLog); err != nil {
127+
return fmt.Errorf("error tailing log file: %w", err)
128+
}
129+
130+
return nil
131+
}
132+
133+
// check verifies if the node is IMEX capable and if so, checks if the IMEX daemon is ready.
134+
// It returns an error if any step fails.
135+
func check(ctx context.Context, flags *Flags) error {
136+
if flags.cliqueID == "" {
137+
fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.")
138+
return nil
139+
}
140+
141+
// Check if IMEX daemon is ready
142+
cmd := exec.CommandContext(ctx, imexCtl, "-q", "-i", "127.0.0.1", "50005")
143+
output, err := cmd.Output()
144+
if err != nil {
145+
return fmt.Errorf("error checking IMEX daemon status: %w", err)
146+
}
147+
148+
if string(output) != "READY\n" {
149+
return fmt.Errorf("IMEX daemon not ready: %s", string(output))
150+
}
151+
152+
return nil
153+
}
154+
155+
// printNodesConfig reads and prints the contents of the nodes configuration file.
156+
// It returns an error if the file cannot be read.
157+
func printNodesConfig(ctx context.Context) error {
158+
fmt.Printf("%s:\n", nodesConfig)
159+
content, err := os.ReadFile(nodesConfig)
160+
if err != nil {
161+
return fmt.Errorf("failed to read nodes config: %w", err)
162+
}
163+
fmt.Println(string(content))
164+
return nil
165+
}
166+
167+
// runIMEXDaemon starts the IMEX daemon with the specified configuration file.
168+
// It returns an error if the daemon fails to start or exits unexpectedly.
169+
func runIMEXDaemon(ctx context.Context, config string) error {
170+
cmd := exec.CommandContext(ctx, imexBinary, "-c", config)
171+
cmd.Stdout = os.Stdout
172+
cmd.Stderr = os.Stderr
173+
return cmd.Run()
174+
}
175+
176+
// tail continuously reads and prints new lines from the specified file using the system's tail command.
177+
// It starts from the beginning of the file (-n +1) and follows new lines (-f).
178+
// It blocks until the context is cancelled or an error occurs.
179+
func tail(ctx context.Context, path string) error {
180+
cmd := exec.CommandContext(ctx, "tail", "-n", "+1", "-f", path)
181+
cmd.Stdout = os.Stdout
182+
cmd.Stderr = os.Stderr
183+
return cmd.Run()
184+
}

deployments/container/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,6 @@ RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-L
8484

8585
COPY --from=build /artifacts/compute-domain-controller /usr/bin/compute-domain-controller
8686
COPY --from=build /artifacts/compute-domain-kubelet-plugin /usr/bin/compute-domain-kubelet-plugin
87+
COPY --from=build /artifacts/compute-domain-daemon /usr/bin/compute-domain-daemon
8788
COPY --from=build /artifacts/gpu-kubelet-plugin /usr/bin/gpu-kubelet-plugin
8889
COPY --from=build /build/templates /templates

templates/compute-domain-daemon.tmpl.yaml

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,51 +21,21 @@ spec:
2121
nodeSelector:
2222
{{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }}
2323
containers:
24+
# Run the compute domain daemon
2425
- name: compute-domain-daemon
2526
image: {{ .ImageName }}
26-
command: [sh, -c]
27-
args:
28-
- |-
29-
trap 'exit 0' TERM
30-
set -e
31-
if nvidia-smi -q | grep -E "ClusterUUID|CliqueId" | grep -q "N/A" || \
32-
nvidia-smi -q | grep -E "ClusterUUID" | grep -q "00000000-0000-0000-0000-000000000000"; then
33-
echo "ClusterUUID and CliqueId are NOT set for GPUs on this node."
34-
echo "The IMEX daemon will not be started."
35-
echo "Sleeping forever..."
36-
touch /etc/nvidia-imex-null
37-
tail -f /dev/null & wait
38-
fi
39-
# Emit nodes config for facilitating debug.
40-
echo "/etc/nvidia-imex/nodes_config.cfg:"
41-
cat /etc/nvidia-imex/nodes_config.cfg
42-
/usr/bin/nvidia-imex -c /etc/nvidia-imex/config.cfg
43-
tail -n +1 -f /var/log/nvidia-imex.log & wait
27+
command: ["compute-domain-daemon", "run"]
4428
resources:
4529
claims:
4630
- name: compute-domain-daemon
4731
startupProbe:
4832
exec:
49-
command:
50-
- "sh"
51-
- "-c"
52-
- |-
53-
if [ -f /etc/nvidia-imex-null ]; then
54-
exit 0
55-
fi
56-
test "$(nvidia-imex-ctl -q -i 127.0.0.1 50005)" = "READY"
33+
command: ["compute-domain-daemon", "check"]
5734
initialDelaySeconds: 1
5835
periodSeconds: 1
5936
livenessProbe:
6037
exec:
61-
command:
62-
- "sh"
63-
- "-c"
64-
- |
65-
if [ -f /etc/nvidia-imex-null ]; then
66-
exit 0
67-
fi
68-
test "$(nvidia-imex-ctl -q -i 127.0.0.1 50005)" = "READY"
38+
command: ["compute-domain-daemon", "check"]
6939
initialDelaySeconds: 10
7040
periodSeconds: 5
7141
# Repel all node taints.

0 commit comments

Comments
 (0)