File tree Expand file tree Collapse file tree 3 files changed +19
-0
lines changed
compute-domain-kubelet-plugin Expand file tree Collapse file tree 3 files changed +19
-0
lines changed Original file line number Diff line number Diff line change @@ -204,6 +204,13 @@ func newApp() *cli.App {
204204
205205// Run invokes the IMEX daemon and manages its lifecycle.
206206func run (ctx context.Context , cancel context.CancelFunc , flags * Flags ) error {
207+ // Verify that CDI container edits were applied by the container runtime.
208+ // If the env var is not set to "true", CDI is likely disabled and the daemon
209+ // cannot function correctly (e.g. the /imexd mount will be missing).
210+ if os .Getenv ("NVIDIA_CDI_EDITS_APPLIED" ) != "true" {
211+ return fmt .Errorf ("CDI container edits did not apply -- is CDI enabled in your container runtime?" )
212+ }
213+
207214 common .StartDebugSignalHandlers ()
208215
209216 // Validate feature gate dependencies
Original file line number Diff line number Diff line change @@ -162,6 +162,9 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEditsCommon(ctx context.Con
162162 edits := & cdiapi.ContainerEdits {
163163 ContainerEdits : & cdispec.ContainerEdits {
164164 Env : []string {
165+ // This is a value that the CD daemon checks at startup to verify that CDI edits were applied by the container runtime.
166+ // If the value is not present, CDI is likely disabled and the daemon cannot function correctly (e.g. the /imexd mount will be missing).
167+ "NVIDIA_CDI_EDITS_APPLIED=true" ,
165168 fmt .Sprintf ("CLIQUE_ID=%s" , s .manager .cliqueID ),
166169 fmt .Sprintf ("COMPUTE_DOMAIN_UUID=%s" , cd .UID ),
167170 fmt .Sprintf ("COMPUTE_DOMAIN_NAME=%s" , cd .Name ),
Original file line number Diff line number Diff line change @@ -29,6 +29,15 @@ import (
2929
3030const dumpPath = "/tmp/goroutine-stacks.dump"
3131
32+ // CDIEditsAppliedEnvKey and CDIEditsAppliedEnvValue is a key/value
33+ // pair injected via CDI container edits into the CD daemon container. The CD
34+ // daemon checks for this key/value pair at startup to verify that CDI edits were
35+ // applied by the container runtime. If not present, CDI is likely disabled.
36+ const (
37+ CDIEditsAppliedEnvKey = "NVIDIA_CDI_EDITS_APPLIED"
38+ CDIEditsAppliedEnvValue = "true"
39+ )
40+
3241// Set up SIGUSR2 handler: if triggered, acquire stack traces for all goroutines
3342// in this process. Dump to file, and fall back to emitting to stderr if file
3443// output didn't work.
You can’t perform that action at this time.
0 commit comments