Skip to content

Commit 088234b

Browse files
committed
[no-relnote] Rework nested runner construction
This change makes the logic w.r.t mounting the host toolkit into a nested runner more explicit and adds a flag to indicate whether the runner needs GPU access. Signed-off-by: Evan Lezar <[email protected]>
1 parent 2e13ab7 commit 088234b

File tree

3 files changed

+24
-17
lines changed

3 files changed

+24
-17
lines changed

tests/e2e/nvidia-cdi-refresh_test.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,7 @@ var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("system
132132

133133
BeforeAll(func(ctx context.Context) {
134134
var err error
135-
// TODO: We set installCTK to true here to SKIP the mounting of the files from the host.
136-
// The test here does NOT require the host toolkit.
137-
systemdRunner, err = NewNestedContainerRunner(runner, outerContainerImage, true, containerName, localCacheDir)
135+
systemdRunner, err = NewNestedContainerRunner(runner, outerContainerImage, false, containerName, localCacheDir, true)
138136
Expect(err).ToNot(HaveOccurred())
139137
for range 10 {
140138
state, _, err := systemdRunner.Run(getSystemStateScript)

tests/e2e/nvidia-container-cli_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn
7878

7979
BeforeAll(func(ctx context.Context) {
8080
var err error
81-
nestedContainerRunner, err = NewNestedContainerRunner(runner, "ubuntu", installCTK, containerName, localCacheDir)
81+
nestedContainerRunner, err = NewNestedContainerRunner(runner, "ubuntu", !installCTK, containerName, localCacheDir, true)
8282
Expect(err).ToNot(HaveOccurred())
8383

8484
if installCTK {

tests/e2e/runner.go

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,12 @@ import (
3030

3131
const (
3232
installPrerequisitesScript = `
33-
export DEBIAN_FRONTEND=noninteractive
34-
apt-get update && apt-get install -y curl gnupg2
35-
`
33+
set -e
34+
export DEBIAN_FRONTEND=noninteractive
35+
# Install prerequisites
36+
apt-get update
37+
apt-get install -y curl gnupg2
38+
`
3639
)
3740

3841
type localRunner struct{}
@@ -96,7 +99,7 @@ func NewRunner(opts ...runnerOption) Runner {
9699
// NewNestedContainerRunner creates a new nested container runner.
97100
// A nested container runs a container inside another container based on a
98101
// given runner (remote or local).
99-
func NewNestedContainerRunner(runner Runner, baseImage string, installCTK bool, containerName string, cacheDir string) (Runner, error) {
102+
func NewNestedContainerRunner(runner Runner, baseImage string, mountToolkitFromHost bool, containerName string, cacheDir string, requiresGPUs bool) (Runner, error) {
100103
// If a container with the same name exists from a previous test run, remove it first.
101104
// Ignore errors as container might not exist
102105
_, _, err := runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName))
@@ -106,13 +109,24 @@ func NewNestedContainerRunner(runner Runner, baseImage string, installCTK bool,
106109

107110
var additionalContainerArguments []string
108111

112+
if requiresGPUs {
113+
// If the container requires access to GPUs we explicitly add the nvidia
114+
// runtime and set `NVIDIA_VISIBLE_DEVICES` to trigger jit-cdi spec
115+
// generation.
116+
additionalContainerArguments = append(additionalContainerArguments,
117+
"--runtime=nvidia",
118+
"-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all",
119+
)
120+
}
121+
109122
if cacheDir != "" {
110123
additionalContainerArguments = append(additionalContainerArguments,
111124
"-v "+cacheDir+":"+cacheDir+":ro",
112125
)
113126
}
114127

115-
if !installCTK {
128+
if mountToolkitFromHost {
129+
// TODO: This is actually ONLY needed for the CLI tests.
116130
// If installCTK is false, we use the preinstalled toolkit.
117131
// This means we need to add toolkit libraries and binaries from the "host"
118132

@@ -179,6 +193,7 @@ func NewNestedContainerRunner(runner Runner, baseImage string, installCTK bool,
179193
if err != nil {
180194
return nil, err
181195
}
196+
182197
_, _, err = runner.Run(script)
183198
if err != nil {
184199
return nil, fmt.Errorf("failed to run start container script: %w", err)
@@ -191,7 +206,7 @@ func NewNestedContainerRunner(runner Runner, baseImage string, installCTK bool,
191206

192207
_, _, err = inContainer.Run(installPrerequisitesScript)
193208
if err != nil {
194-
return nil, fmt.Errorf("failed to install docker: %w", err)
209+
return nil, fmt.Errorf("failed to install prerequisites: %w", err)
195210
}
196211

197212
return inContainer, nil
@@ -296,20 +311,14 @@ func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) {
296311

297312
// outerContainerTemplate represents a template to start a container with
298313
// a name specified.
299-
// The container is given access to all NVIDIA gpus by explicitly using the
300-
// nvidia runtime and the `runtime.nvidia.com/gpu=all` device to trigger JIT
301-
// CDI spec generation.
302-
// The template also allows for additional arguments to be specified.
303314
type outerContainer struct {
304315
Name string
305316
BaseImage string
306317
AdditionalArguments []string
307318
}
308319

309320
func (o *outerContainer) Render() (string, error) {
310-
tmpl, err := template.New("startContainer").Parse(`docker run -d --name {{.Name}} --privileged --runtime=nvidia \
311-
-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
312-
-e NVIDIA_DRIVER_CAPABILITIES=all \
321+
tmpl, err := template.New("startContainer").Parse(`docker run -d --name {{.Name}} --privileged \
313322
{{ range $i, $a := .AdditionalArguments -}}
314323
{{ $a }} \
315324
{{ end -}}

0 commit comments

Comments
 (0)