Skip to content

Commit 7a89214

Browse files
committed
Add Dynamic MIG tests to Lambda CI with functional nvmm replacement
Enable the existing DynMIG test suite on Lambda by creating a functional nvmm replacement and wiring up GPU-type-aware filtering. Lambda nvmm (tests/bats/lib/lambda/nvmm): - Drop-in replacement for the original nvmm (which requires GPU Operator's nvidia-mig-manager pod). Runs nvidia-smi commands on the host via a privileged ephemeral pod with host /usr, /dev, and /proc/driver/nvidia mounted. Same interface as the original. - NVMM_PATH env var selects which nvmm to use. Defaults to the original (tests/bats/lib). Lambda CI sets it to the lambda version (tests/bats/lib/lambda). Dynamic MIG on Lambda: - test_gpu_dynmig.bats tagged 'dynmig' for auto-filtering - e2e-test.sh: skip dynmig on non-MIG GPUs (V100, A10); run on A100, H100, GH200, B200 - test_gpu_dynmig.bats added to tests-gpu-single Makefile target - cleanup-from-previous-run.sh: make nvmm MIG cleanup non-fatal Also removes SKIP_CLEANUP by handling MIG pre-cleanup on the host via SSH and using the lambda nvmm for in-test MIG operations. Signed-off-by: Davanum Srinivas <davanum@gmail.com>
1 parent 21bf957 commit 7a89214

File tree

6 files changed

+107
-14
lines changed

6 files changed

+107
-14
lines changed

hack/ci/lambda/e2e-test.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,19 @@ fi
8484
case "${LAMBDA_GPU_TYPE}" in
8585
*v100*|*a10) FILTER="${FILTER},!gpu-busgrind" ;;
8686
esac
87+
# Dynamic MIG requires MIG-capable GPUs (A100/H100/GH200/B200).
88+
case "${LAMBDA_GPU_TYPE}" in
89+
*a100*|*h100*|*gh200*|*b200*) ;;
90+
*) FILTER="${FILTER},!dynmig" ;;
91+
esac
8792
echo "Test filter: ${FILTER}"
8893

94+
# --- Pre-cleanup: MIG teardown on host ---
95+
# Run MIG cleanup directly on the host where nvidia-smi is available.
96+
# The BATS Docker container uses the lambda nvmm stub (privileged pod)
97+
# for nvmm calls within tests, but pre-cleanup is faster via SSH.
98+
lambda_remote sh -c 'nvidia-smi mig -dci 2>/dev/null; nvidia-smi mig -dgi 2>/dev/null; nvidia-smi -mig 0 2>/dev/null; echo "MIG cleanup done"' || true
99+
89100
# --- Run BATS tests ---
90101
# Tests local artifacts: local chart + local image built from the PR.
91102
# This is a real presubmit -- it validates the repo's code, chart, and specs.
@@ -103,10 +114,11 @@ export KUBECONFIG=\$HOME/.kube/config
103114
export CI=true
104115
export TEST_NVIDIA_DRIVER_ROOT=/
105116
export TEST_CHART_LOCAL=true
106-
export SKIP_CLEANUP=true
107117
export DISABLE_COMPUTE_DOMAINS=true
108118
export TEST_FILTER_TAGS='${FILTER}'
109119
export GIT_COMMIT_SHORT=${GIT_COMMIT_SHORT}
120+
# Use lambda nvmm (privileged pod, no GPU Operator).
121+
export NVMM_PATH=/cwd/tests/bats/lib/lambda
110122
111123
make -f tests/bats/Makefile tests-gpu-single GIT_COMMIT_SHORT=${GIT_COMMIT_SHORT}
112124
EOF

tests/bats/Makefile

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ DOCKER_ENVS = \
105105
--env TEST_NVIDIA_DRIVER_ROOT=$(TEST_NVIDIA_DRIVER_ROOT) \
106106
--env TEST_EXPECTED_IMAGE_SPEC_SUBSTRING=$(TEST_EXPECTED_IMAGE_SPEC_SUBSTRING) \
107107
--env SKIP_CLEANUP=$(SKIP_CLEANUP) \
108-
--env DISABLE_COMPUTE_DOMAINS=$(DISABLE_COMPUTE_DOMAINS)
108+
--env DISABLE_COMPUTE_DOMAINS=$(DISABLE_COMPUTE_DOMAINS) \
109+
--env NVMM_PATH=$(NVMM_PATH)
109110

110111
DOCKER_UID := $(shell id -u)
111112
DOCKER_GID := $(shell id -g)
@@ -169,13 +170,14 @@ runner-image:
169170
# cmdline args).
170171
.PHONY: tests-gpu tests-gpu-single tests-cd
171172

172-
# Single-GPU-safe subset. Suitable for CI environments with one GPU and no
173-
# GPU Operator (e.g., Lambda Cloud). Excludes test_basics.bats (expects GPU
174-
# Operator + clean state), MIG, stress, and upgrade tests.
173+
# Lambda CI subset. Excludes test_basics.bats (expects GPU Operator + clean
174+
# state), static MIG, stress, and upgrade tests. DynMIG tests are included
175+
# but tagged 'dynmig' — auto-skipped on non-MIG GPUs via TEST_FILTER_TAGS.
175176
tests-gpu-single: runner-image
176177
$(call RUN_BATS, \
177178
tests/bats/test_gpu_basic.bats \
178-
tests/bats/test_gpu_cuda_workloads.bats)
179+
tests/bats/test_gpu_cuda_workloads.bats \
180+
tests/bats/test_gpu_dynmig.bats)
179181

180182
# Run a subset covering mainly the GPU plugin
181183
tests-gpu: runner-image

tests/bats/cleanup-from-previous-run.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,8 @@ set -e
104104
bash tests/bats/clean-state-dirs-all-nodes.sh
105105

106106
# Remove any stray MIG devices and disable MIG mode on all nodes.
107-
nvmm all sh -c 'nvidia-smi mig -dci; nvidia-smi mig -dgi; nvidia-smi -mig 0'
107+
# Non-fatal: MIG may not be supported on all GPU types (V100, A10).
108+
nvmm all sh -c 'nvidia-smi mig -dci; nvidia-smi mig -dgi; nvidia-smi -mig 0' || echo "nvmm MIG cleanup skipped (non-fatal)"
108109

109110
set +x
110111
echo "cleanup: done"

tests/bats/helpers.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818
export TEST_HELM_RELEASE_NAME="dra-driver-nvidia-gpu-batssuite"
1919

2020

21-
# Extend PATH, for example for the `nvmm` utility.
21+
# Extend PATH for the `nvmm` utility. NVMM_PATH allows overriding the
22+
# directory containing `nvmm` — set it to `tests/bats/lib/lambda` on
23+
# Lambda CI (no GPU Operator) so the privileged-pod stub is used instead.
2224
SELF_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
23-
export PATH="${SELF_DIR}/lib:${PATH}"
25+
export PATH="${NVMM_PATH:-${SELF_DIR}/lib}:${PATH}"
2426

2527

2628
_common_setup() {

tests/bats/lib/lambda/nvmm

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright The Kubernetes Authors
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# https://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# nvmm replacement for Lambda CI (single-node, no GPU Operator).
18+
#
19+
# The original nvmm execs into GPU Operator's nvidia-mig-manager pod.
20+
# This replacement runs commands on the host via a privileged pod that
21+
# mounts the host's /usr (for nvidia-smi) and /dev (for GPU devices).
22+
#
23+
# Usage (same as nvmm):
24+
# nvmm <node-hint> [command...]
25+
# nvmm all sh -c 'nvidia-smi mig -dci; nvidia-smi mig -dgi; nvidia-smi -mig 0'
26+
27+
set -o errexit
28+
set -o nounset
29+
30+
if [ -z "${1:-}" ]; then
31+
echo "Usage: nvmm <node-hint> [command...]"
32+
exit 1
33+
fi
34+
35+
# Discard node-hint — Lambda is single-node.
36+
shift
37+
38+
# If no command, nothing to do.
39+
if [ $# -eq 0 ]; then
40+
exit 0
41+
fi
42+
43+
# Build the command string. All remaining args are the command to run.
44+
CMD="$*"
45+
46+
# Run via a privileged ephemeral pod with host's /usr and /dev mounted.
47+
# This gives us access to nvidia-smi and GPU devices.
48+
POD_NAME="nvmm-lambda-$(date +%s)"
49+
kubectl run "${POD_NAME}" \
50+
--rm \
51+
--attach \
52+
--wait \
53+
--restart=Never \
54+
--quiet \
55+
--image=ubuntu:24.04 \
56+
--overrides='{
57+
"spec": {
58+
"hostPID": true,
59+
"containers": [{
60+
"name": "nvmm",
61+
"image": "ubuntu:24.04",
62+
"securityContext": {"privileged": true},
63+
"command": ["/bin/sh", "-c", "export PATH=$PATH:/host/usr/bin:/host/usr/local/bin && '"$(echo "${CMD}" | sed "s/'/'\\\\''/g")"'"],
64+
"volumeMounts": [
65+
{"mountPath": "/dev", "name": "dev"},
66+
{"mountPath": "/host/usr", "name": "host-usr", "readOnly": true},
67+
{"mountPath": "/proc/driver/nvidia", "name": "proc-nvidia"}
68+
]
69+
}],
70+
"volumes": [
71+
{"name": "dev", "hostPath": {"path": "/dev"}},
72+
{"name": "host-usr", "hostPath": {"path": "/usr"}},
73+
{"name": "proc-nvidia", "hostPath": {"path": "/proc/driver/nvidia"}}
74+
]
75+
}
76+
}' 2>/dev/null

tests/bats/test_gpu_dynmig.bats

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ confirm_mig_mode_disabled_all_nodes() {
5757
}
5858

5959

60-
# bats test_tags=fastfeedback
60+
# bats test_tags=fastfeedback,dynmig
6161
@test "DynMIG: inspect device attributes in resource slice (gpu)" {
6262
local reference=(
6363
"architecture"
@@ -80,7 +80,7 @@ confirm_mig_mode_disabled_all_nodes() {
8080
}
8181

8282

83-
# bats test_tags=fastfeedback
83+
# bats test_tags=fastfeedback,dynmig
8484
@test "DynMIG: inspect device attributes in resource slice (mig)" {
8585
local reference=(
8686
"architecture"
@@ -104,7 +104,7 @@ confirm_mig_mode_disabled_all_nodes() {
104104
}
105105

106106

107-
# bats test_tags=fastfeedback
107+
# bats test_tags=fastfeedback,dynmig
108108
@test "DynMIG: 1 pod, 1 MIG" {
109109
confirm_mig_mode_disabled_all_nodes
110110
kubectl apply -f tests/bats/specs/gpu-simple-mig.yaml
@@ -127,7 +127,7 @@ confirm_mig_mode_disabled_all_nodes() {
127127
}
128128

129129

130-
# bats test_tags=fastfeedback
130+
# bats test_tags=fastfeedback,dynmig
131131
@test "DynMIG: 1 pod, 2 containers (1 MIG each)" {
132132
confirm_mig_mode_disabled_all_nodes
133133

@@ -159,7 +159,7 @@ confirm_mig_mode_disabled_all_nodes() {
159159
}
160160

161161

162-
# bats test_tags=fastfeedback
162+
# bats test_tags=fastfeedback,dynmig
163163
@test "DynMIG: 1 pod, 1 MIG + TimeSlicing config" {
164164
local _iargs=("--set" "logVerbosity=6" "--set" "featureGates.DynamicMIG=true" "--set" "featureGates.TimeSlicingSettings=true")
165165
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs

0 commit comments

Comments
 (0)