From a64b97e6b08142ec3451c281b5899927af0d35d6 Mon Sep 17 00:00:00 2001 From: "Dr. Jan-Philip Gehrcke" Date: Sat, 7 Mar 2026 18:44:13 +0000 Subject: [PATCH 1/3] tests: bump last stable Signed-off-by: Dr. Jan-Philip Gehrcke --- tests/bats/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/bats/Makefile b/tests/bats/Makefile index bea3b78a2..8ca3e725e 100644 --- a/tests/bats/Makefile +++ b/tests/bats/Makefile @@ -22,7 +22,7 @@ TEST_CHART_VERSION ?= "$(VERSION_GHCR_CHART)" # The baseline Helm chart to test upgrades from and downgrades to. TEST_CHART_LASTSTABLE_REPO ?= "oci://ghcr.io/nvidia/k8s-dra-driver-gpu" -TEST_CHART_LASTSTABLE_VERSION ?= "25.8.1-fc69d985-chart" +TEST_CHART_LASTSTABLE_VERSION ?= "25.12.0-0882da87-chart" # If not "false": the to-be-tested Helm chart is installed from the local # filesystem (from `deployments/helm/nvidia-dra-driver-gpu`). Make sure From 3ff4db36663e82aee4cc5d06535e30f4430c2502 Mon Sep 17 00:00:00 2001 From: "Dr. Jan-Philip Gehrcke" Date: Sat, 7 Mar 2026 19:13:15 +0000 Subject: [PATCH 2/3] tests: allow for concurrent image build Signed-off-by: Dr. Jan-Philip Gehrcke --- tests/bats/Makefile | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/bats/Makefile b/tests/bats/Makefile index 8ca3e725e..1b6171d09 100644 --- a/tests/bats/Makefile +++ b/tests/bats/Makefile @@ -136,9 +136,18 @@ endef default: tests +# Temporary image tag, see +# https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/908. The `$$$$` below +# evaluates to $$ in the recipe-executing shell, which resolves to its process +# ID. With that, concurrent recipe executions use different image tags, +# preventing rare export errors. Note that `docker tag` is atomic (never fails +# with "already exists"). .PHONY: runner-image runner-image: - docker buildx build . -t $(BATS_IMAGE) -f tests/bats/Dockerfile + TMP_TAG="$(BATS_IMAGE)-tmp-$$$$" && \ + docker buildx build . -t $$TMP_TAG -f tests/bats/Dockerfile && \ + docker tag $$TMP_TAG $(BATS_IMAGE) && \ + docker rmi $$TMP_TAG # Warning: destructive against currently configured k8s cluster. # From 5579d16d7973781cf1c30a4480276c5298d18f89 Mon Sep 17 00:00:00 2001 From: "Dr. Jan-Philip Gehrcke" Date: Sat, 7 Mar 2026 20:35:51 +0000 Subject: [PATCH 3/3] tests: filter resource slices (fix #902) Signed-off-by: Dr. Jan-Philip Gehrcke --- tests/bats/helpers.sh | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/tests/bats/helpers.sh b/tests/bats/helpers.sh index 120834591..237560b9f 100644 --- a/tests/bats/helpers.sh +++ b/tests/bats/helpers.sh @@ -217,21 +217,33 @@ get_device_attrs_from_any_gpu_slice() { local device_type="$1" local node_name="$2" local spath="${BATS_TEST_TMPDIR}/gpu_resource_slice_content" - local slicename + local slicenames - # Get contents of first listed GPU plugin resource slice; dump into file. If a - # node_name was provided, filter for the GPU plugin resource slice on that - # node. + # For debugging, show current set of slices. + kubectl get resourceslice >&2 + + # Get names of resource slices: either for all gpu plugin resource slices, or + # all for all gpu slices announced by a specific node. if [ -n "$node_name" ]; then - slicename="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | grep "$node_name" | head -n1 | awk '{print $1}')" + slicenames="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | grep "$node_name" | awk '{print $1}')" else - slicename="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | head -n1 | awk '{print $1}')" + slicenames="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | awk '{print $1}')" fi - log "resource slice name: $slicename" - # For debugging, show current set of slices. - kubectl get resourceslice >&2 - kubectl get resourceslices.resource.k8s.io -o yaml "${slicename}" > "${spath}" + # Identify a suitable (device-announcing) resource slice. This is relevant + # because on Kubernetes 1.35+ there is one special resource slice per node + # that only contains counters, and no devices. We must make sure that we + # do not use that for device attribute extraction. + local slicename + for slicename in $slicenames; do + kubectl get resourceslices.resource.k8s.io -o yaml "${slicename}" > "${spath}" + if grep -q "devices:" "${spath}"; then + log "identified suitable resource slice: $slicename" + break + fi + log "do not use slice $slicename (does not define devices)" + done + log "wrote resource slice content to: ${spath}" # Log contents, for https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/902