Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions tests/bats/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ TEST_CHART_VERSION ?= "$(VERSION_GHCR_CHART)"

# The baseline Helm chart to test upgrades from and downgrades to.
TEST_CHART_LASTSTABLE_REPO ?= "oci://ghcr.io/nvidia/k8s-dra-driver-gpu"
TEST_CHART_LASTSTABLE_VERSION ?= "25.8.1-fc69d985-chart"
TEST_CHART_LASTSTABLE_VERSION ?= "25.12.0-0882da87-chart"

# If not "false": the to-be-tested Helm chart is installed from the local
# filesystem (from `deployments/helm/nvidia-dra-driver-gpu`). Make sure
Expand Down Expand Up @@ -136,9 +136,18 @@ endef

default: tests

# Temporary image tag, see
# https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/908. The `$$$$` below
# evaluates to $$ in the recipe-executing shell, which resolves to its process
# ID. With that, concurrent recipe executions use different image tags,
# preventing rare export errors. Note that `docker tag` is atomic (never fails
# with "already exists").
.PHONY: runner-image
runner-image:
docker buildx build . -t $(BATS_IMAGE) -f tests/bats/Dockerfile
TMP_TAG="$(BATS_IMAGE)-tmp-$$$$" && \
docker buildx build . -t $$TMP_TAG -f tests/bats/Dockerfile && \
docker tag $$TMP_TAG $(BATS_IMAGE) && \
docker rmi $$TMP_TAG

# Warning: destructive against currently configured k8s cluster.
#
Expand Down
32 changes: 22 additions & 10 deletions tests/bats/helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -217,21 +217,33 @@ get_device_attrs_from_any_gpu_slice() {
local device_type="$1"
local node_name="$2"
local spath="${BATS_TEST_TMPDIR}/gpu_resource_slice_content"
local slicename
local slicenames

# Get contents of first listed GPU plugin resource slice; dump into file. If a
# node_name was provided, filter for the GPU plugin resource slice on that
# node.
# For debugging, show current set of slices.
kubectl get resourceslice >&2

# Get names of resource slices: either for all gpu plugin resource slices, or
# all for all gpu slices announced by a specific node.
if [ -n "$node_name" ]; then
slicename="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | grep "$node_name" | head -n1 | awk '{print $1}')"
slicenames="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | grep "$node_name" | awk '{print $1}')"
else
slicename="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | head -n1 | awk '{print $1}')"
slicenames="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | awk '{print $1}')"
fi
log "resource slice name: $slicename"

# For debugging, show current set of slices.
kubectl get resourceslice >&2
kubectl get resourceslices.resource.k8s.io -o yaml "${slicename}" > "${spath}"
# Identify a suitable (device-announcing) resource slice. This is relevant
# because on Kubernetes 1.35+ there is one special resource slice per node
# that only contains counters, and no devices. We must make sure that we
# do not use that for device attribute extraction.
local slicename
for slicename in $slicenames; do
kubectl get resourceslices.resource.k8s.io -o yaml "${slicename}" > "${spath}"
if grep -q "devices:" "${spath}"; then
log "identified suitable resource slice: $slicename"
break
fi
log "do not use slice $slicename (does not define devices)"
done

log "wrote resource slice content to: ${spath}"

# Log contents, for https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/902
Expand Down