diff --git a/tests/bats/helpers.sh b/tests/bats/helpers.sh index a590e6060..120834591 100644 --- a/tests/bats/helpers.sh +++ b/tests/bats/helpers.sh @@ -95,10 +95,12 @@ log_objects() { # Never fail, but show output in case a test fails, to facilitate debugging. # Could this be part of setup()? If setup succeeds and when a test fails: # does this show the output of setup? Then we could do this. - kubectl get resourceclaims || true - kubectl get computedomain || true - kubectl get pods -o wide || true + log "LOG_OBJECTS START" + echo "claims:"; kubectl get resourceclaims --ignore-not-found || true + echo "CDs: "; kubectl get computedomain --ignore-not-found || true + echo "pods: "; kubectl get pods -o wide --ignore-not-found || true kubectl get pods -o wide -n nvidia-dra-driver-gpu || true + log "LOG_OBJECTS END" } @@ -126,6 +128,40 @@ wait_for_pod_event() { done } +# The number of expected GPU resource slices per node depends on the Kubernetes +# version. Wait for each node to announce at least one of them. +wait_for_all_gpu_resource_slices() { + local timeout=$1 + local start_time=$SECONDS + local driver_name="gpu.nvidia.com" + local expected_count=$2 + expected_count="$(get_node_count)" + + log "Waiting up to ${timeout} s for at least ${expected_count} unique node(s) to show at least one '${driver_name}' ResourceSlice" + while (( SECONDS - start_time < timeout )); do + # 1. Fetch DRIVER and NODE columns for all slices. + # 2. filter for GPU DRA driver and print the nodeName ($2) + # 3. sort -u removes duplicate node names (since one node can have multiple GPU slices). + # 4. wc -l counts the unique nodes. + # 5. awk strips any extra whitespace from wc output. + current_node_count=$(kubectl get resourceslices.resource.k8s.io -A \ + -o custom-columns=DRIVER:.spec.driver,NODE:.spec.nodeName --no-headers 2>/dev/null \ + | awk -v d="${driver_name}" '$1 == d { print $2 }' \ + | sort -u \ + | wc -l \ + | awk '{print $1}') + + if [[ "$current_node_count" -ge "$expected_count" ]]; then + log "Success: Found ${current_node_count} unique node(s) with ResourceSlices for '${driver_name}'" + return 0 + fi + + sleep 0.5 + done + + log "wait for resource slices: deadline reached" + return 1 +} get_all_cd_daemon_logs_for_cd_name() { CD_NAME="$1" @@ -169,17 +205,23 @@ assert_attrs_equal() { } -# Helper function to get device attributes from a GPU resource slice +# Get device attributes for a device in a GPU resource slice. +# Emit attributes to stdout. +# Log to stderr. +# If node name is provided, query first slice from that node. +# If no node name is provided, query first slice overall. # Usage: get_device_attrs_from_any_gpu_slice "gpu" # get_device_attrs_from_any_gpu_slice "mig" +# get_device_attrs_from_any_gpu_slice "mig" "" get_device_attrs_from_any_gpu_slice() { local device_type="$1" local node_name="$2" local spath="${BATS_TEST_TMPDIR}/gpu_resource_slice_content" local slicename - # Get contents of first listed GPU plugin resource slice, and dump it into a - # file. + # Get contents of first listed GPU plugin resource slice; dump into file. If a + # node_name was provided, filter for the GPU plugin resource slice on that + # node. if [ -n "$node_name" ]; then slicename="$(kubectl get resourceslices.resource.k8s.io | grep 'gpu.nvidia.com' | grep "$node_name" | head -n1 | awk '{print $1}')" else @@ -192,14 +234,33 @@ get_device_attrs_from_any_gpu_slice() { kubectl get resourceslices.resource.k8s.io -o yaml "${slicename}" > "${spath}" log "wrote resource slice content to: ${spath}" + # Log contents, for https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/902 + cat "${spath}" >&2 + # For the first device in that slice (of given type), extract the set of - # device attribute _keys_. Emit those keys, one per line. If a node_name was - # provided, filter for the GPU plugin resource slice on that node. Using - # --raw-output strips quotes. + # device attribute _keys_. Emit those keys, one per line. --raw-output strips + # quotes. yq --raw-output "[.spec.devices[] | select(.attributes.type.string == \"${device_type}\")] | .[0] | .attributes | keys | .[]" "${spath}" } +# Get Kubernetes node count, emit to stdout. Fail function +# if kubectl fails. +get_node_count() { + local nodes + + if ! nodes=$(kubectl get nodes --no-headers 2>/dev/null); then + return 1 + fi + + if [[ -z "$nodes" ]]; then + echo 0 + else + echo "$nodes" | wc -l | awk '{print $1}' + fi +} + + show_kubelet_plugin_error_logs() { echo -e "\nKUBELET PLUGIN ERROR LOGS START" ( diff --git a/tests/bats/test_gpu_basic.bats b/tests/bats/test_gpu_basic.bats index 2d9000d3d..36f82aad2 100644 --- a/tests/bats/test_gpu_basic.bats +++ b/tests/bats/test_gpu_basic.bats @@ -119,6 +119,7 @@ bats::on_failure() { } + # bats test_tags=fastfeedback @test "GPUs: inspect device attributes in resource slice (gpu)" { local reference=( @@ -135,6 +136,7 @@ bats::on_failure() { "addressingMode" ) - local attrs=$(get_device_attrs_from_any_gpu_slice "gpu") + local attrs + attrs=$(get_device_attrs_from_any_gpu_slice "gpu") assert_attrs_equal "$attrs" "${reference[@]}" } diff --git a/tests/bats/test_gpu_dynmig.bats b/tests/bats/test_gpu_dynmig.bats index adcdfb70a..a0e6cc19e 100644 --- a/tests/bats/test_gpu_dynmig.bats +++ b/tests/bats/test_gpu_dynmig.bats @@ -4,6 +4,14 @@ setup_file () { load 'helpers.sh' _common_setup + + # Try to establish well-known state before entering tests. Any previous + # partial cleanup or partial test might leave an exotic resource slice state + # behind. Delete slices, and to do that reliably: delete DRA driver before, if + # it exists. + helm uninstall -n nvidia-dra-driver-gpu nvidia-dra-driver-gpu-batssuite --wait || true + kubectl delete resourceslices.resource.k8s.io --all + local _iargs=("--set" "logVerbosity=6" "--set" "featureGates.DynamicMIG=true") iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs run kubectl logs \ @@ -13,10 +21,15 @@ setup_file () { --prefix --tail=-1 assert_output --partial "About to announce device gpu-0-mig-1g" - # Give a bit of time for the kubelet plugins to update - # resource slices. See - # https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/902 - sleep 2 + # Wait until resource slices are announced. See + # https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/902 -- Maybe we should + # fail the liveness probe until the first resource slice update is known to + # have been performed? That may be too invasive. In any case, the resource + # slice update controller running in the DRA plugin helper might enable us to + # to something smart here. For the purpose of testing, deleting all resource + # slices above and then waiting until resource slices pop up again: that seems + # to be robust. + wait_for_all_gpu_resource_slices 15 } # Executed before entering each test in this file. @@ -60,7 +73,9 @@ confirm_mig_mode_disabled_all_nodes() { "addressingMode" ) - local attrs=$(get_device_attrs_from_any_gpu_slice "gpu") + local attrs + attrs=$(get_device_attrs_from_any_gpu_slice "gpu") + log "attributes seen: $attrs" assert_attrs_equal "$attrs" "${reference[@]}" } @@ -82,7 +97,9 @@ confirm_mig_mode_disabled_all_nodes() { "profile" ) - local attrs=$(get_device_attrs_from_any_gpu_slice "mig") + local attrs + attrs=$(get_device_attrs_from_any_gpu_slice "mig") + log "attributes seen: $attrs" assert_attrs_equal "$attrs" "${reference[@]}" } diff --git a/tests/bats/test_gpu_mig.bats b/tests/bats/test_gpu_mig.bats index a941e04b3..085c03d52 100644 --- a/tests/bats/test_gpu_mig.bats +++ b/tests/bats/test_gpu_mig.bats @@ -96,7 +96,8 @@ bats::on_failure() { "addressingMode" ) - local attrs=$(get_device_attrs_from_any_gpu_slice "mig" "$node") + local attrs + attrs=$(get_device_attrs_from_any_gpu_slice "mig" "$node") assert_attrs_equal "$attrs" "${reference[@]}" }