From 44ce09c773fd9f32445f954caaefd4d2af84568d Mon Sep 17 00:00:00 2001 From: akri3i Date: Mon, 6 Apr 2026 16:23:59 +0530 Subject: [PATCH] test VM with MIG VGPU Signed-off-by: akri3i --- tests/install_upgrade_operators/conftest.py | 2 +- .../crypto_policy/utils.py | 2 +- .../test_default_featuregates.py | 2 +- tests/install_upgrade_operators/utils.py | 13 -- tests/utils.py | 15 +- tests/virt/conftest.py | 29 +-- tests/virt/node/gpu/constants.py | 14 ++ tests/virt/node/gpu/utils.py | 39 +++- tests/virt/node/gpu/vgpu/conftest.py | 179 +++++++++++++++++- .../gpu/vgpu/test_rhel_vm_with_mig_vgpu.py | 90 +++++++++ 10 files changed, 338 insertions(+), 47 deletions(-) create mode 100644 tests/virt/node/gpu/vgpu/test_rhel_vm_with_mig_vgpu.py diff --git a/tests/install_upgrade_operators/conftest.py b/tests/install_upgrade_operators/conftest.py index 3a4379548b..0af458862d 100644 --- a/tests/install_upgrade_operators/conftest.py +++ b/tests/install_upgrade_operators/conftest.py @@ -20,9 +20,9 @@ ) from tests.install_upgrade_operators.utils import ( get_network_addon_config, - get_resource_by_name, get_resource_from_module_name, ) +from tests.utils import get_resource_by_name from utilities.constants import HOSTPATH_PROVISIONER_CSI, HPP_POOL from utilities.hco import ResourceEditorValidateHCOReconcile, get_hco_version from utilities.infra import ( diff --git a/tests/install_upgrade_operators/crypto_policy/utils.py b/tests/install_upgrade_operators/crypto_policy/utils.py index b34da6c941..14155a6eea 100644 --- a/tests/install_upgrade_operators/crypto_policy/utils.py +++ b/tests/install_upgrade_operators/crypto_policy/utils.py @@ -21,9 +21,9 @@ TLS_INTERMEDIATE_CIPHERS_IANA_OPENSSL_SYNTAX, ) from tests.install_upgrade_operators.utils import ( - get_resource_by_name, get_resource_key_value, ) +from tests.utils import get_resource_by_name from utilities.constants import ( CLUSTER, TIMEOUT_2MIN, diff --git a/tests/install_upgrade_operators/feature_gates/test_default_featuregates.py b/tests/install_upgrade_operators/feature_gates/test_default_featuregates.py index 919147ea46..df87b736bd 100644 --- a/tests/install_upgrade_operators/feature_gates/test_default_featuregates.py +++ b/tests/install_upgrade_operators/feature_gates/test_default_featuregates.py @@ -19,9 +19,9 @@ RESOURCE_TYPE_STR, ) from tests.install_upgrade_operators.utils import ( - get_resource_by_name, get_resource_key_value, ) +from tests.utils import get_resource_by_name from utilities.constants import CDI_KUBEVIRT_HYPERCONVERGED, KUBEVIRT_HCO_NAME pytestmark = [pytest.mark.post_upgrade, pytest.mark.sno, pytest.mark.s390x, pytest.mark.skip_must_gather_collection] diff --git a/tests/install_upgrade_operators/utils.py b/tests/install_upgrade_operators/utils.py index 207e1ecae8..3713678580 100644 --- a/tests/install_upgrade_operators/utils.py +++ b/tests/install_upgrade_operators/utils.py @@ -270,19 +270,6 @@ def get_resource_from_module_name(related_obj, ocp_resources_submodule_list, adm ) -def get_resource_by_name( - resource_kind: Resource, name: str, admin_client: DynamicClient, namespace: str | None = None -) -> Resource: - kwargs = {"name": name} - if namespace: - kwargs["namespace"] = namespace - kwargs["client"] = admin_client - resource = resource_kind(**kwargs) - if resource.exists: - return resource - raise ResourceNotFoundError(f"{resource_kind} {name} not found.") - - def get_resource_key_value(resource: Resource, key_name: str) -> Any: return benedict( resource.instance.to_dict()["spec"], diff --git a/tests/utils.py b/tests/utils.py index 9aa760750b..99dc979527 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -16,7 +16,7 @@ from ocp_resources.datavolume import DataVolume from ocp_resources.kubevirt import KubeVirt from ocp_resources.node import Node -from ocp_resources.resource import ResourceEditor +from ocp_resources.resource import Resource, ResourceEditor from ocp_resources.storage_profile import StorageProfile from ocp_resources.virtual_machine import VirtualMachine from ocp_resources.virtual_machine_instance_migration import VirtualMachineInstanceMigration @@ -689,3 +689,16 @@ def verify_rwx_default_storage(client: DynamicClient) -> None: f"Default storage class '{storage_class}' doesn't support RWX mode " f"(required: RWX, found: {found_mode or 'none'})" ) + + +def get_resource_by_name( + resource_kind: Resource, name: str, admin_client: DynamicClient, namespace: str | None = None +) -> Resource: + kwargs = {"name": name} + if namespace: + kwargs["namespace"] = namespace + kwargs["client"] = admin_client + resource = resource_kind(**kwargs) + if resource.exists: + return resource + raise ResourceNotFoundError(f"{resource_kind} {name} not found.") diff --git a/tests/virt/conftest.py b/tests/virt/conftest.py index 78d5f46f90..4bf7a128b2 100644 --- a/tests/virt/conftest.py +++ b/tests/virt/conftest.py @@ -8,7 +8,6 @@ from ocp_resources.deployment import Deployment from ocp_resources.infrastructure import Infrastructure from ocp_resources.performance_profile import PerformanceProfile -from timeout_sampler import TimeoutExpiredError, TimeoutSampler from tests.utils import ( verify_cpumanager_workers, @@ -26,6 +25,7 @@ ) from tests.virt.node.gpu.utils import ( apply_node_labels, + assert_mdev_bus_exists_on_nodes, toggle_vgpu_deploy_labels, wait_for_ds_ready, ) @@ -41,12 +41,10 @@ from utilities.constants import ( AMD, INTEL, - TIMEOUT_1MIN, - TIMEOUT_5SEC, NamespacesNames, ) from utilities.exceptions import ResourceValueError, UnsupportedGPUDeviceError -from utilities.infra import ExecCommandOnPod, get_nodes_with_label, get_resources_by_name_prefix, label_nodes +from utilities.infra import get_nodes_with_label, get_resources_by_name_prefix, label_nodes from utilities.pytest_utils import exit_pytest_execution from utilities.virt import get_nodes_gpu_info, vm_instance_from_template @@ -306,28 +304,7 @@ def non_existent_mdev_bus_nodes(workers_utility_pods, vgpu_ready_nodes): If it's not available, this means the nvidia-vgpu-manager-daemonset Pod might not be in running state in the nvidia-gpu-operator namespace. """ - desired_bus = "mdev_bus" - non_existent_mdev_bus_nodes = [] - for node in vgpu_ready_nodes: - pod_exec = ExecCommandOnPod(utility_pods=workers_utility_pods, node=node) - try: - for sample in TimeoutSampler( - wait_timeout=TIMEOUT_1MIN, - sleep=TIMEOUT_5SEC, - func=pod_exec.exec, - command=f"ls /sys/class | grep {desired_bus} || true", - ): - if sample: - return - except TimeoutExpiredError: - non_existent_mdev_bus_nodes.append(node.name) - if non_existent_mdev_bus_nodes: - pytest.fail( - reason=( - f"On these nodes: {non_existent_mdev_bus_nodes} {desired_bus} is not available." - "Ensure that in 'nvidia-gpu-operator' namespace nvidia-vgpu-manager-daemonset Pod is Running." - ) - ) + assert_mdev_bus_exists_on_nodes(workers_utility_pods=workers_utility_pods, nodes=vgpu_ready_nodes) @pytest.fixture(scope="session") diff --git a/tests/virt/node/gpu/constants.py b/tests/virt/node/gpu/constants.py index 0611dccb57..b06a106ac0 100644 --- a/tests/virt/node/gpu/constants.py +++ b/tests/virt/node/gpu/constants.py @@ -62,4 +62,18 @@ MDEV_GRID_AVAILABLE_INSTANCES_STR: "4", MDEV_GRID_TYPE_STR: "nvidia-746", }, + "10de:20b7": { + DEVICE_ID_STR: "10de:20b7", + GPU_DEVICE_NAME_STR: f"{GPU_DEVICE_MANUFACTURER}/NVIDIA_A30-1-6C", + VGPU_DEVICE_NAME_STR: f"{GPU_DEVICE_MANUFACTURER}/NVIDIA_A30-1-6C", + GPU_PRETTY_NAME_STR: "NVIDIA A30-1-6C", + VGPU_PRETTY_NAME_STR: "NVIDIA A30-1-6C", + MDEV_NAME_STR: "NVIDIA A30-1-6C", + MDEV_AVAILABLE_INSTANCES_STR: "4", + MDEV_TYPE_STR: "nvidia-689", + VGPU_GRID_NAME_STR: f"{GPU_DEVICE_MANUFACTURER}/A30_1_6C", + MDEV_GRID_NAME_STR: "NVIDIA A30-1-6C", + MDEV_GRID_AVAILABLE_INSTANCES_STR: "4", + MDEV_GRID_TYPE_STR: "nvidia-689", + }, } diff --git a/tests/virt/node/gpu/utils.py b/tests/virt/node/gpu/utils.py index 60f1f905ba..49131979cf 100644 --- a/tests/virt/node/gpu/utils.py +++ b/tests/virt/node/gpu/utils.py @@ -1,9 +1,10 @@ import logging import shlex +import pytest from ocp_resources.resource import ResourceEditor from pyhelper_utils.shell import run_ssh_commands -from timeout_sampler import TimeoutSampler +from timeout_sampler import TimeoutExpiredError, TimeoutSampler from tests.virt.node.gpu.constants import ( SANDBOX_VALIDATOR_DEPLOY_LABEL, @@ -13,10 +14,12 @@ from tests.virt.utils import fetch_gpu_device_name_from_vm_instance, verify_gpu_device_exists_in_vm from utilities.constants import ( TCP_TIMEOUT_30SEC, + TIMEOUT_1MIN, TIMEOUT_2MIN, TIMEOUT_3MIN, TIMEOUT_5SEC, ) +from utilities.infra import ExecCommandOnPod from utilities.virt import restart_vm_wait_for_running_vm, running_vm LOGGER = logging.getLogger(__name__) @@ -111,3 +114,37 @@ def toggle_vgpu_deploy_labels(gpu_nodes, nodes_with_supported_gpus, sandbox_vali apply_node_labels(nodes=nodes_with_supported_gpus, labels={VGPU_DEVICE_MANAGER_DEPLOY_LABEL: "true"}) wait_for_ds_ready(ds=sandbox_validator_ds, expected=len(gpu_nodes)) wait_for_ds_ready(ds=vgpu_device_manager_ds, expected=len(nodes_with_supported_gpus)) + + +def assert_mdev_bus_exists_on_nodes(workers_utility_pods, nodes): + """Assert that mdev_bus is present on every node in nodes. + + Args: + workers_utility_pods: Utility pods used to run commands on nodes. + nodes: GPU nodes to check. + + Raises: + pytest.fail: If mdev_bus is absent on any node after TIMEOUT_1MIN. + """ + desired_bus = "mdev_bus" + missing_nodes = [] + for node in nodes: + pod_exec = ExecCommandOnPod(utility_pods=workers_utility_pods, node=node) + try: + for sample in TimeoutSampler( + wait_timeout=TIMEOUT_1MIN, + sleep=TIMEOUT_5SEC, + func=pod_exec.exec, + command=f"ls /sys/class | grep {desired_bus} || true", + ): + if sample: + break + except TimeoutExpiredError: + missing_nodes.append(node.name) + if missing_nodes: + pytest.fail( + reason=( + f"On these nodes: {missing_nodes} {desired_bus} is not available." + "Ensure that in 'nvidia-gpu-operator' namespace nvidia-vgpu-manager-daemonset Pod is Running." + ) + ) diff --git a/tests/virt/node/gpu/vgpu/conftest.py b/tests/virt/node/gpu/vgpu/conftest.py index 03319abc7c..77d1135400 100644 --- a/tests/virt/node/gpu/vgpu/conftest.py +++ b/tests/virt/node/gpu/vgpu/conftest.py @@ -5,19 +5,32 @@ import logging import pytest +from ocp_resources.cluster_policy import GPUClusterPolicy from ocp_resources.kubevirt import KubeVirt +from ocp_resources.resource import ResourceEditor +from ocp_resources.template import Template +from tests.os_params import RHEL_LATEST_LABELS +from tests.utils import get_resource_by_name from tests.virt.node.gpu.constants import ( + GPU_CARDS_MAP, + GPU_WORKLOAD_CONFIG_LABEL, MDEV_GRID_NAME_STR, MDEV_NAME_STR, VGPU_CONFIG_LABEL, VGPU_DEVICE_NAME_STR, VGPU_GRID_NAME_STR, ) -from tests.virt.node.gpu.utils import wait_for_ds_ready -from tests.virt.utils import patch_hco_cr_with_mdev_permitted_hostdevices +from tests.virt.node.gpu.utils import ( + apply_node_labels, + assert_mdev_bus_exists_on_nodes, + wait_for_ds_ready, +) +from tests.virt.utils import build_node_affinity_dict, patch_hco_cr_with_mdev_permitted_hostdevices +from utilities.exceptions import UnsupportedGPUDeviceError from utilities.hco import ResourceEditorValidateHCOReconcile -from utilities.infra import label_nodes +from utilities.infra import get_daemonsets, label_nodes +from utilities.virt import VirtualMachineForTestsFromTemplate, get_nodes_gpu_info, running_vm, vm_instance_from_template LOGGER = logging.getLogger(__name__) @@ -76,3 +89,163 @@ def hco_cr_with_node_specific_mdev_permitted_hostdevices( wait_for_reconcile_post_update=True, ): yield + + +@pytest.fixture(scope="class") +def update_cluster_policy_to_enable_mig_vgpu(admin_client): + cluster_policy = get_resource_by_name( + resource_kind=GPUClusterPolicy, + name="gpu-cluster-policy", + admin_client=admin_client, + ) + patch_data = { + "spec": { + "vgpuManager": { + "image": "qe-cnv-tests-ocp-nvidia-aie-vgpu-installer", + } + } + } + with ResourceEditor(patches={cluster_policy: patch_data}): + yield + + +@pytest.fixture(scope="class") +def update_daemon_set_to_enable_mig_vgpu(update_cluster_policy_to_enable_mig_vgpu, admin_client): + all_daemonsets = get_daemonsets(admin_client=admin_client, namespace="nvidia-gpu-operator") + for ds in all_daemonsets: + if ds.name.startswith("nvidia-vgpu-manager-daemonset"): + container = ds.instance.spec.template.spec.containers[0] + + container_patch = dict(container.items()) + container_patch["imagePullPolicy"] = "Always" + patch_data = {"spec": {"template": {"spec": {"containers": [container_patch]}}}} + with ResourceEditor(patches={ds: patch_data}): + yield + + +@pytest.fixture(scope="class") +def mig_gpu_vmb( + unprivileged_client, + namespace, + golden_image_data_volume_template_for_test_scope_class, + supported_mig_gpu_device, + mig_gpu_vma, +): + """VM Fixture for second VM for MIG vGPU based Tests.""" + with VirtualMachineForTestsFromTemplate( + name="rhel-vgpu-gpus-spec-vm2", + namespace=namespace.name, + client=unprivileged_client, + labels=Template.generate_template_labels(**RHEL_LATEST_LABELS), + data_volume_template=golden_image_data_volume_template_for_test_scope_class, + vm_affinity=mig_gpu_vma.vm_affinity, + gpu_name=supported_mig_gpu_device[VGPU_DEVICE_NAME_STR], + ) as vm: + running_vm(vm=vm) + yield vm + + +@pytest.fixture(scope="class") +def mig_gpu_vma( + request, + unprivileged_client, + namespace, + golden_image_data_volume_template_for_test_scope_class, + hco_cr_mig_configuration, + supported_mig_gpu_device, + nodes_with_supported_mig_gpus, +): + params = request.param + with vm_instance_from_template( + request=request, + unprivileged_client=unprivileged_client, + namespace=namespace, + data_volume_template=golden_image_data_volume_template_for_test_scope_class, + vm_affinity=build_node_affinity_dict(values=[nodes_with_supported_mig_gpus[0].name]), + host_device_name=supported_mig_gpu_device.get(params.get("host_device")), + gpu_name=supported_mig_gpu_device.get(params.get("gpu_device")), + ) as mig_gpu_vm: + yield mig_gpu_vm + + +@pytest.fixture(scope="class") +def nodes_with_supported_mig_gpus(gpu_nodes, workers_utility_pods): + gpu_nodes_copy = gpu_nodes.copy() + for node in gpu_nodes: + if "A2" in get_nodes_gpu_info(util_pods=workers_utility_pods, node=node): + gpu_nodes_copy.remove(node) + return gpu_nodes_copy + + +@pytest.fixture(scope="class") +def supported_mig_gpu_device(workers_utility_pods, nodes_with_supported_mig_gpus): + gpu_info = get_nodes_gpu_info(util_pods=workers_utility_pods, node=nodes_with_supported_mig_gpus[0]) + for gpu_id in GPU_CARDS_MAP: + if gpu_id in gpu_info: + return GPU_CARDS_MAP[gpu_id] + + raise UnsupportedGPUDeviceError("GPU device ID not in current GPU_CARDS_MAP!") + + +@pytest.fixture(scope="class") +def hco_cr_mig_configuration( + hyperconverged_resource_scope_class, + supported_mig_gpu_device, + mig_gpu_nodes_labeled_with_vgpu_config, +): + yield from patch_hco_cr_with_mdev_permitted_hostdevices( + hyperconverged_resource=hyperconverged_resource_scope_class, supported_gpu_device=supported_mig_gpu_device + ) + + +@pytest.fixture(scope="class") +def mig_gpu_nodes_labeled_with_vgpu_config( + nodes_with_supported_mig_gpus, + mig_gpu_nodes_labeled_with_vm_vgpu, + nvidia_sandbox_validator_ds, + gpu_nodes, +): + label_gen = label_nodes( + nodes=nodes_with_supported_mig_gpus, + labels={"nvidia.com/vgpu.config": "A30-1-6C"}, + ) + + next(label_gen) + wait_for_ds_ready(ds=nvidia_sandbox_validator_ds, expected=len(gpu_nodes)) + yield + try: + next(label_gen) + except StopIteration: + pass + + +@pytest.fixture(scope="class") +def mig_gpu_nodes_labeled_with_vm_vgpu( + nodes_with_supported_mig_gpus, + nvidia_vgpu_manager_ds, + nvidia_vgpu_device_manager_ds, + nvidia_sandbox_validator_ds, + gpu_nodes, +): + label_gen = label_nodes(nodes=nodes_with_supported_mig_gpus, labels={GPU_WORKLOAD_CONFIG_LABEL: "vm-vgpu"}) + next(label_gen) + wait_for_ds_ready(ds=nvidia_vgpu_manager_ds, expected=len(nodes_with_supported_mig_gpus)) + wait_for_ds_ready(ds=nvidia_vgpu_device_manager_ds, expected=len(nodes_with_supported_mig_gpus)) + yield + apply_node_labels(nodes=nodes_with_supported_mig_gpus, labels={"nvidia.com/vgpu.config.state": None}) + try: + next(label_gen) + except StopIteration: + pass + + +@pytest.fixture(scope="class") +def non_existent_mdev_bus_mig_nodes( + workers_utility_pods, + mig_gpu_nodes_labeled_with_vm_vgpu, + nodes_with_supported_mig_gpus, +): + """ + Check if the mdev_bus needed for vGPU is available. + """ + assert_mdev_bus_exists_on_nodes(workers_utility_pods=workers_utility_pods, nodes=nodes_with_supported_mig_gpus) diff --git a/tests/virt/node/gpu/vgpu/test_rhel_vm_with_mig_vgpu.py b/tests/virt/node/gpu/vgpu/test_rhel_vm_with_mig_vgpu.py new file mode 100644 index 0000000000..327453ea25 --- /dev/null +++ b/tests/virt/node/gpu/vgpu/test_rhel_vm_with_mig_vgpu.py @@ -0,0 +1,90 @@ +""" +vGPU with RHEL VM +""" + +import logging + +import pytest + +from tests.os_params import RHEL_LATEST, RHEL_LATEST_LABELS +from tests.virt.node.gpu.constants import ( + MDEV_AVAILABLE_INSTANCES_STR, + VGPU_DEVICE_NAME_STR, +) +from tests.virt.node.gpu.utils import ( + verify_gpu_expected_count_updated_on_node, +) +from tests.virt.utils import ( + get_num_gpu_devices_in_rhel_vm, + verify_gpu_device_exists_in_vm, + verify_gpu_device_exists_on_node, +) + +pytestmark = [ + pytest.mark.post_upgrade, + pytest.mark.special_infra, + pytest.mark.gpu, + pytest.mark.usefixtures("non_existent_mdev_bus_mig_nodes"), +] + + +LOGGER = logging.getLogger(__name__) +TESTS_CLASS_NAME = "TestVGPURHELGPUSSpec" + + +@pytest.mark.parametrize( + "golden_image_data_source_for_test_scope_class, mig_gpu_vma", + [ + pytest.param( + {"os_dict": RHEL_LATEST}, + { + "vm_name": "rhel-vgpu-gpus-spec-vm", + "template_labels": RHEL_LATEST_LABELS, + "gpu_device": VGPU_DEVICE_NAME_STR, + }, + ), + ], + indirect=True, +) +@pytest.mark.usefixtures("update_cluster_policy_to_enable_mig_vgpu", "update_daemon_set_to_enable_mig_vgpu") +class TestMIGVGPURHELGPUSSpec: + @pytest.mark.polarion("CNV-12572") + def test_permitted_hostdevices_mig_vgpu_visible( + self, + nodes_with_supported_mig_gpus, + supported_mig_gpu_device, + hco_cr_mig_configuration, + mig_gpu_vma, + ): + """ + Test Permitted HostDevice is visible and count updated under Capacity/Allocatable + section of the GPU Node. + """ + vgpu_device_name = supported_mig_gpu_device[VGPU_DEVICE_NAME_STR] + verify_gpu_device_exists_on_node(gpu_nodes=nodes_with_supported_mig_gpus, device_name=vgpu_device_name) + verify_gpu_expected_count_updated_on_node( + gpu_nodes=nodes_with_supported_mig_gpus, + device_name=vgpu_device_name, + expected_count=supported_mig_gpu_device[MDEV_AVAILABLE_INSTANCES_STR], + ) + + # test_access_mig_vgpus_rhel_vm must run before test_access_vgpus_in_both_rhel_vm_using_same_mig_gpu + # because the second test reuses the running VM created by the first test. + @pytest.mark.dependency(name=f"{TESTS_CLASS_NAME}::test_access_mig_vgpus_rhel_vm") + @pytest.mark.polarion("CNV-12573") + def test_access_mig_vgpus_rhel_vm(self, supported_mig_gpu_device, mig_gpu_vma): + """ + Test vGPU is accessible in VM with GPUs spec. + """ + verify_gpu_device_exists_in_vm(vm=mig_gpu_vma, supported_gpu_device=supported_mig_gpu_device) + + @pytest.mark.dependency(depends=[f"{TESTS_CLASS_NAME}::test_access_mig_vgpus_rhel_vm"]) + @pytest.mark.polarion("CNV-12574") + def test_access_vgpus_in_both_rhel_vm_using_same_mig_gpu(self, mig_gpu_vma, mig_gpu_vmb): + """ + Test vGPU is accessible in both the RHEL VMs, using same GPU, using GPUs spec. + """ + vm_with_no_gpu = [ + vm.name for vm in [mig_gpu_vma, mig_gpu_vmb] if not get_num_gpu_devices_in_rhel_vm(vm=vm) == 1 + ] + assert not vm_with_no_gpu, f"GPU does not exist in following vms: {vm_with_no_gpu}"