[VIRT] Fix descheduler tests (RedHatQE#1987)

dshchedr · web-flow · commit 50800dd3015b · 2025-09-10T19:24:40.000-07:00
* Fix for Descheduler test

Fix for descheduler test

* check boot time instead of ping process

It saves more than 15 minutes of the test run

* change profile to KubeVirtRelieveAndMigrate

change profile from DevKubeVirtRelieveAndMigrate to KubeVirtRelieveAndMigrate

* update according to comments

update wait_for_migration_finished function to use namespace
use cache for is_jira_67515_open
diff --git a/tests/scale/test_scale_benchmark.py b/tests/scale/test_scale_benchmark.py
@@ -449,7 +449,7 @@ def test_mass_vm_live_migration(
         for batch in scale_vms:
             for vm in batch:
                 wait_for_migration_finished(
-                    vm=vm,
+                    namespace=vm.namespace,
                     migration=vm_migration_info[vm.name][MIGRATION_INSTANCE_STR],
                 )
                 verify_vm_migrated(
diff --git a/tests/utils.py b/tests/utils.py
@@ -183,7 +183,9 @@ def hotplug_instance_type_vm_and_verify(vm, client, instance_type):
 
 def verify_hotplug(vm, client, sockets=None, memory_guest=None):
     vmim = get_created_migration_job(vm=vm, client=client)
-    wait_for_migration_finished(vm=vm, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN)
+    wait_for_migration_finished(
+        namespace=vm.namespace, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN
+    )
     wait_for_ssh_connectivity(vm=vm)
     vmi_spec_domain = vm.vmi.instance.spec.domain
     if sockets:
diff --git a/tests/virt/node/descheduler/conftest.py b/tests/virt/node/descheduler/conftest.py
@@ -5,28 +5,33 @@
 from ocp_resources.deployment import Deployment
 from ocp_resources.pod_disruption_budget import PodDisruptionBudget
 from ocp_resources.resource import Resource, ResourceEditor
+from ocp_resources.virtual_machine_instance_migration import VirtualMachineInstanceMigration
 from ocp_utilities.infra import get_pods_by_name_prefix
 
 from tests.virt.node.descheduler.constants import (
     DESCHEDULER_LABEL_KEY,
     DESCHEDULER_LABEL_VALUE,
     DESCHEDULER_TEST_LABEL,
-    RUNNING_PING_PROCESS_NAME_IN_VM,
 )
 from tests.virt.node.descheduler.utils import (
     calculate_vm_deployment,
     create_kube_descheduler,
     deploy_vms,
-    start_vms_with_process,
     vm_nodes,
     vms_per_nodes,
     wait_vmi_failover,
 )
-from tests.virt.utils import build_node_affinity_dict, get_non_terminated_pods, start_stress_on_vm
-from utilities.constants import TIMEOUT_5SEC
+from tests.virt.utils import (
+    build_node_affinity_dict,
+    get_boot_time_for_multiple_vms,
+    get_non_terminated_pods,
+    start_stress_on_vm,
+)
+from utilities.constants import TIMEOUT_5MIN, TIMEOUT_5SEC
 from utilities.infra import wait_for_pods_deletion
 from utilities.virt import (
     node_mgmt_console,
+    wait_for_migration_finished,
     wait_for_node_schedulable_status,
 )
 
@@ -57,7 +62,7 @@ def descheduler_kubevirt_relieve_and_migrate_profile(
 ):
     with create_kube_descheduler(
         admin_client=admin_client,
-        profiles=["DevKubeVirtRelieveAndMigrate"],
+        profiles=["KubeVirtRelieveAndMigrate"],
         profile_customizations={
             "devActualUtilizationProfile": "PrometheusCPUCombined",
         },
@@ -124,14 +129,10 @@ def vms_orig_nodes_before_node_drain(deployed_vms_for_descheduler_test):
 
 
 @pytest.fixture(scope="class")
-def vms_started_process_for_node_drain(
+def vms_boot_time_before_node_drain(
     deployed_vms_for_descheduler_test,
 ):
-    return start_vms_with_process(
-        vms=deployed_vms_for_descheduler_test,
-        process_name=RUNNING_PING_PROCESS_NAME_IN_VM,
-        args=LOCALHOST,
-    )
+    yield get_boot_time_for_multiple_vms(vm_list=deployed_vms_for_descheduler_test)
 
 
 @pytest.fixture(scope="class")
@@ -161,6 +162,13 @@ def drain_uncordon_node(
                 wait_vmi_failover(vm=vm, orig_node=vms_orig_nodes_before_node_drain[vm.name])
 
 
+@pytest.fixture()
+def all_existing_migrations_completed(admin_client, namespace):
+    # Descheduler may trigger multiple migrations, need to wait when all succeeded
+    for migration in VirtualMachineInstanceMigration.get(dyn_client=admin_client, namespace=namespace):
+        wait_for_migration_finished(namespace=namespace.name, migration=migration, timeout=TIMEOUT_5MIN)
+
+
 @pytest.fixture(scope="class")
 def node_with_min_memory_labeled_for_descheduler_test(node_with_least_available_memory):
     with ResourceEditor(patches={node_with_least_available_memory: {"metadata": {"labels": DESCHEDULER_TEST_LABEL}}}):
@@ -237,14 +245,10 @@ def deployed_vms_on_labeled_node(
 
 
 @pytest.fixture(scope="class")
-def vms_started_process_for_utilization_imbalance(
+def vms_boot_time_before_utilization_imbalance(
     deployed_vms_for_utilization_imbalance,
 ):
-    return start_vms_with_process(
-        vms=deployed_vms_for_utilization_imbalance,
-        process_name=RUNNING_PING_PROCESS_NAME_IN_VM,
-        args=LOCALHOST,
-    )
+    yield get_boot_time_for_multiple_vms(vm_list=deployed_vms_for_utilization_imbalance)
 
 
 @pytest.fixture(scope="class")
@@ -317,6 +321,7 @@ def node_to_run_stress(schedulable_nodes, deployed_vms_for_descheduler_test):
     vm_per_node_counters = vms_per_nodes(vms=vm_nodes(vms=deployed_vms_for_descheduler_test))
     for node in schedulable_nodes:
         if vm_per_node_counters[node.name] > 0:
+            LOGGER.info(f"Node to run stress: {node.name}")
             return node
 
     raise ValueError("No suitable node to run stress")
diff --git a/tests/virt/node/descheduler/test_descheduler.py b/tests/virt/node/descheduler/test_descheduler.py
@@ -5,11 +5,11 @@
 
 from tests.virt.node.descheduler.constants import DESCHEDULER_TEST_LABEL
 from tests.virt.node.descheduler.utils import (
-    assert_running_process_after_failover,
     assert_vms_consistent_virt_launcher_pods,
     assert_vms_distribution_after_failover,
     verify_at_least_one_vm_migrated,
 )
+from tests.virt.utils import verify_linux_boot_time
 
 LOGGER = logging.getLogger(__name__)
 
@@ -39,7 +39,7 @@ def test_descheduler_evicts_vm_after_drain_uncordon(
         self,
         schedulable_nodes,
         deployed_vms_for_descheduler_test,
-        vms_started_process_for_node_drain,
+        vms_boot_time_before_node_drain,
         drain_uncordon_node,
     ):
         assert_vms_distribution_after_failover(
@@ -55,20 +55,21 @@ def test_descheduler_evicts_vm_after_drain_uncordon(
     def test_no_migrations_storm(
         self,
         deployed_vms_for_descheduler_test,
+        all_existing_migrations_completed,
     ):
         LOGGER.info(NO_MIGRATION_STORM_ASSERT_MESSAGE)
         assert_vms_consistent_virt_launcher_pods(running_vms=deployed_vms_for_descheduler_test)
 
     @pytest.mark.dependency(depends=[f"{TESTS_CLASS_NAME}::test_no_migrations_storm"])
     @pytest.mark.polarion("CNV-8288")
-    def test_running_process_after_migrations_complete(
+    def test_boot_time_after_migrations_complete(
         self,
         deployed_vms_for_descheduler_test,
-        vms_started_process_for_node_drain,
+        vms_boot_time_before_node_drain,
     ):
-        assert_running_process_after_failover(
-            vms_list=deployed_vms_for_descheduler_test,
-            process_dict=vms_started_process_for_node_drain,
+        verify_linux_boot_time(
+            vm_list=deployed_vms_for_descheduler_test,
+            initial_boot_time=vms_boot_time_before_node_drain,
         )
 
 
@@ -95,7 +96,7 @@ def test_descheduler_evicts_vm_from_utilization_imbalance(
         node_with_least_available_memory,
         node_with_min_memory_labeled_for_descheduler_test,
         deployed_vms_for_utilization_imbalance,
-        vms_started_process_for_utilization_imbalance,
+        vms_boot_time_before_utilization_imbalance,
         utilization_imbalance,
         node_with_max_memory_labeled_for_descheduler_test,
     ):
@@ -111,20 +112,21 @@ def test_descheduler_evicts_vm_from_utilization_imbalance(
     def test_no_migrations_storm(
         self,
         deployed_vms_for_utilization_imbalance,
+        all_existing_migrations_completed,
     ):
         LOGGER.info(NO_MIGRATION_STORM_ASSERT_MESSAGE)
         assert_vms_consistent_virt_launcher_pods(running_vms=deployed_vms_for_utilization_imbalance)
 
     @pytest.mark.dependency(depends=[f"{TESTS_CLASS_NAME}::test_no_migrations_storm"])
     @pytest.mark.polarion("CNV-8919")
-    def test_running_process_after_migrations_complete(
+    def test_boot_time_after_migrations_complete(
         self,
         deployed_vms_for_utilization_imbalance,
-        vms_started_process_for_utilization_imbalance,
+        vms_boot_time_before_utilization_imbalance,
     ):
-        assert_running_process_after_failover(
-            vms_list=deployed_vms_for_utilization_imbalance,
-            process_dict=vms_started_process_for_utilization_imbalance,
+        verify_linux_boot_time(
+            vm_list=deployed_vms_for_utilization_imbalance,
+            initial_boot_time=vms_boot_time_before_utilization_imbalance,
         )
 
 
diff --git a/tests/virt/node/descheduler/test_descheduler_psi_metrics.py b/tests/virt/node/descheduler/test_descheduler_psi_metrics.py
@@ -3,6 +3,7 @@
 import pytest
 
 from tests.virt.node.descheduler.utils import verify_at_least_one_vm_migrated, wait_for_overutilized_soft_taint
+from utilities.constants import TIMEOUT_15MIN
 
 LOGGER = logging.getLogger(__name__)
 
@@ -45,5 +46,6 @@ def test_rebalancing_when_node_overloaded(
     def test_soft_taint_removed_when_node_not_overloaded(
         self,
         node_to_run_stress,
+        all_existing_migrations_completed,
     ):
-        wait_for_overutilized_soft_taint(node=node_to_run_stress, taint_expected=False)
+        wait_for_overutilized_soft_taint(node=node_to_run_stress, taint_expected=False, wait_timeout=TIMEOUT_15MIN)
diff --git a/tests/virt/node/descheduler/utils.py b/tests/virt/node/descheduler/utils.py
@@ -4,15 +4,16 @@
 
 from ocp_resources.deployment import Deployment
 from ocp_resources.kube_descheduler import KubeDescheduler
+from ocp_resources.resource import ResourceEditor
 from ocp_resources.virtual_machine import VirtualMachine
 from timeout_sampler import TimeoutExpiredError, TimeoutSampler
 
 from tests.virt.node.descheduler.constants import (
     DESCHEDULER_DEPLOYMENT_NAME,
     DESCHEDULER_SOFT_TAINT_KEY,
     DESCHEDULING_INTERVAL_120SEC,
-    RUNNING_PING_PROCESS_NAME_IN_VM,
 )
+from tests.virt.utils import is_jira_67515_open
 from utilities.constants import (
     TIMEOUT_1MIN,
     TIMEOUT_5MIN,
@@ -25,9 +26,7 @@
 from utilities.virt import (
     VirtualMachineForTests,
     fedora_vm_body,
-    fetch_pid_from_linux_vm,
     running_vm,
-    start_and_fetch_processid_on_linux_vm,
 )
 
 LOGGER = logging.getLogger(__name__)
@@ -108,23 +107,6 @@ def wait_vmi_failover(vm, orig_node):
         raise
 
 
-def assert_running_process_after_failover(vms_list, process_dict):
-    LOGGER.info(f"Verify {RUNNING_PING_PROCESS_NAME_IN_VM} is running after migrations.")
-    failed_vms = []
-    for vm in vms_list:
-        vm_name = vm.name
-        new_pid = None
-        try:
-            new_pid = fetch_pid_from_linux_vm(vm=vm, process_name=RUNNING_PING_PROCESS_NAME_IN_VM)
-        except (ValueError, AssertionError):
-            failed_vms.append(vm_name)
-            continue
-        if new_pid != process_dict[vm_name]:
-            failed_vms.append(vm_name)
-
-    assert not failed_vms, f"The following VMs process ID has changed after migration: {failed_vms}"
-
-
 def assert_vms_distribution_after_failover(vms, nodes, all_nodes=True):
     def _get_vms_per_nodes():
         return vms_per_nodes(vms=vm_nodes(vms=vms))
@@ -212,16 +194,6 @@ def _vms_launcher_pod_names():
         LOGGER.info("No VMs were migrated.")
 
 
-def start_vms_with_process(vms, process_name, args):
-    vms_process_id_dict = {}
-    for vm in vms:
-        vms_process_id_dict[vm.name] = start_and_fetch_processid_on_linux_vm(
-            vm=vm, process_name=process_name, args=args
-        )
-
-    return vms_process_id_dict
-
-
 def deploy_vms(
     vm_prefix,
     client,
@@ -261,7 +233,9 @@ def deploy_vms(
         vm.delete()
 
     for vm in vms:
-        vm.wait_deleted()
+        # Due to the bug - VM may hang in terminating state, need to remove the finalizer from VMI
+        if not vm.wait_deleted() and is_jira_67515_open():
+            ResourceEditor(patches={vm.vmi: {"metadata": {"finalizers": []}}}).update()
 
 
 def verify_at_least_one_vm_migrated(vms, node_before):
@@ -296,10 +270,10 @@ def create_kube_descheduler(admin_client, profiles, profile_customizations):
         yield kd
 
 
-def wait_for_overutilized_soft_taint(node, taint_expected):
+def wait_for_overutilized_soft_taint(node, taint_expected, wait_timeout=TIMEOUT_10MIN):
     taint_key = f"{DESCHEDULER_SOFT_TAINT_KEY}/overutilized"
     sampler = TimeoutSampler(
-        wait_timeout=TIMEOUT_10MIN,
+        wait_timeout=wait_timeout,
         sleep=TIMEOUT_5SEC,
         func=lambda: any(taint_key in taint.values() for taint in node.instance.spec.taints),
     )
diff --git a/tests/virt/node/log_verbosity/test_log_virt_launcher.py b/tests/virt/node/log_verbosity/test_log_virt_launcher.py
@@ -7,8 +7,8 @@
 from tests.virt.node.log_verbosity.constants import (
     VIRT_LOG_VERBOSITY_LEVEL_6,
 )
+from tests.virt.utils import is_jira_67515_open
 from utilities.constants import MIGRATION_POLICY_VM_LABEL, TIMEOUT_1MIN, TIMEOUT_5SEC
-from utilities.infra import is_jira_open
 from utilities.virt import (
     VirtualMachineForTests,
     fedora_vm_body,
@@ -82,7 +82,7 @@ def vm_for_migration_progress_test(
         running_vm(vm=vm)
         yield vm
         # Due to the bug - migration job should be removed before stopping the VM
-        if is_jira_open(jira_id="CNV-67515"):
+        if is_jira_67515_open():
             clean_up_migration_jobs(client=admin_client, vm=vm)
 
 
diff --git a/tests/virt/upgrade/conftest.py b/tests/virt/upgrade/conftest.py
@@ -17,6 +17,7 @@
     vm_from_template,
     wait_for_automatic_vm_migrations,
 )
+from tests.virt.utils import get_boot_time_for_multiple_vms
 from utilities.constants import (
     ES_LIVE_MIGRATE_IF_POSSIBLE,
     OS_FLAVOR_RHEL,
@@ -311,11 +312,8 @@ def _vm_is_migrateable(vm):
 
 
 @pytest.fixture(scope="session")
-def linux_boot_time_before_upgrade(virt_migratable_vms):
-    boot_time_dict = {}
-    for vm in virt_migratable_vms:
-        boot_time_dict[vm.name] = get_vm_boot_time(vm=vm)
-    yield boot_time_dict
+def linux_boot_time_before_upgrade(vms_for_upgrade):
+    return get_boot_time_for_multiple_vms(vm_list=vms_for_upgrade)
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/virt/upgrade/test_upgrade_virt.py b/tests/virt/upgrade/test_upgrade_virt.py
@@ -16,12 +16,11 @@
 )
 from tests.virt.upgrade.utils import (
     mismatching_src_pvc_names,
-    verify_linux_boot_time,
     verify_run_strategy_vmi_status,
     verify_vms_ssh_connectivity,
     verify_windows_boot_time,
 )
-from tests.virt.utils import assert_migration_post_copy_mode
+from tests.virt.utils import assert_migration_post_copy_mode, verify_linux_boot_time
 from utilities.constants import DATA_SOURCE_NAME, DEPENDENCY_SCOPE_SESSION
 from utilities.exceptions import ResourceValueError
 from utilities.virt import migrate_vm_and_verify, vm_console_run_commands
diff --git a/tests/virt/upgrade/utils.py b/tests/virt/upgrade/utils.py
@@ -164,15 +164,6 @@ def verify_run_strategy_vmi_status(run_strategy_vmi_list):
     return run_strategy_vmi_list
 
 
-def verify_linux_boot_time(vm_list, initial_boot_time):
-    rebooted_vms = {}
-    for vm in vm_list:
-        current_boot_time = get_vm_boot_time(vm=vm)
-        if initial_boot_time[vm.name] != current_boot_time:
-            rebooted_vms[vm.name] = {"initial": initial_boot_time[vm.name], "current": current_boot_time}
-    assert not rebooted_vms, f"Boot time changed for VMs:\n {rebooted_vms}"
-
-
 def verify_windows_boot_time(windows_vm, initial_boot_time):
     current_boot_time = get_vm_boot_time(vm=windows_vm)
     assert initial_boot_time == current_boot_time, (
diff --git a/tests/virt/utils.py b/tests/virt/utils.py
diff --git a/utilities/virt.py b/utilities/virt.py

Original file line number	Diff line number	Diff line change
`@@ -449,7 +449,7 @@ def test_mass_vm_live_migration(`
`449`	`449`	`for batch in scale_vms:`
`450`	`450`	`for vm in batch:`
`451`	`451`	`wait_for_migration_finished(`
`452`		`- vm=vm,`
	`452`	`+ namespace=vm.namespace,`
`453`	`453`	`migration=vm_migration_info[vm.name][MIGRATION_INSTANCE_STR],`
`454`	`454`	`)`
`455`	`455`	`verify_vm_migrated(`
Original file line number	Diff line number	Diff line change
`@@ -16,12 +16,11 @@`
`16`	`16`	`)`
`17`	`17`	`from tests.virt.upgrade.utils import (`
`18`	`18`	`mismatching_src_pvc_names,`
`19`		`- verify_linux_boot_time,`
`20`	`19`	`verify_run_strategy_vmi_status,`
`21`	`20`	`verify_vms_ssh_connectivity,`
`22`	`21`	`verify_windows_boot_time,`
`23`	`22`	`)`
`24`		`-from tests.virt.utils import assert_migration_post_copy_mode`
	`23`	`+from tests.virt.utils import assert_migration_post_copy_mode, verify_linux_boot_time`
`25`	`24`	`from utilities.constants import DATA_SOURCE_NAME, DEPENDENCY_SCOPE_SESSION`
`26`	`25`	`from utilities.exceptions import ResourceValueError`
`27`	`26`	`from utilities.virt import migrate_vm_and_verify, vm_console_run_commands`