Skip to content

Commit ce14de0

Browse files
authored
Cherry-pick: 4.20 [VIRT] Stabilize node drain/cordon during maintenance (RedHatQE#3727)
cherry-pick [3467](RedHatQE#3467) into cnv-4.20 ##### Short description: ##### More details: ##### What this PR does / why we need it: ##### Which issue(s) this PR fixes: ##### Special notes for reviewer: ##### jira-ticket: <!-- full-ticket-url needs to be provided. This would add a link to the pull request to the jira and close it when the pull request is merged If the task is not tracked by a Jira ticket, just write "NONE". --> Signed-off-by: Samuel Albershtein <salbersh@redhat.com>
1 parent 5096de8 commit ce14de0

File tree

8 files changed

+45
-40
lines changed

8 files changed

+45
-40
lines changed

tests/chaos/oadp/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ def rebooted_vm_source_node(rhel_vm_with_dv_running, oadp_backup_in_progress, wo
7474

7575

7676
@pytest.fixture()
77-
def drain_vm_source_node(rhel_vm_with_dv_running, oadp_backup_in_progress):
77+
def drain_vm_source_node(admin_client, rhel_vm_with_dv_running, oadp_backup_in_progress):
7878
vm_node = rhel_vm_with_dv_running.vmi.node
79-
with node_mgmt_console(node=vm_node, node_mgmt="drain"):
79+
with node_mgmt_console(admin_client=admin_client, node=vm_node, node_mgmt="drain"):
8080
wait_for_node_schedulable_status(node=vm_node, status=False)
8181
yield vm_node

tests/virt/cluster/migration_and_maintenance/test_evictionstrategy.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ def assert_vm_restarts_after_node_drain(source_node, vmi, vmi_old_uid):
4242

4343

4444
@pytest.fixture()
45-
def drained_node(vm_for_test_from_template_scope_class):
45+
def drained_node(admin_client, vm_for_test_from_template_scope_class):
4646
source_node = vm_for_test_from_template_scope_class.privileged_vmi.node
47-
with node_mgmt_console(node=source_node, node_mgmt="drain"):
47+
with node_mgmt_console(admin_client=admin_client, node=source_node, node_mgmt="drain"):
4848
yield source_node
4949

5050

@@ -125,9 +125,7 @@ class TestEvictionStrategy:
125125
def test_hco_evictionstrategy_livemigrate_vm_no_evictionstrategy(
126126
self, unprivileged_client, vm_for_test_from_template_scope_class, drained_node
127127
):
128-
check_migration_process_after_node_drain(
129-
dyn_client=unprivileged_client, vm=vm_for_test_from_template_scope_class
130-
)
128+
check_migration_process_after_node_drain(client=unprivileged_client, vm=vm_for_test_from_template_scope_class)
131129

132130
@pytest.mark.polarion("CNV-10088")
133131
def test_hco_evictionstrategy_none_vm_no_evictionstrategy(
@@ -169,6 +167,4 @@ def test_hco_evictionstrategy_none_vm_evictionstrategy_livemigrate(
169167
added_vm_evictionstrategy,
170168
drained_node,
171169
):
172-
check_migration_process_after_node_drain(
173-
dyn_client=unprivileged_client, vm=vm_for_test_from_template_scope_class
174-
)
170+
check_migration_process_after_node_drain(client=unprivileged_client, vm=vm_for_test_from_template_scope_class)

tests/virt/node/descheduler/conftest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,12 +150,13 @@ def node_to_drain(
150150

151151
@pytest.fixture()
152152
def drain_uncordon_node(
153+
admin_client,
153154
deployed_vms_for_descheduler_test,
154155
vms_orig_nodes_before_node_drain,
155156
node_to_drain,
156157
):
157158
"""Return when node is schedulable again after uncordon"""
158-
with node_mgmt_console(node=node_to_drain, node_mgmt="drain"):
159+
with node_mgmt_console(admin_client=admin_client, node=node_to_drain, node_mgmt="drain"):
159160
wait_for_node_schedulable_status(node=node_to_drain, status=False)
160161
for vm in deployed_vms_for_descheduler_test:
161162
if vms_orig_nodes_before_node_drain[vm.name].name == node_to_drain.name:

tests/virt/node/migration_and_maintenance/test_node_maintenance.py

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
"""
44

55
import logging
6-
import random
76

87
import pytest
98
from ocp_resources.virtual_machine_instance_migration import (
@@ -37,20 +36,20 @@
3736
LOGGER = logging.getLogger(__name__)
3837

3938

40-
def drain_using_console(dyn_client, source_node, vm):
39+
def drain_using_console(admin_client, source_node, vm):
4140
with running_sleep_in_linux(vm=vm):
42-
with node_mgmt_console(node=source_node, node_mgmt="drain"):
43-
check_migration_process_after_node_drain(dyn_client=dyn_client, vm=vm)
41+
with node_mgmt_console(admin_client=admin_client, node=source_node, node_mgmt="drain"):
42+
check_migration_process_after_node_drain(client=admin_client, vm=vm)
4443

4544

46-
def drain_using_console_windows(dyn_client, source_node, vm):
45+
def drain_using_console_windows(admin_client, source_node, vm):
4746
process_name = OS_PROC_NAME["windows"]
4847
pre_migrate_processid = start_and_fetch_processid_on_windows_vm(
4948
vm=vm,
5049
process_name=process_name,
5150
)
52-
with node_mgmt_console(node=source_node, node_mgmt="drain"):
53-
check_migration_process_after_node_drain(dyn_client=dyn_client, vm=vm)
51+
with node_mgmt_console(admin_client=admin_client, node=source_node, node_mgmt="drain"):
52+
check_migration_process_after_node_drain(client=admin_client, vm=vm)
5453
post_migrate_processid = fetch_pid_from_windows_vm(vm=vm, process_name=process_name)
5554
assert post_migrate_processid == pre_migrate_processid, (
5655
f"Post migrate processid is: {post_migrate_processid}. Pre migrate processid is: {pre_migrate_processid}"
@@ -74,7 +73,7 @@ def vm_container_disk_fedora(
7473
namespace,
7574
unprivileged_client,
7675
):
77-
name = f"vm-nodemaintenance-{random.randrange(99999)}"
76+
name = "vm-nodemaintenance"
7877
with VirtualMachineForTests(
7978
name=name,
8079
namespace=namespace.name,
@@ -120,7 +119,7 @@ def test_node_drain_using_console_fedora(
120119
):
121120
privileged_virt_launcher_pod = vm_container_disk_fedora.privileged_vmi.virt_launcher_pod
122121
drain_using_console(
123-
dyn_client=admin_client, source_node=privileged_virt_launcher_pod.node, vm=vm_container_disk_fedora
122+
admin_client=admin_client, source_node=privileged_virt_launcher_pod.node, vm=vm_container_disk_fedora
124123
)
125124

126125

@@ -144,21 +143,20 @@ def test_node_drain_using_console_fedora(
144143
)
145144
@pytest.mark.usefixtures("cluster_cpu_model_scope_class", "golden_image_data_volume_multi_storage_scope_class")
146145
@pytest.mark.ibm_bare_metal
146+
@pytest.mark.usefixtures("no_migration_job")
147147
class TestNodeMaintenanceRHEL:
148148
@pytest.mark.polarion("CNV-2292")
149149
def test_node_drain_using_console_rhel(
150150
self,
151-
no_migration_job,
152151
golden_image_vm_instance_from_template_multi_storage_scope_class,
153152
admin_client,
154153
):
155154
vm = golden_image_vm_instance_from_template_multi_storage_scope_class
156-
drain_using_console(dyn_client=admin_client, source_node=vm.privileged_vmi.virt_launcher_pod.node, vm=vm)
155+
drain_using_console(admin_client=admin_client, source_node=vm.privileged_vmi.virt_launcher_pod.node, vm=vm)
157156

158157
@pytest.mark.polarion("CNV-4995")
159158
def test_migration_when_multiple_nodes_unschedulable_using_console_rhel(
160159
self,
161-
no_migration_job,
162160
golden_image_vm_instance_from_template_multi_storage_scope_class,
163161
schedulable_nodes,
164162
admin_client,
@@ -181,8 +179,8 @@ def test_migration_when_multiple_nodes_unschedulable_using_console_rhel(
181179
pod=vm.privileged_vmi.virt_launcher_pod,
182180
schedulable_nodes=schedulable_nodes,
183181
)
184-
with node_mgmt_console(node=cordon_nodes[0], node_mgmt="cordon"):
185-
drain_using_console(dyn_client=admin_client, source_node=vm.privileged_vmi.virt_launcher_pod.node, vm=vm)
182+
with node_mgmt_console(admin_client=admin_client, node=cordon_nodes[0], node_mgmt="cordon"):
183+
drain_using_console(admin_client=admin_client, source_node=vm.privileged_vmi.virt_launcher_pod.node, vm=vm)
186184

187185

188186
@pytest.mark.parametrize(
@@ -206,28 +204,31 @@ def test_migration_when_multiple_nodes_unschedulable_using_console_rhel(
206204
)
207205
@pytest.mark.usefixtures("cluster_modern_cpu_model_scope_class", "golden_image_data_volume_multi_storage_scope_class")
208206
@pytest.mark.ibm_bare_metal
207+
@pytest.mark.usefixtures("no_migration_job")
209208
class TestNodeCordonAndDrain:
210209
@pytest.mark.polarion("CNV-2048")
211210
def test_node_drain_template_windows(
212211
self,
213-
no_migration_job,
214212
golden_image_vm_instance_from_template_multi_storage_scope_class,
215213
admin_client,
216214
):
217215
vm = golden_image_vm_instance_from_template_multi_storage_scope_class
218216
drain_using_console_windows(
219-
dyn_client=admin_client, source_node=vm.privileged_vmi.virt_launcher_pod.node, vm=vm
217+
admin_client=admin_client, source_node=vm.privileged_vmi.virt_launcher_pod.node, vm=vm
220218
)
221219

222220
@pytest.mark.polarion("CNV-4906")
223221
def test_node_cordon_template_windows(
224222
self,
225-
no_migration_job,
226223
golden_image_vm_instance_from_template_multi_storage_scope_class,
227224
admin_client,
228225
):
229226
vm = golden_image_vm_instance_from_template_multi_storage_scope_class
230-
with node_mgmt_console(node=vm.privileged_vmi.virt_launcher_pod.node, node_mgmt="cordon"):
227+
with node_mgmt_console(
228+
admin_client=admin_client,
229+
node=vm.privileged_vmi.virt_launcher_pod.node,
230+
node_mgmt="cordon",
231+
):
231232
with pytest.raises(TimeoutExpiredError):
232233
migration_job_sampler(
233234
dyn_client=admin_client,

tests/virt/node/migration_and_maintenance/test_post_copy_migration.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ def migrated_hotplugged_vm(hotplugged_vm):
7676

7777
@pytest.fixture()
7878
def drained_node_with_hotplugged_vm(admin_client, hotplugged_vm):
79-
with node_mgmt_console(node=hotplugged_vm.privileged_vmi.node, node_mgmt="drain"):
80-
check_migration_process_after_node_drain(dyn_client=admin_client, vm=hotplugged_vm)
79+
with node_mgmt_console(admin_client=admin_client, node=hotplugged_vm.privileged_vmi.node, node_mgmt="drain"):
80+
check_migration_process_after_node_drain(client=admin_client, vm=hotplugged_vm)
8181
clean_up_migration_jobs(client=admin_client, vm=hotplugged_vm)
8282

8383

tests/virt/node/migration_and_maintenance/test_vm_unscheduled_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def unscheduled_node_vm(
5454
indirect=True,
5555
)
5656
@pytest.mark.polarion("CNV-4157")
57-
def test_schedule_vm_on_cordoned_node(worker_node1, data_volume_scope_function, unscheduled_node_vm):
57+
def test_schedule_vm_on_cordoned_node(admin_client, worker_node1, unscheduled_node_vm):
5858
"""Test VM scheduling on a node under maintenance.
5959
1. Cordon the target node specified in the VM's nodeAffinity (worker_node1).
6060
2. Wait until the node status becomes 'Ready,SchedulingDisabled'.
@@ -65,7 +65,7 @@ def test_schedule_vm_on_cordoned_node(worker_node1, data_volume_scope_function,
6565
7. Verify that the VMI is running on the expected node (worker_node1).
6666
"""
6767

68-
with node_mgmt_console(node=worker_node1, node_mgmt="cordon"):
68+
with node_mgmt_console(admin_client=admin_client, node=worker_node1, node_mgmt="cordon"):
6969
wait_for_node_schedulable_status(node=worker_node1, status=False)
7070
unscheduled_node_vm.start()
7171
unscheduled_node_vm.vmi.wait_for_status(status=VirtualMachineInstance.Status.SCHEDULING, timeout=TIMEOUT_20SEC)

tests/virt/node/migration_and_maintenance/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,10 @@ def _get_source_migration_logs():
132132
assert len(matches) == 6, f"Not all migration logs found. Found {len(matches)} of 6"
133133

134134

135-
def assert_node_drain_and_vm_migration(dyn_client, vm, virt_handler_pods):
135+
def assert_node_drain_and_vm_migration(admin_client, vm, virt_handler_pods):
136136
source_node = vm.privileged_vmi.node
137-
with node_mgmt_console(node=source_node, node_mgmt="drain"):
138-
check_migration_process_after_node_drain(dyn_client=dyn_client, vm=vm)
137+
with node_mgmt_console(admin_client=admin_client, node=source_node, node_mgmt="drain"):
138+
check_migration_process_after_node_drain(client=admin_client, vm=vm)
139139
assert_vm_migrated_through_dedicated_network_with_logs(
140140
source_node=source_node, vm=vm, virt_handler_pods=virt_handler_pods
141141
)

utilities/virt.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@
8383
Images,
8484
)
8585
from utilities.data_collector import collect_vnc_screenshot_for_vms
86-
from utilities.hco import wait_for_hco_conditions
86+
from utilities.hco import get_hco_namespace, wait_for_hco_conditions
8787
from utilities.storage import get_default_storage_class
8888

8989
if TYPE_CHECKING:
@@ -2020,7 +2020,8 @@ def vm_instance_from_template(
20202020

20212021

20222022
@contextmanager
2023-
def node_mgmt_console(node, node_mgmt):
2023+
def node_mgmt_console(admin_client, node, node_mgmt):
2024+
hco_namespace = get_hco_namespace(admin_client=admin_client)
20242025
try:
20252026
LOGGER.info(f"{node_mgmt.capitalize()} the node {node.name}")
20262027
extra_opts = "--delete-emptydir-data --ignore-daemonsets=true --force" if node_mgmt == "drain" else ""
@@ -2030,9 +2031,15 @@ def node_mgmt_console(node, node_mgmt):
20302031
)
20312032
yield
20322033
finally:
2034+
if node_mgmt == "drain":
2035+
LOGGER.info("Terminate drain process")
2036+
run(
2037+
shlex.split('pkill -f "oc adm drain"'),
2038+
)
20332039
LOGGER.info(f"Uncordon node {node.name}")
20342040
run(f"oc adm uncordon {node.name}", shell=True)
20352041
wait_for_node_schedulable_status(node=node, status=True)
2042+
wait_for_kv_stabilize(admin_client=admin_client, hco_namespace=hco_namespace)
20362043

20372044

20382045
@contextmanager
@@ -2212,15 +2219,15 @@ def get_created_migration_job(vm, timeout=TIMEOUT_1MIN, client=None):
22122219
raise
22132220

22142221

2215-
def check_migration_process_after_node_drain(dyn_client, vm):
2222+
def check_migration_process_after_node_drain(client, vm):
22162223
"""
22172224
Wait for migration process to succeed and verify that VM indeed moved to new node.
22182225
"""
22192226
vmi_old_uid = vm.vmi.instance.metadata.uid
22202227
source_node = vm.privileged_vmi.virt_launcher_pod.node
22212228
LOGGER.info(f"The VMI was running on {source_node.name}")
22222229
wait_for_node_schedulable_status(node=source_node, status=False)
2223-
vmim = get_created_migration_job(vm=vm, client=dyn_client, timeout=TIMEOUT_5MIN)
2230+
vmim = get_created_migration_job(vm=vm, client=client, timeout=TIMEOUT_5MIN)
22242231
wait_for_migration_finished(
22252232
namespace=vm.namespace, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN
22262233
)

0 commit comments

Comments
 (0)