From b0d89c5cb846579ee33b3169fc4eb286fd19ae63 Mon Sep 17 00:00:00 2001 From: Rushikesh Jadhav Date: Fri, 14 Mar 2025 00:30:02 +0530 Subject: [PATCH 1/2] Added `test_linstor_sr_fail_disk` which - Simulates failure of a LVM PV on a random host in the LINSTOR SR pool by offlining a selected disk. - Verifies VM start/shutdown on all hosts despite the degraded pool state. - Also ensures SR and PBDs recover after reboot of the affected host. Signed-off-by: Rushikesh Jadhav --- tests/storage/linstor/test_linstor_sr.py | 47 +++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/tests/storage/linstor/test_linstor_sr.py b/tests/storage/linstor/test_linstor_sr.py index 7dc6f4597..ba5256b7f 100644 --- a/tests/storage/linstor/test_linstor_sr.py +++ b/tests/storage/linstor/test_linstor_sr.py @@ -2,7 +2,7 @@ import pytest import time -from .conftest import LINSTOR_PACKAGE +from .conftest import GROUP_NAME, LINSTOR_PACKAGE from lib.commands import SSHCommandFailed from lib.common import wait_for, vm_image from tests.storage import vdi_is_open @@ -131,6 +131,51 @@ def test_linstor_missing(self, linstor_sr, host): if not linstor_installed: host.yum_install([LINSTOR_PACKAGE]) + @pytest.mark.reboot + @pytest.mark.small_vm + def test_linstor_sr_fail_disk(self, linstor_sr, vm_on_linstor_sr, provisioning_type): + """ + Identify random host within the same pool, detect used disks, fail one, and test VM useability on LINSTOR SR. + """ + import random + + sr = linstor_sr + if provisioning_type == "thick": + time.sleep(45) # Let xcp-persistent-database come in sync across the nodes + + vm = vm_on_linstor_sr + + # Fail a disk from random host of Linstor pool + try: + random_host = random.choice(sr.pool.hosts) # TBD: Choose Linstor Diskfull node + logging.info("Working on %s", random_host.hostname_or_ip) + devices = random_host.ssh('vgs ' + GROUP_NAME + ' -o pv_name --no-headings').split("\n") + # Choosing last device from list, assuming its least filled + fail_device = devices[-1].strip() # /dev/sdb + fail_device = random_host.ssh(['lsblk', fail_device, '--nodeps --output NAME --noheadings']) # sdb + logging.info("Attempting to fail device: %s", fail_device) + random_host.ssh(['echo', '"offline"', '>', '/sys/block/' + fail_device + '/device/state']) + except Exception as e: + # Offline disk shall connect back after host reboot. Teardown normally. + random_host.reboot(verify=True) + pytest.fail("Failed to simulate device failure. Error %s", e.stdout) + + # Ensure that VM is able to start on all hosts despite Linstor pool disk failure + for h in sr.pool.hosts: + logging.info(f"Checking VM on host {h.hostname_or_ip}") + vm.start(on=h.uuid) + vm.wait_for_os_booted() + vm.shutdown(verify=True) + + random_host.reboot(verify=True) + + # Ensure PBDs are attached post reboot + if not sr.all_pbds_attached(): + sr.plug_pbds() + + # Ensure SR scan works and proceed for teardown + sr.scan() + # *** End of tests with reboots # --- Test diskless resources -------------------------------------------------- From 52d2a71422d5a37098895382c562c51fc3383a9e Mon Sep 17 00:00:00 2001 From: Rushikesh Jadhav Date: Mon, 26 May 2025 14:25:46 +0530 Subject: [PATCH 2/2] Handle scenario where if VM.start, `xcp-persistent-database` is `InUse`, are on failing disk-host, then VM.start may get stuck. The state can be recovered by bringing the failed device online however it means that the test failed. Signed-off-by: Rushikesh Jadhav --- tests/storage/linstor/test_linstor_sr.py | 26 +++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tests/storage/linstor/test_linstor_sr.py b/tests/storage/linstor/test_linstor_sr.py index ba5256b7f..ce711e31d 100644 --- a/tests/storage/linstor/test_linstor_sr.py +++ b/tests/storage/linstor/test_linstor_sr.py @@ -138,6 +138,7 @@ def test_linstor_sr_fail_disk(self, linstor_sr, vm_on_linstor_sr, provisioning_t Identify random host within the same pool, detect used disks, fail one, and test VM useability on LINSTOR SR. """ import random + import multiprocessing sr = linstor_sr if provisioning_type == "thick": @@ -158,14 +159,29 @@ def test_linstor_sr_fail_disk(self, linstor_sr, vm_on_linstor_sr, provisioning_t except Exception as e: # Offline disk shall connect back after host reboot. Teardown normally. random_host.reboot(verify=True) - pytest.fail("Failed to simulate device failure. Error %s", e.stdout) + pytest.fail("Failed to simulate device failure. Error %s", e) # Ensure that VM is able to start on all hosts despite Linstor pool disk failure for h in sr.pool.hosts: - logging.info(f"Checking VM on host {h.hostname_or_ip}") - vm.start(on=h.uuid) - vm.wait_for_os_booted() - vm.shutdown(verify=True) + logging.info("Checking VM on host %s", h.hostname_or_ip) + try: + proc = multiprocessing.Process(target=vm.start, kwargs={'on': h.uuid}) + proc.start() + proc.join(timeout=30) + if proc.is_alive(): + proc.terminate() + proc.join() + logging.warning("VM start on host %s timed out. Recovering failed disk.", h.hostname_or_ip) + random_host.ssh(['echo', '"running"', '>', f'/sys/block/{fail_device}/device/state']) + # Handle in case VM.start succeed after disk becomes online + if vm.is_running(): + vm.shutdown(verify=True, force_if_fails=True) + pytest.fail("VM start timed out on host %s after 30s. Disk recovered.", h.hostname_or_ip) + else: # VM booted fine + vm.wait_for_os_booted() + vm.shutdown(verify=True) + except Exception as e: + logging.info("Caught exception in multiprocessing: %s", e) random_host.reboot(verify=True)