From b0d89c5cb846579ee33b3169fc4eb286fd19ae63 Mon Sep 17 00:00:00 2001
From: Rushikesh Jadhav <rushikesh7@gmail.com>
Date: Fri, 14 Mar 2025 00:30:02 +0530
Subject: [PATCH 1/2] Added `test_linstor_sr_fail_disk` which - Simulates
 failure of a LVM PV on a random host in the LINSTOR SR pool by offlining a
 selected disk. - Verifies VM start/shutdown on all hosts despite the degraded
 pool state. - Also ensures SR and PBDs recover after reboot of the affected
 host.

Signed-off-by: Rushikesh Jadhav <rushikesh7@gmail.com>
---
 tests/storage/linstor/test_linstor_sr.py | 47 +++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/tests/storage/linstor/test_linstor_sr.py b/tests/storage/linstor/test_linstor_sr.py
index 7dc6f4597..ba5256b7f 100644
--- a/tests/storage/linstor/test_linstor_sr.py
+++ b/tests/storage/linstor/test_linstor_sr.py
@@ -2,7 +2,7 @@
 import pytest
 import time
 
-from .conftest import LINSTOR_PACKAGE
+from .conftest import GROUP_NAME, LINSTOR_PACKAGE
 from lib.commands import SSHCommandFailed
 from lib.common import wait_for, vm_image
 from tests.storage import vdi_is_open
@@ -131,6 +131,51 @@ def test_linstor_missing(self, linstor_sr, host):
             if not linstor_installed:
                 host.yum_install([LINSTOR_PACKAGE])
 
+    @pytest.mark.reboot
+    @pytest.mark.small_vm
+    def test_linstor_sr_fail_disk(self, linstor_sr, vm_on_linstor_sr, provisioning_type):
+        """
+        Identify random host within the same pool, detect used disks, fail one, and test VM useability on LINSTOR SR.
+        """
+        import random
+
+        sr = linstor_sr
+        if provisioning_type == "thick":
+            time.sleep(45) # Let xcp-persistent-database come in sync across the nodes
+
+        vm = vm_on_linstor_sr
+
+        # Fail a disk from random host of Linstor pool
+        try:
+            random_host = random.choice(sr.pool.hosts) # TBD: Choose Linstor Diskfull node
+            logging.info("Working on %s", random_host.hostname_or_ip)
+            devices = random_host.ssh('vgs ' + GROUP_NAME + ' -o pv_name --no-headings').split("\n")
+            # Choosing last device from list, assuming its least filled
+            fail_device = devices[-1].strip() # /dev/sdb
+            fail_device = random_host.ssh(['lsblk', fail_device, '--nodeps --output NAME --noheadings']) # sdb
+            logging.info("Attempting to fail device: %s", fail_device)
+            random_host.ssh(['echo', '"offline"', '>', '/sys/block/' + fail_device + '/device/state'])
+        except Exception as e:
+            # Offline disk shall connect back after host reboot. Teardown normally.
+            random_host.reboot(verify=True)
+            pytest.fail("Failed to simulate device failure. Error %s", e.stdout)
+
+        # Ensure that VM is able to start on all hosts despite Linstor pool disk failure
+        for h in sr.pool.hosts:
+            logging.info(f"Checking VM on host {h.hostname_or_ip}")
+            vm.start(on=h.uuid)
+            vm.wait_for_os_booted()
+            vm.shutdown(verify=True)
+
+        random_host.reboot(verify=True)
+
+        # Ensure PBDs are attached post reboot
+        if not sr.all_pbds_attached():
+            sr.plug_pbds()
+
+        # Ensure SR scan works and proceed for teardown
+        sr.scan()
+
     # *** End of tests with reboots
 
 # --- Test diskless resources --------------------------------------------------

From 52d2a71422d5a37098895382c562c51fc3383a9e Mon Sep 17 00:00:00 2001
From: Rushikesh Jadhav <rushikesh7@gmail.com>
Date: Mon, 26 May 2025 14:25:46 +0530
Subject: [PATCH 2/2] Handle scenario where if VM.start,
 `xcp-persistent-database` is `InUse`, are on failing disk-host, then VM.start
 may get stuck. The state can be recovered by bringing the failed device
 online however it means that the test failed.

Signed-off-by: Rushikesh Jadhav <rushikesh7@gmail.com>
---
 tests/storage/linstor/test_linstor_sr.py | 26 +++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tests/storage/linstor/test_linstor_sr.py b/tests/storage/linstor/test_linstor_sr.py
index ba5256b7f..ce711e31d 100644
--- a/tests/storage/linstor/test_linstor_sr.py
+++ b/tests/storage/linstor/test_linstor_sr.py
@@ -138,6 +138,7 @@ def test_linstor_sr_fail_disk(self, linstor_sr, vm_on_linstor_sr, provisioning_t
         Identify random host within the same pool, detect used disks, fail one, and test VM useability on LINSTOR SR.
         """
         import random
+        import multiprocessing
 
         sr = linstor_sr
         if provisioning_type == "thick":
@@ -158,14 +159,29 @@ def test_linstor_sr_fail_disk(self, linstor_sr, vm_on_linstor_sr, provisioning_t
         except Exception as e:
             # Offline disk shall connect back after host reboot. Teardown normally.
             random_host.reboot(verify=True)
-            pytest.fail("Failed to simulate device failure. Error %s", e.stdout)
+            pytest.fail("Failed to simulate device failure. Error %s", e)
 
         # Ensure that VM is able to start on all hosts despite Linstor pool disk failure
         for h in sr.pool.hosts:
-            logging.info(f"Checking VM on host {h.hostname_or_ip}")
-            vm.start(on=h.uuid)
-            vm.wait_for_os_booted()
-            vm.shutdown(verify=True)
+            logging.info("Checking VM on host %s", h.hostname_or_ip)
+            try:
+                proc = multiprocessing.Process(target=vm.start, kwargs={'on': h.uuid})
+                proc.start()
+                proc.join(timeout=30)
+                if proc.is_alive():
+                    proc.terminate()
+                    proc.join()
+                    logging.warning("VM start on host %s timed out. Recovering failed disk.", h.hostname_or_ip)
+                    random_host.ssh(['echo', '"running"', '>', f'/sys/block/{fail_device}/device/state'])
+                    # Handle in case VM.start succeed after disk becomes online
+                    if vm.is_running():
+                        vm.shutdown(verify=True, force_if_fails=True)
+                    pytest.fail("VM start timed out on host %s after 30s. Disk recovered.", h.hostname_or_ip)
+                else: # VM booted fine
+                    vm.wait_for_os_booted()
+                    vm.shutdown(verify=True)
+            except Exception as e:
+                logging.info("Caught exception in multiprocessing: %s", e)
 
         random_host.reboot(verify=True)