|
1 | 1 | import pytest |
2 | 2 |
|
| 3 | +import json |
3 | 4 | import logging |
4 | 5 | import time |
5 | 6 |
|
6 | 7 | from lib.commands import SSHCommandFailed |
7 | 8 | from lib.common import vm_image, wait_for |
| 9 | +from lib.host import Host |
| 10 | +from lib.vm import VM |
8 | 11 | from tests.storage import vdi_is_open |
9 | 12 |
|
10 | 13 | from .conftest import LINSTOR_PACKAGE |
11 | 14 |
|
| 15 | +from typing import Tuple |
| 16 | + |
12 | 17 | # Requirements: |
13 | 18 | # - two or more XCP-ng hosts >= 8.2 with additional unused disk(s) for the SR |
14 | 19 | # - access to XCP-ng RPM repository from the host |
@@ -52,6 +57,29 @@ def test_create_and_destroy_sr(self, pool_with_linstor, provisioning_type, stora |
52 | 57 | vm.destroy(verify=True) |
53 | 58 | sr.destroy(verify=True) |
54 | 59 |
|
| 60 | + |
| 61 | +def get_drbd_status(host: Host, resource: str): |
| 62 | + logging.debug("[%s] Fetching DRBD status for resource `%s`...", host, resource) |
| 63 | + return json.loads(host.ssh(["drbdsetup", "status", resource, "--json"])) |
| 64 | + |
| 65 | +def get_corrupted_resources(host: Host, resource: str): |
| 66 | + return [ |
| 67 | + ( |
| 68 | + res.get("name", ""), |
| 69 | + conn.get("name", ""), |
| 70 | + peer.get("out-of-sync", 0), |
| 71 | + ) |
| 72 | + for res in get_drbd_status(host, resource) |
| 73 | + for conn in res.get("connections", []) |
| 74 | + for peer in conn.get("peer_devices", []) |
| 75 | + if peer.get("out-of-sync", 0) > 0 |
| 76 | + ] |
| 77 | + |
| 78 | +def wait_drbd_sync(host: Host, resource: str): |
| 79 | + logging.info("[%s] Waiting for DRBD sync on resource `%s`...", host, resource) |
| 80 | + host.ssh(["drbdadm", "wait-sync", resource]) |
| 81 | + |
| 82 | + |
55 | 83 | @pytest.mark.usefixtures("linstor_sr") |
56 | 84 | class TestLinstorSR: |
57 | 85 | @pytest.mark.quicktest |
@@ -88,6 +116,63 @@ def test_snapshot(self, vm_on_linstor_sr): |
88 | 116 | finally: |
89 | 117 | vm.shutdown(verify=True) |
90 | 118 |
|
| 119 | + @pytest.mark.small_vm |
| 120 | + def test_resynchronization( |
| 121 | + self, host_and_corrupted_vdi_on_linstor_sr: Tuple[VM, Host, str] |
| 122 | + ): |
| 123 | + (vm, host, resource_name) = host_and_corrupted_vdi_on_linstor_sr |
| 124 | + hostname = host.hostname() |
| 125 | + |
| 126 | + try: |
| 127 | + other_host = next( |
| 128 | + next(h for h in host.pool.hosts if h.hostname() == conn.get("name", "")) |
| 129 | + for res in get_drbd_status(host, resource_name) |
| 130 | + for conn in res.get("connections", []) |
| 131 | + for peer in conn.get("peer_devices", []) |
| 132 | + if peer.get("peer-disk-state", "") == "UpToDate" |
| 133 | + ) |
| 134 | + logging.info("Elected `%s` as peer for verification and repair", other_host) |
| 135 | + except StopIteration: |
| 136 | + pytest.fail("Could not find an UpToDate peer host") |
| 137 | + |
| 138 | + corrupted = None |
| 139 | + max_attempts = 3 |
| 140 | + # Attempting several times since testing revealed `drbdadm verify` can be flaky |
| 141 | + for attempt in range(1, max_attempts + 1): |
| 142 | + logging.info("`drbdadm verify` attempt %d/%d", attempt, max_attempts) |
| 143 | + logging.info("[%s] Running DRBD verify for %s...", other_host, resource_name) |
| 144 | + other_host.ssh(["drbdadm", "verify", f"{resource_name}:{hostname}/0"]) |
| 145 | + wait_drbd_sync(other_host, resource_name) |
| 146 | + |
| 147 | + corrupted_resources = get_corrupted_resources(other_host, resource_name) |
| 148 | + if not corrupted_resources: |
| 149 | + logging.warning("No corrupted resources found on attempt #%d", attempt) |
| 150 | + continue |
| 151 | + for res_name, peer_name, out_of_sync in corrupted_resources: |
| 152 | + if res_name == resource_name and peer_name == hostname: |
| 153 | + corrupted = (res_name, peer_name, out_of_sync) |
| 154 | + if corrupted: |
| 155 | + break |
| 156 | + if not corrupted: |
| 157 | + pytest.fail(f"Failed to identify corrupted resource after {max_attempts} attempts") |
| 158 | + |
| 159 | + logging.info("Invalidating remote resource `%s`...", resource_name) |
| 160 | + other_host.ssh([ |
| 161 | + "drbdadm", "invalidate-remote", |
| 162 | + f"{resource_name}:{hostname}/0", |
| 163 | + "--reset-bitmap=no" |
| 164 | + ]) |
| 165 | + wait_drbd_sync(other_host, resource_name) |
| 166 | + if get_corrupted_resources(other_host, resource_name): |
| 167 | + pytest.fail("Corrupted resource did not get fixed") |
| 168 | + |
| 169 | + vm.start(on=host.uuid) |
| 170 | + try: |
| 171 | + vm.wait_for_os_booted() |
| 172 | + vm.test_snapshot_on_running_vm() |
| 173 | + finally: |
| 174 | + vm.shutdown(verify=True) |
| 175 | + |
91 | 176 | # *** tests with reboots (longer tests). |
92 | 177 |
|
93 | 178 | @pytest.mark.reboot |
|
0 commit comments