Skip to content

Commit d6b53e9

Browse files
Add tests for corruption recovery on LINSTOR SR
Tests DRBD's corruption recovery using drbdadm verify and invalidate-remote commands as well as basic VM operations (startup, snapshot, shutdown) Signed-off-by: Mathieu Labourier <mathieu.labourier@vates.tech>
1 parent 69dc883 commit d6b53e9

File tree

2 files changed

+142
-0
lines changed

2 files changed

+142
-0
lines changed

tests/storage/linstor/conftest.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
import pytest
44

55
import functools
6+
import json
67
import logging
78
import os
89
from dataclasses import dataclass
910

1011
import lib.commands as commands
12+
from lib.common import safe_split
13+
from lib.sr import SR
1114

1215
# explicit import for package-scope fixtures
1316
from pkgfixtures import pool_with_saved_yum_state
@@ -19,6 +22,7 @@
1922
from lib.pool import Pool
2023
from lib.sr import SR
2124
from lib.vdi import VDI
25+
from lib.vm import VM
2226

2327
GROUP_NAME = 'linstor_group'
2428
STORAGE_POOL_NAME = f'{GROUP_NAME}/thin_device'
@@ -171,3 +175,56 @@ def vm_on_linstor_sr(host: Host, linstor_sr: SR, vm_ref: str):
171175
yield vm
172176
logging.info("<< Destroy VM")
173177
vm.destroy(verify=True)
178+
179+
@pytest.fixture(scope='function')
180+
def host_and_corrupted_vdi_on_linstor_sr(host: Host, linstor_sr: SR, vm_ref: str):
181+
vm: VM = host.import_vm(vm_ref, sr_uuid=linstor_sr.uuid)
182+
pool: Pool = host.pool
183+
master: Host = pool.master
184+
185+
def get_vdi_volume_name_from_linstor() -> str:
186+
result = master.ssh([
187+
"linstor-kv-tool",
188+
"--dump-volumes",
189+
"-g",
190+
f"xcp-sr-{GROUP_NAME}_thin_device"
191+
])
192+
volumes = json.loads(result)
193+
for k, v in volumes.items():
194+
path = safe_split(k, "/")
195+
if len(path) < 4:
196+
continue
197+
uuid = path[2]
198+
data_type = path[3]
199+
if uuid == vdi_uuid and data_type == "volume-name":
200+
return v
201+
raise FileNotFoundError(f"Could not find matching linstor volume for `{vdi_uuid}`")
202+
203+
def get_vdi_host(path: str) -> Host:
204+
for h in pool.hosts:
205+
result = h.ssh(["test", "-e", path], simple_output=False, check=False)
206+
if result.returncode == 0:
207+
return h
208+
raise FileNotFoundError(f"Could not find matching host for `{vdi_uuid}`")
209+
210+
try:
211+
vdi_uuid: str = next((
212+
vdi.uuid for vdi in vm.vdis if vdi.sr.uuid == linstor_sr.uuid
213+
))
214+
215+
volume_name = get_vdi_volume_name_from_linstor()
216+
lv_path = f"/dev/{GROUP_NAME}/{volume_name}_00000"
217+
vdi_host = get_vdi_host(lv_path)
218+
logging.info("[%s]: corrupting `%s`", host, lv_path)
219+
vdi_host.ssh([
220+
"dd",
221+
"if=/dev/urandom",
222+
f"of={lv_path}",
223+
"bs=4096",
224+
# Lower values seems to go undetected sometimes
225+
"count=10000" # ~40MB
226+
])
227+
yield vm, vdi_host, volume_name
228+
finally:
229+
logging.info("<< Destroy corrupted VDI")
230+
vm.destroy(verify=True)

tests/storage/linstor/test_linstor_sr.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
import pytest
22

3+
import json
34
import logging
45
import time
56

67
from lib.commands import SSHCommandFailed
78
from lib.common import vm_image, wait_for
9+
from lib.host import Host
10+
from lib.vm import VM
811
from tests.storage import vdi_is_open
912

1013
from .conftest import LINSTOR_PACKAGE
1114

15+
from typing import Tuple
16+
1217
# Requirements:
1318
# - two or more XCP-ng hosts >= 8.2 with additional unused disk(s) for the SR
1419
# - access to XCP-ng RPM repository from the host
@@ -52,6 +57,29 @@ def test_create_and_destroy_sr(self, pool_with_linstor, provisioning_type, stora
5257
vm.destroy(verify=True)
5358
sr.destroy(verify=True)
5459

60+
61+
def get_drbd_status(host: Host, resource: str):
62+
logging.debug("[%s] Fetching DRBD status for resource `%s`...", host, resource)
63+
return json.loads(host.ssh(["drbdsetup", "status", resource, "--json"]))
64+
65+
def get_corrupted_resources(host: Host, resource: str):
66+
return [
67+
(
68+
res.get("name", ""),
69+
conn.get("name", ""),
70+
peer.get("out-of-sync", 0),
71+
)
72+
for res in get_drbd_status(host, resource)
73+
for conn in res.get("connections", [])
74+
for peer in conn.get("peer_devices", [])
75+
if peer.get("out-of-sync", 0) > 0
76+
]
77+
78+
def wait_drbd_sync(host: Host, resource: str):
79+
logging.info("[%s] Waiting for DRBD sync on resource `%s`...", host, resource)
80+
host.ssh(["drbdadm", "wait-sync", resource])
81+
82+
5583
@pytest.mark.usefixtures("linstor_sr")
5684
class TestLinstorSR:
5785
@pytest.mark.quicktest
@@ -88,6 +116,63 @@ def test_snapshot(self, vm_on_linstor_sr):
88116
finally:
89117
vm.shutdown(verify=True)
90118

119+
@pytest.mark.small_vm
120+
def test_resynchronization(
121+
self, host_and_corrupted_vdi_on_linstor_sr: Tuple[VM, Host, str]
122+
):
123+
(vm, host, resource_name) = host_and_corrupted_vdi_on_linstor_sr
124+
hostname = host.hostname()
125+
126+
try:
127+
other_host = next(
128+
next(h for h in host.pool.hosts if h.hostname() == conn.get("name", ""))
129+
for res in get_drbd_status(host, resource_name)
130+
for conn in res.get("connections", [])
131+
for peer in conn.get("peer_devices", [])
132+
if peer.get("peer-disk-state", "") == "UpToDate"
133+
)
134+
logging.info("Elected `%s` as peer for verification and repair", other_host)
135+
except StopIteration:
136+
pytest.fail("Could not find an UpToDate peer host")
137+
138+
corrupted = None
139+
max_attempts = 3
140+
# Attempting several times since testing revealed `drbdadm verify` can be flaky
141+
for attempt in range(1, max_attempts + 1):
142+
logging.info("`drbdadm verify` attempt %d/%d", attempt, max_attempts)
143+
logging.info("[%s] Running DRBD verify for %s...", other_host, resource_name)
144+
other_host.ssh(["drbdadm", "verify", f"{resource_name}:{hostname}/0"])
145+
wait_drbd_sync(other_host, resource_name)
146+
147+
corrupted_resources = get_corrupted_resources(other_host, resource_name)
148+
if not corrupted_resources:
149+
logging.warning("No corrupted resources found on attempt #%d", attempt)
150+
continue
151+
for res_name, peer_name, out_of_sync in corrupted_resources:
152+
if res_name == resource_name and peer_name == hostname:
153+
corrupted = (res_name, peer_name, out_of_sync)
154+
if corrupted:
155+
break
156+
if not corrupted:
157+
pytest.fail(f"Failed to identify corrupted resource after {max_attempts} attempts")
158+
159+
logging.info("Invalidating remote resource `%s`...", resource_name)
160+
other_host.ssh([
161+
"drbdadm", "invalidate-remote",
162+
f"{resource_name}:{hostname}/0",
163+
"--reset-bitmap=no"
164+
])
165+
wait_drbd_sync(other_host, resource_name)
166+
if get_corrupted_resources(other_host, resource_name):
167+
pytest.fail("Corrupted resource did not get fixed")
168+
169+
vm.start(on=host.uuid)
170+
try:
171+
vm.wait_for_os_booted()
172+
vm.test_snapshot_on_running_vm()
173+
finally:
174+
vm.shutdown(verify=True)
175+
91176
# *** tests with reboots (longer tests).
92177

93178
@pytest.mark.reboot

0 commit comments

Comments
 (0)