From 9de83eb2e1575e8f0c0b645535a41ee7666845a4 Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Thu, 27 Feb 2025 15:30:26 +0000 Subject: [PATCH 1/5] CA-407343: do not remove the VHD parent property after leaf coalesce Long standing bug, introduced in ``` changeset: 1508:3c5a651ef2e4 user: Andrei Lifchits date: Wed Feb 10 09:41:59 2010 -0800 summary: CA-36637: fix the free space requirements calculation for the inline- and leaf-coalesce; plus: ``` The act of coalescing a leaf VHD into a parent does not make any structural changes to the parentage of the VHD into which the data has been coalesced. This, unnecessarily, introduced a race where a VDI would report not having a parent until the next SR scan fixed it up. Signed-off-by: Mark Syms --- libs/sm/cleanup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/sm/cleanup.py b/libs/sm/cleanup.py index 850086acd..494f7b823 100644 --- a/libs/sm/cleanup.py +++ b/libs/sm/cleanup.py @@ -2316,7 +2316,6 @@ def _doCoalesceLeaf(self, vdi): # garbage # update the VDI record - vdi.parent.delConfig(VDI.DB_VHD_PARENT) if vdi.parent.raw: vdi.parent.setConfig(VDI.DB_VDI_TYPE, vhdutil.VDI_TYPE_RAW) vdi.parent.delConfig(VDI.DB_VHD_BLOCKS) From 21d990b74cd34887341be44510e45dfc6ab2db91 Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Wed, 19 Mar 2025 09:21:49 +0000 Subject: [PATCH 2/5] CA-408452: remove vhd parent if it does not have one A previous change stopped the cleanup code unilaterally removing the `vhd-parent` configuration of the lef coalesced VHD which resulted in failures where VHDs that should have declared a parent no longer did until a subsequent SR scan restored the configuration value. This is the alternate mirror case where the leaf coalesce collapses the entire tree out and so the `vhd-parent` becomes redundant, again until a subsequent SR scan removes it. To satisfy both use cases remove the `vhd-parent` property if and only if the coalesced VHD has no parent. Signed-off-by: Mark Syms --- libs/sm/cleanup.py | 3 +++ tests/test_cleanup.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/libs/sm/cleanup.py b/libs/sm/cleanup.py index 494f7b823..401e110c4 100644 --- a/libs/sm/cleanup.py +++ b/libs/sm/cleanup.py @@ -2330,6 +2330,9 @@ def _doCoalesceLeaf(self, vdi): vdi.parent.children = [] vdi.parent = None + if parent.parent is None: + parent.delConfig(VDI.DB_VHD_PARENT) + extraSpace = self._calcExtraSpaceNeeded(vdi, parent) freeSpace = self.getFreeSpace() if freeSpace < extraSpace: diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py index e66f7ed9d..f47ef2d09 100644 --- a/tests/test_cleanup.py +++ b/tests/test_cleanup.py @@ -7,6 +7,7 @@ from uuid import uuid4 from sm import cleanup +from sm import fjournaler from sm.core import lock from sm.core import util @@ -16,6 +17,8 @@ import XenAPI +MEGA = 1024 * 1024 + class FakeFile(object): pass @@ -1815,6 +1818,46 @@ def test_not_plugged_retry(self): # Assert self.assertIsNotNone(sr) + def test_doCoalesceLeaf_no_parent_after(self): + sr = create_cleanup_sr(self.xapi_mock) + sr.journaler = mock.create_autospec(fjournaler.Journaler) + parent_uuid = str(uuid.uuid4()) + mock_parent = mock.MagicMock() + mock_parent.uuid = parent_uuid + mock_parent.parent = None + mock_parent.raw = False + mock_parent.getSizeVHD.return_value = 10 * MEGA + + vdi_uuid = str(uuid.uuid4()) + mock_vdi = mock.MagicMock() + mock_vdi.uuid = vdi_uuid + mock_vdi.getSizeVHD.return_value = 10 * MEGA + mock_vdi.parent = mock_parent + + sr._doCoalesceLeaf(mock_vdi) + + mock_parent.delConfig.assert_called_with("vhd-parent") + + def test_doCoalesceLeaf_parent_remains_after(self): + sr = create_cleanup_sr(self.xapi_mock) + sr.journaler = mock.create_autospec(fjournaler.Journaler) + parent_uuid = str(uuid.uuid4()) + mock_parent = mock.MagicMock() + mock_parent.uuid = parent_uuid + mock_parent.parent = mock.MagicMock() + mock_parent.raw = False + mock_parent.getSizeVHD.return_value = 10 * MEGA + + vdi_uuid = str(uuid.uuid4()) + mock_vdi = mock.MagicMock() + mock_vdi.uuid = vdi_uuid + mock_vdi.getSizeVHD.return_value = 10 * MEGA + mock_vdi.parent = mock_parent + + sr._doCoalesceLeaf(mock_vdi) + + self.assertNotIn('vhd-parent', mock_parent.delConfig.call_args) + class TestLockGCActive(unittest.TestCase): From d7f5b4015ae5b678ee04fc40bf1b92884b5d1d9c Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Fri, 28 Feb 2025 15:39:10 +0000 Subject: [PATCH 3/5] Revert "CA-397084 Log any user of LV at deactivate" This reverts commit 2979937bbb747280e76aa69cef43f78ddd68353c. Signed-off-by: Mark Syms --- libs/sm/lvutil.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/libs/sm/lvutil.py b/libs/sm/lvutil.py index d1e7eb976..e2ed04e1f 100644 --- a/libs/sm/lvutil.py +++ b/libs/sm/lvutil.py @@ -728,13 +728,6 @@ def deactivateNoRefcount(path): @lvmretry def _deactivate(path): - # Records what is using the LVM path in case there is an issue. - # In most cases this should be nothing. - try: - (rc, stdout, stderr) = util.doexec(['/usr/sbin/fuser', "-v", path]) - util.SMlog(f"fuser {path} => {rc} / '{stdout}' / '{stderr}'") - except: - pass text = cmd_lvm([CMD_LVCHANGE, "-an", path]) From 86a0712eb819b996e41d371c87a3453f69afe0d5 Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Mon, 10 Mar 2025 10:46:53 +0000 Subject: [PATCH 4/5] CA-408105: add logging to _finishInterruptedCoalesceLeaf No error logging is present in two error paths which make diagnosis difficult/impossible. Signed-off-by: Mark Syms --- libs/sm/cleanup.py | 2 ++ tests/test_cleanup.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/libs/sm/cleanup.py b/libs/sm/cleanup.py index 401e110c4..c248598f3 100644 --- a/libs/sm/cleanup.py +++ b/libs/sm/cleanup.py @@ -2635,10 +2635,12 @@ def _finishInterruptedCoalesceLeaf(self, childUuid, parentUuid): Util.log("*** FINISH LEAF-COALESCE") vdi = self.getVDI(childUuid) if not vdi: + Util.log(f"_finishInterruptedCoalesceLeaf, vdi {childUuid} not found, aborting") raise util.SMException("VDI %s not found" % childUuid) try: self.forgetVDI(parentUuid) except XenAPI.Failure: + Util.logException('_finishInterruptedCoalesceLeaf') pass self._updateSlavesOnResize(vdi) util.fistpoint.activate("LVHDRT_coaleaf_finish_end", self.uuid) diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py index f47ef2d09..4fde15e6b 100644 --- a/tests/test_cleanup.py +++ b/tests/test_cleanup.py @@ -9,6 +9,7 @@ from sm import cleanup from sm import fjournaler from sm.core import lock +from sm.core.util import SMException from sm.core import util from sm import vhdutil @@ -16,6 +17,7 @@ from sm import ipc import XenAPI +from XenAPI import Failure MEGA = 1024 * 1024 @@ -1962,3 +1964,46 @@ def test_cannot_acquire_if_other_process_holds_sr_lock(self): # When with self.assertRaises(BlockingIOError): gcLock.acquireNoblock() + + +class TestFileSR(TestSR): + """ + Exercise the specfic bits of the FileSR support + """ + + def setUp(self): + super().setUp() + + self.sr_uuid = str(uuid.uuid4()) + + def _make_test_sr(self): + self.mock_sr = cleanup.FileSR(self.sr_uuid, self.xapi_mock, + createLock=False, force=False) + + def test_finishInterruptedCoalesceLeaf_no_vdi(self): + self._make_test_sr() + self.mock_sr.vdis = {} + + child_vdi_uuid = str(uuid.uuid4()) + parent_vdi_uuid = str(uuid.uuid4()) + + with self.assertRaises(SMException) as sme: + self.mock_sr._finishInterruptedCoalesceLeaf(child_vdi_uuid, parent_vdi_uuid) + + self.assertRegex(str(sme.exception), r"VDI \S+ not found") + + def test_finishInterruptedCoalesceLeaf_forget_fail_and_complete(self): + child_vdi_uuid = str(uuid.uuid4()) + parent_vdi_uuid = str(uuid.uuid4()) + + self._make_test_sr() + self.mock_sr.vdis = { + child_vdi_uuid: mock.create_autospec(cleanup.FileVDI) + } + + def forgetVdi(self, uuid): + raise Failure(f"ForgetVDI {uuid} failed") + + self.xapi_mock.forgetVDI.side_effect = forgetVdi + + self.mock_sr._finishInterruptedCoalesceLeaf(child_vdi_uuid, parent_vdi_uuid) From 5e84b90db019198e5c527805695bcc4f140bf921 Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Thu, 15 May 2025 11:38:57 +0100 Subject: [PATCH 5/5] CA-411163: verify SCSI ids for SR PVs If the iSCSI target exposes multiple LUNs which contain Volume Groups with the same ID as is required by an SR it is somewhat non-deterministic as to which actual PV will be used by the LVM subsystem when attaching the SR. To prevent unexpected failures, detect this condition and refuse to attach the SR. Signed-off-by: Mark Syms --- libs/sm/core/XE_SR_ERRORCODES.xml | 6 ++++++ libs/sm/drivers/LVHDoISCSISR.py | 10 ++++++++++ libs/sm/lvutil.py | 14 ++++++++++++++ tests/test_LVHDoISCSISR.py | 3 +++ tests/test_lvutil.py | 23 +++++++++++++++++++++++ 5 files changed, 56 insertions(+) diff --git a/libs/sm/core/XE_SR_ERRORCODES.xml b/libs/sm/core/XE_SR_ERRORCODES.xml index 47fefd830..792fe17fc 100644 --- a/libs/sm/core/XE_SR_ERRORCODES.xml +++ b/libs/sm/core/XE_SR_ERRORCODES.xml @@ -513,6 +513,12 @@ 117 + + PVMultiIDs + PVs found with multiple SCSI IDs + 119 + + APISession diff --git a/libs/sm/drivers/LVHDoISCSISR.py b/libs/sm/drivers/LVHDoISCSISR.py index eb13f8cbe..0823cf28b 100644 --- a/libs/sm/drivers/LVHDoISCSISR.py +++ b/libs/sm/drivers/LVHDoISCSISR.py @@ -471,11 +471,21 @@ def attach(self, sr_uuid): scsiutil.rescan([self.iscsi.adapter[a]]) self._pathrefresh(LVHDoISCSISR) + + # Check that we only have PVs for the volume group with the expected SCSI ID + lvutil.checkPVScsiIds(self.vgname, self.SCSIid) + LVHDSR.LVHDSR.attach(self, sr_uuid) except Exception as inst: for i in self.iscsiSRs: i.detach(sr_uuid) + + # If we already have a proper error just raise it + if isinstance(inst, xs_errors.SROSError): + raise + raise xs_errors.XenError("SRUnavailable", opterr=inst) + self._setMultipathableFlag(SCSIid=self.SCSIid) def detach(self, sr_uuid): diff --git a/libs/sm/lvutil.py b/libs/sm/lvutil.py index e2ed04e1f..2566f9397 100644 --- a/libs/sm/lvutil.py +++ b/libs/sm/lvutil.py @@ -23,6 +23,7 @@ import time from fairlock import Fairlock +from sm.core import scsiutil from sm.core import util from sm.core import xs_errors import xml.dom.minidom @@ -613,6 +614,19 @@ def setActiveVG(path, active): text = cmd_lvm([CMD_VGCHANGE, "-a" + val, path]) +def checkPVScsiIds(vgname, SCSIid): + # Get all the PVs for the specified vgName even if not active + cmd = [CMD_PVS, '-a', '--select', f'vgname={vgname}', '--no-headings'] + text = cmd_lvm(cmd) + pv_paths = [x.split()[0] for x in text.splitlines()] + for pv_path in pv_paths: + pv_scsi_id = scsiutil.getSCSIid(pv_path) + if pv_scsi_id != SCSIid: + raise xs_errors.XenError( + 'PVMultiIDs', + opterr=f'Found PVs {",".join(pv_paths)} and unexpected ' + f'SCSI ID {pv_scsi_id}, expected {SCSIid}') + @lvmretry def create(name, size, vgname, tag=None, size_in_percentage=None): if size_in_percentage: diff --git a/tests/test_LVHDoISCSISR.py b/tests/test_LVHDoISCSISR.py index 8fa7f8647..4082317cb 100644 --- a/tests/test_LVHDoISCSISR.py +++ b/tests/test_LVHDoISCSISR.py @@ -159,6 +159,9 @@ def deepcopy(to_copy): lvlock_patcher = mock.patch('sm.drivers.LVHDSR.lvutil.Fairlock') self.mock_lvlock = lvlock_patcher.start() + pv_check_patcher = mock.patch('sm.drivers.LVHDoISCSISR.lvutil.checkPVScsiIds', autospec=True) + self.mock_pv_check = pv_check_patcher.start() + self.addCleanup(mock.patch.stopall) super().setUp() diff --git a/tests/test_lvutil.py b/tests/test_lvutil.py index 43b3b7127..176dd3c93 100644 --- a/tests/test_lvutil.py +++ b/tests/test_lvutil.py @@ -6,6 +6,7 @@ import testlib import lvmlib from sm.core import util +from sm.core import xs_errors from sm import lvutil @@ -394,6 +395,27 @@ def test_command_error(self, mock_smlog, mock_cmd_lvm): ]) mock_smlog.assert_called_with("PVs in VG vg1: []") + @mock.patch('sm.core.xs_errors.XML_DEFS', 'libs/sm/core/XE_SR_ERRORCODES.xml') + @mock.patch('sm.lvutil.scsiutil.getSCSIid', autospec=True) + def test_check_PV_SCSI_IDs_failure(self, mock_get_scsi_id, mock_smlog, mock_cmd_lvm): + mock_cmd_lvm.return_value = """ /dev/disk/by-id/scsi-360014057b7f26e2962843fa8a0645fbd VG_XenStorage-401d198b-60ab-1f21-1359-bd4f127b8f38 lvm2 d-- 0 0 + /dev/disk/by-id/scsi-36001405fdd436fa7f854cd685fc3b1fd VG_XenStorage-401d198b-60ab-1f21-1359-bd4f127b8f38 lvm2 a-- <49.99g 49.98g +""" # noqa: E501 + mock_get_scsi_id.side_effect = ['36001405fdd436fa7f854cd685fc3b1fd', + '360014057b7f26e2962843fa8a0645fbd'] + with self.assertRaises(xs_errors.SROSError) as srose: + lvutil.checkPVScsiIds('VG_XenStorage-401d198b-60ab-1f21-1359-bd4f127b8f38', + '36001405fdd436fa7f854cd685fc3b1fd') + self.assertEqual(119, srose.exception.errno) + + @mock.patch('sm.lvutil.scsiutil.getSCSIid', autospec=True) + def test_check_PV_SCSI_IDs_success(self, mock_get_scsi_id, mock_smlog, mock_cmd_lvm): + mock_cmd_lvm.return_value = """ /dev/disk/by-id/scsi-360014057b7f26e2962843fa8a0645fbd VG_XenStorage-401d198b-60ab-1f21-1359-bd4f127b8f38 lvm2 d-- 0 0""" # noqa: E501 + mock_get_scsi_id.side_effect = ['36001405fdd436fa7f854cd685fc3b1fd'] + lvutil.checkPVScsiIds('VG_XenStorage-401d198b-60ab-1f21-1359-bd4f127b8f38', + '36001405fdd436fa7f854cd685fc3b1fd') + + @mock.patch('sm.lvutil.cmd_lvm') @mock.patch('sm.core.util.SMlog', autospec=True) class TestGetPVsWithUUID(unittest.TestCase): @@ -431,3 +453,4 @@ def test_command_error(self, mock_smlog, mock_cmd_lvm): mock.call("Warning: Invalid or empty line in pvs output: Invalid return value."), mock.call("PVs with uuid uuid1: []") ]) +