Skip to content

Commit afed04d

Browse files
committed
fix: detect partial provisioning and escalate to CRITICAL severity (SCT-501)
When pre-provisioning partially fails (e.g. GCE zone exhaustion after creating some instances), the test setUp would silently continue with fewer nodes than requested. This happened because add_nodes found the partially-created instances via _get_instances but never validated their count against the requested count. Root cause: the GCE instance provider uses parallel API inserts, so some instances can be created before a ZoneResourcesExhaustedError terminates the batch. These orphaned instances persist in GCE with matching TestId tags. When setUp later calls add_nodes, it discovers them via _get_instances and proceeds without checking the count. Additionally, provisioning failures raised during setUp were published as TestFrameworkEvent with default ERROR severity, which does not trigger the EventsAnalyzer interrupt mechanism (only CRITICAL does). Changes: - cluster_gce.py: validate len(instances) >= count when pre-provisioned instances are found; raise ProvisionError if fewer than expected - cluster_aws.py: add the same count validation on the non-REUSE pre-provisioned path (same bug pattern) - tester.py: escalate ProvisionError and ProvisionUnrecoverableError to CRITICAL severity in teardown_on_exception so the EventsAnalyzer can interrupt the test run - Add unit tests covering all three fixes
1 parent 20a3a25 commit afed04d

4 files changed

Lines changed: 284 additions & 2 deletions

File tree

sdcm/cluster_aws.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
)
4444
from sdcm.provision.scylla_yaml import SeedProvider
4545
from sdcm.provision.helpers.cloud_init import wait_cloud_init_completes
46+
from sdcm.provision.provisioner import ProvisionError
4647
from sdcm.sct_provision.aws.cluster import PlacementGroup
4748

4849
from sdcm.remote import LocalCmdRunner, shell_script_cmd, NETWORK_EXCEPTIONS
@@ -512,6 +513,11 @@ def _create_or_find_instances(
512513
self.log.info("Found instances to be reused from test [%s] = %s", self.test_config.REUSE_CLUSTER, instances)
513514
return instances
514515
if instances := self._get_instances(dc_idx, az_idx):
516+
if len(instances) < count:
517+
raise ProvisionError(
518+
f"Found only {len(instances)} pre-provisioned instance(s) but {count} were requested "
519+
f"for dc_idx={dc_idx}, az_idx={az_idx}. The pre-provisioning step may have partially failed."
520+
)
515521
self.log.info("Found provisioned instances = %s", instances)
516522
return instances
517523
self.log.info("Found no provisioned instances. Provision them.")

sdcm/cluster_gce.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,11 @@ def add_nodes(
661661
if not instances:
662662
raise RuntimeError("No nodes found for testId %s " % (self.test_config.test_id(),))
663663
elif instances := self._get_instances(instance_dc):
664+
if len(instances) < count:
665+
raise ProvisionError(
666+
f"Found only {len(instances)} pre-provisioned instance(s) but {count} were requested "
667+
f"for dc_idx={instance_dc}. The pre-provisioning step may have partially failed."
668+
)
664669
self.log.info("Found provisioned instances = %s", instances)
665670
else:
666671
self.log.info("Found no provisioned instances. Provision them.")

sdcm/tester.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@
9292
from sdcm.provision.gce.provisioner import GceProvisioner
9393
from sdcm.provision.network_configuration import ssh_connection_ip_type
9494
from sdcm.provision.oci.provisioner import OciProvisioner
95-
from sdcm.provision.provisioner import provisioner_factory
95+
from sdcm.provision.provisioner import ProvisionError, ProvisionUnrecoverableError, provisioner_factory
9696
from sdcm.provision.helpers.certificate import (
9797
create_ca,
9898
update_certificate,
@@ -258,8 +258,19 @@ def wrapper(*args, **kwargs):
258258
try:
259259
return method(*args, **kwargs)
260260
except Exception as exc:
261+
# Provisioning errors are always forced to CRITICAL to trigger
262+
# EventsAnalyzer interrupts (exc.severity is NOT preserved for them).
263+
# Non-provisioning exceptions forward exc.severity when present;
264+
# otherwise leave as None so TestFrameworkEvent defaults to ERROR.
265+
if isinstance(exc, (ProvisionError, ProvisionUnrecoverableError)):
266+
severity = Severity.CRITICAL
267+
else:
268+
severity = getattr(exc, "severity", None)
261269
TestFrameworkEvent(
262-
source=args[0].__class__.__name__, source_method="SetUp", exception=exc
270+
source=args[0].__class__.__name__,
271+
source_method="SetUp",
272+
exception=exc,
273+
severity=severity,
263274
).publish_or_dump()
264275
TEST_LOG.exception("Exception in %s. Will call tearDown", method.__name__)
265276
# Try to initialize Argus if it hasn't been done yet, so the failure
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
# This program is free software; you can redistribute it and/or modify
2+
# it under the terms of the GNU Affero General Public License as published by
3+
# the Free Software Foundation; either version 3 of the License, or
4+
# (at your option) any later version.
5+
#
6+
# This program is distributed in the hope that it will be useful,
7+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
9+
#
10+
# See LICENSE for more details.
11+
#
12+
# Copyright (c) 2026 ScyllaDB
13+
14+
"""Tests that partial provisioning is detected and raises errors.
15+
16+
Covers SCT-501: when pre-provisioning partially fails (e.g. zone exhaustion
17+
after creating some instances), the test setUp should not silently continue
18+
with fewer nodes than requested.
19+
"""
20+
21+
from unittest.mock import MagicMock, patch, PropertyMock
22+
23+
import pytest
24+
25+
from sdcm.provision.provisioner import ProvisionError, ProvisionUnrecoverableError, ZoneResourcesExhaustedError
26+
from sdcm.sct_events import Severity
27+
from sdcm.sct_events.system import TestFrameworkEvent
28+
from sdcm.tester import teardown_on_exception
29+
30+
31+
# ---------------------------------------------------------------------------
32+
# GCE: add_nodes count validation
33+
# ---------------------------------------------------------------------------
34+
35+
36+
def _make_gce_cluster(monkeypatch, instance_count, requested_count, is_reuse=False):
37+
"""Create a minimally-mocked GCECluster and invoke add_nodes.
38+
39+
Args:
40+
monkeypatch: pytest monkeypatch fixture.
41+
instance_count: how many pre-provisioned instances _get_instances returns.
42+
requested_count: the count passed to add_nodes.
43+
is_reuse: whether REUSE_CLUSTER is set.
44+
"""
45+
from sdcm.cluster_gce import GCECluster
46+
47+
fake_instances = [MagicMock(name=f"instance-{i}") for i in range(instance_count)]
48+
49+
cluster = MagicMock(spec=GCECluster)
50+
cluster.log = MagicMock()
51+
cluster.params = MagicMock()
52+
cluster.params.get.side_effect = lambda key, *a, **kw: {
53+
"simulated_regions": False,
54+
}.get(key)
55+
cluster._node_index = 0
56+
cluster.nodes = []
57+
cluster.racks_count = 1
58+
59+
test_config = MagicMock()
60+
test_config.REUSE_CLUSTER = is_reuse
61+
test_config.test_id.return_value = "test-id-123"
62+
cluster.test_config = test_config
63+
64+
cluster._get_instances = MagicMock(return_value=fake_instances)
65+
cluster._create_node = MagicMock(side_effect=lambda inst, idx, dc, rack, after_config=None: MagicMock(name=f"node-{idx}"))
66+
67+
# Call the real add_nodes with self=cluster
68+
return GCECluster.add_nodes.__wrapped__(
69+
cluster, count=requested_count, dc_idx=0, rack=0,
70+
)
71+
72+
73+
def test_gce_add_nodes_partial_provision_raises_error(monkeypatch):
74+
"""add_nodes should raise ProvisionError when fewer instances are found than requested."""
75+
with pytest.raises(ProvisionError, match=r"Found only 2.*but 9 were requested"):
76+
_make_gce_cluster(monkeypatch, instance_count=2, requested_count=9)
77+
78+
79+
def test_gce_add_nodes_exact_count_succeeds(monkeypatch):
80+
"""add_nodes should succeed when exactly the requested number of instances are found."""
81+
result = _make_gce_cluster(monkeypatch, instance_count=3, requested_count=3)
82+
assert len(result) == 3
83+
84+
85+
def test_gce_add_nodes_more_instances_than_requested_succeeds(monkeypatch):
86+
"""add_nodes should succeed when more instances are found than requested (extra instances are used)."""
87+
result = _make_gce_cluster(monkeypatch, instance_count=5, requested_count=3)
88+
assert len(result) == 5
89+
90+
91+
def test_gce_add_nodes_no_instances_provisions_inline(monkeypatch):
92+
"""add_nodes should fall through to inline provisioning when no instances are found."""
93+
from sdcm.cluster_gce import GCECluster
94+
95+
cluster = MagicMock(spec=GCECluster)
96+
cluster.log = MagicMock()
97+
cluster.params = MagicMock()
98+
cluster.params.get.side_effect = lambda key, *a, **kw: {
99+
"simulated_regions": False,
100+
}.get(key)
101+
cluster._node_index = 0
102+
cluster.nodes = []
103+
cluster.racks_count = 1
104+
105+
test_config = MagicMock()
106+
test_config.REUSE_CLUSTER = False
107+
cluster.test_config = test_config
108+
109+
# _get_instances returns empty list -> falls through to _create_instances
110+
cluster._get_instances = MagicMock(return_value=[])
111+
112+
fake_vms = [MagicMock(name=f"vm-{i}") for i in range(3)]
113+
cluster._create_instances = MagicMock(return_value=fake_vms)
114+
cluster._get_instance_with_retry = MagicMock(side_effect=lambda name, dc_idx: MagicMock(name=name))
115+
cluster._create_node = MagicMock(side_effect=lambda inst, idx, dc, rack, after_config=None: MagicMock(name=f"node-{idx}"))
116+
117+
result = GCECluster.add_nodes.__wrapped__(cluster, count=3, dc_idx=0, rack=0)
118+
cluster._create_instances.assert_called_once_with(3, 0, instance_type=None)
119+
assert len(result) == 3
120+
121+
122+
# ---------------------------------------------------------------------------
123+
# AWS: _create_or_find_instances count validation
124+
# ---------------------------------------------------------------------------
125+
126+
127+
def test_aws_create_or_find_partial_provision_raises_error():
128+
"""AWS _create_or_find_instances should raise ProvisionError when fewer instances found than requested."""
129+
from sdcm.cluster_aws import AWSCluster
130+
131+
cluster = MagicMock(spec=AWSCluster)
132+
cluster.log = MagicMock()
133+
cluster.params = MagicMock()
134+
cluster.params.get.side_effect = lambda key, *a, **kw: {
135+
"simulated_racks": False,
136+
}.get(key)
137+
cluster.nodes = []
138+
139+
test_config = MagicMock()
140+
test_config.REUSE_CLUSTER = False
141+
cluster.test_config = test_config
142+
143+
fake_instances = [MagicMock() for _ in range(2)]
144+
cluster._get_instances = MagicMock(return_value=fake_instances)
145+
146+
with pytest.raises(ProvisionError, match=r"Found only 2.*but 5 were requested"):
147+
AWSCluster._create_or_find_instances(cluster, count=5, ec2_user_data="", dc_idx=0, az_idx=0)
148+
149+
150+
def test_aws_create_or_find_exact_count_succeeds():
151+
"""AWS _create_or_find_instances should succeed when exactly the requested count is found."""
152+
from sdcm.cluster_aws import AWSCluster
153+
154+
cluster = MagicMock(spec=AWSCluster)
155+
cluster.log = MagicMock()
156+
cluster.params = MagicMock()
157+
cluster.params.get.side_effect = lambda key, *a, **kw: {
158+
"simulated_racks": False,
159+
}.get(key)
160+
cluster.nodes = []
161+
162+
test_config = MagicMock()
163+
test_config.REUSE_CLUSTER = False
164+
cluster.test_config = test_config
165+
166+
fake_instances = [MagicMock() for _ in range(5)]
167+
cluster._get_instances = MagicMock(return_value=fake_instances)
168+
169+
result = AWSCluster._create_or_find_instances(cluster, count=5, ec2_user_data="", dc_idx=0, az_idx=0)
170+
assert len(result) == 5
171+
172+
173+
# ---------------------------------------------------------------------------
174+
# teardown_on_exception: severity escalation for provisioning errors
175+
# ---------------------------------------------------------------------------
176+
177+
178+
def _make_tester_mock():
179+
"""Create a minimal mock of ClusterTester for teardown_on_exception tests."""
180+
tester = MagicMock()
181+
tester.__class__.__name__ = "ClusterTester"
182+
tester.params = MagicMock()
183+
tester.argus_heartbeat_stop_signal = MagicMock()
184+
tester.tearDown = MagicMock()
185+
return tester
186+
187+
188+
def test_teardown_on_exception_provision_error_publishes_critical():
189+
"""ProvisionError in setUp should publish TestFrameworkEvent with CRITICAL severity."""
190+
tester = _make_tester_mock()
191+
192+
@teardown_on_exception
193+
def fake_setup(self):
194+
raise ProvisionError("Failed to create instances")
195+
196+
published_events = []
197+
198+
with patch.object(TestFrameworkEvent, "publish_or_dump", side_effect=lambda: published_events.append(True)):
199+
with patch("sdcm.tester.TestFrameworkEvent") as mock_event_cls:
200+
mock_event_cls.return_value.publish_or_dump = MagicMock()
201+
with pytest.raises(ProvisionError):
202+
fake_setup(tester)
203+
mock_event_cls.assert_called_once()
204+
call_kwargs = mock_event_cls.call_args
205+
assert call_kwargs.kwargs.get("severity") == Severity.CRITICAL or call_kwargs[1].get("severity") == Severity.CRITICAL
206+
207+
208+
def test_teardown_on_exception_zone_exhausted_publishes_critical():
209+
"""ZoneResourcesExhaustedError in setUp should publish TestFrameworkEvent with CRITICAL severity."""
210+
tester = _make_tester_mock()
211+
212+
@teardown_on_exception
213+
def fake_setup(self):
214+
raise ZoneResourcesExhaustedError("Zone us-east1-d exhausted")
215+
216+
with patch("sdcm.tester.TestFrameworkEvent") as mock_event_cls:
217+
mock_event_cls.return_value.publish_or_dump = MagicMock()
218+
with pytest.raises(ZoneResourcesExhaustedError):
219+
fake_setup(tester)
220+
mock_event_cls.assert_called_once()
221+
call_kwargs = mock_event_cls.call_args
222+
assert call_kwargs.kwargs.get("severity") == Severity.CRITICAL or call_kwargs[1].get("severity") == Severity.CRITICAL
223+
224+
225+
def test_teardown_on_exception_generic_error_publishes_default_severity():
226+
"""Non-provisioning errors in setUp should publish TestFrameworkEvent with default (None) severity."""
227+
tester = _make_tester_mock()
228+
229+
@teardown_on_exception
230+
def fake_setup(self):
231+
raise RuntimeError("Something else went wrong")
232+
233+
with patch("sdcm.tester.TestFrameworkEvent") as mock_event_cls:
234+
mock_event_cls.return_value.publish_or_dump = MagicMock()
235+
with pytest.raises(RuntimeError):
236+
fake_setup(tester)
237+
mock_event_cls.assert_called_once()
238+
call_kwargs = mock_event_cls.call_args
239+
assert call_kwargs.kwargs.get("severity") is None or call_kwargs[1].get("severity") is None
240+
241+
242+
def test_teardown_on_exception_preserves_severity_from_exception():
243+
"""If the exception carries a severity attribute, it should be forwarded to the event."""
244+
tester = _make_tester_mock()
245+
246+
class ErrorWithSeverity(Exception):
247+
severity = Severity.WARNING
248+
249+
@teardown_on_exception
250+
def fake_setup(self):
251+
raise ErrorWithSeverity("Warning-level error")
252+
253+
with patch("sdcm.tester.TestFrameworkEvent") as mock_event_cls:
254+
mock_event_cls.return_value.publish_or_dump = MagicMock()
255+
with pytest.raises(ErrorWithSeverity):
256+
fake_setup(tester)
257+
mock_event_cls.assert_called_once()
258+
call_kwargs = mock_event_cls.call_args
259+
severity = call_kwargs.kwargs.get("severity") or call_kwargs[1].get("severity")
260+
assert severity == Severity.WARNING

0 commit comments

Comments
 (0)