Skip to content

Commit 4b16bf4

Browse files
committed
fix: detect partial provisioning and escalate to CRITICAL severity (SCT-501)
When pre-provisioning partially fails (e.g. GCE zone exhaustion after creating some instances), the test setUp would silently continue with fewer nodes than requested. This happened because add_nodes found the partially-created instances via _get_instances but never validated their count against the requested count. Root cause: the GCE instance provider uses parallel API inserts, so some instances can be created before a ZoneResourcesExhaustedError terminates the batch. These orphaned instances persist in GCE with matching TestId tags. When setUp later calls add_nodes, it discovers them via _get_instances and proceeds without checking the count. Additionally, provisioning failures raised during setUp were published as TestFrameworkEvent with default ERROR severity, which does not trigger the EventsAnalyzer interrupt mechanism (only CRITICAL does). Changes: - cluster_gce.py: validate len(instances) >= count when pre-provisioned instances are found; raise ProvisionError if fewer than expected - cluster_aws.py: add the same count validation on the non-REUSE pre-provisioned path (same bug pattern) - tester.py: escalate ProvisionError and ProvisionUnrecoverableError to CRITICAL severity in teardown_on_exception so the EventsAnalyzer can interrupt the test run - Add unit tests covering all three fixes
1 parent 20a3a25 commit 4b16bf4

4 files changed

Lines changed: 279 additions & 2 deletions

File tree

sdcm/cluster_aws.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
)
4444
from sdcm.provision.scylla_yaml import SeedProvider
4545
from sdcm.provision.helpers.cloud_init import wait_cloud_init_completes
46+
from sdcm.provision.provisioner import ProvisionError
4647
from sdcm.sct_provision.aws.cluster import PlacementGroup
4748

4849
from sdcm.remote import LocalCmdRunner, shell_script_cmd, NETWORK_EXCEPTIONS
@@ -512,6 +513,11 @@ def _create_or_find_instances(
512513
self.log.info("Found instances to be reused from test [%s] = %s", self.test_config.REUSE_CLUSTER, instances)
513514
return instances
514515
if instances := self._get_instances(dc_idx, az_idx):
516+
if len(instances) < count:
517+
raise ProvisionError(
518+
f"Found only {len(instances)} pre-provisioned instance(s) but {count} were requested "
519+
f"for dc_idx={dc_idx}, az_idx={az_idx}. The pre-provisioning step may have partially failed."
520+
)
515521
self.log.info("Found provisioned instances = %s", instances)
516522
return instances
517523
self.log.info("Found no provisioned instances. Provision them.")

sdcm/cluster_gce.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,11 @@ def add_nodes(
661661
if not instances:
662662
raise RuntimeError("No nodes found for testId %s " % (self.test_config.test_id(),))
663663
elif instances := self._get_instances(instance_dc):
664+
if len(instances) < count:
665+
raise ProvisionError(
666+
f"Found only {len(instances)} pre-provisioned instance(s) but {count} were requested "
667+
f"for dc_idx={instance_dc}. The pre-provisioning step may have partially failed."
668+
)
664669
self.log.info("Found provisioned instances = %s", instances)
665670
else:
666671
self.log.info("Found no provisioned instances. Provision them.")

sdcm/tester.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@
9292
from sdcm.provision.gce.provisioner import GceProvisioner
9393
from sdcm.provision.network_configuration import ssh_connection_ip_type
9494
from sdcm.provision.oci.provisioner import OciProvisioner
95-
from sdcm.provision.provisioner import provisioner_factory
95+
from sdcm.provision.provisioner import ProvisionError, ProvisionUnrecoverableError, provisioner_factory
9696
from sdcm.provision.helpers.certificate import (
9797
create_ca,
9898
update_certificate,
@@ -258,8 +258,19 @@ def wrapper(*args, **kwargs):
258258
try:
259259
return method(*args, **kwargs)
260260
except Exception as exc:
261+
# Provisioning errors are always forced to CRITICAL to trigger
262+
# EventsAnalyzer interrupts (exc.severity is NOT preserved for them).
263+
# Non-provisioning exceptions forward exc.severity when present;
264+
# otherwise leave as None so TestFrameworkEvent defaults to ERROR.
265+
if isinstance(exc, (ProvisionError, ProvisionUnrecoverableError)):
266+
severity = Severity.CRITICAL
267+
else:
268+
severity = getattr(exc, "severity", None)
261269
TestFrameworkEvent(
262-
source=args[0].__class__.__name__, source_method="SetUp", exception=exc
270+
source=args[0].__class__.__name__,
271+
source_method="SetUp",
272+
exception=exc,
273+
severity=severity,
263274
).publish_or_dump()
264275
TEST_LOG.exception("Exception in %s. Will call tearDown", method.__name__)
265276
# Try to initialize Argus if it hasn't been done yet, so the failure
Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
# This program is free software; you can redistribute it and/or modify
2+
# it under the terms of the GNU Affero General Public License as published by
3+
# the Free Software Foundation; either version 3 of the License, or
4+
# (at your option) any later version.
5+
#
6+
# This program is distributed in the hope that it will be useful,
7+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
9+
#
10+
# See LICENSE for more details.
11+
#
12+
# Copyright (c) 2026 ScyllaDB
13+
14+
"""Tests that partial provisioning is detected and raises errors.
15+
16+
Covers SCT-501: when pre-provisioning partially fails (e.g. zone exhaustion
17+
after creating some instances), the test setUp should not silently continue
18+
with fewer nodes than requested.
19+
"""
20+
21+
from unittest.mock import MagicMock, patch
22+
23+
import pytest
24+
25+
from sdcm.cluster_aws import AWSCluster
26+
from sdcm.cluster_gce import GCECluster
27+
from sdcm.provision.provisioner import ProvisionError, ZoneResourcesExhaustedError
28+
from sdcm.sct_events import Severity
29+
from sdcm.tester import teardown_on_exception
30+
31+
32+
# ---------------------------------------------------------------------------
33+
# GCE: add_nodes count validation
34+
# ---------------------------------------------------------------------------
35+
36+
37+
def _make_gce_cluster(instance_count, requested_count, is_reuse=False):
38+
"""Create a minimally-mocked GCECluster and invoke add_nodes.
39+
40+
Args:
41+
instance_count: how many pre-provisioned instances _get_instances returns.
42+
requested_count: the count passed to add_nodes.
43+
is_reuse: whether REUSE_CLUSTER is set.
44+
"""
45+
fake_instances = [MagicMock(name=f"instance-{i}") for i in range(instance_count)]
46+
47+
cluster = MagicMock(spec=GCECluster)
48+
cluster.log = MagicMock()
49+
cluster.params = MagicMock()
50+
cluster.params.get.side_effect = lambda key, *a, **kw: {
51+
"simulated_regions": False,
52+
}.get(key)
53+
cluster._node_index = 0
54+
cluster.nodes = []
55+
cluster.racks_count = 1
56+
57+
test_config = MagicMock()
58+
test_config.REUSE_CLUSTER = is_reuse
59+
test_config.test_id.return_value = "test-id-123"
60+
cluster.test_config = test_config
61+
62+
cluster._get_instances = MagicMock(return_value=fake_instances)
63+
cluster._create_node = MagicMock(
64+
side_effect=lambda inst, idx, dc, rack, after_config=None: MagicMock(name=f"node-{idx}")
65+
)
66+
67+
# Call the real add_nodes with self=cluster
68+
return GCECluster.add_nodes.__wrapped__(
69+
cluster,
70+
count=requested_count,
71+
dc_idx=0,
72+
rack=0,
73+
)
74+
75+
76+
def test_gce_add_nodes_partial_provision_raises_error():
77+
"""add_nodes should raise ProvisionError when fewer instances are found than requested."""
78+
with pytest.raises(ProvisionError, match=r"Found only 2.*but 9 were requested"):
79+
_make_gce_cluster(instance_count=2, requested_count=9)
80+
81+
82+
def test_gce_add_nodes_exact_count_succeeds():
83+
"""add_nodes should succeed when exactly the requested number of instances are found."""
84+
result = _make_gce_cluster(instance_count=3, requested_count=3)
85+
assert len(result) == 3
86+
87+
88+
def test_gce_add_nodes_more_instances_than_requested_succeeds():
89+
"""add_nodes should succeed when more instances are found than requested."""
90+
result = _make_gce_cluster(instance_count=5, requested_count=3)
91+
assert len(result) >= 3
92+
93+
94+
def test_gce_add_nodes_no_instances_provisions_inline():
95+
"""add_nodes should fall through to inline provisioning when no instances are found."""
96+
cluster = MagicMock(spec=GCECluster)
97+
cluster.log = MagicMock()
98+
cluster.params = MagicMock()
99+
cluster.params.get.side_effect = lambda key, *a, **kw: {
100+
"simulated_regions": False,
101+
}.get(key)
102+
cluster._node_index = 0
103+
cluster.nodes = []
104+
cluster.racks_count = 1
105+
106+
test_config = MagicMock()
107+
test_config.REUSE_CLUSTER = False
108+
cluster.test_config = test_config
109+
110+
# _get_instances returns empty list -> falls through to _create_instances
111+
cluster._get_instances = MagicMock(return_value=[])
112+
113+
fake_vms = [MagicMock(name=f"vm-{i}") for i in range(3)]
114+
cluster._create_instances = MagicMock(return_value=fake_vms)
115+
cluster._get_instance_with_retry = MagicMock(side_effect=lambda name, dc_idx: MagicMock(name=name))
116+
cluster._create_node = MagicMock(
117+
side_effect=lambda inst, idx, dc, rack, after_config=None: MagicMock(name=f"node-{idx}")
118+
)
119+
120+
result = GCECluster.add_nodes.__wrapped__(cluster, count=3, dc_idx=0, rack=0)
121+
cluster._create_instances.assert_called_once_with(3, 0, instance_type=None)
122+
assert len(result) == 3
123+
124+
125+
# ---------------------------------------------------------------------------
126+
# AWS: _create_or_find_instances count validation
127+
# ---------------------------------------------------------------------------
128+
129+
130+
def test_aws_create_or_find_partial_provision_raises_error():
131+
"""AWS _create_or_find_instances should raise ProvisionError when fewer instances found than requested."""
132+
cluster = MagicMock(spec=AWSCluster)
133+
cluster.log = MagicMock()
134+
cluster.params = MagicMock()
135+
cluster.params.get.side_effect = lambda key, *a, **kw: {
136+
"simulated_racks": False,
137+
}.get(key)
138+
cluster.nodes = []
139+
140+
test_config = MagicMock()
141+
test_config.REUSE_CLUSTER = False
142+
cluster.test_config = test_config
143+
144+
fake_instances = [MagicMock() for _ in range(2)]
145+
cluster._get_instances = MagicMock(return_value=fake_instances)
146+
147+
with pytest.raises(ProvisionError, match=r"Found only 2.*but 5 were requested"):
148+
AWSCluster._create_or_find_instances(cluster, count=5, ec2_user_data="", dc_idx=0, az_idx=0)
149+
150+
151+
def test_aws_create_or_find_exact_count_succeeds():
152+
"""AWS _create_or_find_instances should succeed when exactly the requested count is found."""
153+
cluster = MagicMock(spec=AWSCluster)
154+
cluster.log = MagicMock()
155+
cluster.params = MagicMock()
156+
cluster.params.get.side_effect = lambda key, *a, **kw: {
157+
"simulated_racks": False,
158+
}.get(key)
159+
cluster.nodes = []
160+
161+
test_config = MagicMock()
162+
test_config.REUSE_CLUSTER = False
163+
cluster.test_config = test_config
164+
165+
fake_instances = [MagicMock() for _ in range(5)]
166+
cluster._get_instances = MagicMock(return_value=fake_instances)
167+
168+
result = AWSCluster._create_or_find_instances(cluster, count=5, ec2_user_data="", dc_idx=0, az_idx=0)
169+
assert len(result) == 5
170+
171+
172+
# ---------------------------------------------------------------------------
173+
# teardown_on_exception: severity escalation for provisioning errors
174+
# ---------------------------------------------------------------------------
175+
176+
177+
def _make_tester_mock():
178+
"""Create a minimal mock of ClusterTester for teardown_on_exception tests."""
179+
tester = MagicMock()
180+
tester.__class__.__name__ = "ClusterTester"
181+
tester.params = MagicMock()
182+
tester.argus_heartbeat_stop_signal = MagicMock()
183+
tester.tearDown = MagicMock()
184+
return tester
185+
186+
187+
def test_teardown_on_exception_provision_error_publishes_critical():
188+
"""ProvisionError in setUp should publish TestFrameworkEvent with CRITICAL severity."""
189+
tester = _make_tester_mock()
190+
191+
@teardown_on_exception
192+
def fake_setup(self):
193+
raise ProvisionError("Failed to create instances")
194+
195+
with patch("sdcm.tester.TestFrameworkEvent") as mock_event_cls:
196+
mock_event_cls.return_value.publish_or_dump = MagicMock()
197+
with pytest.raises(ProvisionError):
198+
fake_setup(tester)
199+
mock_event_cls.assert_called_once()
200+
_, kwargs = mock_event_cls.call_args
201+
assert kwargs["severity"] == Severity.CRITICAL
202+
203+
204+
def test_teardown_on_exception_zone_exhausted_publishes_critical():
205+
"""ZoneResourcesExhaustedError in setUp should publish TestFrameworkEvent with CRITICAL severity."""
206+
tester = _make_tester_mock()
207+
208+
@teardown_on_exception
209+
def fake_setup(self):
210+
raise ZoneResourcesExhaustedError("Zone us-east1-d exhausted")
211+
212+
with patch("sdcm.tester.TestFrameworkEvent") as mock_event_cls:
213+
mock_event_cls.return_value.publish_or_dump = MagicMock()
214+
with pytest.raises(ZoneResourcesExhaustedError):
215+
fake_setup(tester)
216+
mock_event_cls.assert_called_once()
217+
_, kwargs = mock_event_cls.call_args
218+
assert kwargs["severity"] == Severity.CRITICAL
219+
220+
221+
def test_teardown_on_exception_generic_error_publishes_default_severity():
222+
"""Non-provisioning errors in setUp should publish TestFrameworkEvent with default (None) severity."""
223+
tester = _make_tester_mock()
224+
225+
@teardown_on_exception
226+
def fake_setup(self):
227+
raise RuntimeError("Something else went wrong")
228+
229+
with patch("sdcm.tester.TestFrameworkEvent") as mock_event_cls:
230+
mock_event_cls.return_value.publish_or_dump = MagicMock()
231+
with pytest.raises(RuntimeError):
232+
fake_setup(tester)
233+
mock_event_cls.assert_called_once()
234+
_, kwargs = mock_event_cls.call_args
235+
assert kwargs["severity"] is None
236+
237+
238+
def test_teardown_on_exception_preserves_severity_from_exception():
239+
"""If the exception carries a severity attribute, it should be forwarded to the event."""
240+
tester = _make_tester_mock()
241+
242+
class ErrorWithSeverity(Exception):
243+
severity = Severity.WARNING
244+
245+
@teardown_on_exception
246+
def fake_setup(self):
247+
raise ErrorWithSeverity("Warning-level error")
248+
249+
with patch("sdcm.tester.TestFrameworkEvent") as mock_event_cls:
250+
mock_event_cls.return_value.publish_or_dump = MagicMock()
251+
with pytest.raises(ErrorWithSeverity):
252+
fake_setup(tester)
253+
mock_event_cls.assert_called_once()
254+
_, kwargs = mock_event_cls.call_args
255+
assert kwargs["severity"] == Severity.WARNING

0 commit comments

Comments
 (0)