Skip to content

fix(disrupt_terminate_and_replace_node): raise critical event on failure #10403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions defaults/severities.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,4 @@ ScrubValidationErrorEvent: ERROR
PartitionRowsValidationEvent: CRITICAL
FailedResultEvent: ERROR
HDRFileMissed: ERROR
TopologyFailureEvent: CRITICAL
28 changes: 20 additions & 8 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
from sdcm.sct_events.health import DataValidatorEvent
from sdcm.sct_events.loaders import CassandraStressLogEvent, ScyllaBenchEvent
from sdcm.sct_events.nemesis import DisruptionEvent
from sdcm.sct_events.system import InfoEvent, CoreDumpEvent
from sdcm.sct_events.system import InfoEvent, CoreDumpEvent, TopologyFailureEvent
from sdcm.sla.sla_tests import SlaTests
from sdcm.stress_thread import DockerBasedStressThread
from sdcm.utils.aws_kms import AwsKms
Expand Down Expand Up @@ -1682,7 +1682,13 @@ def get_node_state(node_ip: str) -> List["str"] | None:
is_old_node_seed = self.target_node.is_seed
InfoEvent(message='StartEvent - Terminate node and wait 5 minutes').publish()
self._terminate_and_wait(target_node=self.target_node)
assert get_node_state(old_node_ip) == "DN", "Removed node state should be DN"
if state := get_node_state(old_node_ip) != "DN":
TopologyFailureEvent(
source=self.__class__.__name__,
message=f"Removed node state should be DN, but was {state}",
severity=Severity.CRITICAL
).publish()

InfoEvent(message='FinishEvent - target_node was terminated').publish()
new_node = self.replace_node(old_node_ip, host_id, rack=self.target_node.rack,
is_zero_node=self.target_node._is_zero_token_node)
Expand All @@ -1697,12 +1703,18 @@ def get_node_state(node_ip: str) -> List["str"] | None:
@retrying(n=20, sleep_time=20, allowed_exceptions=(AssertionError,))
def wait_for_old_node_to_removed():
state = get_node_state(old_node_ip)
if old_node_ip == new_node.ip_address:
assert state == "UN", \
f"New node with the same IP as removed one should be in UN state but was: {state}"
else:
assert state is None, \
f"Old node should have been removed from status but it wasn't. State was: {state}"
if old_node_ip == new_node.ip_address and state != "UN":
TopologyFailureEvent(
source=self.__class__.__name__,
message=f"New node with the same IP as removed one should be in UN state but was: {state}",
severity=Severity.CRITICAL
).publish()
if old_node_ip != new_node.ip_address and state is not None:
TopologyFailureEvent(
source=self.__class__.__name__,
message=f"Old node should have been removed from status but it wasn't. State was: {state}",
severity=Severity.CRITICAL
).publish()

wait_for_old_node_to_removed()

Expand Down
10 changes: 10 additions & 0 deletions sdcm/sct_events/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,3 +334,13 @@ def __init__(self, message: str, severity: Severity = Severity.ERROR):
@property
def msgfmt(self) -> str:
return super().msgfmt + ": message={0.message}"


class TopologyFailureEvent(SctEvent):
def __init__(self, message: str, severity: Severity = Severity.CRITICAL):
super().__init__(severity)
self.message = message

@property
def msgfmt(self) -> str:
return super().msgfmt + ": message={0.message}"