Skip to content

Commit dfeca9c

Browse files
committed
feature(nemesis): Unify Nemesis execution among Monkeys
Currently, two separate mechanisms exist to call disrupt methods: build_list_of_disruptions used by SisyphusMonkey call_random_disrupt_method used by all complex monkeys These two mechanism differ in yaml properties it supports, in usage of random_seed and how they discover disrupt methods. Former using NemesisRegistry, later filtering it directly. This commit unities both of the usecases under one code Remove AllMonkey and ChaosMonkey as they can be replaced by SisiphusMonkey
1 parent 9d21811 commit dfeca9c

File tree

3 files changed

+89
-112
lines changed

3 files changed

+89
-112
lines changed

sdcm/nemesis.py

Lines changed: 86 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
import enum
3333
from contextlib import ExitStack, contextmanager
3434
from typing import Any, List, Optional, Tuple, Callable, Dict, Set, Union, Iterable
35-
from functools import wraps, partial
35+
from functools import wraps, partial, cached_property
3636
from collections import defaultdict, Counter, namedtuple
3737
from concurrent.futures import ThreadPoolExecutor
3838
from threading import Lock
@@ -242,7 +242,6 @@ def __init__(self, tester_obj, termination_event, *args, nemesis_selector=None,
242242
self.monitoring_set = tester_obj.monitors
243243
self.target_node: BaseNode = None
244244
self.disruptions_list = []
245-
self.disruptions_cycle: Iterable[Callable] | None = None
246245
self.termination_event = termination_event
247246
self.operation_log = []
248247
self.current_disruption = None
@@ -264,7 +263,6 @@ def __init__(self, tester_obj, termination_event, *args, nemesis_selector=None,
264263
self.task_used_streaming = None
265264
self.filter_seed = self.cluster.params.get('nemesis_filter_seeds')
266265
self.nemesis_seed = nemesis_seed or random.randint(0, 1000)
267-
self._random_sequence = None
268266
self._add_drop_column_max_per_drop = 5
269267
self._add_drop_column_max_per_add = 5
270268
self._add_drop_column_max_column_name_size = 10
@@ -1808,80 +1806,43 @@ def _deprecated_disrupt_stop_start(self):
18081806
self.log.info('StopStart %s', self.target_node)
18091807
self.target_node.restart()
18101808

1811-
def call_random_disrupt_method(self, disrupt_methods=None, predefined_sequence=False):
1812-
# pylint: disable=too-many-branches
1813-
1814-
if disrupt_methods is None:
1815-
disrupt_methods = [attr[1] for attr in inspect.getmembers(self) if
1816-
attr[0].startswith('disrupt_') and
1817-
callable(attr[1])]
1818-
else:
1819-
disrupt_methods = [attr[1] for attr in inspect.getmembers(self) if
1820-
attr[0] in disrupt_methods and
1821-
callable(attr[1])]
1822-
if not disrupt_methods:
1823-
self.log.warning("No monkey to run")
1824-
return
1825-
if not predefined_sequence:
1826-
disrupt_method = random.choice(disrupt_methods)
1827-
else:
1828-
if not self._random_sequence:
1829-
# Generate random sequence, every method has same chance to be called.
1830-
# Here we use multiple original methods list, it will increase the chance
1831-
# to call same method continuously.
1832-
#
1833-
# Adjust the rate according to the test duration. Try to call more unique
1834-
# methods and don't wait to long time to meet the balance if the test
1835-
# duration is short.
1836-
test_duration = self.cluster.params.get('test_duration')
1837-
if test_duration < 600: # less than 10 hours
1838-
rate = 1
1839-
elif test_duration < 4320: # less than 3 days
1840-
rate = 2
1841-
else:
1842-
rate = 3
1843-
multiple_disrupt_methods = disrupt_methods * rate
1844-
random.shuffle(multiple_disrupt_methods)
1845-
self._random_sequence = multiple_disrupt_methods
1846-
# consume the random sequence
1847-
disrupt_method = self._random_sequence.pop()
1809+
# Nemesis running code
18481810

1849-
self.execute_disrupt_method(disrupt_method)
1811+
@cached_property
1812+
def all_disrupt_methods(self):
1813+
return self.nemesis_registry.get_disrupt_methods()
18501814

18511815
def execute_disrupt_method(self, disrupt_method):
1816+
"""Runs selected disrupt method"""
18521817
disrupt_method_name = disrupt_method.__name__.replace('disrupt_', '')
18531818
self.metrics_srv.event_start(disrupt_method_name)
18541819
try:
18551820
disrupt_method(self)
18561821
finally:
18571822
self.metrics_srv.event_stop(disrupt_method_name)
18581823

1859-
def build_list_of_disruptions_to_execute(self, nemesis_selector: str | None = None, nemesis_multiply_factor=1):
1860-
"""
1861-
Builds the list of disruptions that should be excuted during a test.
1824+
def build_disruptions_by_name(self, disrupt_methods: List[str]):
1825+
"""Builds list of available disruptions according to function names"""
1826+
filtered = [func for func in self.all_disrupt_methods if func.__name__ in disrupt_methods]
1827+
names = [func.__name__ for func in filtered]
1828+
assert names == disrupt_methods, f"Unable to find these disrupt methods: {set(disrupt_methods).difference(names)}"
1829+
return filtered
18621830

1863-
nemesis_selector: should be retrived from the test yaml by using the "nemesis_selector".
1864-
Here it kept for future usages and unit testing ability.
1865-
more about nemesis_selector behaviour in sct_config.py
1831+
def build_disruptions_by_selector(self, nemesis_selector: str | None = None):
1832+
"""
1833+
Filter available disruptions according to the logical phrase
18661834
1867-
nemesis_multiply_factor: should be retrived from the test yaml by using the "nemesis_multiply_factor".
1868-
Here it kept for future usages and unit testing ability.
1869-
more about nemesis_selector behaviour in sct_config.py
1835+
nemesis_selector: Logical phrase selector to filter available disruption methods.
1836+
To be able to be filtered, method needs to have corresponding class.
1837+
Usually retrieved from the test yaml by using the "nemesis_selector", more about nemesis_selector behaviour in sct_config.py
18701838
"""
1871-
nemesis_selector = nemesis_selector or self.nemesis_selector
18721839
if self._is_it_on_kubernetes():
18731840
if nemesis_selector:
18741841
nemesis_selector = nemesis_selector + " and kubernetes"
18751842
else:
18761843
nemesis_selector = "kubernetes"
1877-
nemesis_multiply_factor = self.cluster.params.get('nemesis_multiply_factor') or nemesis_multiply_factor
18781844
disruptions = self.nemesis_registry.get_disrupt_methods(nemesis_selector)
1879-
1880-
if nemesis_multiply_factor:
1881-
disruptions = disruptions * nemesis_multiply_factor
1882-
1883-
self.disruptions_list.extend(disruptions)
1884-
return self.disruptions_list
1845+
return disruptions
18851846

18861847
@property
18871848
def nemesis_selector(self) -> str:
@@ -1905,22 +1866,34 @@ def nemesis_selector(self, value: str):
19051866

19061867
@property
19071868
def _disruption_list_names(self):
1869+
"""Returns name of all collected nemesis"""
19081870
return [nemesis.__name__ for nemesis in self.disruptions_list]
19091871

1910-
def shuffle_list_of_disruptions(self):
1911-
self.log.debug(f'nemesis_seed to be used is {self.nemesis_seed}')
1872+
def shuffle_list_of_disruptions(self, disruption_list: List, nemesis_multiply_factor: int | None = None):
1873+
"""
1874+
Randomizes list of disruptions
19121875
1876+
nemesis_multiply_factor: How many times to multiply the original list before shuffle
1877+
Useful for increasing probability of the same nemesis twice in a row
1878+
Usually retrieved from the test yaml by using the "nemesis_selector", more about nemesis_selector behaviour in sct_config.py
1879+
"""
1880+
self.log.debug(f'nemesis_seed to be used is {self.nemesis_seed}')
19131881
self.log.debug(f"nemesis stack BEFORE SHUFFLE is {self._disruption_list_names}")
1914-
random.Random(self.nemesis_seed).shuffle(self.disruptions_list)
1882+
nemesis_multiply_factor = nemesis_multiply_factor or self.cluster.params.get('nemesis_multiply_factor') or 1
1883+
random.Random(self.nemesis_seed).shuffle(disruption_list * nemesis_multiply_factor)
19151884
self.log.info(f"List of Nemesis to execute: {self._disruption_list_names}")
19161885

1886+
@cached_property
1887+
def infinite_cycle(self):
1888+
"""Returns infinite cycle of all nemesis"""
1889+
return itertools.cycle(self.disruptions_list)
1890+
19171891
def call_next_nemesis(self):
1892+
"""Calls next nemesis in the order"""
19181893
assert self.disruptions_list, "no nemesis were selected"
1919-
if not self.disruptions_cycle:
1920-
self.disruptions_cycle = itertools.cycle(self.disruptions_list)
1921-
self.log.debug(f'Selecting the next nemesis out of stack {self._disruption_list_names[10:]}')
1922-
self.execute_disrupt_method(disrupt_method=next(self.disruptions_cycle))
1894+
self.execute_disrupt_method(disrupt_method=next(self.infinite_cycle))
19231895

1896+
# End of Nemesis running code
19241897
@latency_calculator_decorator(legend="Run repair process with nodetool repair")
19251898
def repair_nodetool_repair(self, node=None, publish_event=True):
19261899
node = node if node else self.target_node
@@ -5628,8 +5601,8 @@ def wrapper(*args, **kwargs): # pylint: disable=too-many-statements # noqa: PL
56285601
class SisyphusMonkey(Nemesis):
56295602
def __init__(self, *args, **kwargs):
56305603
super().__init__(*args, **kwargs)
5631-
self.build_list_of_disruptions_to_execute()
5632-
self.shuffle_list_of_disruptions()
5604+
self.disruptions_list = self.build_disruptions_by_selector(self.nemesis_selector)
5605+
self.shuffle_list_of_disruptions(self.disruptions_list)
56335606

56345607
def disrupt(self):
56355608
self.call_next_nemesis()
@@ -5740,13 +5713,14 @@ class EnableDisableTableEncryptionAwsKmsProviderMonkey(Nemesis):
57405713

57415714
def __init__(self, *args, **kwargs):
57425715
super().__init__(*args, **kwargs)
5743-
self.disrupt_methods_list = [
5716+
self.disruptions_list = self.build_disruptions_by_name([
57445717
'disrupt_enable_disable_table_encryption_aws_kms_provider_without_rotation',
57455718
'disrupt_enable_disable_table_encryption_aws_kms_provider_with_rotation',
5746-
]
5719+
])
5720+
self.shuffle_list_of_disruptions(self.disruptions_list)
57475721

57485722
def disrupt(self):
5749-
self.call_random_disrupt_method(disrupt_methods=self.disrupt_methods_list, predefined_sequence=True)
5723+
self.call_next_nemesis()
57505724

57515725

57525726
class RestartThenRepairNodeMonkey(Nemesis):
@@ -5974,12 +5948,6 @@ def disrupt(self):
59745948
self.disrupt_delete_overlapping_row_ranges()
59755949

59765950

5977-
class ChaosMonkey(Nemesis):
5978-
5979-
def disrupt(self):
5980-
self.call_random_disrupt_method()
5981-
5982-
59835951
class CategoricalMonkey(Nemesis):
59845952
"""Randomly picks disruptions to execute using the given categorical distribution.
59855953
@@ -6064,34 +6032,39 @@ def _random_disrupt(self):
60646032
self.execute_disrupt_method(method)
60656033

60666034

6067-
CLOUD_LIMITED_CHAOS_MONKEY = ['disrupt_nodetool_cleanup',
6068-
'disrupt_nodetool_drain', 'disrupt_nodetool_refresh',
6069-
'disrupt_stop_start_scylla_server', 'disrupt_major_compaction',
6070-
'disrupt_modify_table', 'disrupt_nodetool_enospc',
6071-
'disrupt_stop_wait_start_scylla_server',
6072-
'disrupt_soft_reboot_node',
6073-
'disrupt_truncate']
6074-
6075-
60766035
class ScyllaCloudLimitedChaosMonkey(Nemesis):
60776036

6078-
def disrupt(self):
6079-
# Limit the nemesis scope to only one relevant to scylla cloud, where we defined we don't have AWS api access:
6080-
self.call_random_disrupt_method(disrupt_methods=CLOUD_LIMITED_CHAOS_MONKEY)
6081-
6082-
6083-
class AllMonkey(Nemesis):
6037+
def __init__(self, *args, **kwargs):
6038+
super().__init__(*args, **kwargs)
6039+
self.disruptions_list = self.build_disruptions_by_name([
6040+
'disrupt_nodetool_cleanup',
6041+
'disrupt_nodetool_drain', 'disrupt_nodetool_refresh',
6042+
'disrupt_stop_start_scylla_server', 'disrupt_major_compaction',
6043+
'disrupt_modify_table', 'disrupt_nodetool_enospc',
6044+
'disrupt_stop_wait_start_scylla_server',
6045+
'disrupt_soft_reboot_node',
6046+
'disrupt_truncate'
6047+
])
6048+
self.shuffle_list_of_disruptions(self.disruptions_list)
60846049

60856050
def disrupt(self):
6086-
self.call_random_disrupt_method(predefined_sequence=True)
6051+
# Limit the nemesis scope to only one relevant to scylla cloud, where we defined we don't have AWS api access:
6052+
self.call_next_nemesis()
60876053

60886054

60896055
class MdcChaosMonkey(Nemesis):
60906056

6057+
def __init__(self, *args, **kwargs):
6058+
super().__init__(*args, **kwargs)
6059+
self.disruptions_list = self.build_disruptions_by_name([
6060+
'disrupt_destroy_data_then_repair',
6061+
'disrupt_no_corrupt_repair',
6062+
'disrupt_nodetool_decommission'
6063+
])
6064+
self.shuffle_list_of_disruptions(self.disruptions_list)
6065+
60916066
def disrupt(self):
6092-
self.call_random_disrupt_method(
6093-
disrupt_methods=['disrupt_destroy_data_then_repair', 'disrupt_no_corrupt_repair',
6094-
'disrupt_nodetool_decommission'])
6067+
self.call_next_nemesis()
60956068

60966069

60976070
class ModifyTableMonkey(Nemesis):
@@ -6239,13 +6212,14 @@ class DisruptKubernetesNodeThenReplaceScyllaNode(Nemesis):
62396212

62406213
def __init__(self, *args, **kwargs):
62416214
super().__init__(*args, **kwargs)
6242-
self.disrupt_methods_list = [
6215+
self.disruptions_list = self.build_disruptions_by_name([
62436216
'disrupt_drain_kubernetes_node_then_replace_scylla_node',
62446217
'disrupt_terminate_kubernetes_host_then_replace_scylla_node',
6245-
]
6218+
])
6219+
self.shuffle_list_of_disruptions(self.disruptions_list)
62466220

62476221
def disrupt(self):
6248-
self.call_random_disrupt_method(disrupt_methods=self.disrupt_methods_list)
6222+
self.call_next_nemesis()
62496223

62506224

62516225
class DrainKubernetesNodeThenDecommissionAndAddScyllaNode(Nemesis):
@@ -6270,13 +6244,14 @@ class DisruptKubernetesNodeThenDecommissionAndAddScyllaNode(Nemesis):
62706244

62716245
def __init__(self, *args, **kwargs):
62726246
super().__init__(*args, **kwargs)
6273-
self.disrupt_methods_list = [
6247+
self.disruptions_list = self.build_disruptions_by_name([
62746248
'disrupt_drain_kubernetes_node_then_decommission_and_add_scylla_node',
62756249
'disrupt_terminate_kubernetes_host_then_decommission_and_add_scylla_node',
6276-
]
6250+
])
6251+
self.shuffle_list_of_disruptions(self.disruptions_list)
62776252

62786253
def disrupt(self):
6279-
self.call_random_disrupt_method(disrupt_methods=self.disrupt_methods_list)
6254+
self.call_next_nemesis()
62806255

62816256

62826257
class K8sSetMonkey(Nemesis):
@@ -6285,16 +6260,16 @@ class K8sSetMonkey(Nemesis):
62856260

62866261
def __init__(self, *args, **kwargs):
62876262
super().__init__(*args, **kwargs)
6288-
self.disrupt_methods_list = [
6263+
self.disruptions_list = self.build_disruptions_by_name([
62896264
'disrupt_drain_kubernetes_node_then_replace_scylla_node',
62906265
'disrupt_terminate_kubernetes_host_then_replace_scylla_node',
62916266
'disrupt_drain_kubernetes_node_then_decommission_and_add_scylla_node',
62926267
'disrupt_terminate_kubernetes_host_then_decommission_and_add_scylla_node',
6293-
]
6268+
])
6269+
self.shuffle_list_of_disruptions(self.disruptions_list)
62946270

62956271
def disrupt(self):
6296-
self.call_random_disrupt_method(
6297-
disrupt_methods=self.disrupt_methods_list, predefined_sequence=True)
6272+
self.call_next_nemesis()
62986273

62996274

63006275
class OperatorNodeReplace(Nemesis):
@@ -6466,7 +6441,7 @@ class ScyllaOperatorBasicOperationsMonkey(Nemesis):
64666441

64676442
def __init__(self, *args, **kwargs):
64686443
super().__init__(*args, **kwargs)
6469-
self.disrupt_methods_list = [
6444+
self.disruptions_list = self.build_disruptions_by_name([
64706445
'disrupt_nodetool_flush_and_reshard_on_kubernetes',
64716446
'disrupt_rolling_restart_cluster',
64726447
'disrupt_grow_shrink_cluster',
@@ -6481,10 +6456,11 @@ def __init__(self, *args, **kwargs):
64816456
'disrupt_mgmt_repair_cli',
64826457
'disrupt_mgmt_backup_specific_keyspaces',
64836458
'disrupt_mgmt_backup',
6484-
]
6459+
])
6460+
self.shuffle_list_of_disruptions(self.disruptions_list)
64856461

64866462
def disrupt(self):
6487-
self.call_random_disrupt_method(disrupt_methods=self.disrupt_methods_list, predefined_sequence=True)
6463+
self.call_next_nemesis()
64886464

64896465

64906466
class NemesisSequence(Nemesis):
@@ -6552,8 +6528,8 @@ def disrupt(self):
65526528
self.disrupt_repair_streaming_err()
65536529

65546530

6555-
COMPLEX_NEMESIS = [NoOpMonkey, ChaosMonkey, ScyllaCloudLimitedChaosMonkey,
6556-
AllMonkey, MdcChaosMonkey, SisyphusMonkey,
6531+
COMPLEX_NEMESIS = [NoOpMonkey, ScyllaCloudLimitedChaosMonkey,
6532+
MdcChaosMonkey, SisyphusMonkey,
65576533
DisruptKubernetesNodeThenReplaceScyllaNode,
65586534
DisruptKubernetesNodeThenDecommissionAndAddScyllaNode,
65596535
CategoricalMonkey]

test-cases/features/2mv-backpressure-4d.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ user_prefix: 'longevity-2mv-backpressure-4d'
1212
instance_type_db: 'i4i.2xlarge'
1313
instance_type_loader: 'c6i.2xlarge'
1414

15-
nemesis_class_name: 'ChaosMonkey'
15+
nemesis_class_name: 'SisyphusMonkey'
1616
nemesis_during_prepare: false

unit_tests/test_nemesis.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ def test_list_nemesis_of_added_disrupt_methods(capsys):
134134
nemesis = ChaosMonkey(FakeTester(), None)
135135
assert 'disrupt_rnd_method' in [
136136
method.__name__ for method in nemesis.nemesis_registry.get_disrupt_methods()]
137-
assert nemesis.call_random_disrupt_method(disrupt_methods=['disrupt_rnd_method']) is None
137+
nemesis.disruptions_list = nemesis.build_disruptions_by_name(['disrupt_rnd_method'])
138+
nemesis.call_next_nemesis()
138139
captured = capsys.readouterr()
139140
assert "It Works!" in captured.out
140141

0 commit comments

Comments
 (0)