Skip to content

[Backport perf-v15] fix(LoaderUtilsMixin): table_enabled was allways True as this feature is allways enabled #10625

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: branch-perf-v15
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions sdcm/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -5049,7 +5049,22 @@ def get_node_ip_list(verification_node):
self.terminate_node(node) # pylint: disable=no-member
self.test_config.tester_obj().monitors.reconfigure_scylla_monitoring()

<<<<<<< HEAD
def decommission(self, node: BaseNode, timeout: int | float = None):
||||||| parent of b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)
def decommission(self, node: BaseNode, timeout: int | float = None) -> DataCenterTopologyRfControl | None:
if not node._is_zero_token_node:
with node.parent_cluster.cql_connection_patient(node) as session:
if tablets_enabled := is_tablets_feature_enabled(session):
dc_topology_rf_change = DataCenterTopologyRfControl(target_node=node)
dc_topology_rf_change.decrease_keyspaces_rf()
=======
def decommission(self, node: BaseNode, timeout: int | float = None) -> DataCenterTopologyRfControl | None:
if not node._is_zero_token_node:
if tablets_enabled := is_tablets_feature_enabled(node):
dc_topology_rf_change = DataCenterTopologyRfControl(target_node=node)
dc_topology_rf_change.decrease_keyspaces_rf()
>>>>>>> b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)
with adaptive_timeout(operation=Operations.DECOMMISSION, node=node):
node.run_nodetool("decommission", timeout=timeout, long_running=True, retry=0)
self.verify_decommission(node)
Expand Down
3 changes: 1 addition & 2 deletions sdcm/fill_db_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3186,8 +3186,7 @@ def enable_cdc_for_tables(self) -> bool:
@cached_property
def tablets_enabled(self) -> bool:
"""Check is tablets enabled on cluster"""
with self.db_cluster.cql_connection_patient(self.db_cluster.nodes[0]) as session:
return is_tablets_feature_enabled(session)
return is_tablets_feature_enabled(self.db_cluster.nodes[0])

@retrying(n=3, sleep_time=20, allowed_exceptions=ProtocolException)
def truncate_table(self, session, truncate): # pylint: disable=no-self-use
Expand Down
7 changes: 3 additions & 4 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1005,10 +1005,9 @@ def disrupt_restart_with_resharding(self):
"Run 'disrupt_nodetool_flush_and_reshard_on_kubernetes' instead")

# If tablets in use, skipping resharding since it is not supported.
with self.cluster.cql_connection_patient(self.target_node) as session:
if is_tablets_feature_enabled(session=session):
if SkipPerIssues('https://github.com/scylladb/scylladb/issues/16739', params=self.tester.params):
raise UnsupportedNemesis('https://github.com/scylladb/scylladb/issues/16739')
if is_tablets_feature_enabled(self.target_node):
if SkipPerIssues('https://github.com/scylladb/scylladb/issues/16739', params=self.tester.params):
raise UnsupportedNemesis('https://github.com/scylladb/scylladb/issues/16739')

murmur3_partitioner_ignore_msb_bits = 15 # pylint: disable=invalid-name
self.log.info(f'Restart node with resharding. New murmur3_partitioner_ignore_msb_bits value: '
Expand Down
17 changes: 17 additions & 0 deletions sdcm/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,24 @@ def reliable_replication_factor(self) -> int:
:return:
"""
n_db_nodes = str(self.params.get('n_db_nodes'))
<<<<<<< HEAD
return min([int(nodes_num) for nodes_num in n_db_nodes.split() if int(nodes_num) > 0])
||||||| parent of b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)
min_nodes_dc = min([int(nodes_num) for nodes_num in n_db_nodes.split() if int(nodes_num) > 0])
with self.db_cluster.cql_connection_patient(self.db_cluster.nodes[0]) as session:
# In case tablets are enabled, it's better to set RF smaller than dc-nodes-number, so decommission is allowed.
rf_candidate = max([min_nodes_dc - 1, 1]) if is_tablets_feature_enabled(session) else min_nodes_dc
# NOTE: use RF=3 at max to avoid problems on big setups
return min(rf_candidate, 3)
=======
min_nodes_dc = min([int(nodes_num) for nodes_num in n_db_nodes.split() if int(nodes_num) > 0])

# In case tablets are enabled, it's better to set RF smaller than dc-nodes-number, so decommission is allowed.
rf_candidate = max([min_nodes_dc - 1, 1]
) if is_tablets_feature_enabled(self.db_cluster.nodes[0]) else min_nodes_dc
# NOTE: use RF=3 at max to avoid problems on big setups
return min(rf_candidate, 3)
>>>>>>> b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)

@property
def test_id(self):
Expand Down
13 changes: 9 additions & 4 deletions sdcm/utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

CONSISTENT_TOPOLOGY_CHANGES_FEATURE = "SUPPORTS_CONSISTENT_TOPOLOGY_CHANGES"
CONSISTENT_CLUSTER_MANAGEMENT_FEATURE = "SUPPORTS_RAFT_CLUSTER_MANAGEMENT"
TABLETS_FEATURE = "TABLETS"

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,8 +70,14 @@ def is_consistent_topology_changes_feature_enabled(session: Session) -> bool:
return CONSISTENT_TOPOLOGY_CHANGES_FEATURE in get_enabled_features(session)


def is_tablets_feature_enabled(session: Session) -> bool:
def is_tablets_feature_enabled(node) -> bool:
""" Check whether tablets enabled
if you need from a specific node use `patient_exclusive_cql_connection` session
"""
return TABLETS_FEATURE in get_enabled_features(session)
with node.remote_scylla_yaml() as scylla_yaml:
# for backward compatibility of 2024.1 and earlier
if "tablets" in scylla_yaml.experimental_features:
return True
if scylla_yaml.dict().get("enable_tablets"):
return True

return False
15 changes: 15 additions & 0 deletions sdcm/utils/loader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,21 @@ def assemble_and_run_all_stress_cmd_by_ks_names(self, stress_queue, stress_cmd,
}
self._run_all_stress_cmds(stress_queue, params)

<<<<<<< HEAD
||||||| parent of b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)
@cached_property
def tablets_enabled(self):
# is tablets feature enabled in Scylla configuration.
with self.db_cluster.cql_connection_patient(self.db_cluster.nodes[0]) as session:
return is_tablets_feature_enabled(session)

=======
@cached_property
def tablets_enabled(self):
# is tablets feature enabled in Scylla configuration.
return is_tablets_feature_enabled(self.db_cluster.nodes[0])

>>>>>>> b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)
def _run_all_stress_cmds(self, stress_queue, params):
stress_cmds = params['stress_cmd']
if not isinstance(stress_cmds, list):
Expand Down
53 changes: 53 additions & 0 deletions sdcm/utils/tablets/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,56 @@ def __str__(self):
value = str(v).lower() if isinstance(v, bool) else v
items.append(f"'{k}': {value}")
return '{' + ', '.join(items) + '}'
<<<<<<< HEAD
||||||| parent of b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)


def wait_no_tablets_migration_running(node):
"""
Waiting for having no ongoing tablets topology operations using REST API.
!!! It does not guarantee that tablets are balanced !!!
Keep in mind that it is good only to find when ongoing tablets topology operations is done.
Very next second another topology operation can be started.

doing it several times as there's a risk of:
"currently a small time window after adding nodes and before load balancing starts during which
topology may appear as quiesced because the state machine goes through an idle state before it enters load balancing state"
"""
with node.parent_cluster.cql_connection_patient(node=node) as session:
if not is_tablets_feature_enabled(session):
LOGGER.info("Tablets are disabled, skipping wait for balance")
return
time.sleep(60) # one minute gap before checking, just to give some time to the state machine
client = RemoteCurlClient(host="127.0.0.1:10000", endpoint="", node=node)
LOGGER.info("Waiting for having no ongoing tablets topology operations")
for _ in range(3):
client.run_remoter_curl(method="POST", path="storage_service/quiesce_topology",
params={}, timeout=3600, retry=3)
time.sleep(5)
LOGGER.info("All ongoing tablets topology operations are done")
=======


def wait_no_tablets_migration_running(node):
"""
Waiting for having no ongoing tablets topology operations using REST API.
!!! It does not guarantee that tablets are balanced !!!
Keep in mind that it is good only to find when ongoing tablets topology operations is done.
Very next second another topology operation can be started.

doing it several times as there's a risk of:
"currently a small time window after adding nodes and before load balancing starts during which
topology may appear as quiesced because the state machine goes through an idle state before it enters load balancing state"
"""
if not is_tablets_feature_enabled(node):
LOGGER.info("Tablets are disabled, skipping wait for balance")
return
time.sleep(60) # one minute gap before checking, just to give some time to the state machine
client = RemoteCurlClient(host="127.0.0.1:10000", endpoint="", node=node)
LOGGER.info("Waiting for having no ongoing tablets topology operations")
for _ in range(3):
client.run_remoter_curl(method="POST", path="storage_service/quiesce_topology",
params={}, timeout=3600, retry=3)
time.sleep(5)
LOGGER.info("All ongoing tablets topology operations are done")
>>>>>>> b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)
20 changes: 20 additions & 0 deletions sla_per_user_system_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,26 @@ def test_read_throughput_1to5_ratio(self):

def _two_users_load_througput_workload(self, shares, load, expected_shares_ratio=None):
session = self.prepare_schema()
<<<<<<< HEAD
||||||| parent of b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)

if is_tablets_feature_enabled(session=session):
self.run_pre_create_keyspace()
# after several test runs with Tablets decided to decrease by half of the percent(usually tests show about 96.8 - 97.5)
# due to unbalanced shards utilization with tablets(particular tablet belong to particular shard)
# during gauss distribution read
self.MIN_CPU_UTILIZATION = 96.5

=======

if is_tablets_feature_enabled(self.db_cluster.nodes[0]):
self.run_pre_create_keyspace()
# after several test runs with Tablets decided to decrease by half of the percent(usually tests show about 96.8 - 97.5)
# due to unbalanced shards utilization with tablets(particular tablet belong to particular shard)
# during gauss distribution read
self.MIN_CPU_UTILIZATION = 96.5

>>>>>>> b03a690e6 (fix(LoaderUtilsMixin): table_enabled was allways True as this feature is enabled)
self.create_test_data_and_wait_no_compaction()

# Define Service Levels/Roles/Users
Expand Down