Skip to content

Commit 367e6d1

Browse files
cezarmoiseactions-user
authored andcommitted
test(size-based-load-balancing) add test with different partition sizes
check node storage is balanced after writes finish (cherry picked from commit 065fdee)
1 parent d1e6c58 commit 367e6d1

File tree

4 files changed

+232
-0
lines changed

4 files changed

+232
-0
lines changed

.github/CODEOWNERS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,11 @@ upgrade_test.py @abramche @fruch
100100
/test-cases/features/out-of-space-prevention @pehala
101101
/jenkins-pipelines/oss/features/out-of-space-prevention @pehala
102102

103+
# Size-based load balancing
104+
/longevity_balancer_test.py @pehala
105+
/test-cases/features/size-based-load-balancing @pehala
106+
/jenkins-pipelines/oss/features/size-based-load-balancing @pehala
107+
103108
# Backup & Restore
104109
jenkins-pipelines/performance/branch-perf-v17/scylla-enterprise/perf-regression/scylla-enterprise-perf-manager-native-backup-nemesis.jenkinsfile @pehala
105110
jenkins-pipelines/performance/branch-perf-v17/scylla-enterprise/perf-regression/scylla-enterprise-perf-manager-rclone-backup-nemesis.jenkinsfile @pehala
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!groovy
2+
3+
// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
4+
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
5+
6+
longevityPipeline(
7+
backend: "aws",
8+
region: 'eu-west-1',
9+
test_name: 'longevity_balancer_test.LongevityBalancerTest.test_load_balance',
10+
test_config: 'test-cases/features/size-based-load-balancing/size-based-load-balancing.yaml',
11+
)

longevity_balancer_test.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#!/usr/bin/env python
2+
3+
# This program is free software; you can redistribute it and/or modify
4+
# it under the terms of the GNU Affero General Public License as published by
5+
# the Free Software Foundation; either version 3 of the License, or
6+
# (at your option) any later version.
7+
#
8+
# This program is distributed in the hope that it will be useful,
9+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11+
#
12+
# See LICENSE for more details.
13+
#
14+
# Copyright (c) 2025 ScyllaDB
15+
16+
17+
from collections import defaultdict
18+
from contextlib import contextmanager
19+
from longevity_test import LongevityTest
20+
from sdcm.argus_results import PeriodicDiskUsageToArgus
21+
from sdcm.cluster import MAX_TIME_WAIT_FOR_DECOMMISSION, MAX_TIME_WAIT_FOR_NEW_NODE_UP, BaseNode
22+
from sdcm.sct_events import Severity
23+
from sdcm.sct_events.system import InfoEvent, TestFrameworkEvent
24+
from sdcm.utils.adaptive_timeouts import Operations, adaptive_timeout
25+
from sdcm.utils.common import ParallelObject, get_node_disk_usage
26+
from sdcm.utils.tablets.common import wait_no_tablets_migration_running
27+
28+
# Per requirement, load balance difference should be bellow 5% for nodes in the same rack
29+
# https://scylladb.atlassian.net/wiki/spaces/RND/pages/5505671/Size-Based+Load+Balancing+Requirement+Document#Performance
30+
BALANCE_THRESHOLD = 5
31+
32+
33+
class LongevityBalancerTest(LongevityTest):
34+
"""
35+
Test to ensure that the cluster is balanced correctly in difficult conditions.
36+
37+
Required config params:
38+
- nemesis_add_node_cnt:
39+
- At the beginning of the test, this many nodes will be added to the cluster
40+
- In the middle of the test, this many nodes will be added and then removed from the cluster
41+
- nemesis_grow_shrink_instance_type: The instance type to use when adding nodes to the cluster
42+
- prepare_write_cmd: The stress command to use for the initial data population
43+
- stress_cmd: The stress command to use for the second data population, after adding and removing nodes
44+
"""
45+
46+
def expand_cluster_heterogenous(self):
47+
new_nodes = self.db_cluster.add_nodes(
48+
count=self.params.get("nemesis_add_node_cnt"),
49+
instance_type=self.params.get("nemesis_grow_shrink_instance_type"),
50+
enable_auto_bootstrap=True,
51+
rack=None,
52+
)
53+
self.monitors.reconfigure_scylla_monitoring()
54+
up_timeout = MAX_TIME_WAIT_FOR_NEW_NODE_UP
55+
with adaptive_timeout(Operations.NEW_NODE, node=self.db_cluster.data_nodes[0], timeout=up_timeout):
56+
self.db_cluster.wait_for_init(node_list=new_nodes, timeout=up_timeout, check_node_health=False)
57+
self.db_cluster.set_seeds()
58+
self.db_cluster.update_seed_provider()
59+
self.db_cluster.wait_for_nodes_up_and_normal(nodes=new_nodes)
60+
61+
def wait_for_balance(self):
62+
# run multiple times because `storage_service/quiesce_topology` only returns when
63+
# the topology operations that were ongoing when the command was issued are done
64+
# but new operations can start right after that
65+
for _ in range(3):
66+
ParallelObject(objects=self.db_cluster.data_nodes, timeout=3600).run(wait_no_tablets_migration_running)
67+
68+
def check_balance(self):
69+
rack_usages = defaultdict(list)
70+
for node in self.db_cluster.data_nodes:
71+
rack_usages[node.rack].append(get_node_disk_usage(node))
72+
73+
for rack, usages in rack_usages.items():
74+
min_utilization = min(usages)
75+
max_utilization = max(usages)
76+
if max_utilization - min_utilization > BALANCE_THRESHOLD:
77+
TestFrameworkEvent(
78+
source="longevity_balancer_test",
79+
message=f"Storage utilization is not balanced in rack {rack}. Min: {min_utilization:.2f}%, Max: {max_utilization:.2f}%",
80+
severity=Severity.ERROR,
81+
).publish()
82+
83+
def scale_out(self):
84+
added_nodes = self.db_cluster.add_nodes(
85+
count=self.db_cluster.racks_count,
86+
instance_type=self.params.get("nemesis_grow_shrink_instance_type"),
87+
enable_auto_bootstrap=True,
88+
rack=None,
89+
)
90+
self.monitors.reconfigure_scylla_monitoring()
91+
up_timeout = MAX_TIME_WAIT_FOR_NEW_NODE_UP
92+
with adaptive_timeout(Operations.NEW_NODE, node=self.db_cluster.data_nodes[0], timeout=up_timeout):
93+
self.db_cluster.wait_for_init(node_list=added_nodes, timeout=up_timeout, check_node_health=False)
94+
self.db_cluster.set_seeds()
95+
self.db_cluster.update_seed_provider()
96+
self.db_cluster.wait_for_nodes_up_and_normal(nodes=added_nodes)
97+
return added_nodes
98+
99+
def scale_in(self, nodes: list[BaseNode]):
100+
for node in nodes:
101+
self.nemesis_allocator.set_running_nemesis(node, "decommissioning")
102+
parallel_obj = ParallelObject(objects=nodes, timeout=MAX_TIME_WAIT_FOR_DECOMMISSION, num_workers=len(nodes))
103+
InfoEvent(f"Started decommissioning {[node for node in nodes]}").publish()
104+
parallel_obj.run(self.db_cluster.decommission, ignore_exceptions=False, unpack_objects=True)
105+
InfoEvent(f"Finished decommissioning {[node for node in nodes]}").publish()
106+
self.monitors.reconfigure_scylla_monitoring()
107+
108+
def run_stress_command(self):
109+
stress_queue = []
110+
self.assemble_and_run_all_stress_cmd(
111+
stress_queue, self.params.get("stress_cmd"), self.params.get("keyspace_num")
112+
)
113+
for stress in stress_queue:
114+
self.verify_stress_thread(stress)
115+
116+
@contextmanager
117+
def enforce_balance(self):
118+
"""
119+
Add a property to the cluster to enforce balance checking in Argus.
120+
"""
121+
self.db_cluster._enforce_balance = True
122+
try:
123+
yield
124+
finally:
125+
self.db_cluster._enforce_balance = False
126+
127+
def test_load_balance(self):
128+
"""
129+
Test to ensure that the cluster is balanced correctly in difficult conditions:
130+
- heterogeneous nodes with different disk sizes.
131+
- multiple tables with different partition sizes.
132+
133+
This test will:
134+
1. Expand the cluster by adding new nodes of different types.
135+
No possible to start with heterogeneous nodes, as the cluster is created with a single type.
136+
2. Populate the cluster with data.
137+
3. Add some nodes to the cluster.
138+
4. Write more data to the cluster.
139+
5. Remove the added nodes.
140+
6. Wait for the cluster to balance.
141+
7. Check the final balance of the cluster.
142+
"""
143+
self.expand_cluster_heterogenous()
144+
with PeriodicDiskUsageToArgus(
145+
self.db_cluster, self.test_config.argus_client(), interval=600, threshold=BALANCE_THRESHOLD
146+
):
147+
with self.enforce_balance():
148+
self.run_prepare_write_cmd()
149+
150+
new_nodes = self.scale_out()
151+
self.wait_for_balance()
152+
self.check_balance()
153+
154+
with self.enforce_balance():
155+
self.run_stress_command()
156+
157+
self.scale_in(new_nodes)
158+
self.wait_for_balance()
159+
self.check_balance()
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Size-based load balancing test.
2+
# This test ensures storage utilization within each rack stays within the allowed threshold.
3+
# - Work with a heterogenous cluster: add 'nemesis_add_node_cnt' nodes of type 'nemesis_grow_shrink_instance_type' to the base cluster (n_db_nodes = 6).
4+
# - Initial data population: run the 'prepare_write_cmd' scylla-bench commands.
5+
# The commands are designed to create a significant amount of data (total ~1.2TB) with a wide distribution of partition sizes, which should trigger the size-based load balancing logic.
6+
# - Scale-out: add one node per rack ('nemesis_add_node_cnt'/ 'simulated_racks' = 3) of type 'nemesis_grow_shrink_instance_type'.
7+
# - Check intermediary balance: wait for tablets migration to quiesce and compare per-node disk usage per rack (threshold 5%, defined in test).
8+
# - Secondary data population: run the 'stress_cmd' scylla-bench commands.
9+
# - Scale-in: decommission the added nodes.
10+
# - Check final balance: wait for tablets migration to quiesce and compare per-node disk usage per rack (threshold 5%, defined in test).
11+
12+
test_duration: 720
13+
prepare_write_cmd: [
14+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table1 -partition-count=200000 -partition-offset=10000001 -clustering-row-count=10 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
15+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table2 -partition-count=100000 -partition-offset=20000001 -clustering-row-count=10 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
16+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table3 -partition-count=20000 -partition-offset=30000001 -clustering-row-count=100 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
17+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table1 -partition-count=10000 -partition-offset=40000001 -clustering-row-count=100 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
18+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table2 -partition-count=2000 -partition-offset=50000001 -clustering-row-count=1000 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
19+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table3 -partition-count=1000 -partition-offset=60000001 -clustering-row-count=1000 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
20+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table1 -partition-count=200 -partition-offset=70000001 -clustering-row-count=10000 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
21+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table2 -partition-count=100 -partition-offset=80000001 -clustering-row-count=10000 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
22+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table3 -partition-count=20 -partition-offset=90000001 -clustering-row-count=100000 -clustering-row-size=fixed:131072 -concurrency=20 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
23+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table1 -partition-count=10 -partition-offset=100000001 -clustering-row-count=100000 -clustering-row-size=fixed:131072 -concurrency=10 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
24+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table2 -partition-count=2 -partition-offset=110000001 -clustering-row-count=1000000 -clustering-row-size=fixed:131072 -concurrency=2 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
25+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table3 -partition-count=1 -partition-offset=120000001 -clustering-row-count=1000000 -clustering-row-size=fixed:131072 -concurrency=1 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
26+
]
27+
28+
stress_cmd: [
29+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table4 -partition-count=200000 -partition-offset=210000001 -clustering-row-count=7 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
30+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table5 -partition-count=100000 -partition-offset=220000001 -clustering-row-count=7 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
31+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table6 -partition-count=20000 -partition-offset=230000001 -clustering-row-count=70 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
32+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table4 -partition-count=10000 -partition-offset=240000001 -clustering-row-count=70 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
33+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table5 -partition-count=2000 -partition-offset=250000001 -clustering-row-count=700 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
34+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table6 -partition-count=1000 -partition-offset=260000001 -clustering-row-count=700 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
35+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table4 -partition-count=200 -partition-offset=270000001 -clustering-row-count=7000 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
36+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table5 -partition-count=100 -partition-offset=280000001 -clustering-row-count=7000 -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
37+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table6 -partition-count=20 -partition-offset=290000001 -clustering-row-count=70000 -clustering-row-size=fixed:131072 -concurrency=20 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
38+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table4 -partition-count=10 -partition-offset=300000001 -clustering-row-count=70000 -clustering-row-size=fixed:131072 -concurrency=10 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
39+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table5 -partition-count=2 -partition-offset=310000001 -clustering-row-count=700000 -clustering-row-size=fixed:131072 -concurrency=2 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
40+
"scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table6 -partition-count=1 -partition-offset=320000001 -clustering-row-count=700000 -clustering-row-size=fixed:131072 -concurrency=1 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
41+
]
42+
43+
n_db_nodes: 6
44+
simulated_racks: 3
45+
instance_type_db: 'i4i.2xlarge'
46+
# instance types and number are chosen so at the end of the test the storage utilization is ~85-90%
47+
n_loaders: 6
48+
instance_type_loader: 'c6i.2xlarge'
49+
# the test should run on a heterogeneous cluster, so it adds different instance types
50+
# reuses the nemesis_ variables for simplicity
51+
nemesis_add_node_cnt: 3
52+
nemesis_grow_shrink_instance_type: 'i4i.large'
53+
54+
user_prefix: 'balancer-test'
55+
56+
round_robin: true
57+
append_scylla_args: '++ --logger-log-level load_balancer=debug'

0 commit comments

Comments
 (0)