test(size-based-load-balancing) add test with different partition sizes

cezarmoise · actions-user · commit 6b16afd3eab7 · 2026-02-19T18:56:35.000Z
check node storage is balanced after writes finish (cherry picked from commit 065fdee)
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -100,6 +100,11 @@ upgrade_test.py                             @abramche @fruch
 /test-cases/features/out-of-space-prevention              @pehala
 /jenkins-pipelines/oss/features/out-of-space-prevention   @pehala
 
+# Size-based load balancing
+/longevity_balancer_test.py                                 @pehala
+/test-cases/features/size-based-load-balancing              @pehala
+/jenkins-pipelines/oss/features/size-based-load-balancing   @pehala
+
 # Backup & Restore
 jenkins-pipelines/performance/branch-perf-v17/scylla-enterprise/perf-regression/scylla-enterprise-perf-manager-native-backup-nemesis.jenkinsfile   @pehala
 jenkins-pipelines/performance/branch-perf-v17/scylla-enterprise/perf-regression/scylla-enterprise-perf-manager-rclone-backup-nemesis.jenkinsfile   @pehala
diff --git a/jenkins-pipelines/oss/features/size-based-load-balancing/size-based-load-balancing.jenkinsfile b/jenkins-pipelines/oss/features/size-based-load-balancing/size-based-load-balancing.jenkinsfile
@@ -0,0 +1,11 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+longevityPipeline(
+    backend: "aws",
+    region: 'eu-west-1',
+    test_name: 'longevity_balancer_test.LongevityBalancerTest.test_load_balance',
+    test_config: 'test-cases/features/size-based-load-balancing/size-based-load-balancing.yaml',
+)
diff --git a/longevity_balancer_test.py b/longevity_balancer_test.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See LICENSE for more details.
+#
+# Copyright (c) 2025 ScyllaDB
+
+
+from collections import defaultdict
+from contextlib import contextmanager
+from longevity_test import LongevityTest
+from sdcm.argus_results import PeriodicDiskUsageToArgus
+from sdcm.cluster import MAX_TIME_WAIT_FOR_DECOMMISSION, MAX_TIME_WAIT_FOR_NEW_NODE_UP, BaseNode
+from sdcm.sct_events import Severity
+from sdcm.sct_events.system import InfoEvent, TestFrameworkEvent
+from sdcm.utils.adaptive_timeouts import Operations, adaptive_timeout
+from sdcm.utils.common import ParallelObject, get_node_disk_usage
+from sdcm.utils.tablets.common import wait_no_tablets_migration_running
+
+# Per requirement, load balance difference should be bellow 5% for nodes in the same rack
+# https://scylladb.atlassian.net/wiki/spaces/RND/pages/5505671/Size-Based+Load+Balancing+Requirement+Document#Performance
+BALANCE_THRESHOLD = 5
+
+
+class LongevityBalancerTest(LongevityTest):
+    """
+    Test to ensure that the cluster is balanced correctly in difficult conditions.
+
+    Required config params:
+    - nemesis_add_node_cnt:
+        - At the beginning of the test, this many nodes will be added to the cluster
+        - In the middle of the test, this many nodes will be added and then removed from the cluster
+    - nemesis_grow_shrink_instance_type: The instance type to use when adding nodes to the cluster
+    - prepare_write_cmd: The stress command to use for the initial data population
+    - stress_cmd: The stress command to use for the second data population, after adding and removing nodes
+    """
+
+    def expand_cluster_heterogenous(self):
+        new_nodes = self.db_cluster.add_nodes(
+            count=self.params.get("nemesis_add_node_cnt"),
+            instance_type=self.params.get("nemesis_grow_shrink_instance_type"),
+            enable_auto_bootstrap=True,
+            rack=None,
+        )
+        self.monitors.reconfigure_scylla_monitoring()
+        up_timeout = MAX_TIME_WAIT_FOR_NEW_NODE_UP
+        with adaptive_timeout(Operations.NEW_NODE, node=self.db_cluster.data_nodes[0], timeout=up_timeout):
+            self.db_cluster.wait_for_init(node_list=new_nodes, timeout=up_timeout, check_node_health=False)
+        self.db_cluster.set_seeds()
+        self.db_cluster.update_seed_provider()
+        self.db_cluster.wait_for_nodes_up_and_normal(nodes=new_nodes)
+
+    def wait_for_balance(self):
+        # run multiple times because `storage_service/quiesce_topology` only returns when
+        # the topology operations that were ongoing when the command was issued are done
+        # but new operations can start right after that
+        for _ in range(3):
+            ParallelObject(objects=self.db_cluster.data_nodes, timeout=3600).run(wait_no_tablets_migration_running)
+
+    def check_balance(self):
+        rack_usages = defaultdict(list)
+        for node in self.db_cluster.data_nodes:
+            rack_usages[node.rack].append(get_node_disk_usage(node))
+
+        for rack, usages in rack_usages.items():
+            min_utilization = min(usages)
+            max_utilization = max(usages)
+            if max_utilization - min_utilization > BALANCE_THRESHOLD:
+                TestFrameworkEvent(
+                    source="longevity_balancer_test",
+                    message=f"Storage utilization is not balanced in rack {rack}. Min: {min_utilization:.2f}%, Max: {max_utilization:.2f}%",
+                    severity=Severity.ERROR,
+                ).publish()
+
+    def scale_out(self):
+        added_nodes = self.db_cluster.add_nodes(
+            count=self.db_cluster.racks_count,
+            instance_type=self.params.get("nemesis_grow_shrink_instance_type"),
+            enable_auto_bootstrap=True,
+            rack=None,
+        )
+        self.monitors.reconfigure_scylla_monitoring()
+        up_timeout = MAX_TIME_WAIT_FOR_NEW_NODE_UP
+        with adaptive_timeout(Operations.NEW_NODE, node=self.db_cluster.data_nodes[0], timeout=up_timeout):
+            self.db_cluster.wait_for_init(node_list=added_nodes, timeout=up_timeout, check_node_health=False)
+        self.db_cluster.set_seeds()
+        self.db_cluster.update_seed_provider()
+        self.db_cluster.wait_for_nodes_up_and_normal(nodes=added_nodes)
+        return added_nodes
+
+    def scale_in(self, nodes: list[BaseNode]):
+        for node in nodes:
+            self.nemesis_allocator.set_running_nemesis(node, "decommissioning")
+        parallel_obj = ParallelObject(objects=nodes, timeout=MAX_TIME_WAIT_FOR_DECOMMISSION, num_workers=len(nodes))
+        InfoEvent(f"Started decommissioning {[node for node in nodes]}").publish()
+        parallel_obj.run(self.db_cluster.decommission, ignore_exceptions=False, unpack_objects=True)
+        InfoEvent(f"Finished decommissioning {[node for node in nodes]}").publish()
+        self.monitors.reconfigure_scylla_monitoring()
+
+    def run_stress_command(self):
+        stress_queue = []
+        self.assemble_and_run_all_stress_cmd(
+            stress_queue, self.params.get("stress_cmd"), self.params.get("keyspace_num")
+        )
+        for stress in stress_queue:
+            self.verify_stress_thread(stress)
+
+    @contextmanager
+    def enforce_balance(self):
+        """
+        Add a property to the cluster to enforce balance checking in Argus.
+        """
+        self.db_cluster._enforce_balance = True
+        try:
+            yield
+        finally:
+            self.db_cluster._enforce_balance = False
+
+    def test_load_balance(self):
+        """
+        Test to ensure that the cluster is balanced correctly in difficult conditions:
+            - heterogeneous nodes with different disk sizes.
+            - multiple tables with different partition sizes.
+
+        This test will:
+        1. Expand the cluster by adding new nodes of different types.
+            No possible to start with heterogeneous nodes, as the cluster is created with a single type.
+        2. Populate the cluster with data.
+        3. Add some nodes to the cluster.
+        4. Write more data to the cluster.
+        5. Remove the added nodes.
+        6. Wait for the cluster to balance.
+        7. Check the final balance of the cluster.
+        """
+        self.expand_cluster_heterogenous()
+        with PeriodicDiskUsageToArgus(
+            self.db_cluster, self.test_config.argus_client(), interval=600, threshold=BALANCE_THRESHOLD
+        ):
+            with self.enforce_balance():
+                self.run_prepare_write_cmd()
+
+            new_nodes = self.scale_out()
+            self.wait_for_balance()
+            self.check_balance()
+
+            with self.enforce_balance():
+                self.run_stress_command()
+
+            self.scale_in(new_nodes)
+            self.wait_for_balance()
+            self.check_balance()
diff --git a/test-cases/features/size-based-load-balancing/size-based-load-balancing.yaml b/test-cases/features/size-based-load-balancing/size-based-load-balancing.yaml
@@ -0,0 +1,57 @@
+# Size-based load balancing test.
+# This test ensures storage utilization within each rack stays within the allowed threshold.
+# - Work with a heterogenous cluster: add 'nemesis_add_node_cnt' nodes of type 'nemesis_grow_shrink_instance_type' to the base cluster (n_db_nodes = 6).
+# - Initial data population: run the 'prepare_write_cmd' scylla-bench commands.
+#   The commands are designed to create a significant amount of data (total ~1.2TB) with a wide distribution of partition sizes, which should trigger the size-based load balancing logic.
+# - Scale-out: add one node per rack ('nemesis_add_node_cnt'/ 'simulated_racks' = 3) of type 'nemesis_grow_shrink_instance_type'.
+# - Check intermediary balance: wait for tablets migration to quiesce and compare per-node disk usage per rack (threshold 5%, defined in test).
+# - Secondary data population: run the 'stress_cmd' scylla-bench commands.
+# - Scale-in: decommission the added nodes.
+# - Check final balance: wait for tablets migration to quiesce and compare per-node disk usage per rack (threshold 5%, defined in test).
+
+test_duration: 720
+prepare_write_cmd:  [
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table1 -partition-count=200000 -partition-offset=10000001  -clustering-row-count=10      -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table2 -partition-count=100000 -partition-offset=20000001  -clustering-row-count=10      -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table3 -partition-count=20000  -partition-offset=30000001  -clustering-row-count=100     -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table1 -partition-count=10000  -partition-offset=40000001  -clustering-row-count=100     -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table2 -partition-count=2000   -partition-offset=50000001  -clustering-row-count=1000    -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table3 -partition-count=1000   -partition-offset=60000001  -clustering-row-count=1000    -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table1 -partition-count=200    -partition-offset=70000001  -clustering-row-count=10000   -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table2 -partition-count=100    -partition-offset=80000001  -clustering-row-count=10000   -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table3 -partition-count=20     -partition-offset=90000001  -clustering-row-count=100000  -clustering-row-size=fixed:131072 -concurrency=20 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table1 -partition-count=10     -partition-offset=100000001 -clustering-row-count=100000  -clustering-row-size=fixed:131072 -concurrency=10 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table2 -partition-count=2      -partition-offset=110000001 -clustering-row-count=1000000 -clustering-row-size=fixed:131072 -concurrency=2  -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table3 -partition-count=1      -partition-offset=120000001 -clustering-row-count=1000000 -clustering-row-size=fixed:131072 -concurrency=1  -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+]
+
+stress_cmd: [
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table4 -partition-count=200000 -partition-offset=210000001 -clustering-row-count=7       -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table5 -partition-count=100000 -partition-offset=220000001 -clustering-row-count=7       -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table6 -partition-count=20000  -partition-offset=230000001 -clustering-row-count=70      -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table4 -partition-count=10000  -partition-offset=240000001 -clustering-row-count=70      -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table5 -partition-count=2000   -partition-offset=250000001 -clustering-row-count=700     -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table6 -partition-count=1000   -partition-offset=260000001 -clustering-row-count=700     -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table4 -partition-count=200    -partition-offset=270000001 -clustering-row-count=7000    -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table5 -partition-count=100    -partition-offset=280000001 -clustering-row-count=7000    -clustering-row-size=fixed:131072 -concurrency=40 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table6 -partition-count=20     -partition-offset=290000001 -clustering-row-count=70000   -clustering-row-size=fixed:131072 -concurrency=20 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table4 -partition-count=10     -partition-offset=300000001 -clustering-row-count=70000   -clustering-row-size=fixed:131072 -concurrency=10 -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table5 -partition-count=2      -partition-offset=310000001 -clustering-row-count=700000  -clustering-row-size=fixed:131072 -concurrency=2  -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+  "scylla-bench -workload=sequential -mode=write -replication-factor=3 -table=table6 -partition-count=1      -partition-offset=320000001 -clustering-row-count=700000  -clustering-row-size=fixed:131072 -concurrency=1  -connection-count=40 -consistency-level=quorum -rows-per-request=20 -timeout=180s -retry-number=50 -retry-interval=100ms,1s",
+]
+
+n_db_nodes: 6
+simulated_racks: 3
+instance_type_db: 'i4i.2xlarge'
+# instance types and number are chosen so at the end of the test the storage utilization is ~85-90%
+n_loaders: 6
+instance_type_loader: 'c6i.2xlarge'
+# the test should run on a heterogeneous cluster, so it adds different instance types
+# reuses the nemesis_ variables for simplicity
+nemesis_add_node_cnt: 3
+nemesis_grow_shrink_instance_type: 'i4i.large'
+
+user_prefix: 'balancer-test'
+
+round_robin: true
+append_scylla_args: '++ --logger-log-level load_balancer=debug'