Adding e2e maint notifications tests covering node migration and rebind

petyaslavova · petyaslavova · commit a8e775a89793 · 2025-12-16T09:50:35.000+02:00
diff --git a/tests/test_scenario/fault_injector_client.py b/tests/test_scenario/fault_injector_client.py
@@ -113,6 +113,7 @@ def execute_migrate(
         endpoint_config: Dict[str, Any],
         target_node: str,
         empty_node: str,
+        skip_end_notification: bool = False,
     ) -> str:
         pass
 
@@ -452,6 +453,7 @@ def execute_migrate(
         endpoint_config: Dict[str, Any],
         target_node: str,
         empty_node: str,
+        skip_end_notification: bool = False,
     ) -> str:
         """Execute rladmin migrate command and wait for completion."""
         command = f"migrate node {target_node} all_shards target_node {empty_node}"
@@ -554,15 +556,20 @@ def __init__(self, oss_cluster: bool = False):
         self.proxy_helper.set_cluster_slots(
             self.CLUSTER_SLOTS_INTERCEPTOR_NAME, self.DEFAULT_CLUSTER_SLOTS
         )
-        logging.info("Sleeping for 2 seconds to allow proxy to apply the changes...")
-        time.sleep(2)
 
         self.seq_id = 0
 
     def _get_seq_id(self):
         self.seq_id += 1
         return self.seq_id
 
+    def get_operation_result(
+        self,
+        action_id: str,
+        timeout: int = 60,
+    ) -> Dict[str, Any]:
+        return {"status": "done"}
+
     def find_target_node_and_empty_node(
         self,
         endpoint_config: Dict[str, Any],
@@ -633,7 +640,11 @@ def execute_failover(
         return {"status": "done"}
 
     def execute_migrate(
-        self, endpoint_config: Dict[str, Any], target_node: str, empty_node: str
+        self,
+        endpoint_config: Dict[str, Any],
+        target_node: str,
+        empty_node: str,
+        skip_end_notification: bool = False,
     ) -> str:
         """
         Simulate migrate command execution.
@@ -661,25 +672,27 @@ def execute_migrate(
         time.sleep(self.SLEEP_TIME_BETWEEN_START_END_NOTIFICATIONS)
 
         if self.oss_cluster:
-            # intercept cluster slots
-            self.proxy_helper.set_cluster_slots(
-                self.CLUSTER_SLOTS_INTERCEPTOR_NAME,
-                [
-                    SlotsRange("127.0.0.1", self.NODE_PORT_2, 0, 200),
-                    SlotsRange("127.0.0.1", self.NODE_PORT_1, 201, 8191),
-                    SlotsRange("127.0.0.1", self.NODE_PORT_2, 8192, 16383),
-                ],
-            )
-            # send smigrated
-            end_maint_notif = RespTranslator.oss_maint_notification_to_resp(
-                f"SMIGRATED {self._get_seq_id()} 127.0.0.1:{self.NODE_PORT_2} 0-200"
-            )
+            if not skip_end_notification:
+                # intercept cluster slots
+                self.proxy_helper.set_cluster_slots(
+                    self.CLUSTER_SLOTS_INTERCEPTOR_NAME,
+                    [
+                        SlotsRange("127.0.0.1", self.NODE_PORT_2, 0, 200),
+                        SlotsRange("127.0.0.1", self.NODE_PORT_1, 201, 8191),
+                        SlotsRange("127.0.0.1", self.NODE_PORT_2, 8192, 16383),
+                    ],
+                )
+                # send smigrated
+                end_maint_notif = RespTranslator.oss_maint_notification_to_resp(
+                    f"SMIGRATED {self._get_seq_id()} 127.0.0.1:{self.NODE_PORT_2} 0-200"
+                )
+                self.proxy_helper.send_notification(self.NODE_PORT_1, end_maint_notif)
         else:
             # send migrated
             end_maint_notif = RespTranslator.re_cluster_maint_notification_to_resp(
                 f"MIGRATED {self._get_seq_id()} [1]"
             )
-        self.proxy_helper.send_notification(self.NODE_PORT_1, end_maint_notif)
+            self.proxy_helper.send_notification(self.NODE_PORT_1, end_maint_notif)
 
         return "done"
 
@@ -695,17 +708,15 @@ def execute_rebind(self, endpoint_config: Dict[str, Any], endpoint_id: str) -> s
         """
         sleep_time = self.SLEEP_TIME_BETWEEN_START_END_NOTIFICATIONS
         if self.oss_cluster:
-            # send smigrating
-            maint_start_notif = RespTranslator.oss_maint_notification_to_resp(
-                f"SMIGRATING {self._get_seq_id()} 0-8191"
-            )
+            # smigrating should be sent as part of the migrate flow
+            pass
         else:
             # send moving
             sleep_time = self.MOVING_TTL
             maint_start_notif = RespTranslator.re_cluster_maint_notification_to_resp(
                 f"MOVING {self._get_seq_id()} {sleep_time} 127.0.0.1:{self.NODE_PORT_3}"
             )
-        self.proxy_helper.send_notification(self.NODE_PORT_1, maint_start_notif)
+            self.proxy_helper.send_notification(self.NODE_PORT_1, maint_start_notif)
 
         # sleep to allow the client to receive the notification
         time.sleep(sleep_time)
@@ -725,7 +736,7 @@ def execute_rebind(self, endpoint_config: Dict[str, Any], endpoint_id: str) -> s
             )
             self.proxy_helper.send_notification(self.NODE_PORT_1, smigrated_node_1)
         else:
-            # TODO drop connections to node 1
+            # TODO drop connections to node 1 to simulate that the node is removed
             pass
 
         return "done"
diff --git a/tests/test_scenario/maint_notifications_helpers.py b/tests/test_scenario/maint_notifications_helpers.py
@@ -127,9 +127,12 @@ def execute_migrate(
         endpoint_config: Dict[str, Any],
         target_node: str,
         empty_node: str,
+        skip_end_notification: bool = False,
     ) -> str:
         """Execute rladmin migrate command and wait for completion."""
-        return fault_injector.execute_migrate(endpoint_config, target_node, empty_node)
+        return fault_injector.execute_migrate(
+            endpoint_config, target_node, empty_node, skip_end_notification
+        )
 
     @staticmethod
     def execute_rebind(
diff --git a/tests/test_scenario/test_maint_notifications.py b/tests/test_scenario/test_maint_notifications.py
@@ -73,12 +73,14 @@ def _execute_migration(
         endpoints_config: Dict[str, Any],
         target_node: str,
         empty_node: str,
+        skip_end_notification: bool = False,
     ):
         migrate_action_id = ClusterOperations.execute_migrate(
             fault_injector=fault_injector_client,
             endpoint_config=endpoints_config,
             target_node=target_node,
             empty_node=empty_node,
+            skip_end_notification=skip_end_notification,
         )
 
         self._migration_executed = True
@@ -118,6 +120,7 @@ def _execute_migrate_bind_flow(
             endpoints_config=endpoints_config,
             target_node=target_node,
             empty_node=empty_node,
+            skip_end_notification=True,
         )
         self._execute_bind(
             fault_injector_client=fault_injector_client,
@@ -1364,7 +1367,6 @@ def execute_commands(duration: int, errors: Queue):
                     )
             logging.debug(f"{threading.current_thread().name}: Thread ended")
 
-        logging.info("Creating one connection in the pool.")
         # get the node covering first shard - it is the node we will failover
         target_node = (
             cluster_client_maint_notifications.nodes_manager.get_node_from_slot(0)
@@ -1447,7 +1449,7 @@ def test_notification_handling_during_migration_without_node_replacement(
             cluster_client_maint_notifications.nodes_manager.nodes_cache.copy()
         )
 
-        logging.info("Executing failover command...")
+        logging.info("Executing migrate command...")
         migration_thread = Thread(
             target=self._execute_migration,
             name="migration_thread",
@@ -1566,7 +1568,7 @@ def execute_commands(duration: int, errors: Queue):
             thread.start()
             threads.append(thread)
 
-        logging.info("Executing failover command...")
+        logging.info("Executing migrate command...")
         migration_thread = Thread(
             target=self._execute_migration,
             name="migration_thread",
@@ -1605,3 +1607,192 @@ def execute_commands(duration: int, errors: Queue):
 
         # validate no errors were raised in the command execution threads
         assert errors.empty(), f"Errors occurred in threads: {errors.queue}"
+
+    @pytest.mark.timeout(300)  # 5 minutes timeout for this test
+    def test_notification_handling_during_migration_and_re_bind(
+        self,
+        cluster_client_maint_notifications: RedisCluster,
+        fault_injector_client_oss_api: FaultInjectorClient,
+        cluster_endpoints_config: Dict[str, Any],
+    ):
+        """
+        Test the push notifications are received when executing re cluster operations.
+
+        """
+        # get the node covering first shard - it is the node we will have migrated slots
+        target_node = (
+            cluster_client_maint_notifications.nodes_manager.get_node_from_slot(0)
+        )
+        logging.info(
+            f"Creating one connection in the pool using node {target_node.name}."
+        )
+        conn = target_node.redis_connection.connection_pool.get_connection()
+        cluster_nodes = (
+            cluster_client_maint_notifications.nodes_manager.nodes_cache.copy()
+        )
+
+        logging.info("Executing migrate and bind flow ...")
+        migrate_and_bind_thread = Thread(
+            target=self._execute_migrate_bind_flow,
+            name="migrate_and_bind_thread",
+            args=(
+                fault_injector_client_oss_api,
+                cluster_endpoints_config,
+                self.target_node.node_id,
+                self.empty_node.node_id,
+                self.endpoint_id,
+            ),
+        )
+        migrate_and_bind_thread.start()
+
+        logging.info("Waiting for SMIGRATING push notifications...")
+        ClientValidations.wait_push_notification(
+            cluster_client_maint_notifications,
+            timeout=SMIGRATING_TIMEOUT,
+            connection=conn,
+        )
+
+        logging.info("Validating connection maintenance state...")
+        assert conn.maintenance_state == MaintenanceState.MAINTENANCE
+        assert conn._sock.gettimeout() == RELAXED_TIMEOUT
+        assert conn.should_reconnect() is False
+
+        assert len(cluster_nodes) == len(
+            cluster_client_maint_notifications.nodes_manager.nodes_cache
+        )
+        for node_key in cluster_nodes.keys():
+            assert (
+                node_key in cluster_client_maint_notifications.nodes_manager.nodes_cache
+            )
+
+        logging.info("Waiting for SMIGRATED push notifications...")
+        ClientValidations.wait_push_notification(
+            cluster_client_maint_notifications,
+            timeout=SMIGRATED_TIMEOUT,
+            connection=conn,
+        )
+
+        logging.info("Validating connection state after SMIGRATED ...")
+
+        assert conn.should_reconnect() is True
+
+        # the overall number of nodes should be the same - one removed and one added
+        assert len(cluster_nodes) == len(
+            cluster_client_maint_notifications.nodes_manager.nodes_cache
+        )
+        assert (
+            target_node.name
+            not in cluster_client_maint_notifications.nodes_manager.nodes_cache
+        )
+
+        logging.info("Releasing connection back to the pool...")
+        target_node.redis_connection.connection_pool.release(conn)
+
+        migrate_and_bind_thread.join()
+
+    @pytest.mark.timeout(300)  # 5 minutes timeout for this test
+    def test_command_execution_during_migration_and_re_bind(
+        self,
+        fault_injector_client_oss_api: FaultInjectorClient,
+        cluster_endpoints_config: Dict[str, Any],
+    ):
+        """
+        Test the push notifications are received when executing re cluster operations.
+        """
+
+        errors = Queue()
+        if isinstance(fault_injector_client_oss_api, ProxyServerFaultInjector):
+            execution_duration = 20
+        else:
+            execution_duration = 180
+
+        socket_timeout = 0.5
+
+        cluster_client_maint_notifications = _get_cluster_client_maint_notifications(
+            endpoints_config=cluster_endpoints_config,
+            disable_retries=True,
+            socket_timeout=socket_timeout,
+            enable_maintenance_notifications=True,
+        )
+
+        def execute_commands(duration: int, errors: Queue):
+            start = time.time()
+            while time.time() - start < duration:
+                try:
+                    # the slot is covered by the first shard - this one will have slots migrated
+                    cluster_client_maint_notifications.set("key:{3}", "value")
+                    cluster_client_maint_notifications.get("key:{3}")
+                    # execute also commands that will run on the second shard
+                    cluster_client_maint_notifications.set("key:{0}", "value")
+                    cluster_client_maint_notifications.get("key:{0}")
+                except Exception as e:
+                    logging.error(
+                        f"Error in thread {threading.current_thread().name}: {e}"
+                    )
+                    errors.put(
+                        f"Command failed in thread {threading.current_thread().name}: {e}"
+                    )
+            logging.debug(f"{threading.current_thread().name}: Thread ended")
+
+        # get the node covering first shard - it is the node we will migrate and remove
+        target_node = (
+            cluster_client_maint_notifications.nodes_manager.get_node_from_slot(0)
+        )
+
+        cluster_nodes = (
+            cluster_client_maint_notifications.nodes_manager.nodes_cache.copy()
+        )
+
+        threads = []
+        for i in range(10):
+            thread = Thread(
+                target=execute_commands,
+                name=f"command_execution_thread_{i}",
+                args=(
+                    execution_duration,
+                    errors,
+                ),
+            )
+            thread.start()
+            threads.append(thread)
+
+        logging.info("Executing migrate and bind flow...")
+        migrate_and_bind_thread = Thread(
+            target=self._execute_migrate_bind_flow,
+            name="migrate_and_bind_thread",
+            args=(
+                fault_injector_client_oss_api,
+                cluster_endpoints_config,
+                self.target_node.node_id,
+                self.empty_node.node_id,
+                self.endpoint_id,
+            ),
+        )
+        migrate_and_bind_thread.start()
+
+        for thread in threads:
+            thread.join()
+
+        migrate_and_bind_thread.join()
+
+        # validate cluster nodes
+        assert len(cluster_nodes) == len(
+            cluster_client_maint_notifications.nodes_manager.nodes_cache
+        )
+        assert (
+            target_node.name
+            not in cluster_client_maint_notifications.nodes_manager.nodes_cache
+        )
+
+        for (
+            node
+        ) in cluster_client_maint_notifications.nodes_manager.nodes_cache.values():
+            # validate connections settings
+            self._validate_default_state(
+                node.redis_connection,
+                expected_matching_conns_count=10,
+                configured_timeout=socket_timeout,
+            )
+
+        # validate no errors were raised in the command execution threads
+        assert errors.empty(), f"Errors occurred in threads: {errors.queue}"