diff --git a/sdcm/nemesis.py b/sdcm/nemesis.py index c83612580c..7d345702f6 100644 --- a/sdcm/nemesis.py +++ b/sdcm/nemesis.py @@ -5525,15 +5525,15 @@ def drop_keyspace(node): target_host_id = self.target_node.host_id stack.callback(self._remove_node_add_node, verification_node=working_node, node_to_remove=self.target_node, remove_node_host_id=target_host_id) - + stack.enter_context(node_operations.block_loaders_payload_for_scylla_node( + self.target_node, loader_nodes=self.loaders.nodes)) self.tester.create_keyspace(keyspace_name, replication_factor=3) self.tester.create_table(name=table_name, keyspace_name=keyspace_name, key_type="bigint", columns={"name": "text"}) - stack.callback(drop_keyspace, node=working_node) with simulate_node_unavailability(self.target_node): # target node stopped by Contextmanger. Wait while its status will be updated - wait_for(node_operations.is_node_seen_as_down, timeout=600, throw_exc=True, + wait_for(node_operations.is_node_seen_as_down, step=5, timeout=600, throw_exc=True, down_node=self.target_node, verification_node=working_node, text=f"Wait other nodes see {self.target_node.name} as DOWN...") self.log.debug("Remove node %s : hostid: %s with blocked scylla from cluster", self.target_node.name, target_host_id) @@ -5560,12 +5560,12 @@ def drop_keyspace(node): with self.cluster.cql_connection_patient(working_node) as session: LOGGER.debug("Check keyspace %s.%s is empty", keyspace_name, table_name) - result = list(session.execute(f"SELECT * from {keyspace_name}.{table_name}")) + stmt = SimpleStatement(f"SELECT * from {keyspace_name}.{table_name}", + consistency_level=ConsistencyLevel.QUORUM) + result = list(session.execute(stmt)) LOGGER.debug("Query result %s", result) assert not result, f"New rows were added from banned node, {result}" - drop_keyspace(working_node) - def disrupt_method_wrapper(method, is_exclusive=False): # pylint: disable=too-many-statements # noqa: PLR0915 """ diff --git a/sdcm/utils/nemesis_utils/node_operations.py b/sdcm/utils/nemesis_utils/node_operations.py index cce60171de..c27e9eecbd 100644 --- a/sdcm/utils/nemesis_utils/node_operations.py +++ b/sdcm/utils/nemesis_utils/node_operations.py @@ -9,7 +9,7 @@ @contextlib.contextmanager -def block_scylla_ports(target_node: "BaseNode", ports: list[int] | None = None): +def block_scylla_ports(target_node: BaseNode, ports: list[int] | None = None): ports = ports or [7001, 7000, 9042, 9142, 19042, 19142] target_node.install_package("iptables") target_node.start_service("iptables", ignore_status=True) @@ -17,16 +17,20 @@ def block_scylla_ports(target_node: "BaseNode", ports: list[int] | None = None): for port in ports: target_node.remoter.sudo(f"iptables -A INPUT -p tcp --dport {port} -j DROP") target_node.remoter.sudo(f"iptables -A OUTPUT -p tcp --dport {port} -j DROP") + target_node.remoter.sudo(f"ip6tables -A INPUT -p tcp --dport {port} -j DROP") + target_node.remoter.sudo(f"ip6tables -A OUTPUT -p tcp --dport {port} -j DROP") yield target_node.log.debug("Remove all iptable rules %s", target_node.name) for port in ports: target_node.remoter.sudo(f"iptables -D INPUT -p tcp --dport {port} -j DROP") target_node.remoter.sudo(f"iptables -D OUTPUT -p tcp --dport {port} -j DROP") + target_node.remoter.sudo(f"ip6tables -D INPUT -p tcp --dport {port} -j DROP") + target_node.remoter.sudo(f"ip6tables -D OUTPUT -p tcp --dport {port} -j DROP") target_node.stop_service("iptables", ignore_status=True) @contextlib.contextmanager -def pause_scylla_with_sigstop(target_node: "BaseNode"): +def pause_scylla_with_sigstop(target_node: BaseNode): target_node.log.debug("Send signal SIGSTOP to scylla process on node %s", target_node.name) target_node.remoter.sudo("pkill --signal SIGSTOP -e scylla", timeout=60) yield @@ -34,6 +38,37 @@ def pause_scylla_with_sigstop(target_node: "BaseNode"): target_node.remoter.sudo(cmd="pkill --signal SIGCONT -e scylla", timeout=60) +@contextlib.contextmanager +def block_loaders_payload_for_scylla_node(scylla_node: BaseNode, loader_nodes: list[BaseNode]): + """ Block connections from loaders to cql ports on scylla node + + Make the Scylla node inaccessible to loaders by blocking + any subsequent connections to the Scylla node. + This ensures that the stress tool can continue to operate without failure + even if the Scylla node is banned and removed from the cluster. + """ + ports = [9042, 9142, 19042, 19142] + scylla_node.install_package("iptables") + scylla_node.start_service("iptables", ignore_status=True) + loader_nodes_names = [node.name for node in loader_nodes] + blocking_ips = [node.ip_address for node in loader_nodes] + scylla_node.log.debug("Block connections on %s from loader nodes %s", scylla_node.name, loader_nodes_names) + for port in ports: + scylla_node.remoter.sudo( + f"iptables -A INPUT -s {','.join(blocking_ips)} -p tcp --dport {port} -j DROP", ignore_status=True) + scylla_node.remoter.sudo( + f"ip6tables -A INPUT -s {','.join(blocking_ips)} -p tcp --dport {port} -j DROP", ignore_status=True) + yield + # if scylla_node is alive, then delete the iptables rules + if scylla_node.remoter.is_up(): + for port in ports: + scylla_node.remoter.sudo( + f"iptables -D INPUT -s {','.join(blocking_ips)} -p tcp --dport {port} -j DROP", ignore_status=True) + scylla_node.remoter.sudo( + f"ip6tables -D INPUT -s {','.join(blocking_ips)} -p tcp --dport {port} -j DROP", ignore_status=True) + scylla_node.stop_service("iptables", ignore_status=True) + + def is_node_removed_from_cluster(removed_node: BaseNode, verification_node: BaseNode) -> bool: LOGGER.debug("Verification node %s", verification_node.name) cluster_status: Optional[dict] = removed_node.parent_cluster.get_nodetool_status( @@ -48,4 +83,6 @@ def is_node_removed_from_cluster(removed_node: BaseNode, verification_node: Base def is_node_seen_as_down(down_node: BaseNode, verification_node: BaseNode) -> bool: LOGGER.debug("Verification node %s", verification_node.name) - return down_node not in verification_node.parent_cluster.get_nodes_up_and_normal(verification_node) + nodes_status = verification_node.parent_cluster.get_nodetool_status(verification_node, dc_aware=False) + down_node_status = nodes_status.get(down_node.ip_address) + return (not down_node_status or down_node_status["state"] == "DN")