Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(nemesis): add support ipv6 for refuse connection for banned node #10594

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5525,15 +5525,15 @@ def drop_keyspace(node):
target_host_id = self.target_node.host_id
stack.callback(self._remove_node_add_node, verification_node=working_node, node_to_remove=self.target_node,
remove_node_host_id=target_host_id)

stack.enter_context(node_operations.block_loaders_payload_for_scylla_node(
self.target_node, loader_nodes=self.loaders.nodes))
self.tester.create_keyspace(keyspace_name, replication_factor=3)
self.tester.create_table(name=table_name, keyspace_name=keyspace_name, key_type="bigint",
columns={"name": "text"})
stack.callback(drop_keyspace, node=working_node)

with simulate_node_unavailability(self.target_node):
# target node stopped by Contextmanger. Wait while its status will be updated
wait_for(node_operations.is_node_seen_as_down, timeout=600, throw_exc=True,
wait_for(node_operations.is_node_seen_as_down, step=5, timeout=600, throw_exc=True,
down_node=self.target_node, verification_node=working_node, text=f"Wait other nodes see {self.target_node.name} as DOWN...")
self.log.debug("Remove node %s : hostid: %s with blocked scylla from cluster",
self.target_node.name, target_host_id)
Expand All @@ -5560,12 +5560,12 @@ def drop_keyspace(node):

with self.cluster.cql_connection_patient(working_node) as session:
LOGGER.debug("Check keyspace %s.%s is empty", keyspace_name, table_name)
result = list(session.execute(f"SELECT * from {keyspace_name}.{table_name}"))
stmt = SimpleStatement(f"SELECT * from {keyspace_name}.{table_name}",
consistency_level=ConsistencyLevel.QUORUM)
result = list(session.execute(stmt))
LOGGER.debug("Query result %s", result)
assert not result, f"New rows were added from banned node, {result}"

drop_keyspace(working_node)


def disrupt_method_wrapper(method, is_exclusive=False): # pylint: disable=too-many-statements # noqa: PLR0915
"""
Expand Down
43 changes: 40 additions & 3 deletions sdcm/utils/nemesis_utils/node_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,66 @@


@contextlib.contextmanager
def block_scylla_ports(target_node: "BaseNode", ports: list[int] | None = None):
def block_scylla_ports(target_node: BaseNode, ports: list[int] | None = None):
ports = ports or [7001, 7000, 9042, 9142, 19042, 19142]
target_node.install_package("iptables")
target_node.start_service("iptables", ignore_status=True)
target_node.log.debug("Block connections %s", target_node.name)
for port in ports:
target_node.remoter.sudo(f"iptables -A INPUT -p tcp --dport {port} -j DROP")
target_node.remoter.sudo(f"iptables -A OUTPUT -p tcp --dport {port} -j DROP")
target_node.remoter.sudo(f"ip6tables -A INPUT -p tcp --dport {port} -j DROP")
target_node.remoter.sudo(f"ip6tables -A OUTPUT -p tcp --dport {port} -j DROP")
yield
target_node.log.debug("Remove all iptable rules %s", target_node.name)
for port in ports:
target_node.remoter.sudo(f"iptables -D INPUT -p tcp --dport {port} -j DROP")
target_node.remoter.sudo(f"iptables -D OUTPUT -p tcp --dport {port} -j DROP")
target_node.remoter.sudo(f"ip6tables -D INPUT -p tcp --dport {port} -j DROP")
target_node.remoter.sudo(f"ip6tables -D OUTPUT -p tcp --dport {port} -j DROP")
target_node.stop_service("iptables", ignore_status=True)


@contextlib.contextmanager
def pause_scylla_with_sigstop(target_node: "BaseNode"):
def pause_scylla_with_sigstop(target_node: BaseNode):
target_node.log.debug("Send signal SIGSTOP to scylla process on node %s", target_node.name)
target_node.remoter.sudo("pkill --signal SIGSTOP -e scylla", timeout=60)
yield
target_node.log.debug("Send signal SIGCONT to scylla process on node %s", target_node.name)
target_node.remoter.sudo(cmd="pkill --signal SIGCONT -e scylla", timeout=60)


@contextlib.contextmanager
def block_loaders_payload_for_scylla_node(scylla_node: BaseNode, loader_nodes: list[BaseNode]):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add docstring why this is needed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

""" Block connections from loaders to cql ports on scylla node

Make the Scylla node inaccessible to loaders by blocking
any subsequent connections to the Scylla node.
This ensures that the stress tool can continue to operate without failure
even if the Scylla node is banned and removed from the cluster.
"""
ports = [9042, 9142, 19042, 19142]
scylla_node.install_package("iptables")
scylla_node.start_service("iptables", ignore_status=True)
loader_nodes_names = [node.name for node in loader_nodes]
blocking_ips = [node.ip_address for node in loader_nodes]
scylla_node.log.debug("Block connections on %s from loader nodes %s", scylla_node.name, loader_nodes_names)
for port in ports:
scylla_node.remoter.sudo(
f"iptables -A INPUT -s {','.join(blocking_ips)} -p tcp --dport {port} -j DROP", ignore_status=True)
scylla_node.remoter.sudo(
f"ip6tables -A INPUT -s {','.join(blocking_ips)} -p tcp --dport {port} -j DROP", ignore_status=True)
yield
# if scylla_node is alive, then delete the iptables rules
if scylla_node.remoter.is_up():
for port in ports:
scylla_node.remoter.sudo(
f"iptables -D INPUT -s {','.join(blocking_ips)} -p tcp --dport {port} -j DROP", ignore_status=True)
scylla_node.remoter.sudo(
f"ip6tables -D INPUT -s {','.join(blocking_ips)} -p tcp --dport {port} -j DROP", ignore_status=True)
scylla_node.stop_service("iptables", ignore_status=True)


def is_node_removed_from_cluster(removed_node: BaseNode, verification_node: BaseNode) -> bool:
LOGGER.debug("Verification node %s", verification_node.name)
cluster_status: Optional[dict] = removed_node.parent_cluster.get_nodetool_status(
Expand All @@ -48,4 +83,6 @@ def is_node_removed_from_cluster(removed_node: BaseNode, verification_node: Base

def is_node_seen_as_down(down_node: BaseNode, verification_node: BaseNode) -> bool:
LOGGER.debug("Verification node %s", verification_node.name)
return down_node not in verification_node.parent_cluster.get_nodes_up_and_normal(verification_node)
nodes_status = verification_node.parent_cluster.get_nodetool_status(verification_node, dc_aware=False)
down_node_status = nodes_status.get(down_node.ip_address)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouln't use down_node.listen_address?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will be the same as ip_address and we use it everywhere with nodetool_status

return (not down_node_status or down_node_status["state"] == "DN")