Skip to content

Commit e6dbd2f

Browse files
[DPE-4183] - fix: only handle quorum removal on relation-departed (#146)
1 parent 9846005 commit e6dbd2f

12 files changed

Lines changed: 202 additions & 83 deletions

File tree

.github/workflows/ci.yaml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,50 @@ jobs:
9393
env:
9494
CI_PACKED_CHARMS: ${{ needs.build.outputs.charms }}
9595

96+
integration-test-scaling:
97+
strategy:
98+
fail-fast: false
99+
matrix:
100+
tox-environments:
101+
- integration-scaling
102+
name: ${{ matrix.tox-environments }}
103+
needs:
104+
- lint
105+
- unit-test
106+
- build
107+
- integration-test
108+
runs-on: [self-hosted, linux, X64, jammy, xlarge]
109+
timeout-minutes: 120
110+
steps:
111+
- name: Checkout
112+
uses: actions/checkout@v3
113+
- name: Setup operator environment
114+
# TODO: Replace with custom image on self-hosted runner
115+
uses: charmed-kubernetes/actions-operator@main
116+
with:
117+
provider: lxd
118+
juju-channel: 3.4/stable
119+
bootstrap-options: "--agent-version 3.4.2"
120+
- name: Download packed charm(s)
121+
uses: actions/download-artifact@v3
122+
with:
123+
name: ${{ needs.build.outputs.artifact-name }}
124+
- name: Select tests
125+
id: select-tests
126+
run: |
127+
if [ "${{ github.event_name }}" == "schedule" ]
128+
then
129+
echo Running unstable and stable tests
130+
echo "mark_expression=" >> $GITHUB_OUTPUT
131+
else
132+
echo Skipping unstable tests
133+
echo "mark_expression=not unstable" >> $GITHUB_OUTPUT
134+
fi
135+
- name: Run integration tests
136+
run: tox run -e ${{ matrix.tox-environments }} -- -m '${{ steps.select-tests.outputs.mark_expression }}'
137+
env:
138+
CI_PACKED_CHARMS: ${{ needs.build.outputs.charms }}
139+
96140
integration-test-ha:
97141
strategy:
98142
fail-fast: false
@@ -105,6 +149,7 @@ jobs:
105149
- unit-test
106150
- build
107151
- integration-test
152+
- integration-test-scaling
108153
runs-on: ubuntu-latest
109154
timeout-minutes: 120
110155
steps:

metadata.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,5 @@ storage:
4242
data:
4343
type: filesystem
4444
description: Directories where snapshot and transaction data is stored
45-
minimum-size: 10G
45+
minimum-size: 1G
4646
location: /var/snap/charmed-zookeeper/common/var/lib/zookeeper

src/charm.py

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@
99

1010
from charms.grafana_agent.v0.cos_agent import COSAgentProvider
1111
from charms.rolling_ops.v0.rollingops import RollingOpsManager
12+
from charms.zookeeper.v0.client import QuorumLeaderNotFoundError
13+
from kazoo.exceptions import BadVersionError, ReconfigInProcessError
1214
from ops.charm import (
1315
CharmBase,
1416
InstallEvent,
15-
LeaderElectedEvent,
1617
RelationDepartedEvent,
1718
SecretChangedEvent,
1819
StorageAttachedEvent,
@@ -115,7 +116,7 @@ def __init__(self, *args):
115116
getattr(self.on, "cluster_relation_joined"), self._on_cluster_relation_changed
116117
)
117118
self.framework.observe(
118-
getattr(self.on, "cluster_relation_departed"), self._on_cluster_relation_changed
119+
getattr(self.on, "cluster_relation_departed"), self._on_cluster_relation_departed
119120
)
120121

121122
self.framework.observe(
@@ -183,10 +184,6 @@ def _on_cluster_relation_changed(self, event: EventBase) -> None:
183184
# even if leader has not started, attempt update quorum
184185
self.update_quorum(event=event)
185186

186-
# don't delay scale-down leader ops by restarting dying unit
187-
if getattr(event, "departing_unit", None) == self.unit:
188-
return
189-
190187
# check whether restart is needed for all `*_changed` events
191188
# only restart where necessary to avoid slowdowns
192189
# config_changed call here implicitly updates jaas + zoo.cfg
@@ -212,9 +209,39 @@ def _on_cluster_relation_changed(self, event: EventBase) -> None:
212209
self._set_status(Status.SERVICE_UNHEALTHY)
213210
return
214211

212+
# in case server was erroneously removed from the quorum
213+
if not self.state.stale_quorum and not self.quorum_manager.server_in_quorum:
214+
self._set_status(Status.SERVICE_NOT_QUORUM)
215+
return
216+
215217
self._set_status(Status.ACTIVE)
216218

217-
def _on_storage_attached(self, event: StorageAttachedEvent) -> None:
219+
def _on_cluster_relation_departed(self, event: RelationDepartedEvent) -> None:
220+
"""Handler for `relation_departed` events."""
221+
# is related to issue found in https://bugs.launchpad.net/juju/+bug/2053055
222+
# likely due to a controller upgrade or a cloud maintenance with machines being reshuffled
223+
# periodically, juju would emit a LeaderElected event, and would return no peer units
224+
# the leader would then remove all other units from the quorum, which when restarted, would fail
225+
if not event.departing_unit:
226+
return
227+
228+
departing_server_id = (
229+
int(event.departing_unit.name.split("/")[1]) + 1
230+
) # server-ids must be positive integers
231+
232+
try:
233+
self.quorum_manager.client.remove_members(members=[f"server.{departing_server_id}"])
234+
except (
235+
ReconfigInProcessError, # another unit already handling
236+
BadVersionError, # another unit handled
237+
QuorumLeaderNotFoundError, # this unit is departing, can't find leader in peer data
238+
):
239+
pass
240+
241+
# NOTE: if the leader is also going down, it may miss the event to set {unit.id: removed}
242+
# to avoid this, eventual clean up occurs during update-status calling update_quorum
243+
244+
def _on_storage_attached(self, _: StorageAttachedEvent) -> None:
218245
"""Handler for `storage_attached` events."""
219246
self.workload.exec(["chmod", "750", f"{self.workload.paths.data_path}"])
220247
self.workload.exec(["chown", f"{USER}:{GROUP}", f"{self.workload.paths.data_path}"])
@@ -356,10 +383,6 @@ def update_quorum(self, event: EventBase) -> None:
356383

357384
if (
358385
self.state.stale_quorum # in the case of scale-up
359-
or isinstance( # to run without delay to maintain quorum on scale down
360-
event,
361-
(RelationDepartedEvent, LeaderElectedEvent),
362-
)
363386
or self.state.healthy # to ensure run on update-status
364387
):
365388
updated_servers = self.quorum_manager.update_cluster()
@@ -386,7 +409,7 @@ def update_quorum(self, event: EventBase) -> None:
386409
logger.debug("tls disabled - switching to non-ssl")
387410
self.state.cluster.update({"quorum": "non-ssl"})
388411

389-
if self.state.all_units_quorum:
412+
if self.state.all_units_same_encryption:
390413
logger.debug(
391414
"all units running desired encryption - removing switching-encryption"
392415
)

src/core/cluster.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def all_units_unified(self) -> bool:
315315
return True
316316

317317
@property
318-
def all_units_quorum(self) -> bool:
318+
def all_units_same_encryption(self) -> bool:
319319
"""Flag to check if all units are using the same quorum encryption."""
320320
if not self.cluster:
321321
return False
@@ -369,7 +369,7 @@ def stable(self) -> Status:
369369
@property
370370
def ready(self) -> Status:
371371
"""Gets appropriate Status if the charm is ready to handle related applications."""
372-
if not self.all_units_quorum:
372+
if not self.all_units_same_encryption:
373373
return Status.NOT_ALL_QUORUM
374374

375375
if self.cluster.switching_encryption:

src/core/models.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,6 @@ def __init__(
191191
substrate: SUBSTRATES,
192192
):
193193
super().__init__(relation, data_interface, component, substrate)
194-
# Lint :-/ It can't resolve the subtype otherwise, even though the same assignment happens in super()
195194
self.data_interface = data_interface
196195
self.app = component
197196

src/literals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ class Status(Enum):
6868
BlockedStatus("unable to install zookeeper service"), "ERROR"
6969
)
7070
SERVICE_NOT_RUNNING = StatusLevel(BlockedStatus("zookeeper service not running"), "ERROR")
71+
SERVICE_NOT_QUORUM = StatusLevel(BlockedStatus("unit not in the zookeeper quorum"), "ERROR")
7172
CONTAINER_NOT_CONNECTED = StatusLevel(
7273
MaintenanceStatus("zookeeper container not ready"), "DEBUG"
7374
)

src/managers/quorum.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from ops.charm import RelationEvent
2323

2424
from core.cluster import ClusterState
25+
from core.models import ZKServer
2526
from literals import CLIENT_PORT
2627

2728
logger = logging.getLogger(__name__)
@@ -102,17 +103,19 @@ def get_hostname_mapping(self) -> dict[str, str]:
102103

103104
return {"hostname": hostname, "fqdn": fqdn, "ip": ip}
104105

105-
def _get_updated_servers(self, add: list[str], remove: list[str]) -> dict[str, str]:
106+
def _get_updated_servers(self, add: list[str]) -> dict[str, str]:
106107
"""Simple wrapper for building `updated_servers` for passing to app data updates."""
107-
servers_to_update = add + remove
108-
109108
updated_servers = {}
110-
for server_string in servers_to_update:
109+
for server_string in add:
111110
unit_id = str(int(re.findall(r"server.([0-9]+)", server_string)[0]) - 1)
112111
if server_string in add:
113112
updated_servers[unit_id] = "added"
114-
elif server_string in remove:
115-
updated_servers[unit_id] = "removed"
113+
114+
# settings units to removed that were handled in relation-departed
115+
# also set here in case leader missed the event
116+
for added_id in self.state.cluster.added_unit_ids:
117+
if added_id not in [server.unit_id for server in self.state.servers]:
118+
updated_servers[str(added_id)] = "removed"
116119

117120
return updated_servers
118121

@@ -123,7 +126,6 @@ def update_cluster(self) -> dict[str, str]:
123126
124127
After grabbing all the "started" units that the leader can see in the peer relation
125128
unit data.
126-
Removes members not in the quorum anymore (i.e `relation_departed`/`leader_elected` event)
127129
Adds new members to the quorum (i.e `relation_joined` event).
128130
129131
Returns:
@@ -138,21 +140,12 @@ def update_cluster(self) -> dict[str, str]:
138140
active_server_strings = {server.server_string for server in self.state.started_servers}
139141

140142
try:
141-
# remove units first, faster due to no startup/sync delay
142-
zk_members = self.client.server_members
143-
servers_to_remove = list(zk_members - active_server_strings)
144-
logger.debug(f"{servers_to_remove=}")
145-
146-
self.client.remove_members(members=servers_to_remove)
147-
148143
# sorting units to ensure units are added in id order
149144
zk_members = self.client.server_members
150145
servers_to_add = sorted(active_server_strings - zk_members)
151-
logger.debug(f"{servers_to_add=}")
152-
153146
self.client.add_members(members=servers_to_add)
154147

155-
return self._get_updated_servers(add=servers_to_add, remove=servers_to_remove)
148+
return self._get_updated_servers(add=servers_to_add)
156149

157150
# caught errors relate to a unit/zk_server not yet being ready to change
158151
except (
@@ -165,6 +158,17 @@ def update_cluster(self) -> dict[str, str]:
165158
logger.warning(str(e))
166159
return {}
167160

161+
def server_in_quorum(self, server: ZKServer) -> bool:
162+
"""Checks if server is in current quorum.
163+
164+
Args:
165+
server: the server to check
166+
167+
Returns:
168+
True if server is found in the quorum. Otherwise False.
169+
"""
170+
return server.server_string in self.client.server_members
171+
168172
@staticmethod
169173
def _is_child_of(path: str, chroots: Set[str]) -> bool:
170174
"""Checks if given path is a child znode from a set of chroot paths.
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2023 Canonical Ltd.
3+
# See LICENSE file for licensing details.
4+
5+
import logging
6+
7+
import helpers
8+
import pytest
9+
from pytest_operator.plugin import OpsTest
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
@pytest.mark.skip_if_deployed
15+
@pytest.mark.abort_on_fail
16+
async def test_deploy_active(ops_test: OpsTest):
17+
charm = await ops_test.build_charm(".")
18+
await ops_test.model.deploy(
19+
charm,
20+
application_name=helpers.APP_NAME,
21+
num_units=3,
22+
storage={"data": {"pool": "lxd-btrfs", "size": 10240}},
23+
)
24+
await helpers.wait_idle(ops_test, units=3)
25+
26+
27+
@pytest.mark.abort_on_fail
28+
async def test_simple_scale_up(ops_test: OpsTest):
29+
await ops_test.model.applications[helpers.APP_NAME].add_units(count=3)
30+
await helpers.wait_idle(ops_test, units=6)
31+
32+
33+
@pytest.mark.abort_on_fail
34+
async def test_simple_scale_down(ops_test: OpsTest):
35+
await ops_test.model.applications[helpers.APP_NAME].destroy_units(
36+
f"{helpers.APP_NAME}/5", f"{helpers.APP_NAME}/4", f"{helpers.APP_NAME}/3"
37+
)
38+
await helpers.wait_idle(ops_test, units=3)
39+
40+
# scaling back up
41+
await ops_test.model.applications[helpers.APP_NAME].add_units(count=3)
42+
await helpers.wait_idle(ops_test, units=6)
43+
44+
45+
@pytest.mark.abort_on_fail
46+
async def test_complex_scale_down(ops_test: OpsTest):
47+
hosts = helpers.get_hosts(ops_test)
48+
49+
quorum_leader_name = helpers.get_leader_name(ops_test, hosts)
50+
charm_leader_name = None
51+
other_unit_name = None
52+
53+
for unit in ops_test.model.applications[helpers.APP_NAME].units:
54+
if await unit.is_leader_from_status():
55+
charm_leader_name = unit.name
56+
57+
if unit.name not in [charm_leader_name, quorum_leader_name]:
58+
other_unit_name = unit.name
59+
60+
await ops_test.model.applications[helpers.APP_NAME].destroy_units(
61+
quorum_leader_name, charm_leader_name, other_unit_name
62+
)
63+
await helpers.wait_idle(ops_test, units=3)

0 commit comments

Comments
 (0)