99
1010from charms .grafana_agent .v0 .cos_agent import COSAgentProvider
1111from charms .rolling_ops .v0 .rollingops import RollingOpsManager
12+ from charms .zookeeper .v0 .client import QuorumLeaderNotFoundError
13+ from kazoo .exceptions import BadVersionError , ReconfigInProcessError
1214from ops .charm import (
1315 CharmBase ,
1416 InstallEvent ,
15- LeaderElectedEvent ,
1617 RelationDepartedEvent ,
1718 SecretChangedEvent ,
1819 StorageAttachedEvent ,
@@ -115,7 +116,7 @@ def __init__(self, *args):
115116 getattr (self .on , "cluster_relation_joined" ), self ._on_cluster_relation_changed
116117 )
117118 self .framework .observe (
118- getattr (self .on , "cluster_relation_departed" ), self ._on_cluster_relation_changed
119+ getattr (self .on , "cluster_relation_departed" ), self ._on_cluster_relation_departed
119120 )
120121
121122 self .framework .observe (
@@ -183,10 +184,6 @@ def _on_cluster_relation_changed(self, event: EventBase) -> None:
183184 # even if leader has not started, attempt update quorum
184185 self .update_quorum (event = event )
185186
186- # don't delay scale-down leader ops by restarting dying unit
187- if getattr (event , "departing_unit" , None ) == self .unit :
188- return
189-
190187 # check whether restart is needed for all `*_changed` events
191188 # only restart where necessary to avoid slowdowns
192189 # config_changed call here implicitly updates jaas + zoo.cfg
@@ -212,9 +209,39 @@ def _on_cluster_relation_changed(self, event: EventBase) -> None:
212209 self ._set_status (Status .SERVICE_UNHEALTHY )
213210 return
214211
212+ # in case server was erroneously removed from the quorum
213+ if not self .state .stale_quorum and not self .quorum_manager .server_in_quorum :
214+ self ._set_status (Status .SERVICE_NOT_QUORUM )
215+ return
216+
215217 self ._set_status (Status .ACTIVE )
216218
217- def _on_storage_attached (self , event : StorageAttachedEvent ) -> None :
219+ def _on_cluster_relation_departed (self , event : RelationDepartedEvent ) -> None :
220+ """Handler for `relation_departed` events."""
221+ # is related to issue found in https://bugs.launchpad.net/juju/+bug/2053055
222+ # likely due to a controller upgrade or a cloud maintenance with machines being reshuffled
223+ # periodically, juju would emit a LeaderElected event, and would return no peer units
224+ # the leader would then remove all other units from the quorum, which when restarted, would fail
225+ if not event .departing_unit :
226+ return
227+
228+ departing_server_id = (
229+ int (event .departing_unit .name .split ("/" )[1 ]) + 1
230+ ) # server-ids must be positive integers
231+
232+ try :
233+ self .quorum_manager .client .remove_members (members = [f"server.{ departing_server_id } " ])
234+ except (
235+ ReconfigInProcessError , # another unit already handling
236+ BadVersionError , # another unit handled
237+ QuorumLeaderNotFoundError , # this unit is departing, can't find leader in peer data
238+ ):
239+ pass
240+
241+ # NOTE: if the leader is also going down, it may miss the event to set {unit.id: removed}
242+ # to avoid this, eventual clean up occurs during update-status calling update_quorum
243+
244+ def _on_storage_attached (self , _ : StorageAttachedEvent ) -> None :
218245 """Handler for `storage_attached` events."""
219246 self .workload .exec (["chmod" , "750" , f"{ self .workload .paths .data_path } " ])
220247 self .workload .exec (["chown" , f"{ USER } :{ GROUP } " , f"{ self .workload .paths .data_path } " ])
@@ -356,10 +383,6 @@ def update_quorum(self, event: EventBase) -> None:
356383
357384 if (
358385 self .state .stale_quorum # in the case of scale-up
359- or isinstance ( # to run without delay to maintain quorum on scale down
360- event ,
361- (RelationDepartedEvent , LeaderElectedEvent ),
362- )
363386 or self .state .healthy # to ensure run on update-status
364387 ):
365388 updated_servers = self .quorum_manager .update_cluster ()
@@ -386,7 +409,7 @@ def update_quorum(self, event: EventBase) -> None:
386409 logger .debug ("tls disabled - switching to non-ssl" )
387410 self .state .cluster .update ({"quorum" : "non-ssl" })
388411
389- if self .state .all_units_quorum :
412+ if self .state .all_units_same_encryption :
390413 logger .debug (
391414 "all units running desired encryption - removing switching-encryption"
392415 )
0 commit comments