Skip to content

Commit bfc997d

Browse files
committed
pglookout: support explicit failover priorities
Support explicit prioritization between instances. This can be configured via ``failover_priorities`` key, and will be consulted upon picking up the standby that should do the promotion in cases where multiple nodes have a matching replication position. Previously, and also as the current default, the selection was based on the sorting order of the remote nodes. The configuration option allows some additional flexibility, and supports e.g. topologies where we have more favorable and less desirable standbys in multiple different network locations.
1 parent 60f65b2 commit bfc997d

File tree

3 files changed

+117
-8
lines changed

3 files changed

+117
-8
lines changed

README.rst

+8
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,14 @@ over_warning_limit_command and to create a warning file.
295295

296296
Shell command to execute in case the node has deemed itself in need of promotion
297297

298+
``failover_priorities`` (default ``{}``)
299+
300+
Define priority of nodes for promotion, in case there are multiple candidates
301+
with the same replication position. This allows to ensure all pglookout instances
302+
would elect the same standby for promotion, while still allowing for topologies
303+
with e.g. less preferred standbys in secondary network locations. By default,
304+
pglookout uses remote connection ids for the same selection purpose.
305+
298306
``known_gone_nodes`` (default ``[]``)
299307

300308
Lists nodes that are explicitly known to have left the cluster. If the old

pglookout/pglookout.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -643,19 +643,36 @@ def do_failover_decision(self, standby_nodes):
643643
if not known_replication_positions:
644644
self.log.warning("No known replication positions, canceling failover consideration")
645645
return
646-
# If there are multiple nodes with the same replication positions pick the one with the "highest" name
647-
# to make sure pglookouts running on all standbys make the same decision. The rationale for picking
648-
# the "highest" node is that there's no obvious way for pglookout to decide which of the nodes is
649-
# "best" beyond looking at replication positions, but picking the highest id supports environments
650-
# where nodes are assigned identifiers from an incrementing sequence identifiers and where we want to
651-
# promote the latest and greatest node. In static environments node identifiers can be priority
652-
# numbers, with the highest number being the one that should be preferred.
653-
furthest_along_instance = max(known_replication_positions[max(known_replication_positions)])
646+
647+
# Find the instance that is furthest along.
648+
# If there are multiple nodes with the same replication positions, try to identify one to promote either
649+
# via explicit failover priority configuration or pick the one with the "highest" name by sort order.
650+
# The rationale of this logic is to ensure all participating pglookouts running on all standbys make
651+
# the same decision. The "highest" name works well in environments where nodes are assigned identifiers
652+
# from an incrementing sequence and where we want to promote the latest and greatest node.
653+
654+
# First, find the list of instances that share the more recent replication position
655+
furthest_along_instances = known_replication_positions[max(known_replication_positions)]
656+
# Second, sort them by "instance name"
657+
furthest_along_instances = sorted(furthest_along_instances, reverse=True)
658+
# Third, if we have explicit failover priorities, use those for selecting the to be promoted instance
659+
if "failover_priorities" in self.config:
660+
highest_priority = max([
661+
self.config["failover_priorities"].get(instance, 0)
662+
for instance in furthest_along_instances
663+
])
664+
furthest_along_instances = [
665+
instance
666+
for instance in furthest_along_instances
667+
if self.config["failover_priorities"].get(instance) == highest_priority
668+
]
669+
furthest_along_instance = furthest_along_instances[0]
654670
self.log.warning(
655671
"Node that is furthest along is: %r, all replication positions were: %r",
656672
furthest_along_instance,
657673
sorted(known_replication_positions),
658674
)
675+
659676
total_observers = len(self.connected_observer_nodes) + len(self.disconnected_observer_nodes)
660677
# +1 in the calculation comes from the master node
661678
total_amount_of_nodes = len(standby_nodes) + 1 - len(self.never_promote_these_nodes) + total_observers

test/test_lookout.py

+84
Original file line numberDiff line numberDiff line change
@@ -1005,6 +1005,90 @@ def test_standbys_failover_equal_replication_positions(pgl):
10051005
assert pgl.execute_external_command.call_count == 1
10061006

10071007

1008+
def test_standbys_failover_equal_replication_positions_with_priorities(pgl):
1009+
now = datetime.datetime.utcnow()
1010+
_set_instance_cluster_state(
1011+
pgl,
1012+
instance="192.168.54.183",
1013+
pg_last_xlog_receive_location="0/70004D8",
1014+
pg_is_in_recovery=True,
1015+
connection=True,
1016+
replication_time_lag=400.435871,
1017+
fetch_time=now,
1018+
db_time=now,
1019+
conn_info="foobar",
1020+
)
1021+
_set_instance_cluster_state(
1022+
pgl,
1023+
instance="192.168.57.180",
1024+
pg_last_xlog_receive_location=None,
1025+
pg_is_in_recovery=False,
1026+
connection=False,
1027+
replication_time_lag=0.0,
1028+
fetch_time=now - datetime.timedelta(seconds=3600),
1029+
db_time=now - datetime.timedelta(seconds=3600),
1030+
conn_info="foobar",
1031+
)
1032+
_set_instance_cluster_state(
1033+
pgl,
1034+
instance="192.168.63.4",
1035+
pg_last_xlog_receive_location="0/70004D8",
1036+
pg_is_in_recovery=True,
1037+
connection=True,
1038+
replication_time_lag=401.104655,
1039+
fetch_time=now,
1040+
db_time=now,
1041+
conn_info="foobar",
1042+
)
1043+
_set_instance_cluster_state(
1044+
pgl,
1045+
instance="192.168.62.4",
1046+
pg_last_xlog_receive_location="0/70004D8",
1047+
pg_is_in_recovery=True,
1048+
connection=True,
1049+
replication_time_lag=401.104655,
1050+
fetch_time=now,
1051+
db_time=now,
1052+
conn_info="foobar",
1053+
)
1054+
_set_instance_cluster_state(
1055+
pgl,
1056+
instance="192.168.52.183",
1057+
pg_last_xlog_receive_location="0/70004D8",
1058+
pg_is_in_recovery=True,
1059+
connection=True,
1060+
replication_time_lag=401.104655,
1061+
fetch_time=now,
1062+
db_time=now,
1063+
conn_info="foobar",
1064+
)
1065+
1066+
pgl.current_master = "192.168.57.180"
1067+
1068+
pgl.config["failover_priorities"] = {
1069+
"192.168.54.183": 1000,
1070+
"192.168.52.183": 1000,
1071+
"192.168.63.4": 0,
1072+
}
1073+
1074+
# This is highest by instance, but lower in priority
1075+
pgl.own_db = "192.168.63.4"
1076+
pgl.check_cluster_state()
1077+
assert pgl.execute_external_command.call_count == 0
1078+
# This is second highest by instance, but no priority set - it's counted at 0
1079+
pgl.own_db = "192.168.62.4"
1080+
pgl.check_cluster_state()
1081+
assert pgl.execute_external_command.call_count == 0
1082+
# This node shares highest priority == 1000, but is lower by instance
1083+
pgl.own_db = "192.168.52.183"
1084+
pgl.check_cluster_state()
1085+
assert pgl.execute_external_command.call_count == 0
1086+
# Second lowest by instance, but with priority == 1000
1087+
pgl.own_db = "192.168.54.183"
1088+
pgl.check_cluster_state()
1089+
assert pgl.execute_external_command.call_count == 1
1090+
1091+
10081092
def test_node_map_when_only_observer_sees_master(pgl):
10091093
cluster_state = {
10101094
"10.255.255.10": {

0 commit comments

Comments
 (0)