Skip to content

Commit f7704d3

Browse files
committed
pglookout: support explicit failover priorities
Support explicit prioritization between instances. This can be configured via ``failover_priorities`` key, and will be consulted upon picking up the standby that should do the promotion in cases where multiple nodes have a matching replication position. Previously, and also as the current default, the selection was based on the sorting order of the remote nodes. The configuration option allows some additional flexibility, and supports e.g. topologies where we have more favorable and less desirable standbys in multiple different network locations.
1 parent 7bb297f commit f7704d3

File tree

3 files changed

+117
-8
lines changed

3 files changed

+117
-8
lines changed

README.rst

+8
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,14 @@ over_warning_limit_command and to create a warning file.
295295

296296
Shell command to execute in case the node has deemed itself in need of promotion
297297

298+
``failover_priorities`` (default ``{}``)
299+
300+
Define priority of nodes for promotion, in case there are multiple candidates
301+
with the same replication position. This allows to ensure all pglookout instances
302+
would elect the same standby for promotion, while still allowing for topologies
303+
with e.g. less preferred standbys in secondary network locations. By default,
304+
pglookout uses remote connection ids for the same selection purpose.
305+
298306
``known_gone_nodes`` (default ``[]``)
299307

300308
Lists nodes that are explicitly known to have left the cluster. If the old

pglookout/pglookout.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
This file is under the Apache License, Version 2.0.
88
See the file `LICENSE` for details.
99
"""
10+
1011
from . import logutil, statsd, version
1112
from .cluster_monitor import ClusterMonitor
1213
from .common import convert_xlog_location_to_offset, get_iso_timestamp, parse_iso_datetime
@@ -643,19 +644,35 @@ def do_failover_decision(self, standby_nodes):
643644
if not known_replication_positions:
644645
self.log.warning("No known replication positions, canceling failover consideration")
645646
return
646-
# If there are multiple nodes with the same replication positions pick the one with the "highest" name
647-
# to make sure pglookouts running on all standbys make the same decision. The rationale for picking
648-
# the "highest" node is that there's no obvious way for pglookout to decide which of the nodes is
649-
# "best" beyond looking at replication positions, but picking the highest id supports environments
650-
# where nodes are assigned identifiers from an incrementing sequence identifiers and where we want to
651-
# promote the latest and greatest node. In static environments node identifiers can be priority
652-
# numbers, with the highest number being the one that should be preferred.
653-
furthest_along_instance = max(known_replication_positions[max(known_replication_positions)])
647+
648+
# Find the instance that is furthest along.
649+
# If there are multiple nodes with the same replication positions, try to identify one to promote either
650+
# via explicit failover priority configuration or pick the one with the "highest" name by sort order.
651+
# The rationale of this logic is to ensure all participating pglookouts running on all standbys make
652+
# the same decision. The "highest" name works well in environments where nodes are assigned identifiers
653+
# from an incrementing sequence and where we want to promote the latest and greatest node.
654+
655+
# First, find the list of instances that share the more recent replication position
656+
furthest_along_instances = known_replication_positions[max(known_replication_positions)]
657+
# Second, sort them by "instance name"
658+
furthest_along_instances = sorted(furthest_along_instances, reverse=True)
659+
# Third, if we have explicit failover priorities, use those for selecting the to be promoted instance
660+
if "failover_priorities" in self.config:
661+
highest_priority = max(
662+
self.config["failover_priorities"].get(instance, 0) for instance in furthest_along_instances
663+
)
664+
furthest_along_instances = [
665+
instance
666+
for instance in furthest_along_instances
667+
if self.config["failover_priorities"].get(instance) == highest_priority
668+
]
669+
furthest_along_instance = furthest_along_instances[0]
654670
self.log.warning(
655671
"Node that is furthest along is: %r, all replication positions were: %r",
656672
furthest_along_instance,
657673
sorted(known_replication_positions),
658674
)
675+
659676
total_observers = len(self.connected_observer_nodes) + len(self.disconnected_observer_nodes)
660677
# +1 in the calculation comes from the master node
661678
total_amount_of_nodes = len(standby_nodes) + 1 - len(self.never_promote_these_nodes) + total_observers

test/test_lookout.py

+84
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,90 @@ def test_standbys_failover_equal_replication_positions(pgl):
931931
assert pgl.execute_external_command.call_count == 1
932932

933933

934+
def test_standbys_failover_equal_replication_positions_with_priorities(pgl):
935+
now = datetime.datetime.utcnow()
936+
set_instance_cluster_state(
937+
pgl,
938+
instance="192.168.54.183",
939+
pg_last_xlog_receive_location="0/70004D8",
940+
pg_is_in_recovery=True,
941+
connection=True,
942+
replication_time_lag=400.435871,
943+
fetch_time=now,
944+
db_time=now,
945+
conn_info="foobar",
946+
)
947+
set_instance_cluster_state(
948+
pgl,
949+
instance="192.168.57.180",
950+
pg_last_xlog_receive_location=None,
951+
pg_is_in_recovery=False,
952+
connection=False,
953+
replication_time_lag=0.0,
954+
fetch_time=now - datetime.timedelta(seconds=3600),
955+
db_time=now - datetime.timedelta(seconds=3600),
956+
conn_info="foobar",
957+
)
958+
set_instance_cluster_state(
959+
pgl,
960+
instance="192.168.63.4",
961+
pg_last_xlog_receive_location="0/70004D8",
962+
pg_is_in_recovery=True,
963+
connection=True,
964+
replication_time_lag=401.104655,
965+
fetch_time=now,
966+
db_time=now,
967+
conn_info="foobar",
968+
)
969+
set_instance_cluster_state(
970+
pgl,
971+
instance="192.168.62.4",
972+
pg_last_xlog_receive_location="0/70004D8",
973+
pg_is_in_recovery=True,
974+
connection=True,
975+
replication_time_lag=401.104655,
976+
fetch_time=now,
977+
db_time=now,
978+
conn_info="foobar",
979+
)
980+
set_instance_cluster_state(
981+
pgl,
982+
instance="192.168.52.183",
983+
pg_last_xlog_receive_location="0/70004D8",
984+
pg_is_in_recovery=True,
985+
connection=True,
986+
replication_time_lag=401.104655,
987+
fetch_time=now,
988+
db_time=now,
989+
conn_info="foobar",
990+
)
991+
992+
pgl.current_master = "192.168.57.180"
993+
994+
pgl.config["failover_priorities"] = {
995+
"192.168.54.183": 1000,
996+
"192.168.52.183": 1000,
997+
"192.168.63.4": 0,
998+
}
999+
1000+
# This is highest by instance, but lower in priority
1001+
pgl.own_db = "192.168.63.4"
1002+
pgl.check_cluster_state()
1003+
assert pgl.execute_external_command.call_count == 0
1004+
# This is second highest by instance, but no priority set - it's counted at 0
1005+
pgl.own_db = "192.168.62.4"
1006+
pgl.check_cluster_state()
1007+
assert pgl.execute_external_command.call_count == 0
1008+
# This node shares highest priority == 1000, but is lower by instance
1009+
pgl.own_db = "192.168.52.183"
1010+
pgl.check_cluster_state()
1011+
assert pgl.execute_external_command.call_count == 0
1012+
# Second lowest by instance, but with priority == 1000
1013+
pgl.own_db = "192.168.54.183"
1014+
pgl.check_cluster_state()
1015+
assert pgl.execute_external_command.call_count == 1
1016+
1017+
9341018
def test_node_map_when_only_observer_sees_master(pgl):
9351019
cluster_state = {
9361020
"10.255.255.10": {

0 commit comments

Comments
 (0)