|
7 | 7 | This file is under the Apache License, Version 2.0.
|
8 | 8 | See the file `LICENSE` for details.
|
9 | 9 | """
|
| 10 | + |
10 | 11 | from . import logutil, statsd, version
|
11 | 12 | from .cluster_monitor import ClusterMonitor
|
12 | 13 | from .common import convert_xlog_location_to_offset, get_iso_timestamp, parse_iso_datetime
|
@@ -643,19 +644,35 @@ def do_failover_decision(self, standby_nodes):
|
643 | 644 | if not known_replication_positions:
|
644 | 645 | self.log.warning("No known replication positions, canceling failover consideration")
|
645 | 646 | return
|
646 |
| - # If there are multiple nodes with the same replication positions pick the one with the "highest" name |
647 |
| - # to make sure pglookouts running on all standbys make the same decision. The rationale for picking |
648 |
| - # the "highest" node is that there's no obvious way for pglookout to decide which of the nodes is |
649 |
| - # "best" beyond looking at replication positions, but picking the highest id supports environments |
650 |
| - # where nodes are assigned identifiers from an incrementing sequence identifiers and where we want to |
651 |
| - # promote the latest and greatest node. In static environments node identifiers can be priority |
652 |
| - # numbers, with the highest number being the one that should be preferred. |
653 |
| - furthest_along_instance = max(known_replication_positions[max(known_replication_positions)]) |
| 647 | + |
| 648 | + # Find the instance that is furthest along. |
| 649 | + # If there are multiple nodes with the same replication positions, try to identify one to promote either |
| 650 | + # via explicit failover priority configuration or pick the one with the "highest" name by sort order. |
| 651 | + # The rationale of this logic is to ensure all participating pglookouts running on all standbys make |
| 652 | + # the same decision. The "highest" name works well in environments where nodes are assigned identifiers |
| 653 | + # from an incrementing sequence and where we want to promote the latest and greatest node. |
| 654 | + |
| 655 | + # First, find the list of instances that share the more recent replication position |
| 656 | + furthest_along_instances = known_replication_positions[max(known_replication_positions)] |
| 657 | + # Second, sort them by "instance name" |
| 658 | + furthest_along_instances = sorted(furthest_along_instances, reverse=True) |
| 659 | + # Third, if we have explicit failover priorities, use those for selecting the to be promoted instance |
| 660 | + if "failover_priorities" in self.config: |
| 661 | + highest_priority = max( |
| 662 | + self.config["failover_priorities"].get(instance, 0) for instance in furthest_along_instances |
| 663 | + ) |
| 664 | + furthest_along_instances = [ |
| 665 | + instance |
| 666 | + for instance in furthest_along_instances |
| 667 | + if self.config["failover_priorities"].get(instance) == highest_priority |
| 668 | + ] |
| 669 | + furthest_along_instance = furthest_along_instances[0] |
654 | 670 | self.log.warning(
|
655 | 671 | "Node that is furthest along is: %r, all replication positions were: %r",
|
656 | 672 | furthest_along_instance,
|
657 | 673 | sorted(known_replication_positions),
|
658 | 674 | )
|
| 675 | + |
659 | 676 | total_observers = len(self.connected_observer_nodes) + len(self.disconnected_observer_nodes)
|
660 | 677 | # +1 in the calculation comes from the master node
|
661 | 678 | total_amount_of_nodes = len(standby_nodes) + 1 - len(self.never_promote_these_nodes) + total_observers
|
|
0 commit comments