Skip to content

Commit 15122ac

Browse files
committed
[WIP] Auto leader rebalancing
1 parent d235a0d commit 15122ac

File tree

10 files changed

+425
-9
lines changed

10 files changed

+425
-9
lines changed

fluss-common/src/main/java/com/alibaba/fluss/config/ConfigOptions.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,34 @@ public class ConfigOptions {
228228
public static final ConfigOption<List<String>> SERVER_SASL_ENABLED_MECHANISMS_CONFIG =
229229
key("security.sasl.enabled.mechanisms").stringType().asList().noDefaultValue();
230230

231+
public static final ConfigOption<Integer> LEADER_IMBALANCE_PER_TABLET_SERVER_PERCENTAGE =
232+
key("leader.imbalance.per.tablet-server.percentage")
233+
.intType()
234+
.defaultValue(10)
235+
.withDescription(
236+
"The ratio of leader imbalance allowed per tablet server. "
237+
+ "The coordinator would trigger a leader balance if it goes above this value per tablet server. "
238+
+ "The value is specified in percentage.");
239+
240+
public static final ConfigOption<Integer> LEADER_IMBALANCE_CHECK_INTERVAL_SECONDS =
241+
key("leader.imbalance.check.interval.seconds")
242+
.intType()
243+
.defaultValue(300)
244+
.withDeprecatedKeys(
245+
"The frequency with which the replica rebalance check is triggered by the coordinator.");
246+
247+
public static final ConfigOption<Boolean> AUTO_LEADER_REBALANCE_ENABLE =
248+
key("auto.leader.rebalance.enable")
249+
.booleanType()
250+
.defaultValue(true)
251+
.withDescription(
252+
"Enables auto leader balancing. A background thread checks the distribution of replica leaders at regular intervals, "
253+
+ "configurable by "
254+
+ LEADER_IMBALANCE_CHECK_INTERVAL_SECONDS.key()
255+
+ ". If the leader imbalance exceeds "
256+
+ LEADER_IMBALANCE_PER_TABLET_SERVER_PERCENTAGE.key()
257+
+ ", leader rebalance to the preferred leader for replicas is triggered.");
258+
231259
// ------------------------------------------------------------------------
232260
// ConfigOptions for Coordinator Server
233261
// ------------------------------------------------------------------------

fluss-server/src/main/java/com/alibaba/fluss/server/coordinator/CoordinatorEventProcessor.java

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import com.alibaba.fluss.rpc.protocol.ApiError;
4646
import com.alibaba.fluss.server.coordinator.event.AccessContextEvent;
4747
import com.alibaba.fluss.server.coordinator.event.AdjustIsrReceivedEvent;
48+
import com.alibaba.fluss.server.coordinator.event.AutoPreferredReplicaLeaderElection;
4849
import com.alibaba.fluss.server.coordinator.event.CommitKvSnapshotEvent;
4950
import com.alibaba.fluss.server.coordinator.event.CommitLakeTableSnapshotEvent;
5051
import com.alibaba.fluss.server.coordinator.event.CommitRemoteLogManifestEvent;
@@ -87,6 +88,7 @@
8788
import com.alibaba.fluss.server.zk.data.TabletServerRegistration;
8889
import com.alibaba.fluss.server.zk.data.ZkData.PartitionIdsZNode;
8990
import com.alibaba.fluss.server.zk.data.ZkData.TableIdsZNode;
91+
import com.alibaba.fluss.utils.concurrent.Scheduler;
9092
import com.alibaba.fluss.utils.types.Tuple2;
9193

9294
import org.slf4j.Logger;
@@ -97,6 +99,7 @@
9799

98100
import java.util.ArrayList;
99101
import java.util.Collections;
102+
import java.util.HashMap;
100103
import java.util.HashSet;
101104
import java.util.List;
102105
import java.util.Map;
@@ -108,7 +111,9 @@
108111

109112
import static com.alibaba.fluss.server.coordinator.statemachine.BucketState.OfflineBucket;
110113
import static com.alibaba.fluss.server.coordinator.statemachine.BucketState.OnlineBucket;
114+
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.preferredReplicaLeaderElection;
111115
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.CONTROLLED_SHUTDOWN_ELECTION;
116+
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.PREFERRED_LEADER_ELECTION;
112117
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaState.OfflineReplica;
113118
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaState.OnlineReplica;
114119
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaState.ReplicaDeletionStarted;
@@ -138,9 +143,12 @@ public class CoordinatorEventProcessor implements EventProcessor {
138143
private final CoordinatorRequestBatch coordinatorRequestBatch;
139144
private final CoordinatorMetricGroup coordinatorMetricGroup;
140145
private final String internalListenerName;
146+
private final Configuration conf;
141147

142148
private final CompletedSnapshotStoreManager completedSnapshotStoreManager;
143149

150+
private final Scheduler scheduler;
151+
144152
// metrics
145153
private volatile int tabletServerCount;
146154
private volatile int offlineBucketCount;
@@ -157,7 +165,8 @@ public CoordinatorEventProcessor(
157165
LakeTableTieringManager lakeTableTieringManager,
158166
CoordinatorMetricGroup coordinatorMetricGroup,
159167
Configuration conf,
160-
ExecutorService ioExecutor) {
168+
ExecutorService ioExecutor,
169+
Scheduler scheduler) {
161170
this.zooKeeperClient = zooKeeperClient;
162171
this.serverMetadataCache = serverMetadataCache;
163172
this.coordinatorChannelManager = coordinatorChannelManager;
@@ -202,7 +211,9 @@ public CoordinatorEventProcessor(
202211
this.autoPartitionManager = autoPartitionManager;
203212
this.lakeTableTieringManager = lakeTableTieringManager;
204213
this.coordinatorMetricGroup = coordinatorMetricGroup;
214+
this.conf = conf;
205215
this.internalListenerName = conf.getString(ConfigOptions.INTERNAL_LISTENER_NAME);
216+
this.scheduler = scheduler;
206217
registerMetrics();
207218
}
208219

@@ -250,6 +261,10 @@ public void startup() {
250261

251262
// start the event manager which will then process the event
252263
coordinatorEventManager.start();
264+
265+
if (conf.getBoolean(ConfigOptions.AUTO_LEADER_REBALANCE_ENABLE)) {
266+
scheduleAutoLeaderRebalanceTask(5000);
267+
}
253268
}
254269

255270
public void shutdown() {
@@ -283,6 +298,13 @@ private ServerInfo getCoordinatorServerInfo() {
283298
}
284299
}
285300

301+
private void scheduleAutoLeaderRebalanceTask(long delayMs) {
302+
scheduler.scheduleOnce(
303+
"auto-leader-rebalance-task",
304+
() -> coordinatorEventManager.put(new AutoPreferredReplicaLeaderElection()),
305+
delayMs);
306+
}
307+
286308
public int getCoordinatorEpoch() {
287309
return coordinatorContext.getCoordinatorEpoch();
288310
}
@@ -505,6 +527,8 @@ public void process(CoordinatorEvent event) {
505527
} else if (event instanceof AccessContextEvent) {
506528
AccessContextEvent<?> accessContextEvent = (AccessContextEvent<?>) event;
507529
processAccessContext(accessContextEvent);
530+
} else if (event instanceof AutoPreferredReplicaLeaderElection) {
531+
processAutoPreferredReplicaLeaderElection();
508532
} else {
509533
LOG.warn("Unknown event type: {}", event.getClass().getName());
510534
}
@@ -1053,6 +1077,88 @@ private <T> void processAccessContext(AccessContextEvent<T> event) {
10531077
}
10541078
}
10551079

1080+
private void processAutoPreferredReplicaLeaderElection() {
1081+
try {
1082+
LOG.info("Processing automatic preferred replica leader election");
1083+
checkAndTriggerAutoLeaderRebalance();
1084+
} finally {
1085+
scheduleAutoLeaderRebalanceTask(
1086+
conf.getInt(ConfigOptions.LEADER_IMBALANCE_CHECK_INTERVAL_SECONDS) * 1000L);
1087+
}
1088+
}
1089+
1090+
private void checkAndTriggerAutoLeaderRebalance() {
1091+
LOG.trace("Checking need to trigger auto leader balancing");
1092+
Map<Integer, Map<TableBucket, List<Integer>>> preferredReplicasForTopicsByTabletServers =
1093+
new HashMap<>();
1094+
coordinatorContext.allBuckets().stream()
1095+
.filter(tb -> !coordinatorContext.isToBeDeleted(tb))
1096+
.collect(Collectors.toMap(tb -> tb, coordinatorContext::getAssignment))
1097+
.forEach(
1098+
(tb, assignment) ->
1099+
preferredReplicasForTopicsByTabletServers
1100+
.computeIfAbsent(assignment.get(0), k -> new HashMap<>())
1101+
.put(tb, assignment));
1102+
1103+
// for each tablet server, check if a preferred replica election needs to be triggered
1104+
for (Map.Entry<Integer, Map<TableBucket, List<Integer>>> entry :
1105+
preferredReplicasForTopicsByTabletServers.entrySet()) {
1106+
int leader = entry.getKey();
1107+
Set<TableBucket> tableBucketsNotInPreferredReplica = new HashSet<>();
1108+
for (TableBucket tableBucket : entry.getValue().keySet()) {
1109+
Optional<LeaderAndIsr> leaderAndIsrOp =
1110+
coordinatorContext.getBucketLeaderAndIsr(tableBucket);
1111+
leaderAndIsrOp
1112+
.filter(leaderAndIsr -> leaderAndIsr.leader() != leader)
1113+
.ifPresent(
1114+
leaderAndIsr -> tableBucketsNotInPreferredReplica.add(tableBucket));
1115+
}
1116+
LOG.debug(
1117+
"Table buckets not in preferred replica for tablet server {} {}",
1118+
leader,
1119+
tableBucketsNotInPreferredReplica);
1120+
1121+
double imbalanceRatio =
1122+
(double) tableBucketsNotInPreferredReplica.size() / entry.getValue().size();
1123+
LOG.trace("Leader imbalance ratio for tablet server {} is {}", leader, imbalanceRatio);
1124+
1125+
// check ratio and if greater than desired ratio, trigger a rebalance for the table
1126+
// buckets
1127+
// that need to be on this tablet server
1128+
if (imbalanceRatio
1129+
> ((double)
1130+
conf.getInt(
1131+
ConfigOptions
1132+
.LEADER_IMBALANCE_PER_TABLET_SERVER_PERCENTAGE)
1133+
/ 100)) {
1134+
// do this check only if the tablet server is live and preferred replica election is
1135+
// not in progress
1136+
Set<TableBucket> candidateTableBuckets =
1137+
tableBucketsNotInPreferredReplica.stream()
1138+
.filter(
1139+
tb ->
1140+
!coordinatorContext.isToBeDeleted(tb)
1141+
&& coordinatorContext
1142+
.allBuckets()
1143+
.contains(tb)
1144+
&& canPreferredReplicaBeLeader(tb))
1145+
.collect(Collectors.toSet());
1146+
tableBucketStateMachine.handleStateChange(
1147+
candidateTableBuckets, OnlineBucket, PREFERRED_LEADER_ELECTION);
1148+
}
1149+
}
1150+
}
1151+
1152+
private boolean canPreferredReplicaBeLeader(TableBucket tableBucket) {
1153+
List<Integer> assignment = coordinatorContext.getAssignment(tableBucket);
1154+
List<Integer> liveReplicas =
1155+
assignment.stream()
1156+
.filter(replica -> coordinatorContext.isReplicaOnline(replica, tableBucket))
1157+
.collect(Collectors.toList());
1158+
List<Integer> isr = coordinatorContext.getBucketLeaderAndIsr(tableBucket).get().isr();
1159+
return preferredReplicaLeaderElection(assignment, liveReplicas, isr).isPresent();
1160+
}
1161+
10561162
private CommitLakeTableSnapshotResponse tryProcessCommitLakeTableSnapshot(
10571163
CommitLakeTableSnapshotEvent commitLakeTableSnapshotEvent) {
10581164
CommitLakeTableSnapshotData commitLakeTableSnapshotData =

fluss-server/src/main/java/com/alibaba/fluss/server/coordinator/CoordinatorServer.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@
4949
import com.alibaba.fluss.utils.ExceptionUtils;
5050
import com.alibaba.fluss.utils.ExecutorUtils;
5151
import com.alibaba.fluss.utils.concurrent.ExecutorThreadFactory;
52+
import com.alibaba.fluss.utils.concurrent.FlussScheduler;
5253
import com.alibaba.fluss.utils.concurrent.FutureUtils;
54+
import com.alibaba.fluss.utils.concurrent.Scheduler;
5355

5456
import org.slf4j.Logger;
5557
import org.slf4j.LoggerFactory;
@@ -69,6 +71,7 @@
6971
import java.util.concurrent.TimeUnit;
7072
import java.util.concurrent.atomic.AtomicBoolean;
7173

74+
import static com.alibaba.fluss.config.ConfigOptions.BACKGROUND_THREADS;
7275
import static com.alibaba.fluss.server.utils.LakeStorageUtils.extractLakeProperties;
7376
import static com.alibaba.fluss.utils.Preconditions.checkNotNull;
7477

@@ -137,6 +140,9 @@ public class CoordinatorServer extends ServerBase {
137140
@GuardedBy("lock")
138141
private ExecutorService ioExecutor;
139142

143+
@GuardedBy("lock")
144+
private Scheduler scheduler;
145+
140146
@GuardedBy("lock")
141147
@Nullable
142148
private Authorizer authorizer;
@@ -178,6 +184,9 @@ protected void startServices() throws Exception {
178184
this.coordinatorContext = new CoordinatorContext();
179185
this.metadataCache = new CoordinatorMetadataCache();
180186

187+
this.scheduler = new FlussScheduler(conf.get(BACKGROUND_THREADS));
188+
scheduler.startup();
189+
181190
this.authorizer = AuthorizerLoader.createAuthorizer(conf, zkClient, pluginManager);
182191
if (authorizer != null) {
183192
authorizer.startup();
@@ -240,7 +249,8 @@ protected void startServices() throws Exception {
240249
lakeTableTieringManager,
241250
serverMetricGroup,
242251
conf,
243-
ioExecutor);
252+
ioExecutor,
253+
scheduler);
244254
coordinatorEventProcessor.startup();
245255

246256
createDefaultDatabase();
@@ -379,6 +389,16 @@ CompletableFuture<Void> stopServices() {
379389
exception = ExceptionUtils.firstOrSuppressed(t, exception);
380390
}
381391

392+
// We must shut down the scheduler early because otherwise, the scheduler could
393+
// touch other resources that might have been shutdown and cause exceptions.
394+
try {
395+
if (scheduler != null) {
396+
scheduler.shutdown();
397+
}
398+
} catch (Throwable t) {
399+
exception = ExceptionUtils.firstOrSuppressed(t, exception);
400+
}
401+
382402
try {
383403
if (autoPartitionManager != null) {
384404
autoPartitionManager.close();
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package com.alibaba.fluss.server.coordinator.event;
19+
20+
/** An event for auto rebalance the replica leader. */
21+
public class AutoPreferredReplicaLeaderElection implements CoordinatorEvent {}

fluss-server/src/main/java/com/alibaba/fluss/server/coordinator/statemachine/ReplicaLeaderElectionAlgorithms.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,11 @@ public static Optional<Integer> controlledShutdownReplicaLeaderElection(
5252
}
5353
return Optional.empty();
5454
}
55+
56+
public static Optional<Integer> preferredReplicaLeaderElection(
57+
List<Integer> assignments, List<Integer> aliveReplicas, List<Integer> isr) {
58+
return assignments.stream()
59+
.findFirst()
60+
.filter(id -> aliveReplicas.contains(id) && isr.contains(id));
61+
}
5562
}

fluss-server/src/main/java/com/alibaba/fluss/server/coordinator/statemachine/ReplicaLeaderElectionStrategy.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@
2020
/** The strategies to elect the replica leader. */
2121
public enum ReplicaLeaderElectionStrategy {
2222
DEFAULT_ELECTION,
23-
CONTROLLED_SHUTDOWN_ELECTION
23+
CONTROLLED_SHUTDOWN_ELECTION,
24+
PREFERRED_LEADER_ELECTION
2425
}

fluss-server/src/main/java/com/alibaba/fluss/server/coordinator/statemachine/TableBucketStateMachine.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,10 @@
4242

4343
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.controlledShutdownReplicaLeaderElection;
4444
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.defaultReplicaLeaderElection;
45+
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.preferredReplicaLeaderElection;
4546
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.CONTROLLED_SHUTDOWN_ELECTION;
4647
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.DEFAULT_ELECTION;
48+
import static com.alibaba.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.PREFERRED_LEADER_ELECTION;
4749

4850
/* This file is based on source code of Apache Kafka Project (https://kafka.apache.org/), licensed by the Apache
4951
* Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
@@ -168,10 +170,16 @@ public void handleStateChange(
168170
* leader since the previous leader fail. Do: choose a new leader, send the leader info to the
169171
* servers that hold the replicas of the bucket and mark it as OnlineBucket.
170172
*
171-
* <p>-- For OnlineBucket -> OnlineBucket, it happens on tablet server that holds leaders of
172-
* bucket shutdown graceful. Coordinator server receives the shutdown request from tablet server
173-
* and choose other replicas as the leader. Do: choose a new leader, send the leader info to the
174-
* servers that hold the replicas of the bucket and mark it as OnlineBucket.
173+
* <p>-- For OnlineBucket -> OnlineBucket:
174+
*
175+
* <p>-- Case1: it happens on tablet server that holds leaders of bucket shutdown graceful.
176+
* Coordinator server receives the shutdown request from tablet server and choose other replicas
177+
* as the leader. Do: choose a new leader, send the leader info to the servers that hold the
178+
* replicas of the bucket and mark it as OnlineBucket.
179+
*
180+
* <p>-- Case2: it happens when the leader imbalance of the replicas exceeds the set threshold.
181+
* Do: elect the preferred leader, send the leader info to the servers that hold the replicas of
182+
* the bucket and mark it as OnlineBucket.
175183
*
176184
* <p>NewBucket, OnlineBucket, OfflineBucket -> OfflineBucket
177185
*
@@ -585,6 +593,7 @@ private String stringifyBucket(TableBucket tableBucket) {
585593
* <ol>
586594
* <li>new or offline bucket
587595
* <li>tabletServer controlled shutdown
596+
* <li>preferred replica leader rebalance
588597
* </ol>
589598
*/
590599
private Optional<ElectionResult> electLeader(
@@ -619,6 +628,9 @@ private Optional<ElectionResult> electLeader(
619628
leaderAndIsr.isr(),
620629
liveReplicas,
621630
shuttingDownTabletServers);
631+
} else if (electionStrategy == PREFERRED_LEADER_ELECTION) {
632+
leaderOpt =
633+
preferredReplicaLeaderElection(assignment, liveReplicas, leaderAndIsr.isr());
622634
}
623635

624636
if (!leaderOpt.isPresent()) {

0 commit comments

Comments
 (0)