|
20 | 20 | import com.alibaba.fluss.metadata.TableBucket; |
21 | 21 | import com.alibaba.fluss.server.coordinator.CoordinatorContext; |
22 | 22 | import com.alibaba.fluss.server.coordinator.CoordinatorRequestBatch; |
| 23 | +import com.alibaba.fluss.server.entity.BatchRegisterLeadAndIsr; |
| 24 | +import com.alibaba.fluss.server.entity.RegisterTableBucketLeadAndIsrInfo; |
23 | 25 | import com.alibaba.fluss.server.zk.ZooKeeperClient; |
24 | 26 | import com.alibaba.fluss.server.zk.data.LeaderAndIsr; |
25 | 27 | import com.alibaba.fluss.shaded.guava32.com.google.common.collect.Sets; |
|
29 | 31 |
|
30 | 32 | import javax.annotation.Nullable; |
31 | 33 |
|
| 34 | +import java.util.ArrayList; |
32 | 35 | import java.util.HashSet; |
33 | 36 | import java.util.List; |
| 37 | +import java.util.Objects; |
34 | 38 | import java.util.Optional; |
35 | 39 | import java.util.Set; |
36 | 40 | import java.util.stream.Collectors; |
@@ -112,8 +116,14 @@ public void shutdown() { |
112 | 116 | public void handleStateChange(Set<TableBucket> tableBuckets, BucketState targetState) { |
113 | 117 | try { |
114 | 118 | coordinatorRequestBatch.newBatch(); |
115 | | - for (TableBucket tableBucket : tableBuckets) { |
116 | | - doHandleStateChange(tableBucket, targetState); |
| 119 | + |
| 120 | + if (checkIfCreateTablePartitionRequest(tableBuckets, targetState)) { |
| 121 | + // batch register table bucket lead and isr |
| 122 | + batchHandleOnlineChangeAndInitLeader(tableBuckets); |
| 123 | + } else { |
| 124 | + for (TableBucket tableBucket : tableBuckets) { |
| 125 | + doHandleStateChange(tableBucket, targetState); |
| 126 | + } |
117 | 127 | } |
118 | 128 | coordinatorRequestBatch.sendRequestToTabletServers( |
119 | 129 | coordinatorContext.getCoordinatorEpoch()); |
@@ -242,8 +252,139 @@ private void doHandleStateChange(TableBucket tableBucket, BucketState targetStat |
242 | 252 | } |
243 | 253 | } |
244 | 254 |
|
| 255 | + private boolean checkIfCreateTablePartitionRequest( |
| 256 | + Set<TableBucket> tableBuckets, BucketState targetState) { |
| 257 | + // Check if the state is from NewBucket -> OnlineBucket |
| 258 | + // and all buckets belong to a same table (partition). |
| 259 | + // If so, we will merge the register zk requests to speed up |
| 260 | + if (targetState != BucketState.OnlineBucket) { |
| 261 | + return false; |
| 262 | + } |
| 263 | + |
| 264 | + if (tableBuckets.isEmpty()) { |
| 265 | + return false; |
| 266 | + } |
| 267 | + |
| 268 | + TableBucket first = tableBuckets.iterator().next(); |
| 269 | + |
| 270 | + for (TableBucket tableBucket : tableBuckets) { |
| 271 | + BucketState currentState = coordinatorContext.getBucketState(tableBucket); |
| 272 | + if (currentState != BucketState.NewBucket) { |
| 273 | + return false; |
| 274 | + } |
| 275 | + |
| 276 | + if (tableBucket.getTableId() != first.getTableId() |
| 277 | + || !Objects.equals(tableBucket.getPartitionId(), first.getPartitionId())) { |
| 278 | + // not belong to the same table(partition). |
| 279 | + return false; |
| 280 | + } |
| 281 | + } |
| 282 | + return true; |
| 283 | + } |
| 284 | + |
245 | 285 | private Optional<ElectionResult> initLeaderForTableBuckets( |
246 | 286 | TableBucket tableBucket, List<Integer> assignedServers) { |
| 287 | + Optional<ElectionResult> optionalElectionResult = |
| 288 | + doInitElectionForBucket(tableBucket, assignedServers); |
| 289 | + if (optionalElectionResult.isPresent()) { |
| 290 | + ElectionResult electionResult = optionalElectionResult.get(); |
| 291 | + LeaderAndIsr leaderAndIsr = electionResult.leaderAndIsr; |
| 292 | + try { |
| 293 | + zooKeeperClient.registerLeaderAndIsr(tableBucket, leaderAndIsr); |
| 294 | + } catch (Exception e) { |
| 295 | + LOG.error( |
| 296 | + "Fail to create state node for table bucket {} in zookeeper.", |
| 297 | + stringifyBucket(tableBucket), |
| 298 | + e); |
| 299 | + return Optional.empty(); |
| 300 | + } |
| 301 | + coordinatorContext.putBucketLeaderAndIsr(tableBucket, leaderAndIsr); |
| 302 | + } |
| 303 | + return optionalElectionResult; |
| 304 | + } |
| 305 | + |
| 306 | + public void batchHandleOnlineChangeAndInitLeader(Set<TableBucket> tableBuckets) { |
| 307 | + if (tableBuckets.isEmpty()) { |
| 308 | + return; |
| 309 | + } |
| 310 | + |
| 311 | + TableBucket first = tableBuckets.iterator().next(); |
| 312 | + BatchRegisterLeadAndIsr batchRegister = |
| 313 | + new BatchRegisterLeadAndIsr(first.getTableId(), first.getPartitionId()); |
| 314 | + for (TableBucket tableBucket : tableBuckets) { |
| 315 | + // precheck partition name |
| 316 | + BucketState currentState = coordinatorContext.getBucketState(tableBucket); |
| 317 | + String partitionName = null; |
| 318 | + if (tableBucket.getPartitionId() != null) { |
| 319 | + partitionName = coordinatorContext.getPartitionName(tableBucket.getPartitionId()); |
| 320 | + if (partitionName == null) { |
| 321 | + LOG.error( |
| 322 | + "Can't find partition name for partition: {}.", |
| 323 | + tableBucket.getBucket()); |
| 324 | + logFailedStateChange(tableBucket, currentState, BucketState.OnlineBucket); |
| 325 | + continue; |
| 326 | + } |
| 327 | + } |
| 328 | + |
| 329 | + List<Integer> assignedServers = coordinatorContext.getAssignment(tableBucket); |
| 330 | + |
| 331 | + Optional<ElectionResult> optionalElectionResult = |
| 332 | + doInitElectionForBucket(tableBucket, assignedServers); |
| 333 | + if (!optionalElectionResult.isPresent()) { |
| 334 | + logFailedStateChange(tableBucket, currentState, BucketState.OnlineBucket); |
| 335 | + continue; |
| 336 | + } |
| 337 | + ElectionResult electionResult = optionalElectionResult.get(); |
| 338 | + |
| 339 | + batchRegister.add( |
| 340 | + tableBucket, |
| 341 | + electionResult.leaderAndIsr, |
| 342 | + partitionName, |
| 343 | + electionResult.liveReplicas); |
| 344 | + } |
| 345 | + |
| 346 | + List<RegisterTableBucketLeadAndIsrInfo> registerSuccessList = new ArrayList<>(); |
| 347 | + List<RegisterTableBucketLeadAndIsrInfo> tableBucketLeadAndIsrInfos = |
| 348 | + batchRegister.getRegisterList(); |
| 349 | + |
| 350 | + // Register the initial leader and isr. |
| 351 | + if (!tableBucketLeadAndIsrInfos.isEmpty()) { |
| 352 | + try { |
| 353 | + zooKeeperClient.batchRegisterLeaderAndIsrForTablePartition( |
| 354 | + tableBucketLeadAndIsrInfos); |
| 355 | + registerSuccessList.addAll(tableBucketLeadAndIsrInfos); |
| 356 | + } catch (Exception e) { |
| 357 | + LOG.error( |
| 358 | + "Fail to batch create state node for table buckets in zookeeper. The first bucket info: {}", |
| 359 | + stringifyBucket(tableBucketLeadAndIsrInfos.get(0).getTableBucket()), |
| 360 | + e); |
| 361 | + // Failed in batch mode, try to register one by one. |
| 362 | + registerSuccessList.addAll( |
| 363 | + tryRegisterLeaderAndIsrOneByOne(tableBucketLeadAndIsrInfos)); |
| 364 | + } |
| 365 | + } |
| 366 | + |
| 367 | + for (RegisterTableBucketLeadAndIsrInfo info : registerSuccessList) { |
| 368 | + TableBucket tableBucket = info.getTableBucket(); |
| 369 | + LeaderAndIsr leaderAndIsr = info.getLeaderAndIsr(); |
| 370 | + coordinatorContext.putBucketLeaderAndIsr(tableBucket, leaderAndIsr); |
| 371 | + |
| 372 | + // transmit state |
| 373 | + doStateChange(tableBucket, BucketState.OnlineBucket); |
| 374 | + // then send request to the tablet servers |
| 375 | + coordinatorRequestBatch.addNotifyLeaderRequestForTabletServers( |
| 376 | + new HashSet<>(info.getLiveReplicas()), |
| 377 | + PhysicalTablePath.of( |
| 378 | + coordinatorContext.getTablePathById(tableBucket.getTableId()), |
| 379 | + info.getPartitionName()), |
| 380 | + tableBucket, |
| 381 | + coordinatorContext.getAssignment(tableBucket), |
| 382 | + leaderAndIsr); |
| 383 | + } |
| 384 | + } |
| 385 | + |
| 386 | + private Optional<ElectionResult> doInitElectionForBucket( |
| 387 | + TableBucket tableBucket, List<Integer> assignedServers) { |
247 | 388 | // filter out the live servers |
248 | 389 | List<Integer> liveServers = |
249 | 390 | assignedServers.stream() |
@@ -286,19 +427,27 @@ private Optional<ElectionResult> initLeaderForTableBuckets( |
286 | 427 | // Register the initial leader and isr. |
287 | 428 | LeaderAndIsr leaderAndIsr = |
288 | 429 | new LeaderAndIsr(leader, 0, isr, coordinatorContext.getCoordinatorEpoch(), 0); |
289 | | - try { |
290 | | - zooKeeperClient.registerLeaderAndIsr(tableBucket, leaderAndIsr); |
291 | | - } catch (Exception e) { |
292 | | - LOG.error( |
293 | | - "Fail to create state node for table bucket {} in zookeeper.", |
294 | | - stringifyBucket(tableBucket), |
295 | | - e); |
296 | | - return Optional.empty(); |
297 | | - } |
298 | | - coordinatorContext.putBucketLeaderAndIsr(tableBucket, leaderAndIsr); |
| 430 | + |
299 | 431 | return Optional.of(new ElectionResult(liveServers, leaderAndIsr)); |
300 | 432 | } |
301 | 433 |
|
| 434 | + private List<RegisterTableBucketLeadAndIsrInfo> tryRegisterLeaderAndIsrOneByOne( |
| 435 | + List<RegisterTableBucketLeadAndIsrInfo> registerList) { |
| 436 | + List<RegisterTableBucketLeadAndIsrInfo> registerSuccessList = new ArrayList<>(); |
| 437 | + for (RegisterTableBucketLeadAndIsrInfo info : registerList) { |
| 438 | + try { |
| 439 | + zooKeeperClient.registerLeaderAndIsr(info.getTableBucket(), info.getLeaderAndIsr()); |
| 440 | + registerSuccessList.add(info); |
| 441 | + } catch (Exception e) { |
| 442 | + LOG.error( |
| 443 | + "Fail to create state node for table bucket {} in zookeeper.", |
| 444 | + stringifyBucket(info.getTableBucket()), |
| 445 | + e); |
| 446 | + } |
| 447 | + } |
| 448 | + return registerSuccessList; |
| 449 | + } |
| 450 | + |
302 | 451 | private Optional<ElectionResult> electNewLeaderForTableBuckets(TableBucket tableBucket) { |
303 | 452 | LeaderAndIsr leaderAndIsr; |
304 | 453 | try { |
|
0 commit comments