@@ -239,8 +239,7 @@ public void startup() {
239239 // can process the LeaderRequests that are generated by replicaStateMachine.startup() and
240240 // partitionStateMachine.startup().
241241 // update coordinator metadata cache when CoordinatorServer start.
242- HashSet <ServerInfo > tabletServerInfoList =
243- new HashSet <>(coordinatorContext .getLiveTabletServers ().values ());
242+ Set <ServerInfo > tabletServerInfoList = coordinatorContext .liveTabletServerInfos ();
244243 serverMetadataCache .updateMetadata (
245244 coordinatorContext .getCoordinatorServerInfo (), tabletServerInfoList );
246245 updateTabletServerMetadataCacheWhenStartup (tabletServerInfoList );
@@ -515,7 +514,7 @@ public void process(CoordinatorEvent event) {
515514 }
516515
517516 private void updateMetrics () {
518- tabletServerCount = coordinatorContext .getLiveTabletServers ().size ();
517+ tabletServerCount = coordinatorContext .liveOrShuttingDownTabletServers ().size ();
519518 tableCount = coordinatorContext .allTables ().size ();
520519 bucketCount = coordinatorContext .bucketLeaderAndIsr ().size ();
521520 offlineBucketCount = coordinatorContext .getOfflineBucketCount ();
@@ -570,13 +569,10 @@ private void processCreateTable(CreateTableEvent createTableEvent) {
570569 .keySet ()
571570 .forEach (bucketId -> tableBuckets .add (new TableBucket (tableId , bucketId )));
572571 updateTabletServerMetadataCache (
573- new HashSet <>(coordinatorContext .getLiveTabletServers ().values ()),
574- null ,
575- null ,
576- tableBuckets );
572+ coordinatorContext .liveTabletServerInfos (), null , null , tableBuckets );
577573 } else {
578574 updateTabletServerMetadataCache (
579- new HashSet <>( coordinatorContext .getLiveTabletServers (). values () ),
575+ coordinatorContext .liveTabletServerInfos ( ),
580576 tableId ,
581577 null ,
582578 Collections .emptySet ());
@@ -609,10 +605,7 @@ private void processCreatePartition(CreatePartitionEvent createPartitionEvent) {
609605 bucketId ->
610606 tableBuckets .add (new TableBucket (tableId , partitionId , bucketId )));
611607 updateTabletServerMetadataCache (
612- new HashSet <>(coordinatorContext .getLiveTabletServers ().values ()),
613- null ,
614- null ,
615- tableBuckets );
608+ coordinatorContext .liveTabletServerInfos (), null , null , tableBuckets );
616609 }
617610
618611 private void processDropTable (DropTableEvent dropTableEvent ) {
@@ -636,10 +629,7 @@ private void processDropTable(DropTableEvent dropTableEvent) {
636629
637630 // send update metadata request.
638631 updateTabletServerMetadataCache (
639- new HashSet <>(coordinatorContext .getLiveTabletServers ().values ()),
640- tableId ,
641- null ,
642- Collections .emptySet ());
632+ coordinatorContext .liveTabletServerInfos (), tableId , null , Collections .emptySet ());
643633 }
644634
645635 private void processDropPartition (DropPartitionEvent dropPartitionEvent ) {
@@ -663,7 +653,7 @@ private void processDropPartition(DropPartitionEvent dropPartitionEvent) {
663653
664654 // send update metadata request.
665655 updateTabletServerMetadataCache (
666- new HashSet <>( coordinatorContext .getLiveTabletServers (). values () ),
656+ coordinatorContext .liveTabletServerInfos ( ),
667657 tableId ,
668658 tablePartition .getPartitionId (),
669659 Collections .emptySet ());
@@ -781,7 +771,7 @@ private void processNewTabletServer(NewTabletServerEvent newTabletServerEvent) {
781771 // when we finish the logic of tablet server
782772 ServerInfo serverInfo = newTabletServerEvent .getServerInfo ();
783773 int tabletServerId = serverInfo .id ();
784- if (coordinatorContext .getLiveTabletServers ().containsKey (serverInfo .id ())) {
774+ if (coordinatorContext .liveTabletServerIds ().contains (serverInfo .id ())) {
785775 // if the dead server is already in live servers, return directly
786776 // it may happen during coordinator server initiation, the watcher watch a new tablet
787777 // server register event and put it to event manager, but after that, the coordinator
@@ -803,13 +793,10 @@ private void processNewTabletServer(NewTabletServerEvent newTabletServerEvent) {
803793 // update coordinatorServer metadata cache for the new added table server.
804794 serverMetadataCache .updateMetadata (
805795 coordinatorContext .getCoordinatorServerInfo (),
806- new HashSet <>( coordinatorContext .getLiveTabletServers (). values () ));
796+ coordinatorContext .liveTabletServerInfos ( ));
807797 // update server info for all tablet servers.
808798 updateTabletServerMetadataCache (
809- new HashSet <>(coordinatorContext .getLiveTabletServers ().values ()),
810- null ,
811- null ,
812- Collections .emptySet ());
799+ coordinatorContext .liveTabletServerInfos (), null , null , Collections .emptySet ());
813800 // update table info for the new added table server.
814801 updateTabletServerMetadataCache (
815802 Collections .singleton (serverInfo ),
@@ -838,7 +825,7 @@ private void processNewTabletServer(NewTabletServerEvent newTabletServerEvent) {
838825
839826 private void processDeadTabletServer (DeadTabletServerEvent deadTabletServerEvent ) {
840827 int tabletServerId = deadTabletServerEvent .getServerId ();
841- if (!coordinatorContext .getLiveTabletServers ().containsKey (tabletServerId )) {
828+ if (!coordinatorContext .liveOrShuttingDownTabletServers ().contains (tabletServerId )) {
842829 // if the dead server is already not in live servers, return directly
843830 // it may happen during coordinator server initiation, the watcher watch a new tablet
844831 // server unregister event, but the coordinator server also don't read it from zk and
@@ -856,8 +843,7 @@ private void processDeadTabletServer(DeadTabletServerEvent deadTabletServerEvent
856843 // coordinatorServer metadata. The purpose of this approach is to prevent the scenario where
857844 // NotifyLeaderAndIsrRequest gets sent before UpdateMetadataRequest, which could cause the
858845 // leader to incorrectly adjust isr.
859- Set <ServerInfo > serverInfos =
860- new HashSet <>(coordinatorContext .getLiveTabletServers ().values ());
846+ Set <ServerInfo > serverInfos = coordinatorContext .liveTabletServerInfos ();
861847 // update coordinatorServer metadata cache.
862848 serverMetadataCache .updateMetadata (
863849 coordinatorContext .getCoordinatorServerInfo (), serverInfos );
@@ -1118,7 +1104,7 @@ private ControlledShutdownResponse tryProcessControlledShutdown(
11181104 ControlledShutdownResponse response = new ControlledShutdownResponse ();
11191105
11201106 // TODO here we need to check tabletServerEpoch, avoid to receive controlled shutdown
1121- // request from and old tabletServer.
1107+ // request from an old tabletServer. Trace by https://github.com/alibaba/fluss/issues/1153
11221108 int tabletServerEpoch = controlledShutdownEvent .getTabletServerEpoch ();
11231109
11241110 int tabletServerId = controlledShutdownEvent .getTabletServerId ();
@@ -1135,14 +1121,14 @@ private ControlledShutdownResponse tryProcessControlledShutdown(
11351121 LOG .debug (
11361122 "All shutting down tabletServers: {}" ,
11371123 coordinatorContext .shuttingDownTabletServers ());
1138- LOG .debug ("All live tabletServers: {}" , coordinatorContext .liveTabletServerSet ());
1124+ LOG .debug ("All live tabletServers: {}" , coordinatorContext .liveTabletServerIds ());
11391125
11401126 List <TableBucketReplica > replicasToActOn =
11411127 coordinatorContext .replicasOnTabletServer (tabletServerId ).stream ()
11421128 .filter (
11431129 replica -> {
11441130 TableBucket tableBucket = replica .getTableBucket ();
1145- return coordinatorContext .getAssignment (tableBucket ).size () >= 1
1131+ return ! coordinatorContext .getAssignment (tableBucket ).isEmpty ()
11461132 && coordinatorContext
11471133 .getBucketLeaderAndIsr (tableBucket )
11481134 .isPresent ()
@@ -1165,16 +1151,7 @@ private ControlledShutdownResponse tryProcessControlledShutdown(
11651151 tableBucketStateMachine .handleStateChange (
11661152 bucketsLedByServer , OnlineBucket , CONTROLLED_SHUTDOWN_ELECTION );
11671153
1168- coordinatorRequestBatch .newBatch ();
1169- replicasFollowedByServer .forEach (
1170- replica ->
1171- coordinatorRequestBatch .addStopReplicaRequestForTabletServers (
1172- Collections .singleton (tabletServerId ),
1173- replica .getTableBucket (),
1174- false ,
1175- coordinatorContext .getBucketLeaderEpoch (replica .getTableBucket ())));
1176- coordinatorRequestBatch .sendRequestToTabletServers (
1177- coordinatorContext .getCoordinatorEpoch ());
1154+ // TODO need send stop request to the leader?
11781155
11791156 // If the tabletServer is a follower, updates the isr in ZK and notifies the current leader.
11801157 replicaStateMachine .handleStateChanges (replicasFollowedByServer , OfflineReplica );
0 commit comments