@@ -156,14 +156,14 @@ func (m *ChannelManagerImpl) Startup(ctx context.Context, legacyNodes, allNodes
156
156
m .mu .Lock ()
157
157
nodeChannels := m .store .GetNodeChannelsBy (
158
158
WithAllNodes (),
159
- func (ch * StateChannel ) bool {
159
+ func (ch * StateChannel ) bool { // Channel with drop-mark
160
160
return m .h .CheckShouldDropChannel (ch .GetName ())
161
161
})
162
- m .mu .Unlock ()
163
162
164
163
for _ , info := range nodeChannels {
165
164
m .finishRemoveChannel (info .NodeID , lo .Values (info .Channels )... )
166
165
}
166
+ m .mu .Unlock ()
167
167
168
168
if m .balanceCheckLoop != nil {
169
169
log .Ctx (ctx ).Info ("starting channel balance loop" )
@@ -238,6 +238,7 @@ func (m *ChannelManagerImpl) Watch(ctx context.Context, ch RWChannel) error {
238
238
zap .Array ("updates" , updates ), zap .Error (err ))
239
239
}
240
240
241
+ // Speed up channel assignment
241
242
// channel already written into meta, try to assign it to the cluster
242
243
// not error is returned if failed, the assignment will retry later
243
244
updates = m .assignPolicy (m .store .GetNodesChannels (), m .store .GetBufferChannelInfo (), m .legacyNodes .Collect ())
@@ -286,11 +287,8 @@ func (m *ChannelManagerImpl) DeleteNode(nodeID UniqueID) error {
286
287
return nil
287
288
}
288
289
289
- // reassign reassigns a channel to another DataNode.
290
+ // inner method, lock before using it, reassign reassigns a channel to another DataNode.
290
291
func (m * ChannelManagerImpl ) reassign (original * NodeChannelInfo ) error {
291
- m .mu .Lock ()
292
- defer m .mu .Unlock ()
293
-
294
292
updates := m .assignPolicy (m .store .GetNodesChannels (), original , m .legacyNodes .Collect ())
295
293
if updates != nil {
296
294
return m .execute (updates )
@@ -436,15 +434,16 @@ func (m *ChannelManagerImpl) CheckLoop(ctx context.Context) {
436
434
}
437
435
438
436
func (m * ChannelManagerImpl ) AdvanceChannelState (ctx context.Context ) {
439
- m .mu .RLock ()
437
+ m .mu .Lock ()
440
438
standbys := m .store .GetNodeChannelsBy (WithAllNodes (), WithChannelStates (Standby ))
441
439
toNotifies := m .store .GetNodeChannelsBy (WithoutBufferNode (), WithChannelStates (ToWatch , ToRelease ))
442
440
toChecks := m .store .GetNodeChannelsBy (WithoutBufferNode (), WithChannelStates (Watching , Releasing ))
443
- m .mu .RUnlock ()
444
441
445
- // Processing standby channels
446
- updatedStandbys := false
447
- updatedStandbys = m .advanceStandbys (ctx , standbys )
442
+ // Reassigning standby channels in locks to avoid concurrent assignment with Watch, Remove, AddNode, DeleteNode
443
+ updatedStandbys := m .advanceStandbys (ctx , standbys )
444
+ m .mu .Unlock ()
445
+
446
+ // RPCs stays out of locks
448
447
updatedToCheckes := m .advanceToChecks (ctx , toChecks )
449
448
updatedToNotifies := m .advanceToNotifies (ctx , toNotifies )
450
449
@@ -453,9 +452,8 @@ func (m *ChannelManagerImpl) AdvanceChannelState(ctx context.Context) {
453
452
}
454
453
}
455
454
455
+ // inner method need lock
456
456
func (m * ChannelManagerImpl ) finishRemoveChannel (nodeID int64 , channels ... RWChannel ) {
457
- m .mu .Lock ()
458
- defer m .mu .Unlock ()
459
457
for _ , ch := range channels {
460
458
if err := m .removeChannel (nodeID , ch ); err != nil {
461
459
log .Warn ("Failed to remove channel" , zap .Any ("channel" , ch ), zap .Error (err ))
@@ -469,6 +467,7 @@ func (m *ChannelManagerImpl) finishRemoveChannel(nodeID int64, channels ...RWCha
469
467
}
470
468
}
471
469
470
+ // inner method need locks
472
471
func (m * ChannelManagerImpl ) advanceStandbys (ctx context.Context , standbys []* NodeChannelInfo ) bool {
473
472
var advanced bool = false
474
473
for _ , nodeAssign := range standbys {
@@ -576,7 +575,7 @@ func (m *ChannelManagerImpl) advanceToNotifies(ctx context.Context, toNotifies [
576
575
}
577
576
578
577
m .mu .Lock ()
579
- m .store .UpdateState (err == nil , nodeID , res .ch , res .opID )
578
+ m .store .UpdateState (err , nodeID , res .ch , res .opID )
580
579
m .mu .Unlock ()
581
580
}
582
581
@@ -592,9 +591,9 @@ func (m *ChannelManagerImpl) advanceToNotifies(ctx context.Context, toNotifies [
592
591
}
593
592
594
593
type poolResult struct {
595
- successful bool
596
- ch RWChannel
597
- opID int64
594
+ err error
595
+ ch RWChannel
596
+ opID int64
598
597
}
599
598
600
599
func (m * ChannelManagerImpl ) advanceToChecks (ctx context.Context , toChecks []* NodeChannelInfo ) bool {
@@ -620,10 +619,14 @@ func (m *ChannelManagerImpl) advanceToChecks(ctx context.Context, toChecks []*No
620
619
future := getOrCreateIOPool ().Submit (func () (any , error ) {
621
620
successful , got := m .Check (ctx , nodeID , tmpWatchInfo )
622
621
if got {
622
+ var err error
623
+ if ! successful {
624
+ err = errors .New ("operation in progress" )
625
+ }
623
626
return poolResult {
624
- successful : successful ,
625
- ch : innerCh ,
626
- opID : tmpWatchInfo .GetOpID (),
627
+ err : err ,
628
+ ch : innerCh ,
629
+ opID : tmpWatchInfo .GetOpID (),
627
630
}, nil
628
631
}
629
632
return nil , errors .New ("Got results with no progress" )
@@ -636,7 +639,7 @@ func (m *ChannelManagerImpl) advanceToChecks(ctx context.Context, toChecks []*No
636
639
if err == nil {
637
640
m .mu .Lock ()
638
641
result := got .(poolResult )
639
- m .store .UpdateState (result .successful , nodeID , result .ch , result .opID )
642
+ m .store .UpdateState (result .err , nodeID , result .ch , result .opID )
640
643
m .mu .Unlock ()
641
644
642
645
advanced = true
@@ -712,6 +715,7 @@ func (m *ChannelManagerImpl) Check(ctx context.Context, nodeID int64, info *data
712
715
return false , false
713
716
}
714
717
718
+ // inner method need lock
715
719
func (m * ChannelManagerImpl ) execute (updates * ChannelOpSet ) error {
716
720
for _ , op := range updates .ops {
717
721
if op .Type != Delete {
0 commit comments