@@ -79,24 +79,25 @@ type (
79
79
}
80
80
81
81
matchingEngineImpl struct {
82
- shutdownCompletion * sync.WaitGroup
83
- shutdown chan struct {}
84
- taskManager persistence.TaskManager
85
- clusterMetadata cluster.Metadata
86
- historyService history.Client
87
- matchingClient matching.Client
88
- tokenSerializer common.TaskTokenSerializer
89
- logger log.Logger
90
- metricsClient metrics.Client
91
- taskListsLock sync.RWMutex // locks mutation of taskLists
92
- taskLists map [tasklist.Identifier ]tasklist.Manager // Convert to LRU cache
93
- config * config.Config
94
- lockableQueryTaskMap lockableQueryTaskMap
95
- domainCache cache.DomainCache
96
- versionChecker client.VersionChecker
97
- membershipResolver membership.Resolver
98
- isolationState isolationgroup.State
99
- timeSource clock.TimeSource
82
+ shutdownCompletion * sync.WaitGroup
83
+ shutdown chan struct {}
84
+ taskManager persistence.TaskManager
85
+ clusterMetadata cluster.Metadata
86
+ historyService history.Client
87
+ matchingClient matching.Client
88
+ tokenSerializer common.TaskTokenSerializer
89
+ logger log.Logger
90
+ metricsClient metrics.Client
91
+ taskListsLock sync.RWMutex // locks mutation of taskLists
92
+ taskLists map [tasklist.Identifier ]tasklist.Manager // Convert to LRU cache
93
+ config * config.Config
94
+ lockableQueryTaskMap lockableQueryTaskMap
95
+ domainCache cache.DomainCache
96
+ versionChecker client.VersionChecker
97
+ membershipResolver membership.Resolver
98
+ isolationState isolationgroup.State
99
+ timeSource clock.TimeSource
100
+ failoverNotificationVersion int64
100
101
}
101
102
102
103
// HistoryInfo consists of two integer regarding the history size and history count
@@ -162,6 +163,7 @@ func NewEngine(
162
163
}
163
164
164
165
func (e * matchingEngineImpl ) Start () {
166
+ e .registerDomainFailoverCallback ()
165
167
}
166
168
167
169
func (e * matchingEngineImpl ) Stop () {
@@ -170,6 +172,7 @@ func (e *matchingEngineImpl) Stop() {
170
172
for _ , l := range e .getTaskLists (math .MaxInt32 ) {
171
173
l .Stop ()
172
174
}
175
+ e .unregisterDomainFailoverCallback ()
173
176
e .shutdownCompletion .Wait ()
174
177
}
175
178
@@ -535,7 +538,7 @@ pollLoop:
535
538
pollerCtx = tasklist .ContextWithIsolationGroup (pollerCtx , req .GetIsolationGroup ())
536
539
tlMgr , err := e .getTaskListManager (taskListID , taskListKind )
537
540
if err != nil {
538
- return nil , fmt .Errorf ("couldn't load tasklist namanger : %w" , err )
541
+ return nil , fmt .Errorf ("couldn't load tasklist manager : %w" , err )
539
542
}
540
543
startT := time .Now () // Record the start time
541
544
task , err := tlMgr .GetTask (pollerCtx , nil )
@@ -724,7 +727,7 @@ pollLoop:
724
727
taskListKind := request .TaskList .Kind
725
728
tlMgr , err := e .getTaskListManager (taskListID , taskListKind )
726
729
if err != nil {
727
- return nil , fmt .Errorf ("couldn't load tasklist namanger : %w" , err )
730
+ return nil , fmt .Errorf ("couldn't load tasklist manager : %w" , err )
728
731
}
729
732
startT := time .Now () // Record the start time
730
733
task , err := tlMgr .GetTask (pollerCtx , maxDispatch )
@@ -1425,6 +1428,82 @@ func (e *matchingEngineImpl) isShuttingDown() bool {
1425
1428
}
1426
1429
}
1427
1430
1431
+ func (e * matchingEngineImpl ) domainChangeCallback (nextDomains []* cache.DomainCacheEntry ) {
1432
+ newFailoverNotificationVersion := e .failoverNotificationVersion
1433
+
1434
+ for _ , domain := range nextDomains {
1435
+ if domain .GetFailoverNotificationVersion () > newFailoverNotificationVersion {
1436
+ newFailoverNotificationVersion = domain .GetFailoverNotificationVersion ()
1437
+ }
1438
+
1439
+ if ! isDomainEligibleToDisconnectPollers (domain , e .failoverNotificationVersion ) {
1440
+ continue
1441
+ }
1442
+
1443
+ req := & types.GetTaskListsByDomainRequest {
1444
+ Domain : domain .GetInfo ().Name ,
1445
+ }
1446
+
1447
+ resp , err := e .GetTaskListsByDomain (nil , req )
1448
+ if err != nil {
1449
+ continue
1450
+ }
1451
+
1452
+ for taskListName := range resp .DecisionTaskListMap {
1453
+ e .disconnectTaskListPollersAfterDomainFailover (taskListName , domain , persistence .TaskListTypeDecision )
1454
+ }
1455
+
1456
+ for taskListName := range resp .ActivityTaskListMap {
1457
+ e .disconnectTaskListPollersAfterDomainFailover (taskListName , domain , persistence .TaskListTypeActivity )
1458
+ }
1459
+ }
1460
+ e .failoverNotificationVersion = newFailoverNotificationVersion
1461
+ }
1462
+
1463
+ func (e * matchingEngineImpl ) registerDomainFailoverCallback () {
1464
+ catchUpFn := func (domainCache cache.DomainCache , _ cache.PrepareCallbackFn , _ cache.CallbackFn ) {
1465
+ for _ , domain := range domainCache .GetAllDomain () {
1466
+ if domain .GetFailoverNotificationVersion () > e .failoverNotificationVersion {
1467
+ e .failoverNotificationVersion = domain .GetFailoverNotificationVersion ()
1468
+ }
1469
+ }
1470
+ }
1471
+
1472
+ e .domainCache .RegisterDomainChangeCallback (
1473
+ service .Matching ,
1474
+ catchUpFn ,
1475
+ func () {},
1476
+ e .domainChangeCallback )
1477
+ }
1478
+
1479
+ func (e * matchingEngineImpl ) unregisterDomainFailoverCallback () {
1480
+ e .domainCache .UnregisterDomainChangeCallback (service .Matching )
1481
+ }
1482
+
1483
+ func (e * matchingEngineImpl ) disconnectTaskListPollersAfterDomainFailover (taskListName string , domain * cache.DomainCacheEntry , taskType int ) {
1484
+ taskList , err := tasklist .NewIdentifier (domain .GetInfo ().ID , taskListName , taskType )
1485
+ if err != nil {
1486
+ return
1487
+ }
1488
+ tlMgr , err := e .getTaskListManager (taskList , types .TaskListKindNormal .Ptr ())
1489
+ if err != nil {
1490
+ e .logger .Error ("Couldn't load tasklist manager" , tag .Error (err ))
1491
+ return
1492
+ }
1493
+
1494
+ err = tlMgr .ReleaseBlockedPollers ()
1495
+ if err != nil {
1496
+ e .logger .Error ("Couldn't disconnect tasklist pollers after domain failover" ,
1497
+ tag .Error (err ),
1498
+ tag .WorkflowDomainID (domain .GetInfo ().ID ),
1499
+ tag .WorkflowDomainName (domain .GetInfo ().Name ),
1500
+ tag .WorkflowTaskListName (taskListName ),
1501
+ tag .WorkflowTaskListType (taskType ),
1502
+ )
1503
+ return
1504
+ }
1505
+ }
1506
+
1428
1507
func (m * lockableQueryTaskMap ) put (key string , value chan * queryResult ) {
1429
1508
m .Lock ()
1430
1509
defer m .Unlock ()
@@ -1451,3 +1530,10 @@ func isMatchingRetryableError(err error) bool {
1451
1530
}
1452
1531
return true
1453
1532
}
1533
+
1534
+ func isDomainEligibleToDisconnectPollers (domain * cache.DomainCacheEntry , currentVersion int64 ) bool {
1535
+ return domain .IsGlobalDomain () &&
1536
+ domain .GetReplicationConfig () != nil &&
1537
+ ! domain .GetReplicationConfig ().IsActiveActive () &&
1538
+ domain .GetFailoverNotificationVersion () > currentVersion
1539
+ }
0 commit comments