-
Notifications
You must be signed in to change notification settings - Fork 831
Disconnect tasklist pollers on domain failover using callback #6903
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
4adcd83
9b675bd
2645cd3
378b387
6f24d28
237b41c
c2c6426
803bb59
8627f29
dcd86ce
c036145
ec8d6b5
ef8cb30
8292668
f3f2a53
cdd8f8d
fb32497
69957f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -97,6 +97,7 @@ type ( | |
membershipResolver membership.Resolver | ||
isolationState isolationgroup.State | ||
timeSource clock.TimeSource | ||
notificationVersion int64 | ||
} | ||
|
||
// HistoryInfo consists of two integer regarding the history size and history count | ||
|
@@ -162,6 +163,7 @@ func NewEngine( | |
} | ||
|
||
func (e *matchingEngineImpl) Start() { | ||
e.registerDomainFailoverCallback() | ||
} | ||
|
||
func (e *matchingEngineImpl) Stop() { | ||
|
@@ -170,6 +172,7 @@ func (e *matchingEngineImpl) Stop() { | |
for _, l := range e.getTaskLists(math.MaxInt32) { | ||
l.Stop() | ||
} | ||
e.unregisterDomainFailoverCallback() | ||
e.shutdownCompletion.Wait() | ||
} | ||
|
||
|
@@ -535,7 +538,7 @@ pollLoop: | |
pollerCtx = tasklist.ContextWithIsolationGroup(pollerCtx, req.GetIsolationGroup()) | ||
tlMgr, err := e.getTaskListManager(taskListID, taskListKind) | ||
if err != nil { | ||
return nil, fmt.Errorf("couldn't load tasklist namanger: %w", err) | ||
return nil, fmt.Errorf("couldn't load tasklist manager: %w", err) | ||
} | ||
startT := time.Now() // Record the start time | ||
task, err := tlMgr.GetTask(pollerCtx, nil) | ||
|
@@ -724,7 +727,7 @@ pollLoop: | |
taskListKind := request.TaskList.Kind | ||
tlMgr, err := e.getTaskListManager(taskListID, taskListKind) | ||
if err != nil { | ||
return nil, fmt.Errorf("couldn't load tasklist namanger: %w", err) | ||
return nil, fmt.Errorf("couldn't load tasklist manager: %w", err) | ||
} | ||
startT := time.Now() // Record the start time | ||
task, err := tlMgr.GetTask(pollerCtx, maxDispatch) | ||
|
@@ -1425,6 +1428,66 @@ func (e *matchingEngineImpl) isShuttingDown() bool { | |
} | ||
} | ||
|
||
func (e *matchingEngineImpl) domainChangeCallback(nextDomains []*cache.DomainCacheEntry) { | ||
newNotificationVersion := e.notificationVersion | ||
|
||
for _, domain := range nextDomains { | ||
if domain.GetNotificationVersion() > newNotificationVersion { | ||
newNotificationVersion = domain.GetNotificationVersion() | ||
} | ||
|
||
if !isDomainEligibleToDisconnectPollers(domain, e.notificationVersion) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does domain notification version only change when active-> passive switch happens? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is true. I guess it can be more efficient, I'll change that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. notification version should change for every domain change I should think. I guess my question is here: what's the use-case or thing you're guarding against here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was trying to be more efficient when getting domain updates, but if we are monotonically increasing the value I'm not sure if it adds any value. If we always get values that were higher than the stored one, it'll not make any difference. Is my understanding correct here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. took me a second to get my head around the code structure, that makes sense. No concerns. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this could be more efficient by checking the failover version of the domain. From my understanding, notification version is also updated when the domain metadata is updated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But the failover version is independent for each domain, right? I'd have to keep track of each domain failover version independently in the manager, not in the engine. I guess I could use that to track failover instead of using the domain's active name. Does that make sense? |
||
continue | ||
} | ||
|
||
req := &types.GetTaskListsByDomainRequest{ | ||
Domain: domain.GetInfo().Name, | ||
} | ||
|
||
resp, err := e.GetTaskListsByDomain(nil, req) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
for taskListName := range resp.DecisionTaskListMap { | ||
e.disconnectTaskListPollersAfterDomainFailover(taskListName, domain, persistence.TaskListTypeDecision) | ||
} | ||
|
||
for taskListName := range resp.ActivityTaskListMap { | ||
e.disconnectTaskListPollersAfterDomainFailover(taskListName, domain, persistence.TaskListTypeActivity) | ||
} | ||
} | ||
e.notificationVersion = newNotificationVersion | ||
} | ||
|
||
func (e *matchingEngineImpl) registerDomainFailoverCallback() { | ||
e.domainCache.RegisterDomainChangeCallback( | ||
service.Matching, | ||
func(_ cache.DomainCache, _ cache.PrepareCallbackFn, _ cache.CallbackFn) {}, | ||
func() {}, | ||
e.domainChangeCallback) | ||
} | ||
|
||
func (e *matchingEngineImpl) unregisterDomainFailoverCallback() { | ||
e.domainCache.UnregisterDomainChangeCallback(service.Matching) | ||
} | ||
|
||
func (e *matchingEngineImpl) disconnectTaskListPollersAfterDomainFailover(taskListName string, domain *cache.DomainCacheEntry, taskType int) { | ||
taskList, err := tasklist.NewIdentifier(domain.GetInfo().ID, taskListName, taskType) | ||
if err != nil { | ||
return | ||
} | ||
tlMgr, err := e.getTaskListManager(taskList, types.TaskListKindNormal.Ptr()) | ||
if err != nil { | ||
e.logger.Error("Couldn't load tasklist manager", tag.Error(err)) | ||
return | ||
} | ||
|
||
if tlMgr.GetDomainActiveCluster() != "" && tlMgr.GetDomainActiveCluster() != domain.GetReplicationConfig().ActiveClusterName { | ||
tlMgr.DisconnectBlockedPollers(&domain.GetReplicationConfig().ActiveClusterName) | ||
} | ||
} | ||
|
||
func (m *lockableQueryTaskMap) put(key string, value chan *queryResult) { | ||
m.Lock() | ||
defer m.Unlock() | ||
|
@@ -1451,3 +1514,10 @@ func isMatchingRetryableError(err error) bool { | |
} | ||
return true | ||
} | ||
|
||
func isDomainEligibleToDisconnectPollers(domain *cache.DomainCacheEntry, currentVersion int64) bool { | ||
return domain.IsGlobalDomain() && | ||
domain.GetReplicationConfig() != nil && | ||
!domain.GetReplicationConfig().IsActiveActive() && | ||
domain.GetNotificationVersion() > currentVersion | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Matching engine is not created on-demand so it doesn't matter probably but for consistency reasons let's unregister during Stop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The matching engine can be completely removed and everything can be put inside handler directly, but the change is not unnecessary.