-
Notifications
You must be signed in to change notification settings - Fork 846
Disconnect tasklist pollers on domain failover using callback #6903
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4adcd83
9b675bd
2645cd3
378b387
6f24d28
237b41c
c2c6426
803bb59
8627f29
dcd86ce
c036145
ec8d6b5
ef8cb30
8292668
f3f2a53
cdd8f8d
fb32497
69957f5
9f65af9
02c47eb
08d395c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,24 +79,25 @@ type ( | |
} | ||
|
||
matchingEngineImpl struct { | ||
shutdownCompletion *sync.WaitGroup | ||
shutdown chan struct{} | ||
taskManager persistence.TaskManager | ||
clusterMetadata cluster.Metadata | ||
historyService history.Client | ||
matchingClient matching.Client | ||
tokenSerializer common.TaskTokenSerializer | ||
logger log.Logger | ||
metricsClient metrics.Client | ||
taskListsLock sync.RWMutex // locks mutation of taskLists | ||
taskLists map[tasklist.Identifier]tasklist.Manager // Convert to LRU cache | ||
config *config.Config | ||
lockableQueryTaskMap lockableQueryTaskMap | ||
domainCache cache.DomainCache | ||
versionChecker client.VersionChecker | ||
membershipResolver membership.Resolver | ||
isolationState isolationgroup.State | ||
timeSource clock.TimeSource | ||
shutdownCompletion *sync.WaitGroup | ||
shutdown chan struct{} | ||
taskManager persistence.TaskManager | ||
clusterMetadata cluster.Metadata | ||
historyService history.Client | ||
matchingClient matching.Client | ||
tokenSerializer common.TaskTokenSerializer | ||
logger log.Logger | ||
metricsClient metrics.Client | ||
taskListsLock sync.RWMutex // locks mutation of taskLists | ||
taskLists map[tasklist.Identifier]tasklist.Manager // Convert to LRU cache | ||
config *config.Config | ||
lockableQueryTaskMap lockableQueryTaskMap | ||
domainCache cache.DomainCache | ||
versionChecker client.VersionChecker | ||
membershipResolver membership.Resolver | ||
isolationState isolationgroup.State | ||
timeSource clock.TimeSource | ||
failoverNotificationVersion int64 | ||
} | ||
|
||
// HistoryInfo consists of two integer regarding the history size and history count | ||
|
@@ -162,6 +163,7 @@ func NewEngine( | |
} | ||
|
||
func (e *matchingEngineImpl) Start() { | ||
e.registerDomainFailoverCallback() | ||
} | ||
|
||
func (e *matchingEngineImpl) Stop() { | ||
|
@@ -170,6 +172,7 @@ func (e *matchingEngineImpl) Stop() { | |
for _, l := range e.getTaskLists(math.MaxInt32) { | ||
l.Stop() | ||
} | ||
e.unregisterDomainFailoverCallback() | ||
e.shutdownCompletion.Wait() | ||
} | ||
|
||
|
@@ -535,7 +538,7 @@ pollLoop: | |
pollerCtx = tasklist.ContextWithIsolationGroup(pollerCtx, req.GetIsolationGroup()) | ||
tlMgr, err := e.getTaskListManager(taskListID, taskListKind) | ||
if err != nil { | ||
return nil, fmt.Errorf("couldn't load tasklist namanger: %w", err) | ||
return nil, fmt.Errorf("couldn't load tasklist manager: %w", err) | ||
} | ||
startT := time.Now() // Record the start time | ||
task, err := tlMgr.GetTask(pollerCtx, nil) | ||
|
@@ -724,7 +727,7 @@ pollLoop: | |
taskListKind := request.TaskList.Kind | ||
tlMgr, err := e.getTaskListManager(taskListID, taskListKind) | ||
if err != nil { | ||
return nil, fmt.Errorf("couldn't load tasklist namanger: %w", err) | ||
return nil, fmt.Errorf("couldn't load tasklist manager: %w", err) | ||
} | ||
startT := time.Now() // Record the start time | ||
task, err := tlMgr.GetTask(pollerCtx, maxDispatch) | ||
|
@@ -1425,6 +1428,82 @@ func (e *matchingEngineImpl) isShuttingDown() bool { | |
} | ||
} | ||
|
||
func (e *matchingEngineImpl) domainChangeCallback(nextDomains []*cache.DomainCacheEntry) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit side note: Domain cache calls these callbacks one by one and waits for them to return. This is the one and only domain change callback in a matching service instance so sync processing here is fine. It's a bigger problem on history side that all the callbacks have to be invoked sequentially before domain cache continues its work. This can potentially delay processing of new domain updates. Something to consider revisiting at some point if we decide to shorten domain cache refresh interval |
||
newFailoverNotificationVersion := e.failoverNotificationVersion | ||
|
||
for _, domain := range nextDomains { | ||
if domain.GetFailoverNotificationVersion() > newFailoverNotificationVersion { | ||
newFailoverNotificationVersion = domain.GetFailoverNotificationVersion() | ||
} | ||
|
||
if !isDomainEligibleToDisconnectPollers(domain, e.failoverNotificationVersion) { | ||
continue | ||
} | ||
|
||
req := &types.GetTaskListsByDomainRequest{ | ||
Domain: domain.GetInfo().Name, | ||
} | ||
|
||
resp, err := e.GetTaskListsByDomain(nil, req) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
for taskListName := range resp.DecisionTaskListMap { | ||
e.disconnectTaskListPollersAfterDomainFailover(taskListName, domain, persistence.TaskListTypeDecision) | ||
} | ||
|
||
for taskListName := range resp.ActivityTaskListMap { | ||
e.disconnectTaskListPollersAfterDomainFailover(taskListName, domain, persistence.TaskListTypeActivity) | ||
} | ||
} | ||
e.failoverNotificationVersion = newFailoverNotificationVersion | ||
} | ||
|
||
func (e *matchingEngineImpl) registerDomainFailoverCallback() { | ||
catchUpFn := func(domainCache cache.DomainCache, _ cache.PrepareCallbackFn, _ cache.CallbackFn) { | ||
for _, domain := range domainCache.GetAllDomain() { | ||
if domain.GetFailoverNotificationVersion() > e.failoverNotificationVersion { | ||
e.failoverNotificationVersion = domain.GetFailoverNotificationVersion() | ||
} | ||
} | ||
} | ||
|
||
e.domainCache.RegisterDomainChangeCallback( | ||
service.Matching, | ||
catchUpFn, | ||
func() {}, | ||
e.domainChangeCallback) | ||
} | ||
|
||
func (e *matchingEngineImpl) unregisterDomainFailoverCallback() { | ||
e.domainCache.UnregisterDomainChangeCallback(service.Matching) | ||
} | ||
|
||
func (e *matchingEngineImpl) disconnectTaskListPollersAfterDomainFailover(taskListName string, domain *cache.DomainCacheEntry, taskType int) { | ||
taskList, err := tasklist.NewIdentifier(domain.GetInfo().ID, taskListName, taskType) | ||
if err != nil { | ||
return | ||
} | ||
tlMgr, err := e.getTaskListManager(taskList, types.TaskListKindNormal.Ptr()) | ||
if err != nil { | ||
e.logger.Error("Couldn't load tasklist manager", tag.Error(err)) | ||
return | ||
} | ||
|
||
err = tlMgr.ReleaseBlockedPollers() | ||
if err != nil { | ||
e.logger.Error("Couldn't disconnect tasklist pollers after domain failover", | ||
tag.Error(err), | ||
tag.WorkflowDomainID(domain.GetInfo().ID), | ||
tag.WorkflowDomainName(domain.GetInfo().Name), | ||
tag.WorkflowTaskListName(taskListName), | ||
tag.WorkflowTaskListType(taskType), | ||
) | ||
return | ||
} | ||
} | ||
|
||
func (m *lockableQueryTaskMap) put(key string, value chan *queryResult) { | ||
m.Lock() | ||
defer m.Unlock() | ||
|
@@ -1451,3 +1530,10 @@ func isMatchingRetryableError(err error) bool { | |
} | ||
return true | ||
} | ||
|
||
func isDomainEligibleToDisconnectPollers(domain *cache.DomainCacheEntry, currentVersion int64) bool { | ||
return domain.IsGlobalDomain() && | ||
domain.GetReplicationConfig() != nil && | ||
!domain.GetReplicationConfig().IsActiveActive() && | ||
domain.GetFailoverNotificationVersion() > currentVersion | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Matching engine is not created on-demand so it doesn't matter probably but for consistency reasons let's unregister during Stop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The matching engine can be completely removed and everything can be put inside handler directly, but the change is not unnecessary.