Skip to content

Disconnect tasklist pollers on domain failover using callback #6903

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 106 additions & 20 deletions service/matching/handler/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,24 +79,25 @@ type (
}

matchingEngineImpl struct {
shutdownCompletion *sync.WaitGroup
shutdown chan struct{}
taskManager persistence.TaskManager
clusterMetadata cluster.Metadata
historyService history.Client
matchingClient matching.Client
tokenSerializer common.TaskTokenSerializer
logger log.Logger
metricsClient metrics.Client
taskListsLock sync.RWMutex // locks mutation of taskLists
taskLists map[tasklist.Identifier]tasklist.Manager // Convert to LRU cache
config *config.Config
lockableQueryTaskMap lockableQueryTaskMap
domainCache cache.DomainCache
versionChecker client.VersionChecker
membershipResolver membership.Resolver
isolationState isolationgroup.State
timeSource clock.TimeSource
shutdownCompletion *sync.WaitGroup
shutdown chan struct{}
taskManager persistence.TaskManager
clusterMetadata cluster.Metadata
historyService history.Client
matchingClient matching.Client
tokenSerializer common.TaskTokenSerializer
logger log.Logger
metricsClient metrics.Client
taskListsLock sync.RWMutex // locks mutation of taskLists
taskLists map[tasklist.Identifier]tasklist.Manager // Convert to LRU cache
config *config.Config
lockableQueryTaskMap lockableQueryTaskMap
domainCache cache.DomainCache
versionChecker client.VersionChecker
membershipResolver membership.Resolver
isolationState isolationgroup.State
timeSource clock.TimeSource
failoverNotificationVersion int64
}

// HistoryInfo consists of two integer regarding the history size and history count
Expand Down Expand Up @@ -162,6 +163,7 @@ func NewEngine(
}

func (e *matchingEngineImpl) Start() {
e.registerDomainFailoverCallback()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Matching engine is not created on-demand so it doesn't matter probably but for consistency reasons let's unregister during Stop.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The matching engine can be completely removed and everything can be put inside handler directly, but the change is not unnecessary.

}

func (e *matchingEngineImpl) Stop() {
Expand All @@ -170,6 +172,7 @@ func (e *matchingEngineImpl) Stop() {
for _, l := range e.getTaskLists(math.MaxInt32) {
l.Stop()
}
e.unregisterDomainFailoverCallback()
e.shutdownCompletion.Wait()
}

Expand Down Expand Up @@ -535,7 +538,7 @@ pollLoop:
pollerCtx = tasklist.ContextWithIsolationGroup(pollerCtx, req.GetIsolationGroup())
tlMgr, err := e.getTaskListManager(taskListID, taskListKind)
if err != nil {
return nil, fmt.Errorf("couldn't load tasklist namanger: %w", err)
return nil, fmt.Errorf("couldn't load tasklist manager: %w", err)
}
startT := time.Now() // Record the start time
task, err := tlMgr.GetTask(pollerCtx, nil)
Expand Down Expand Up @@ -724,7 +727,7 @@ pollLoop:
taskListKind := request.TaskList.Kind
tlMgr, err := e.getTaskListManager(taskListID, taskListKind)
if err != nil {
return nil, fmt.Errorf("couldn't load tasklist namanger: %w", err)
return nil, fmt.Errorf("couldn't load tasklist manager: %w", err)
}
startT := time.Now() // Record the start time
task, err := tlMgr.GetTask(pollerCtx, maxDispatch)
Expand Down Expand Up @@ -1425,6 +1428,82 @@ func (e *matchingEngineImpl) isShuttingDown() bool {
}
}

func (e *matchingEngineImpl) domainChangeCallback(nextDomains []*cache.DomainCacheEntry) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit side note: Domain cache calls these callbacks one by one and waits for them to return. This is the one and only domain change callback in a matching service instance so sync processing here is fine. It's a bigger problem on history side that all the callbacks have to be invoked sequentially before domain cache continues its work. This can potentially delay processing of new domain updates. Something to consider revisiting at some point if we decide to shorten domain cache refresh interval

newFailoverNotificationVersion := e.failoverNotificationVersion

for _, domain := range nextDomains {
if domain.GetFailoverNotificationVersion() > newFailoverNotificationVersion {
newFailoverNotificationVersion = domain.GetFailoverNotificationVersion()
}

if !isDomainEligibleToDisconnectPollers(domain, e.failoverNotificationVersion) {
continue
}

req := &types.GetTaskListsByDomainRequest{
Domain: domain.GetInfo().Name,
}

resp, err := e.GetTaskListsByDomain(nil, req)
if err != nil {
continue
}

for taskListName := range resp.DecisionTaskListMap {
e.disconnectTaskListPollersAfterDomainFailover(taskListName, domain, persistence.TaskListTypeDecision)
}

for taskListName := range resp.ActivityTaskListMap {
e.disconnectTaskListPollersAfterDomainFailover(taskListName, domain, persistence.TaskListTypeActivity)
}
}
e.failoverNotificationVersion = newFailoverNotificationVersion
}

func (e *matchingEngineImpl) registerDomainFailoverCallback() {
catchUpFn := func(domainCache cache.DomainCache, _ cache.PrepareCallbackFn, _ cache.CallbackFn) {
for _, domain := range domainCache.GetAllDomain() {
if domain.GetFailoverNotificationVersion() > e.failoverNotificationVersion {
e.failoverNotificationVersion = domain.GetFailoverNotificationVersion()
}
}
}

e.domainCache.RegisterDomainChangeCallback(
service.Matching,
catchUpFn,
func() {},
e.domainChangeCallback)
}

func (e *matchingEngineImpl) unregisterDomainFailoverCallback() {
e.domainCache.UnregisterDomainChangeCallback(service.Matching)
}

func (e *matchingEngineImpl) disconnectTaskListPollersAfterDomainFailover(taskListName string, domain *cache.DomainCacheEntry, taskType int) {
taskList, err := tasklist.NewIdentifier(domain.GetInfo().ID, taskListName, taskType)
if err != nil {
return
}
tlMgr, err := e.getTaskListManager(taskList, types.TaskListKindNormal.Ptr())
if err != nil {
e.logger.Error("Couldn't load tasklist manager", tag.Error(err))
return
}

err = tlMgr.ReleaseBlockedPollers()
if err != nil {
e.logger.Error("Couldn't disconnect tasklist pollers after domain failover",
tag.Error(err),
tag.WorkflowDomainID(domain.GetInfo().ID),
tag.WorkflowDomainName(domain.GetInfo().Name),
tag.WorkflowTaskListName(taskListName),
tag.WorkflowTaskListType(taskType),
)
return
}
}

func (m *lockableQueryTaskMap) put(key string, value chan *queryResult) {
m.Lock()
defer m.Unlock()
Expand All @@ -1451,3 +1530,10 @@ func isMatchingRetryableError(err error) bool {
}
return true
}

func isDomainEligibleToDisconnectPollers(domain *cache.DomainCacheEntry, currentVersion int64) bool {
return domain.IsGlobalDomain() &&
domain.GetReplicationConfig() != nil &&
!domain.GetReplicationConfig().IsActiveActive() &&
domain.GetFailoverNotificationVersion() > currentVersion
}
2 changes: 2 additions & 0 deletions service/matching/handler/engine_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ func (s *matchingEngineSuite) SetupTest() {
s.mockDomainCache.EXPECT().GetDomainByID(gomock.Any()).Return(cache.CreateDomainCacheEntry(matchingTestDomainName), nil).AnyTimes()
s.mockDomainCache.EXPECT().GetDomain(gomock.Any()).Return(cache.CreateDomainCacheEntry(matchingTestDomainName), nil).AnyTimes()
s.mockDomainCache.EXPECT().GetDomainName(gomock.Any()).Return(matchingTestDomainName, nil).AnyTimes()
s.mockDomainCache.EXPECT().RegisterDomainChangeCallback(service.Matching, gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()
s.mockDomainCache.EXPECT().UnregisterDomainChangeCallback(service.Matching).AnyTimes()
s.mockMembershipResolver = membership.NewMockResolver(s.controller)
s.mockMembershipResolver.EXPECT().Lookup(gomock.Any(), gomock.Any()).Return(membership.HostInfo{}, nil).AnyTimes()
s.mockMembershipResolver.EXPECT().WhoAmI().Return(membership.HostInfo{}, nil).AnyTimes()
Expand Down
Loading