Skip to content

Commit 622fdce

Browse files
authored
Added exponential retry to the domain cache (#6676)
What changed? Added a proper retry policy to the domain cache so it does not just retry every second Why? The load from retrying every second can cause too much load on the database. How did you test it? Unit test Potential risks Might cause domain data to be out of date, but we are already in that situation when it's failing. Release notes Documentation Changes
1 parent f717213 commit 622fdce

File tree

3 files changed

+52
-18
lines changed

3 files changed

+52
-18
lines changed

common/cache/domainCache.go

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"time"
3434

3535
"github.com/uber/cadence/common"
36+
"github.com/uber/cadence/common/backoff"
3637
"github.com/uber/cadence/common/clock"
3738
"github.com/uber/cadence/common/cluster"
3839
"github.com/uber/cadence/common/errors"
@@ -60,10 +61,7 @@ const (
6061
domainCacheMinRefreshInterval = 1 * time.Second
6162
// DomainCacheRefreshInterval domain cache refresh interval
6263
DomainCacheRefreshInterval = 10 * time.Second
63-
// DomainCacheRefreshFailureRetryInterval is the wait time
64-
// if refreshment encounters error
65-
DomainCacheRefreshFailureRetryInterval = 1 * time.Second
66-
domainCacheRefreshPageSize = 200
64+
domainCacheRefreshPageSize = 200
6765

6866
domainCachePersistenceTimeout = 3 * time.Second
6967

@@ -119,6 +117,8 @@ type (
119117
callbackLock sync.Mutex
120118
prepareCallbacks map[int]PrepareCallbackFn
121119
callbacks map[int]CallbackFn
120+
121+
throttleRetry *backoff.ThrottleRetry
122122
}
123123

124124
// DomainCacheEntries is DomainCacheEntry slice
@@ -160,6 +160,11 @@ func NewDomainCache(
160160
opts ...DomainCacheOption,
161161
) *DefaultDomainCache {
162162

163+
throttleRetry := backoff.NewThrottleRetry(
164+
backoff.WithRetryPolicy(common.CreateDomainCacheRetryPolicy()),
165+
backoff.WithRetryableError(common.IsServiceTransientError),
166+
)
167+
163168
cache := &DefaultDomainCache{
164169
status: domainCacheInitialized,
165170
shutdownChan: make(chan struct{}),
@@ -172,6 +177,7 @@ func NewDomainCache(
172177
logger: logger,
173178
prepareCallbacks: make(map[int]PrepareCallbackFn),
174179
callbacks: make(map[int]CallbackFn),
180+
throttleRetry: throttleRetry,
175181
}
176182
cache.cacheNameToID.Store(newDomainCache())
177183
cache.cacheByID.Store(newDomainCache())
@@ -411,15 +417,12 @@ func (c *DefaultDomainCache) refreshLoop() {
411417
case <-c.shutdownChan:
412418
return
413419
case <-timer.Chan():
414-
for err := c.refreshDomains(); err != nil; err = c.refreshDomains() {
415-
select {
416-
case <-c.shutdownChan:
417-
return
418-
default:
419-
c.logger.Error("Error refreshing domain cache", tag.Error(err))
420-
c.timeSource.Sleep(DomainCacheRefreshFailureRetryInterval)
421-
}
420+
err := c.refreshDomains()
421+
if err != nil {
422+
c.logger.Error("Error refreshing domain cache", tag.Error(err))
423+
continue
422424
}
425+
423426
c.logger.Debug("Domain cache refreshed")
424427
}
425428
}
@@ -428,7 +431,7 @@ func (c *DefaultDomainCache) refreshLoop() {
428431
func (c *DefaultDomainCache) refreshDomains() error {
429432
c.refreshLock.Lock()
430433
defer c.refreshLock.Unlock()
431-
return c.refreshDomainsLocked()
434+
return c.throttleRetry.Do(context.Background(), c.refreshDomainsLocked)
432435
}
433436

434437
// this function only refresh the domains in the v2 table

common/cache/domainCache_test.go

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -897,13 +897,11 @@ func (s *domainCacheSuite) Test_refreshLoop_domainCacheRefreshedError() {
897897

898898
s.domainCache.timeSource = mockedTimeSource
899899

900-
s.metadataMgr.On("GetMetadata", mock.Anything).Return(nil, assert.AnError).Twice()
900+
s.metadataMgr.On("GetMetadata", mock.Anything).Return(nil, assert.AnError).Once()
901901

902902
go func() {
903903
mockedTimeSource.BlockUntil(1)
904904
mockedTimeSource.Advance(DomainCacheRefreshInterval)
905-
mockedTimeSource.BlockUntil(2)
906-
mockedTimeSource.Advance(DomainCacheRefreshFailureRetryInterval)
907905
s.domainCache.shutdownChan <- struct{}{}
908906
}()
909907

@@ -921,14 +919,34 @@ func (s *domainCacheSuite) Test_refreshDomainsLocked_IntervalTooShort() {
921919
s.NoError(err)
922920
}
923921

924-
func (s *domainCacheSuite) Test_refreshDomainsLocked_ListDomainsError() {
922+
func (s *domainCacheSuite) Test_refreshDomains_ListDomainsNonRetryableError() {
925923
s.metadataMgr.On("GetMetadata", mock.Anything).Return(&persistence.GetMetadataResponse{NotificationVersion: 0}, nil).Once()
926924
s.metadataMgr.On("ListDomains", mock.Anything, mock.Anything).Return(nil, assert.AnError).Once()
927925

928-
err := s.domainCache.refreshDomainsLocked()
926+
err := s.domainCache.refreshDomains()
929927
s.ErrorIs(err, assert.AnError)
930928
}
931929

930+
func (s *domainCacheSuite) Test_refreshDomains_ListDomainsRetryableError() {
931+
retryableError := &types.ServiceBusyError{
932+
Message: "Service is busy",
933+
}
934+
935+
// We expect the metadataMgr to be called twice, once for the initial attempt and once for the retry
936+
s.metadataMgr.On("GetMetadata", mock.Anything).Return(&persistence.GetMetadataResponse{NotificationVersion: 0}, nil).Times(2)
937+
938+
// First time return retryable error
939+
s.metadataMgr.On("ListDomains", mock.Anything, mock.Anything).Return(nil, retryableError).Once()
940+
941+
// Second time return non-retryable error
942+
s.metadataMgr.On("ListDomains", mock.Anything, mock.Anything).Return(nil, assert.AnError).Once()
943+
944+
err := s.domainCache.refreshDomains()
945+
946+
// We expect the error to be the first error
947+
s.ErrorIs(err, retryableError)
948+
}
949+
932950
func (s *domainCacheSuite) TestDomainCacheEntry_Getters() {
933951
gen := testdatagen.New(s.T())
934952

common/util.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ const (
8888
taskCompleterMaxInterval = 10 * time.Second
8989
taskCompleterExpirationInterval = 5 * time.Minute
9090

91+
domainCacheInitialInterval = 1 * time.Second
92+
domainCacheMaxInterval = 5 * time.Second
93+
domainCacheExpirationInterval = 2 * time.Minute
94+
9195
contextExpireThreshold = 10 * time.Millisecond
9296

9397
// FailureReasonCompleteResultExceedsLimit is failureReason for complete result exceeds limit
@@ -228,6 +232,15 @@ func CreateTaskCompleterRetryPolicy() backoff.RetryPolicy {
228232
return policy
229233
}
230234

235+
// CreateDomainCacheRetryPolicy creates a retry policy to handle domain cache refresh failures
236+
func CreateDomainCacheRetryPolicy() backoff.RetryPolicy {
237+
policy := backoff.NewExponentialRetryPolicy(domainCacheInitialInterval)
238+
policy.SetMaximumInterval(domainCacheMaxInterval)
239+
policy.SetExpirationInterval(domainCacheExpirationInterval)
240+
241+
return policy
242+
}
243+
231244
// IsValidIDLength checks if id is valid according to its length
232245
func IsValidIDLength(
233246
id string,

0 commit comments

Comments
 (0)