Skip to content

Commit 264dfde

Browse files
Add support for periodic health-checks
1 parent 34e39a2 commit 264dfde

File tree

5 files changed

+203
-38
lines changed

5 files changed

+203
-38
lines changed

Diff for: CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
Release v1.2.12 (2023-01-10)
2+
===
3+
* Add support for healthcheck infrastructure
4+
15
Release v1.2.11 (2022-12-28)
26
===
37
* Made IdleConnectionReapDelay configurable for clients.

Diff for: dax/internal/client/cluster.go

+107-33
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ type Config struct {
6161
ClusterUpdateThreshold time.Duration
6262
ClusterUpdateInterval time.Duration
6363
IdleConnectionReapDelay time.Duration
64+
ClientHealthCheckInterval time.Duration
6465

6566
HostPorts []string
6667
Region string
@@ -110,6 +111,7 @@ var defaultConfig = Config{
110111
MaxPendingConnectionsPerHost: 10,
111112
ClusterUpdateInterval: time.Second * 4,
112113
ClusterUpdateThreshold: time.Millisecond * 125,
114+
ClientHealthCheckInterval: time.Second * 5,
113115

114116
Credentials: defaults.CredChain(defaults.Config(), defaults.Handlers()),
115117

@@ -418,10 +420,10 @@ func (cc *ClusterDaxClient) shouldRetry(o RequestOptions, err error) (request.Re
418420

419421
type cluster struct {
420422
lock sync.RWMutex
421-
active map[hostPort]DaxAPI // protected by lock
422-
routes []DaxAPI // protected by lock
423-
closed bool // protected by lock
424-
lastRefreshErr error // protected by lock
423+
active map[hostPort]clientAndConfig // protected by lock
424+
routes []DaxAPI // protected by lock
425+
closed bool // protected by lock
426+
lastRefreshErr error // protected by lock
425427

426428
lastUpdateNs int64
427429
executor *taskExecutor
@@ -431,6 +433,11 @@ type cluster struct {
431433
clientBuilder clientBuilder
432434
}
433435

436+
type clientAndConfig struct {
437+
client DaxAPI
438+
cfg serviceEndpoint
439+
}
440+
434441
func newCluster(cfg Config) (*cluster, error) {
435442
if err := cfg.validate(); err != nil {
436443
return nil, err
@@ -599,7 +606,7 @@ func (c *cluster) refresh(force bool) error {
599606
func (c *cluster) refreshNow() error {
600607
cfg, err := c.pullEndpoints()
601608
if err != nil {
602-
c.config.logger.Log(fmt.Sprintf("ERROR: Failed to refresh endpoint : %s", err))
609+
c.debugLog(fmt.Sprintf("ERROR: Failed to refresh endpoint : %s", err))
603610
return err
604611
}
605612
if !c.hasChanged(cfg) {
@@ -608,55 +615,116 @@ func (c *cluster) refreshNow() error {
608615
return c.update(cfg)
609616
}
610617

618+
// This method is responsible for updating the set of active routes tracked by
619+
// the clsuter-dax-client in response to updates in the roster.
611620
func (c *cluster) update(config []serviceEndpoint) error {
612621
newEndpoints := make(map[hostPort]struct{}, len(config))
613622
for _, cfg := range config {
614623
newEndpoints[cfg.hostPort()] = struct{}{}
615624
}
616625

617-
newActive := make(map[hostPort]DaxAPI, len(config))
626+
newActive := make(map[hostPort]clientAndConfig, len(config))
618627
newRoutes := make([]DaxAPI, len(config))
628+
shouldUpdateRoutes := true
629+
var toClose []clientAndConfig
630+
// Track the newly created client instances, so that we can clean them up in case of partial failures.
631+
var newCliCfg []clientAndConfig
632+
633+
c.lock.Lock()
619634

620-
c.lock.RLock()
621635
cls := c.closed
622636
oldActive := c.active
623-
c.lock.RUnlock()
624-
if cls {
625-
return nil
626-
}
627637

628-
var toClose []DaxAPI
629-
for ep, cli := range oldActive {
630-
_, ok := newEndpoints[ep]
631-
if !ok {
632-
toClose = append(toClose, cli)
638+
if cls {
639+
shouldUpdateRoutes = false
640+
} else {
641+
// Close the client instances that are no longer part of roster.
642+
for ep, clicfg := range oldActive {
643+
_, isPartOfUpdatedEndpointsConfig := newEndpoints[ep]
644+
if !isPartOfUpdatedEndpointsConfig {
645+
c.debugLog(fmt.Sprintf("Found updated endpoing configs, will close inactive endpoint client : %s", ep.host))
646+
toClose = append(toClose, clicfg)
647+
}
633648
}
634-
}
635-
for i, ep := range config {
636-
cli, ok := oldActive[ep.hostPort()]
637-
var err error
638-
if !ok {
639-
cli, err = c.newSingleClient(ep)
640-
if err != nil {
641-
return nil
649+
650+
// Create client instances for the new endpoints in roster.
651+
for i, ep := range config {
652+
cliAndCfg, alreadyExists := oldActive[ep.hostPort()]
653+
if !alreadyExists {
654+
cli, err := c.newSingleClient(ep)
655+
if err != nil {
656+
shouldUpdateRoutes = false
657+
break
658+
} else {
659+
cliAndCfg = clientAndConfig{client: cli, cfg: ep}
660+
newCliCfg = append(newCliCfg, cliAndCfg)
661+
}
662+
663+
if singleCli, ok := cli.(HealthCheckDaxAPI); ok {
664+
singleCli.startHealthChecks(c, ep.hostPort())
665+
}
642666
}
667+
newActive[ep.hostPort()] = cliAndCfg
668+
newRoutes[i] = cliAndCfg.client
643669
}
644-
newActive[ep.hostPort()] = cli
645-
newRoutes[i] = cli
646670
}
647-
c.lock.Lock()
648-
c.active = newActive
649-
c.routes = newRoutes
671+
672+
if shouldUpdateRoutes {
673+
c.active = newActive
674+
c.routes = newRoutes
675+
} else {
676+
// cleanup newly created clients if they are not going to be tracked further.
677+
toClose = append(toClose, newCliCfg...)
678+
}
650679
c.lock.Unlock()
651680

652681
go func() {
653682
for _, client := range toClose {
654-
c.closeClient(client)
683+
c.debugLog(fmt.Sprintf("Closing client for : %s", client.cfg.hostname))
684+
c.closeClient(client.client)
655685
}
656686
}()
657687
return nil
658688
}
659689

690+
func (c *cluster) onHealthCheckFailed(host hostPort) {
691+
c.lock.Lock()
692+
c.debugLog("DEBUG: Refreshing cache for host: " + host.host)
693+
shouldCloseOldClient := true
694+
var oldClientConfig, ok = c.active[host]
695+
if ok {
696+
var err error
697+
var cli DaxAPI
698+
cli, err = c.newSingleClient(oldClientConfig.cfg)
699+
if singleCli, ok := cli.(HealthCheckDaxAPI); ok {
700+
singleCli.startHealthChecks(c, host)
701+
}
702+
703+
if err == nil {
704+
c.active[host] = clientAndConfig{client: cli, cfg: oldClientConfig.cfg}
705+
706+
newRoutes := make([]DaxAPI, len(c.active))
707+
i := 0
708+
for _, cliAndCfg := range c.active {
709+
newRoutes[i] = cliAndCfg.client
710+
i++
711+
}
712+
c.routes = newRoutes
713+
} else {
714+
shouldCloseOldClient = false
715+
c.debugLog(fmt.Sprintf("DEBUG: Failed to refresh cache for host: " + host.host))
716+
}
717+
} else {
718+
c.debugLog(fmt.Sprintf("DEBUG: The node is not part of active routes. Ignoring the health check failure for host: " + host.host))
719+
}
720+
c.lock.Unlock()
721+
722+
if shouldCloseOldClient {
723+
c.debugLog(fmt.Sprintf("DEBUG: Closing old instance of a replaced client for endpoint: %s", oldClientConfig.cfg.hostPort().host))
724+
c.closeClient(oldClientConfig.client)
725+
}
726+
}
727+
660728
func (c *cluster) hasChanged(cfg []serviceEndpoint) bool {
661729
c.lock.RLock()
662730
defer c.lock.RUnlock()
@@ -692,9 +760,7 @@ func (c *cluster) pullEndpoints() ([]serviceEndpoint, error) {
692760
lastErr = err
693761
continue
694762
}
695-
if c.config.logger != nil && c.config.logLevel.AtLeast(aws.LogDebug) {
696-
c.config.logger.Log(fmt.Sprintf("DEBUG: Pulled endpoints from %s : %v", ip, endpoints))
697-
}
763+
c.debugLog(fmt.Sprintf("DEBUG: Pulled endpoints from %s : %v", ip, endpoints))
698764
if len(endpoints) > 0 {
699765
return endpoints, nil
700766
}
@@ -720,6 +786,14 @@ func (c *cluster) closeClient(client DaxAPI) {
720786
}
721787
}
722788

789+
func (c *cluster) debugLog(args ...interface{}) {
790+
if c.config.logger != nil && c.config.logLevel.AtLeast(aws.LogDebug) {
791+
{
792+
c.config.logger.Log(args)
793+
}
794+
}
795+
}
796+
723797
func (c *cluster) newSingleClient(cfg serviceEndpoint) (DaxAPI, error) {
724798
return c.clientBuilder.newClient(net.IP(cfg.address), cfg.port, c.config.connConfig, c.config.Region, c.config.Credentials, c.config.MaxPendingConnectionsPerHost, c.config.DialContext)
725799
}

Diff for: dax/internal/client/cluster_test.go

+66-4
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,7 @@ func TestCluster_update(t *testing.T) {
468468
}
469469
assertNumRoutes(cluster, 1, t)
470470
assertConnections(cluster, first, t)
471+
assertHealthCheckCalls(cluster, t)
471472

472473
// add new hosts
473474
second := []serviceEndpoint{{hostname: "localhost", port: 8121}, {hostname: "localhost", port: 8122}, {hostname: "localhost", port: 8123}}
@@ -479,6 +480,7 @@ func TestCluster_update(t *testing.T) {
479480
}
480481
assertNumRoutes(cluster, 3, t)
481482
assertConnections(cluster, second, t)
483+
assertHealthCheckCalls(cluster, t)
482484

483485
// replace host
484486
third := []serviceEndpoint{{hostname: "localhost", port: 8121}, {hostname: "localhost", port: 8122}, {hostname: "localhost", port: 8124}}
@@ -490,6 +492,7 @@ func TestCluster_update(t *testing.T) {
490492
}
491493
assertNumRoutes(cluster, 3, t)
492494
assertConnections(cluster, third, t)
495+
assertHealthCheckCalls(cluster, t)
493496

494497
// remove host
495498
fourth := []serviceEndpoint{{hostname: "localhost", port: 8122}, {hostname: "localhost", port: 8124}}
@@ -501,14 +504,50 @@ func TestCluster_update(t *testing.T) {
501504
}
502505
assertNumRoutes(cluster, 2, t)
503506
assertConnections(cluster, fourth, t)
507+
assertHealthCheckCalls(cluster, t)
504508

505509
// no change
506510
fifth := []serviceEndpoint{{hostname: "localhost", port: 8122}, {hostname: "localhost", port: 8124}}
507511
if cluster.hasChanged(fifth) {
508512
t.Errorf("unexpected config change")
509513
}
514+
if err := cluster.update(fifth); err != nil {
515+
t.Errorf("unexpected error %v", err)
516+
}
510517
assertNumRoutes(cluster, 2, t)
511518
assertConnections(cluster, fifth, t)
519+
assertHealthCheckCalls(cluster, t)
520+
}
521+
522+
func TestCluster_onHealthCheckFailed(t *testing.T) {
523+
cluster, clientBuilder := newTestCluster([]string{"127.0.0.1:8888"})
524+
endpoint := serviceEndpoint{hostname: "localhost", port: 8123}
525+
first := []serviceEndpoint{endpoint, {hostname: "localhost", port: 8124}, {hostname: "localhost", port: 8125}}
526+
cluster.update(first)
527+
528+
assertNumRoutes(cluster, 3, t)
529+
assertConnections(cluster, first, t)
530+
assertHealthCheckCalls(cluster, t)
531+
// Replace old instance of client with new one. Total client instances: 3 + 0
532+
assert.Equal(t, 3, len(clientBuilder.clients))
533+
assertCloseCalls(cluster, 0, t)
534+
535+
cluster.onHealthCheckFailed(endpoint.hostPort())
536+
assertNumRoutes(cluster, 3, t)
537+
assertConnections(cluster, first, t)
538+
assertHealthCheckCalls(cluster, t)
539+
// Replace old instance of client with new one. Total client instances: 3 + 1
540+
assert.Equal(t, 4, len(clientBuilder.clients))
541+
assertCloseCalls(cluster, 1, t)
542+
543+
// Another failure
544+
cluster.onHealthCheckFailed(endpoint.hostPort())
545+
assertNumRoutes(cluster, 3, t)
546+
assertConnections(cluster, first, t)
547+
assertHealthCheckCalls(cluster, t)
548+
// Replace old instance of client with new one. Total client instances: 3 + 2
549+
assert.Equal(t, 5, len(clientBuilder.clients))
550+
assertCloseCalls(cluster, 2, t)
512551
}
513552

514553
func TestCluster_client(t *testing.T) {
@@ -635,7 +674,7 @@ func assertConnections(cluster *cluster, endpoints []serviceEndpoint, t *testing
635674
if !ok {
636675
t.Errorf("missing client %v", hp)
637676
}
638-
if tc, ok := c.(*testClient); ok {
677+
if tc, ok := c.client.(*testClient); ok {
639678
if tc.hp != hp {
640679
t.Errorf("expected %v, got %v", hp, tc.hp)
641680
}
@@ -653,6 +692,25 @@ func assertNumRoutes(cluster *cluster, num int, t *testing.T) {
653692
}
654693
}
655694

695+
func assertHealthCheckCalls(cluster *cluster, t *testing.T) {
696+
for _, cliAndCfg := range cluster.active {
697+
healtCheckCalls := cliAndCfg.client.(*testClient).healthCheckCalls
698+
if healtCheckCalls != 1 {
699+
t.Errorf("expected 1 healthcheck call, got %d", healtCheckCalls)
700+
}
701+
}
702+
}
703+
704+
func assertCloseCalls(cluster *cluster, num int, t *testing.T) {
705+
cnt := 0
706+
for _, client := range cluster.clientBuilder.(*testClientBuilder).clients {
707+
if client.closeCalls == 1 {
708+
cnt++
709+
}
710+
}
711+
assert.Equal(t, num, cnt)
712+
}
713+
656714
func assertDiscoveryClient(client *testClient, t *testing.T) {
657715
if client.endpointsCalls != 1 {
658716
t.Errorf("expected 1, got %d", client.endpointsCalls)
@@ -751,9 +809,13 @@ func (b *testClientBuilder) newClient(ip net.IP, port int, connConfigData connCo
751809
}
752810

753811
type testClient struct {
754-
hp hostPort
755-
ep []serviceEndpoint
756-
endpointsCalls, closeCalls int
812+
hp hostPort
813+
ep []serviceEndpoint
814+
endpointsCalls, closeCalls, healthCheckCalls int
815+
}
816+
817+
func (c *testClient) startHealthChecks(cc *cluster, host hostPort) {
818+
c.healthCheckCalls++
757819
}
758820

759821
func (c *testClient) endpoints(opt RequestOptions) ([]serviceEndpoint, error) {

0 commit comments

Comments
 (0)