Skip to content

Commit 4684265

Browse files
committed
discovery: Fix tablets removed from healthcheck on topo error
Signed-off-by: Brendan Dougherty <[email protected]>
1 parent e1151b9 commit 4684265

File tree

2 files changed

+78
-6
lines changed

2 files changed

+78
-6
lines changed

go/vt/discovery/topology_watcher.go

+18-6
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,8 @@ func (tw *TopologyWatcher) loadTablets() {
213213
wg.Wait()
214214
tw.mu.Lock()
215215

216+
partialResult := len(tabletAliases) != len(newTablets)
217+
216218
for alias, newVal := range newTablets {
217219
if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) {
218220
continue
@@ -236,14 +238,24 @@ func (tw *TopologyWatcher) loadTablets() {
236238
}
237239
}
238240

239-
for _, val := range tw.tablets {
240-
if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(val.tablet) {
241-
continue
241+
if partialResult {
242+
for _, val := range tw.tablets {
243+
if _, ok := newTablets[val.alias]; !ok {
244+
// We don't know if the tablet was removed or if we simply failed to fetch it.
245+
// We'll assume it was not removed and ensure it remains in the tablet list
246+
newTablets[val.alias] = val
247+
}
242248
}
249+
} else {
250+
for _, val := range tw.tablets {
251+
if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(val.tablet) {
252+
continue
253+
}
243254

244-
if _, ok := newTablets[val.alias]; !ok {
245-
tw.healthcheck.RemoveTablet(val.tablet)
246-
topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1)
255+
if _, ok := newTablets[val.alias]; !ok {
256+
tw.healthcheck.RemoveTablet(val.tablet)
257+
topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1)
258+
}
247259
}
248260
}
249261
tw.tablets = newTablets

go/vt/discovery/topology_watcher_test.go

+60
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"vitess.io/vitess/go/vt/logutil"
3030
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
3131
"vitess.io/vitess/go/vt/topo"
32+
"vitess.io/vitess/go/vt/topo/faketopo"
3233
"vitess.io/vitess/go/vt/topo/memorytopo"
3334
)
3435

@@ -614,3 +615,62 @@ func TestFilterByKeypsaceSkipsIgnoredTablets(t *testing.T) {
614615

615616
tw.Stop()
616617
}
618+
619+
func TestGetTabletErrorDoesNotRemoveFromHealthcheck(t *testing.T) {
620+
factory := faketopo.NewFakeTopoFactory()
621+
// add cell to the factory. This returns a fake connection which we will use to set the get and update errors as we require.
622+
fakeConn := factory.AddCell("aa")
623+
624+
ts := faketopo.NewFakeTopoServer(factory)
625+
if err := ts.CreateCellInfo(context.Background(), "aa", &topodatapb.CellInfo{}); err != nil {
626+
t.Fatalf("CreateCellInfo failed: %v", err)
627+
}
628+
629+
fhc := NewFakeHealthCheck(nil)
630+
topologyWatcherOperations.ZeroAll()
631+
counts := topologyWatcherOperations.Counts()
632+
tw := NewCellTabletsWatcher(context.Background(), ts, fhc, nil, "aa", 10*time.Minute, true, 5)
633+
634+
counts = checkOpCounts(t, counts, map[string]int64{})
635+
checkChecksum(t, tw, 0)
636+
637+
// Add a tablet to the topology.
638+
tablet := &topodatapb.Tablet{
639+
Alias: &topodatapb.TabletAlias{
640+
Cell: "aa",
641+
Uid: 0,
642+
},
643+
Hostname: "host1",
644+
PortMap: map[string]int32{
645+
"vt": 123,
646+
},
647+
Keyspace: "keyspace",
648+
Shard: "shard",
649+
}
650+
if err := ts.CreateTablet(context.Background(), tablet); err != nil {
651+
t.Fatalf("CreateTablet failed: %v", err)
652+
}
653+
tw.loadTablets()
654+
counts = checkOpCounts(t, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "AddTablet": 1})
655+
checkChecksum(t, tw, 3238442862)
656+
657+
// Check the tablet is returned by GetAllTablets().
658+
allTablets := fhc.GetAllTablets()
659+
key := TabletToMapKey(tablet)
660+
if _, ok := allTablets[key]; !ok || len(allTablets) != 1 || !proto.Equal(allTablets[key], tablet) {
661+
t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet)
662+
}
663+
664+
// Force the next topo Get call to return an error (the ListDir call for the tablet aliases will still succeed)
665+
fakeConn.AddGetError(true)
666+
667+
tw.loadTablets()
668+
checkOpCounts(t, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1})
669+
checkChecksum(t, tw, 3238442862)
670+
671+
// Check the tablet is still returned by GetAllTablets().
672+
allTablets2 := fhc.GetAllTablets()
673+
if _, ok := allTablets2[key]; !ok || len(allTablets2) != 1 || !proto.Equal(allTablets2[key], tablet) {
674+
t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets2, tablet)
675+
}
676+
}

0 commit comments

Comments
 (0)