Skip to content

Commit 7b40359

Browse files
Trigger additional dns probes on peer conn status
1 parent 6016d2f commit 7b40359

File tree

4 files changed

+178
-73
lines changed

4 files changed

+178
-73
lines changed

client/internal/dns/host.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func newNoopHostMocker() hostManager {
7878
}
7979
}
8080

81-
func dnsConfigToHostDNSConfig(dnsConfig nbdns.Config, ip string, port int) HostDNSConfig {
81+
func dnsConfigToHostDNSConfig(dnsConfig nbdns.Config, ip string, port int, connectedPeers int) HostDNSConfig {
8282
config := HostDNSConfig{
8383
RouteAll: false,
8484
ServerIP: ip,
@@ -88,13 +88,14 @@ func dnsConfigToHostDNSConfig(dnsConfig nbdns.Config, ip string, port int) HostD
8888
if len(nsConfig.NameServers) == 0 {
8989
continue
9090
}
91-
if nsConfig.Primary {
91+
if nsConfig.Primary && connectedPeers != 0 {
9292
config.RouteAll = true
9393
}
9494

9595
for _, domain := range nsConfig.Domains {
9696
config.Domains = append(config.Domains, DomainConfig{
9797
Domain: strings.TrimSuffix(domain, "."),
98+
Disabled: connectedPeers == 0,
9899
MatchOnly: !nsConfig.SearchDomainsEnabled,
99100
})
100101
}

client/internal/dns/server.go

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"net/netip"
77
"runtime"
8+
"slices"
89
"strings"
910
"sync"
1011

@@ -116,7 +117,7 @@ func NewDefaultServerPermanentUpstream(
116117
ds.hostsDNSHolder.set(hostsDnsList)
117118
ds.permanent = true
118119
ds.addHostRootZone()
119-
ds.currentConfig = dnsConfigToHostDNSConfig(config, ds.service.RuntimeIP(), ds.service.RuntimePort())
120+
ds.currentConfig = dnsConfigToHostDNSConfig(config, ds.service.RuntimeIP(), ds.service.RuntimePort(), 1)
120121
ds.searchDomainNotifier = newNotifier(ds.SearchDomains())
121122
ds.searchDomainNotifier.setListener(listener)
122123
setServerDns(ds)
@@ -305,11 +306,18 @@ func (s *DefaultServer) applyConfiguration(update nbdns.Config) error {
305306
if err != nil {
306307
return fmt.Errorf("not applying dns update, error: %v", err)
307308
}
308-
muxUpdates := append(localMuxUpdates, upstreamMuxUpdates...) //nolint:gocritic
309+
310+
var muxUpdates []muxUpdate
311+
if s.statusRecorder.GetConnectedPeersCount() == 0 {
312+
log.Infof("O connected peers, not registering upstream handlers")
313+
muxUpdates = localMuxUpdates
314+
} else {
315+
muxUpdates = append(localMuxUpdates, upstreamMuxUpdates...) //nolint:gocritic
316+
}
309317

310318
s.updateMux(muxUpdates)
311319
s.updateLocalResolver(localRecords)
312-
s.currentConfig = dnsConfigToHostDNSConfig(update, s.service.RuntimeIP(), s.service.RuntimePort())
320+
s.currentConfig = dnsConfigToHostDNSConfig(update, s.service.RuntimeIP(), s.service.RuntimePort(), s.statusRecorder.GetConnectedPeersCount())
313321

314322
hostUpdate := s.currentConfig
315323
if s.service.RuntimePort() != defaultPort && !s.hostManager.supportCustomPort() {
@@ -359,8 +367,8 @@ func (s *DefaultServer) buildLocalHandlerUpdate(customZones []nbdns.CustomZone)
359367
}
360368

361369
func (s *DefaultServer) buildUpstreamHandlerUpdate(nameServerGroups []*nbdns.NameServerGroup) ([]muxUpdate, error) {
362-
363370
var muxUpdates []muxUpdate
371+
log.Infof("length of nameServerGroups %d", len(nameServerGroups))
364372
for _, nsGroup := range nameServerGroups {
365373
if len(nsGroup.NameServers) == 0 {
366374
log.Warn("received a nameserver group with empty nameserver list")
@@ -495,29 +503,22 @@ func (s *DefaultServer) upstreamCallbacks(
495503
nsGroup *nbdns.NameServerGroup,
496504
handler dns.Handler,
497505
) (deactivate func(error), reactivate func()) {
498-
var removeIndex map[string]int
499506
deactivate = func(err error) {
500507
s.mux.Lock()
501508
defer s.mux.Unlock()
502509

503510
l := log.WithField("nameservers", nsGroup.NameServers)
504511
l.Info("Temporarily deactivating nameservers group due to timeout")
505512

506-
removeIndex = make(map[string]int)
507-
for _, domain := range nsGroup.Domains {
508-
removeIndex[domain] = -1
509-
}
510513
if nsGroup.Primary {
511-
removeIndex[nbdns.RootZone] = -1
512514
s.currentConfig.RouteAll = false
513515
s.service.DeregisterMux(nbdns.RootZone)
514516
}
515517

516518
for i, item := range s.currentConfig.Domains {
517-
if _, found := removeIndex[item.Domain]; found {
519+
if slices.Contains(nsGroup.Domains, item.Domain) {
518520
s.currentConfig.Domains[i].Disabled = true
519521
s.service.DeregisterMux(item.Domain)
520-
removeIndex[item.Domain] = i
521522
}
522523
}
523524

@@ -530,18 +531,16 @@ func (s *DefaultServer) upstreamCallbacks(
530531
}
531532

532533
s.updateNSState(nsGroup, err, false)
533-
534534
}
535535
reactivate = func() {
536536
s.mux.Lock()
537537
defer s.mux.Unlock()
538538

539-
for domain, i := range removeIndex {
540-
if i == -1 || i >= len(s.currentConfig.Domains) || s.currentConfig.Domains[i].Domain != domain {
541-
continue
539+
for i, item := range s.currentConfig.Domains {
540+
if slices.Contains(nsGroup.Domains, item.Domain) {
541+
s.currentConfig.Domains[i].Disabled = false
542+
s.service.RegisterMux(item.Domain, handler)
542543
}
543-
s.currentConfig.Domains[i].Disabled = false
544-
s.service.RegisterMux(domain, handler)
545544
}
546545

547546
l := log.WithField("nameservers", nsGroup.NameServers)

client/internal/dns/upstream.go

Lines changed: 105 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,101 @@ type upstreamResolverBase struct {
5656
func newUpstreamResolverBase(ctx context.Context, statusRecorder *peer.Status) *upstreamResolverBase {
5757
ctx, cancel := context.WithCancel(ctx)
5858

59-
return &upstreamResolverBase{
59+
resolverBase := &upstreamResolverBase{
6060
ctx: ctx,
6161
cancel: cancel,
6262
upstreamTimeout: upstreamTimeout,
6363
reactivatePeriod: reactivatePeriod,
6464
failsTillDeact: failsTillDeact,
6565
statusRecorder: statusRecorder,
6666
}
67+
68+
go resolverBase.watchPeersConnStatusChanges()
69+
70+
return resolverBase
71+
}
72+
73+
func (u *upstreamResolverBase) watchPeersConnStatusChanges() {
74+
var probeRunning atomic.Bool
75+
var cancelBackOff context.CancelFunc
76+
77+
exponentialBackOff := &backoff.ExponentialBackOff{
78+
InitialInterval: 200 * time.Millisecond,
79+
RandomizationFactor: 0.5,
80+
Multiplier: 1.1,
81+
MaxInterval: 5 * time.Second,
82+
MaxElapsedTime: 15 * time.Second,
83+
Stop: backoff.Stop,
84+
Clock: backoff.SystemClock,
85+
}
86+
operation := func() error {
87+
select {
88+
case <-u.ctx.Done():
89+
return backoff.Permanent(fmt.Errorf("exiting upstream retry loop for upstreams %s: parent context : %s", u.upstreamServers, u.ctx.Err()))
90+
default:
91+
}
92+
93+
u.probeAvailability()
94+
if u.disabled {
95+
return fmt.Errorf("probe faled")
96+
}
97+
return nil
98+
}
99+
100+
continualProbe := func() {
101+
// probe continually for 30s when peer count >= 1
102+
if u.statusRecorder.GetConnectedPeersCount() == 0 {
103+
log.Debug("O peer connected, running one more DNS probe")
104+
// cancel backoff operation
105+
if cancelBackOff != nil {
106+
cancelBackOff()
107+
cancelBackOff = nil
108+
}
109+
u.probeAvailability()
110+
return
111+
}
112+
113+
if probeRunning.Load() {
114+
log.Info("restarting DNS probing")
115+
cancelBackOff()
116+
cancelBackOff = nil
117+
}
118+
defer func() {
119+
u.mutex.Lock()
120+
log.Infof("DNS probing finished, servers %s disabled: %t", u.upstreamServers, u.disabled)
121+
u.mutex.Unlock()
122+
probeRunning.Store(false)
123+
}()
124+
probeRunning.Store(true)
125+
126+
ctx, cancel := context.WithCancel(context.Background())
127+
cancelBackOff = cancel
128+
err := backoff.Retry(func() error {
129+
select {
130+
case <-ctx.Done():
131+
log.Warn("DNS probing cancelled")
132+
return backoff.Permanent(ctx.Err())
133+
default:
134+
return operation()
135+
}
136+
}, backoff.WithContext(exponentialBackOff, ctx))
137+
cancelBackOff = nil
138+
if err != nil {
139+
log.Warn("DNS probe trigger by peer connection failed")
140+
u.disable(err)
141+
return
142+
}
143+
}
144+
145+
for {
146+
select {
147+
case <-u.ctx.Done():
148+
return
149+
case <-u.statusRecorder.GetPeersConnStatusChangeNotifier():
150+
log.Debugf("probing DNS availability on/off for 30s")
151+
go continualProbe()
152+
}
153+
}
67154
}
68155

69156
func (u *upstreamResolverBase) stop() {
@@ -163,7 +250,7 @@ func (u *upstreamResolverBase) checkUpstreamFails(err error) {
163250
}
164251

165252
// probeAvailability tests all upstream servers simultaneously and
166-
// disables the resolver if none work
253+
// disables/enable the resolver
167254
func (u *upstreamResolverBase) probeAvailability() {
168255
u.mutex.Lock()
169256
defer u.mutex.Unlock()
@@ -174,11 +261,6 @@ func (u *upstreamResolverBase) probeAvailability() {
174261
default:
175262
}
176263

177-
// avoid probe if upstreams could resolve at least one query and fails count is less than failsTillDeact
178-
if u.successCount.Load() > 0 && u.failsCount.Load() < u.failsTillDeact {
179-
return
180-
}
181-
182264
var success bool
183265
var mu sync.Mutex
184266
var wg sync.WaitGroup
@@ -190,7 +272,7 @@ func (u *upstreamResolverBase) probeAvailability() {
190272
wg.Add(1)
191273
go func() {
192274
defer wg.Done()
193-
err := u.testNameserver(upstream, 500*time.Millisecond)
275+
err := u.testNameserver(upstream, probeTimeout)
194276
if err != nil {
195277
errors = multierror.Append(errors, err)
196278
log.Warnf("probing upstream nameserver %s: %s", upstream, err)
@@ -208,6 +290,15 @@ func (u *upstreamResolverBase) probeAvailability() {
208290
// didn't find a working upstream server, let's disable and try later
209291
if !success {
210292
u.disable(errors.ErrorOrNil())
293+
return
294+
}
295+
296+
if u.disabled {
297+
log.Infof("upstreams %s are responsive again. Adding them back to system", u.upstreamServers)
298+
u.failsCount.Store(0)
299+
u.successCount.Add(1)
300+
u.reactivate()
301+
u.disabled = false
211302
}
212303
}
213304

@@ -223,37 +314,17 @@ func (u *upstreamResolverBase) waitUntilResponse() {
223314
Clock: backoff.SystemClock,
224315
}
225316

226-
operation := func() error {
227-
select {
228-
case <-u.ctx.Done():
229-
return backoff.Permanent(fmt.Errorf("exiting upstream retry loop for upstreams %s: parent context has been canceled", u.upstreamServers))
230-
default:
231-
}
232-
233-
for _, upstream := range u.upstreamServers {
234-
if err := u.testNameserver(upstream, probeTimeout); err != nil {
235-
log.Tracef("upstream check for %s: %s", upstream, err)
236-
} else {
237-
// at least one upstream server is available, stop probing
238-
return nil
239-
}
317+
err := backoff.Retry(func() error {
318+
u.probeAvailability()
319+
if u.disabled {
320+
return fmt.Errorf("failed to enable upsstream")
240321
}
241-
242-
log.Tracef("checking connectivity with upstreams %s failed. Retrying in %s", u.upstreamServers, exponentialBackOff.NextBackOff())
243-
return fmt.Errorf("upstream check call error")
244-
}
245-
246-
err := backoff.Retry(operation, exponentialBackOff)
322+
return nil
323+
}, exponentialBackOff)
247324
if err != nil {
248325
log.Warn(err)
249326
return
250327
}
251-
252-
log.Infof("upstreams %s are responsive again. Adding them back to system", u.upstreamServers)
253-
u.failsCount.Store(0)
254-
u.successCount.Add(1)
255-
u.reactivate()
256-
u.disabled = false
257328
}
258329

259330
// isTimeout returns true if the given error is a network timeout error.

0 commit comments

Comments
 (0)