Skip to content

Commit cf80476

Browse files
committed
server: stagger initial reconnects
This commit adds optional jitter to our initial reconnection to our persistent peers. Currently we will attempt reconnections to all peers simultaneously, which results in large amount of contention as the number of channels a node has grows. We resolve this by adding a randomized delay between 0 and 30 seconds for all persistent peers. This spreads out the load and contention to resources such as the database, read/write pools, and memory allocations. On my node, this allows to start up with about 80% of the memory burst compared to the all-at-once approach. This also has a second-order effect in better distributing messages sent at constant intervals, such as pings. This reduces the concurrent jobs submitted to the read and write pools at any given time, resulting in better reuse of read/write buffers and fewer bursty allocation and garbage collection cycles.
1 parent 4de7d0c commit cf80476

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,8 @@ type config struct {
252252

253253
RejectPush bool `long:"rejectpush" description:"If true, lnd will not accept channel opening requests with non-zero push amounts. This should prevent accidental pushes to merchant nodes."`
254254

255+
StaggerInitialReconnect bool `long:"stagger-initial-reconnect" description:"If true, will apply a randomized staggering between 0s and 30s when reconnecting to persistent peers on startup. The first 10 reconnections will be attempted instantly, regardless of the flag's value"`
256+
255257
net tor.Net
256258

257259
Routing *routing.Conf `group:"routing" namespace:"routing"`

server.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"image/color"
1010
"math/big"
11+
prand "math/rand"
1112
"net"
1213
"path/filepath"
1314
"regexp"
@@ -60,6 +61,18 @@ const (
6061
// durations exceeding this value will be eligible to have their
6162
// backoffs reduced.
6263
defaultStableConnDuration = 10 * time.Minute
64+
65+
// numInstantInitReconnect specifies how many persistent peers we should
66+
// always attempt outbound connections to immediately. After this value
67+
// is surpassed, the remaining peers will be randomly delayed using
68+
// maxInitReconnectDelay.
69+
numInstantInitReconnect = 10
70+
71+
// maxInitReconnectDelay specifies the maximum delay in seconds we will
72+
// apply in attempting to reconnect to persistent peers on startup. The
73+
// value used or a particular peer will be chosen between 0s and this
74+
// value.
75+
maxInitReconnectDelay = 30
6376
)
6477

6578
var (
@@ -1931,6 +1944,7 @@ func (s *server) establishPersistentConnections() error {
19311944

19321945
// Iterate through the combined list of addresses from prior links and
19331946
// node announcements and attempt to reconnect to each node.
1947+
var numOutboundConns int
19341948
for pubStr, nodeAddr := range nodeAddrsMap {
19351949
// Add this peer to the set of peers we should maintain a
19361950
// persistent connection with.
@@ -1961,13 +1975,42 @@ func (s *server) establishPersistentConnections() error {
19611975
s.persistentConnReqs[pubStr] = append(
19621976
s.persistentConnReqs[pubStr], connReq)
19631977

1964-
go s.connMgr.Connect(connReq)
1978+
// We'll connect to the first 10 peers immediately, then
1979+
// randomly stagger any remaining connections if the
1980+
// stagger initial reconnect flag is set. This ensures
1981+
// that mobile nodes or nodes with a small number of
1982+
// channels obtain connectivity quickly, but larger
1983+
// nodes are able to disperse the costs of connecting to
1984+
// all peers at once.
1985+
if numOutboundConns < numInstantInitReconnect ||
1986+
!cfg.StaggerInitialReconnect {
1987+
1988+
go s.connMgr.Connect(connReq)
1989+
} else {
1990+
go s.delayInitialReconnect(connReq)
1991+
}
19651992
}
1993+
1994+
numOutboundConns++
19661995
}
19671996

19681997
return nil
19691998
}
19701999

2000+
// delayInitialReconnect will attempt a reconnection using the passed connreq
2001+
// after sampling a value for the delay between 0s and the
2002+
// maxInitReconnectDelay.
2003+
//
2004+
// NOTE: This method MUST be run as a goroutine.
2005+
func (s *server) delayInitialReconnect(connReq *connmgr.ConnReq) {
2006+
delay := time.Duration(prand.Intn(maxInitReconnectDelay)) * time.Second
2007+
select {
2008+
case <-time.After(delay):
2009+
s.connMgr.Connect(connReq)
2010+
case <-s.quit:
2011+
}
2012+
}
2013+
19712014
// prunePersistentPeerConnection removes all internal state related to
19722015
// persistent connections to a peer within the server. This is used to avoid
19732016
// persistent connection retries to peers we do not have any open channels with.

0 commit comments

Comments
 (0)