Skip to content

Commit ddf551f

Browse files
committed
htlcswitch: add new LinkFailureDisconnect action
In this commit, we add a new LinkFailureDisconnect action that'll be used if we detect that the remote party hasn't sent a revoke and ack when it actually should. Before this commit, we would log our action, tear down the link, but then not actually force a connection recycle, as we assumed that if the TCP connection was actually stale, then the read/write timeout would expire. In practice this doesn't always seem to be the case, so we make a strong action here to actually force a disconnection in hopes that either side will reconnect and keep the good times rollin' 🕺.
1 parent 5d20d08 commit ddf551f

File tree

5 files changed

+42
-13
lines changed

5 files changed

+42
-13
lines changed
+14-4
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
11
# Release Notes
22

3-
## Mempool
3+
## Mempool Optimizations
44

5-
Optimized [mempool
6-
management](https://github.com/lightningnetwork/lnd/pull/7681) to lower the CPU
7-
usage.
5+
* Optimized [mempool
6+
management](https://github.com/lightningnetwork/lnd/pull/7681) to lower the
7+
CPU usage.
88

99
## Bug Fixes
1010

1111
* [Re-encrypt/regenerate](https://github.com/lightningnetwork/lnd/pull/7705)
1212
all macaroon DB root keys on `ChangePassword`/`GenerateNewRootKey`
1313
respectively.
1414

15+
## Channel Link Bug Fix
16+
17+
* If we detect the remote link is inactive, [we'll now tear down the
18+
connection](https://github.com/lightningnetwork/lnd/pull/7711) in addition to
19+
stopping the link's statemachine. If we're persistently connected with the
20+
peer, then this'll force a reconnect, which may restart things and help avoid
21+
certain force close scenarios.
22+
1523
# Contributors (Alphabetical Order)
24+
1625
* Elle Mouton
26+
* Olaoluwa Osuntokun
1727
* Yong Yu

htlcswitch/link.go

+8-3
Original file line numberDiff line numberDiff line change
@@ -1037,7 +1037,7 @@ func (l *channelLink) htlcManager() {
10371037
l.fail(
10381038
LinkFailureError{
10391039
code: ErrSyncError,
1040-
FailureAction: LinkFailureForceClose, // nolint:lll
1040+
FailureAction: LinkFailureForceClose, //nolint:lll
10411041
},
10421042
"unable to synchronize channel "+
10431043
"states: %v", err,
@@ -1239,8 +1239,13 @@ func (l *channelLink) htlcManager() {
12391239
}
12401240

12411241
case <-l.cfg.PendingCommitTicker.Ticks():
1242-
l.fail(LinkFailureError{code: ErrRemoteUnresponsive},
1243-
"unable to complete dance")
1242+
l.fail(
1243+
LinkFailureError{
1244+
code: ErrRemoteUnresponsive,
1245+
FailureAction: LinkFailureDisconnect,
1246+
},
1247+
"unable to complete dance",
1248+
)
12441249
return
12451250

12461251
// A message from the switch was just received. This indicates

htlcswitch/link_test.go

+8-6
Original file line numberDiff line numberDiff line change
@@ -5457,7 +5457,8 @@ func TestChannelLinkFail(t *testing.T) {
54575457
// If we expect the link to force close the channel in this
54585458
// case, check that it happens. If not, make sure it does not
54595459
// happen.
5460-
isForceCloseErr := (linkErr.FailureAction == LinkFailureForceClose)
5460+
isForceCloseErr := (linkErr.FailureAction ==
5461+
LinkFailureForceClose)
54615462
require.True(
54625463
t, test.shouldForceClose == isForceCloseErr, test.name,
54635464
)
@@ -6343,11 +6344,12 @@ func TestPendingCommitTicker(t *testing.T) {
63436344
// Assert that we get the expected link failure from Alice.
63446345
select {
63456346
case linkErr := <-linkErrs:
6346-
if linkErr.code != ErrRemoteUnresponsive {
6347-
t.Fatalf("error code mismatch, "+
6348-
"want: ErrRemoteUnresponsive, got: %v",
6349-
linkErr.code)
6350-
}
6347+
require.Equal(
6348+
t, linkErr.code, ErrRemoteUnresponsive,
6349+
fmt.Sprintf("error code mismatch, want: "+
6350+
"ErrRemoteUnresponsive, got: %v", linkErr.code),
6351+
)
6352+
require.Equal(t, linkErr.FailureAction, LinkFailureDisconnect)
63516353

63526354
case <-time.After(time.Second):
63536355
t.Fatalf("did not receive failure")

htlcswitch/linkfailure.go

+5
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ const (
6464
// LinkFailureForceClose indicates that the channel should be force
6565
// closed.
6666
LinkFailureForceClose
67+
68+
// LinkFailureDisconnect indicates that we should disconnect in an
69+
// attempt to recycle the connection. This can be useful if we think a
70+
// TCP connection or state machine is stalled.
71+
LinkFailureDisconnect
6772
)
6873

6974
// LinkFailureError encapsulates an error that will make us fail the current

peer/brontide.go

+7
Original file line numberDiff line numberDiff line change
@@ -3142,6 +3142,13 @@ func (p *Brontide) handleLinkFailure(failure linkFailureReport) {
31423142
"remote peer: %v", err)
31433143
}
31443144
}
3145+
3146+
// If the failure action is disconnect, then we'll execute that now. If
3147+
// we had to send an error above, it was a sync call, so we expect the
3148+
// message to be flushed on the wire by now.
3149+
if failure.linkErr.FailureAction == htlcswitch.LinkFailureDisconnect {
3150+
p.Disconnect(fmt.Errorf("link requested disconnect"))
3151+
}
31453152
}
31463153

31473154
// tryLinkShutdown attempts to fetch a target link from the switch, calls

0 commit comments

Comments
 (0)