Skip to content

Commit 9a2e727

Browse files
committed
Give peers which are sending us messages longer to respond to ping
See comment for rationale.
1 parent 2bf39a6 commit 9a2e727

File tree

2 files changed

+87
-24
lines changed

2 files changed

+87
-24
lines changed

lightning-background-processor/src/lib.rs

+5-4
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ use lightning::chain::chainmonitor::{ChainMonitor, Persist};
1414
use lightning::chain::keysinterface::{Sign, KeysInterface};
1515
use lightning::ln::channelmanager::ChannelManager;
1616
use lightning::ln::msgs::{ChannelMessageHandler, RoutingMessageHandler};
17-
use lightning::ln::peer_handler::{PeerManager, SocketDescriptor};
18-
use lightning::ln::peer_handler::CustomMessageHandler;
17+
use lightning::ln::peer_handler::{CustomMessageHandler, PeerManager, SocketDescriptor};
18+
use lightning::ln::peer_handler;
1919
use lightning::routing::network_graph::NetGraphMsgHandler;
2020
use lightning::util::events::{Event, EventHandler, EventsProvider};
2121
use lightning::util::logger::Logger;
@@ -236,8 +236,9 @@ impl BackgroundProcessor {
236236
// timer, we should have disconnected all sockets by now (and they're probably
237237
// dead anyway), so disconnect them by calling `timer_tick_occurred()` twice.
238238
log_trace!(logger, "Awoke after more than double our ping timer, disconnecting peers.");
239-
peer_manager.timer_tick_occurred();
240-
peer_manager.timer_tick_occurred();
239+
for _ in 0..peer_handler::MAX_BUFFER_DRAIN_TICK_INTERVALS + 1 {
240+
peer_manager.timer_tick_occurred();
241+
}
241242
last_ping_call = Instant::now();
242243
} else if last_ping_call.elapsed().as_secs() > PING_TIMER {
243244
log_trace!(logger, "Calling PeerManager's timer_tick_occurred");

lightning/src/ln/peer_handler.rs

+82-20
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,24 @@ const OUTBOUND_BUFFER_LIMIT_READ_PAUSE: usize = 10;
294294
/// the peer.
295295
const OUTBOUND_BUFFER_LIMIT_DROP_GOSSIP: usize = 20;
296296

297+
/// If we've sent a ping, and are still awaiting a response, we (or our peer) may need to churn our
298+
/// (or their) way through the socket receive buffer before receiving the ping.
299+
///
300+
/// On a fairly old Arm64 board, with Linux defaults, this can take as long as 20 seconds, not
301+
/// including any network delays or outbound traffic.
302+
///
303+
/// Thus, to avoid needlessly disconnecting a peer, we allow a peer to take this many timer ticks
304+
/// to respond to a ping, as long as they send us at least one message during each tick or if we
305+
/// sent a lot of messages, ensuring we aren't actually just disconnected. With a timer tick
306+
/// interval of five seconds, this translates to about 30 seconds.
307+
pub const MAX_BUFFER_DRAIN_TICK_INTERVALS: i8 = 6;
308+
309+
/// This is the minimum number of messages we expect a peer to be able to handle within one timer
310+
/// tick. Once we have sent this many messages since the last ping, we send a ping right away to
311+
/// ensures we don't just fill up our send buffer and leave the peer with too many messages to
312+
/// process before the next ping.
313+
pub const BUFFER_DRAIN_MSGS_PER_TICK: usize = 32;
314+
297315
struct Peer {
298316
channel_encryptor: PeerChannelEncryptor,
299317
their_node_id: Option<PublicKey>,
@@ -309,7 +327,9 @@ struct Peer {
309327

310328
sync_status: InitSyncTracker,
311329

312-
awaiting_pong: bool,
330+
msgs_sent_since_pong: usize,
331+
awaiting_pong_tick_intervals: i8,
332+
received_message_since_timer_tick: bool,
313333
}
314334

315335
impl Peer {
@@ -551,7 +571,9 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
551571

552572
sync_status: InitSyncTracker::NoSyncRequested,
553573

554-
awaiting_pong: false,
574+
msgs_sent_since_pong: 0,
575+
awaiting_pong_tick_intervals: 0,
576+
received_message_since_timer_tick: false,
555577
}).is_some() {
556578
panic!("PeerManager driver duplicated descriptors!");
557579
};
@@ -589,7 +611,9 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
589611

590612
sync_status: InitSyncTracker::NoSyncRequested,
591613

592-
awaiting_pong: false,
614+
msgs_sent_since_pong: 0,
615+
awaiting_pong_tick_intervals: 0,
616+
received_message_since_timer_tick: false,
593617
}).is_some() {
594618
panic!("PeerManager driver duplicated descriptors!");
595619
};
@@ -598,7 +622,7 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
598622

599623
fn do_attempt_write_data(&self, descriptor: &mut Descriptor, peer: &mut Peer) {
600624
while !peer.awaiting_write_event {
601-
if peer.pending_outbound_buffer.len() < OUTBOUND_BUFFER_LIMIT_READ_PAUSE {
625+
if peer.pending_outbound_buffer.len() < OUTBOUND_BUFFER_LIMIT_READ_PAUSE && peer.msgs_sent_since_pong < BUFFER_DRAIN_MSGS_PER_TICK {
602626
match peer.sync_status {
603627
InitSyncTracker::NoSyncRequested => {},
604628
InitSyncTracker::ChannelsSyncing(c) if c < 0xffff_ffff_ffff_ffff => {
@@ -643,6 +667,9 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
643667
},
644668
}
645669
}
670+
if peer.msgs_sent_since_pong >= BUFFER_DRAIN_MSGS_PER_TICK {
671+
self.maybe_send_extra_ping(peer);
672+
}
646673

647674
if {
648675
let next_buff = match peer.pending_outbound_buffer.front() {
@@ -725,6 +752,7 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
725752
let encoded_message = buffer.0;
726753

727754
log_trace!(self.logger, "Enqueueing message {:?} to {}", message, log_pubkey!(peer.their_node_id.unwrap()));
755+
peer.msgs_sent_since_pong += 1;
728756
peer.pending_outbound_buffer.push_back(peer.channel_encryptor.encrypt_message(&encoded_message[..]));
729757
}
730758

@@ -919,6 +947,7 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
919947
message: wire::Message<<<CMH as core::ops::Deref>::Target as wire::CustomMessageReader>::CustomMessage>
920948
) -> Result<Option<wire::Message<<<CMH as core::ops::Deref>::Target as wire::CustomMessageReader>::CustomMessage>>, MessageHandlingError> {
921949
log_trace!(self.logger, "Received message {:?} from {}", message, log_pubkey!(peer.their_node_id.unwrap()));
950+
peer.received_message_since_timer_tick = true;
922951

923952
// Need an Init as first message
924953
if let wire::Message::Init(_) = message {
@@ -982,7 +1011,8 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
9821011
}
9831012
},
9841013
wire::Message::Pong(_msg) => {
985-
peer.awaiting_pong = false;
1014+
peer.awaiting_pong_tick_intervals = 0;
1015+
peer.msgs_sent_since_pong = 0;
9861016
},
9871017

9881018
// Channel messages:
@@ -1103,7 +1133,7 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
11031133
!peer.should_forward_channel_announcement(msg.contents.short_channel_id) {
11041134
continue
11051135
}
1106-
if peer.pending_outbound_buffer.len() > OUTBOUND_BUFFER_LIMIT_DROP_GOSSIP {
1136+
if peer.pending_outbound_buffer.len() > OUTBOUND_BUFFER_LIMIT_DROP_GOSSIP || peer.msgs_sent_since_pong > BUFFER_DRAIN_MSGS_PER_TICK * 2 {
11071137
log_trace!(self.logger, "Skipping broadcast message to {:?} as its outbound buffer is full", peer.their_node_id);
11081138
continue;
11091139
}
@@ -1115,6 +1145,7 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
11151145
continue;
11161146
}
11171147
peer.pending_outbound_buffer.push_back(peer.channel_encryptor.encrypt_message(&encoded_msg[..]));
1148+
peer.msgs_sent_since_pong += 1;
11181149
}
11191150
},
11201151
wire::Message::NodeAnnouncement(ref msg) => {
@@ -1126,7 +1157,7 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
11261157
!peer.should_forward_node_announcement(msg.contents.node_id) {
11271158
continue
11281159
}
1129-
if peer.pending_outbound_buffer.len() > OUTBOUND_BUFFER_LIMIT_DROP_GOSSIP {
1160+
if peer.pending_outbound_buffer.len() > OUTBOUND_BUFFER_LIMIT_DROP_GOSSIP || peer.msgs_sent_since_pong > BUFFER_DRAIN_MSGS_PER_TICK * 2 {
11301161
log_trace!(self.logger, "Skipping broadcast message to {:?} as its outbound buffer is full", peer.their_node_id);
11311162
continue;
11321163
}
@@ -1137,6 +1168,7 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
11371168
continue;
11381169
}
11391170
peer.pending_outbound_buffer.push_back(peer.channel_encryptor.encrypt_message(&encoded_msg[..]));
1171+
peer.msgs_sent_since_pong += 1;
11401172
}
11411173
},
11421174
wire::Message::ChannelUpdate(ref msg) => {
@@ -1148,14 +1180,15 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
11481180
!peer.should_forward_channel_announcement(msg.contents.short_channel_id) {
11491181
continue
11501182
}
1151-
if peer.pending_outbound_buffer.len() > OUTBOUND_BUFFER_LIMIT_DROP_GOSSIP {
1183+
if peer.pending_outbound_buffer.len() > OUTBOUND_BUFFER_LIMIT_DROP_GOSSIP || peer.msgs_sent_since_pong > BUFFER_DRAIN_MSGS_PER_TICK * 2 {
11521184
log_trace!(self.logger, "Skipping broadcast message to {:?} as its outbound buffer is full", peer.their_node_id);
11531185
continue;
11541186
}
11551187
if except_node.is_some() && peer.their_node_id.as_ref() == except_node {
11561188
continue;
11571189
}
11581190
peer.pending_outbound_buffer.push_back(peer.channel_encryptor.encrypt_message(&encoded_msg[..]));
1191+
peer.msgs_sent_since_pong += 1;
11591192
}
11601193
},
11611194
_ => debug_assert!(false, "We shouldn't attempt to forward anything but gossip messages"),
@@ -1431,6 +1464,20 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
14311464
}
14321465
}
14331466

1467+
/// This is called when we're blocked on sending additional gossip messages until we receive a
1468+
/// pong. If we aren't waiting on a pong, we take this opportunity to send a ping (setting
1469+
/// `awaiting_pong_tick_intervals` to a special flag value to indicate this).
1470+
fn maybe_send_extra_ping(&self, peer: &mut Peer) {
1471+
if peer.awaiting_pong_tick_intervals == 0 {
1472+
peer.awaiting_pong_tick_intervals = -1;
1473+
let ping = msgs::Ping {
1474+
ponglen: 0,
1475+
byteslen: 64,
1476+
};
1477+
self.enqueue_message(peer, &ping);
1478+
}
1479+
}
1480+
14341481
/// Send pings to each peer and disconnect those which did not respond to the last round of
14351482
/// pings.
14361483
///
@@ -1451,7 +1498,14 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
14511498
let mut descriptors_needing_disconnect = Vec::new();
14521499

14531500
peers.retain(|descriptor, peer| {
1454-
if peer.awaiting_pong {
1501+
if !peer.channel_encryptor.is_ready_for_encryption() {
1502+
// The peer needs to complete its handshake before we can exchange messages
1503+
return true;
1504+
}
1505+
1506+
if (peer.awaiting_pong_tick_intervals > 0 && !peer.received_message_since_timer_tick)
1507+
|| peer.awaiting_pong_tick_intervals > MAX_BUFFER_DRAIN_TICK_INTERVALS
1508+
{
14551509
descriptors_needing_disconnect.push(descriptor.clone());
14561510
match peer.their_node_id {
14571511
Some(node_id) => {
@@ -1468,21 +1522,26 @@ impl<Descriptor: SocketDescriptor, CM: Deref, RM: Deref, L: Deref, CMH: Deref> P
14681522
return false;
14691523
}
14701524

1471-
if !peer.channel_encryptor.is_ready_for_encryption() {
1472-
// The peer needs to complete its handshake before we can exchange messages
1525+
peer.received_message_since_timer_tick = false;
1526+
if peer.awaiting_pong_tick_intervals == -1 {
1527+
// Magic value set in `maybe_send_extra_ping`.
1528+
peer.awaiting_pong_tick_intervals = 1;
1529+
return true;
1530+
}
1531+
1532+
if peer.awaiting_pong_tick_intervals > 0 {
1533+
peer.awaiting_pong_tick_intervals += 1;
14731534
return true;
14741535
}
14751536

1537+
peer.awaiting_pong_tick_intervals = 1;
14761538
let ping = msgs::Ping {
14771539
ponglen: 0,
14781540
byteslen: 64,
14791541
};
14801542
self.enqueue_message(peer, &ping);
1543+
self.do_attempt_write_data(&mut (descriptor.clone()), &mut *peer);
14811544

1482-
let mut descriptor_clone = descriptor.clone();
1483-
self.do_attempt_write_data(&mut descriptor_clone, peer);
1484-
1485-
peer.awaiting_pong = true;
14861545
true
14871546
});
14881547

@@ -1641,11 +1700,14 @@ mod tests {
16411700
// than can fit into a peer's buffer).
16421701
let (mut fd_a, mut fd_b) = establish_connection(&peers[0], &peers[1]);
16431702

1644-
// Make each peer to read the messages that the other peer just wrote to them.
1645-
peers[0].process_events();
1646-
peers[1].read_event(&mut fd_b, &fd_a.outbound_data.lock().unwrap().split_off(0)).unwrap();
1647-
peers[1].process_events();
1648-
peers[0].read_event(&mut fd_a, &fd_b.outbound_data.lock().unwrap().split_off(0)).unwrap();
1703+
// Make each peer to read the messages that the other peer just wrote to them. Note that
1704+
// due to the max-messagse-before-bing limits this may take a few iterations to complete.
1705+
for _ in 0..10 {
1706+
peers[0].process_events();
1707+
peers[1].read_event(&mut fd_b, &fd_a.outbound_data.lock().unwrap().split_off(0)).unwrap();
1708+
peers[1].process_events();
1709+
peers[0].read_event(&mut fd_a, &fd_b.outbound_data.lock().unwrap().split_off(0)).unwrap();
1710+
}
16491711

16501712
// Check that each peer has received the expected number of channel updates and channel
16511713
// announcements.

0 commit comments

Comments
 (0)