From 4fd949f674b65cc1964443649ff82a67b0ffe34d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 26 Jun 2022 16:14:12 +0930 Subject: [PATCH 1/2] connectd: don't insist on ping replies when other traffic is flowing. Got complaints about us hanging up on some nodes because they don't respond to pings in a timely manner (e.g. ACINQ?), but that turned out to be something else. Nonetheless, we've had reports in the past of LND badly prioritizing gossip traffic, and thus important messages can get queued behind gossip dumps! Signed-off-by: Rusty Russell Changelog-Changed: connectd: give busy peers more time to respond to pings. --- connectd/connectd.c | 1 + connectd/connectd.h | 3 +++ connectd/multiplex.c | 26 ++++++++++++++++++-------- tests/test_connection.py | 17 ++++++++++------- 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/connectd/connectd.c b/connectd/connectd.c index 07a952732188..48d027e54211 100644 --- a/connectd/connectd.c +++ b/connectd/connectd.c @@ -310,6 +310,7 @@ static struct peer *new_peer(struct daemon *daemon, peer->ready_to_die = false; peer->active = false; peer->peer_outq = msg_queue_new(peer, false); + peer->last_recv_time = time_now(); #if DEVELOPER peer->dev_writes_enabled = NULL; diff --git a/connectd/connectd.h b/connectd/connectd.h index 9af5f815d7ca..db5ec2022120 100644 --- a/connectd/connectd.h +++ b/connectd/connectd.h @@ -87,6 +87,9 @@ struct peer { /* Random ping timer, to detect dead connections. */ struct oneshot *ping_timer; + /* Last time we received traffic */ + struct timeabs last_recv_time; + #if DEVELOPER bool dev_read_enabled; /* If non-NULL, this counts down; 0 means disable */ diff --git a/connectd/multiplex.c b/connectd/multiplex.c index e99c78e6e214..0e52671654fc 100644 --- a/connectd/multiplex.c +++ b/connectd/multiplex.c @@ -423,16 +423,23 @@ static void set_ping_timer(struct peer *peer) static void send_ping(struct peer *peer) { - /* Already have a ping in flight? */ - if (peer->expecting_pong != PONG_UNEXPECTED) { - status_peer_debug(&peer->id, "Last ping unreturned: hanging up"); - if (peer->to_peer) - io_close(peer->to_peer); - return; + /* If it's still sending us traffic, maybe ping reply is backed up? + * That's OK, ping is just to make sure it's still alive, and clearly + * it is. */ + if (time_before(peer->last_recv_time, + timeabs_sub(time_now(), time_from_sec(60)))) { + /* Already have a ping in flight? */ + if (peer->expecting_pong != PONG_UNEXPECTED) { + status_peer_debug(&peer->id, "Last ping unreturned: hanging up"); + if (peer->to_peer) + io_close(peer->to_peer); + return; + } + + inject_peer_msg(peer, take(make_ping(NULL, 1, 0))); + peer->expecting_pong = PONG_EXPECTED_PROBING; } - inject_peer_msg(peer, take(make_ping(NULL, 1, 0))); - peer->expecting_pong = PONG_EXPECTED_PROBING; set_ping_timer(peer); } @@ -991,6 +998,9 @@ static struct io_plan *read_body_from_peer_done(struct io_conn *peer_conn, if (!IFDEV(peer->dev_read_enabled, true)) return read_hdr_from_peer(peer_conn, peer); + /* We got something! */ + peer->last_recv_time = time_now(); + /* Don't process packets while we're closing */ if (peer->ready_to_die) return read_hdr_from_peer(peer_conn, peer); diff --git a/tests/test_connection.py b/tests/test_connection.py index feed3ed939e7..b6cf02b7cd50 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -3848,15 +3848,18 @@ def test_ping_timeout(node_factory): l1, l2 = node_factory.get_nodes(2, opts=[{'dev-no-reconnect': None, 'disconnect': l1_disconnects}, - {}]) + {'dev-no-ping-timer': None}]) l1.rpc.connect(l2.info['id'], 'localhost', l2.port) - # Takes 15-45 seconds, then another to try second ping - # Because of ping timer randomness we don't know which side hangs up first - wait_for(lambda: l1.rpc.listpeers(l2.info['id'])['peers'] == [], - timeout=45 + 45 + 5) - wait_for(lambda: (l1.daemon.is_in_log('Last ping unreturned: hanging up') - or l2.daemon.is_in_log('Last ping unreturned: hanging up'))) + # This can take 10 seconds (dev-fast-gossip means timer fires every 5 seconds) + l1.daemon.wait_for_log('seeker: startup peer finished', timeout=15) + # Ping timers runs at 15-45 seconds, *but* only fires if also 60 seconds + # after previous traffic. + l1.daemon.wait_for_log('dev_disconnect: xWIRE_PING', timeout=60 + 45 + 5) + + # Next pign will cause hangup + l1.daemon.wait_for_log('Last ping unreturned: hanging up', timeout=45 + 5) + wait_for(lambda: l1.rpc.listpeers(l2.info['id'])['peers'] == []) @pytest.mark.openchannel('v1') From 30c1346d53a0183820e29b40c0416a52aabb9220 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jun 2022 09:48:48 +0930 Subject: [PATCH 2/2] pytest: fix flake in test_node_reannounce Fixes 25699994e5941fb165cf379ff3a688981bcdaddb (pytest: fix flake in test_node_reannounce). Converting ->set->list does not give deterministic order, so we can end up with the two lists differing due to order: ``` May send its own announcement *twice*, since it always spams us. msgs2 = list(set(msgs2)) > assert msgs == msgs2 E AssertionError: assert ['01012ff5580...000000000000'] == ['01014973d81...000000000000'] E At index 0 diff: '01012ff55800f5b9492021372d74df4d6547bb0d32aec8d4c932a8c3b044e4bd983c429154e73091b0a2aff1cf9bbf16b37e6e9dd10ce4c2d949217366472acd341b0007800000080269a262bbd1750266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c035180266e453454e494f524245414d000000000000000000000000000000000000000000000000' != '01014973d8160dd8fc28e8fb25c40b9d5c68aed8dfb36af9fc13e4d2040fb3718553051a188ce98239c0bed138e1f8713a64acc7de98c183c9597fa58bf37f0b89bb0007800000080269a262bbd16c022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59022d2253494c454e544152544953542d336333626132392d6d6f6464656400000000000000' E Full diff: E [ E + '01012ff55800f5b9492021372d74df4d6547bb0d32aec8d4c932a8c3b044e4bd983c429154e73091b0a2aff1cf9bbf16b37e6e9dd10ce4c2d949217366472acd341b0007800000080269a262bbd1750266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c035180266e453454e494f524245414d000000000000000000000000000000000000000000000000', E '01014973d8160dd8fc28e8fb25c40b9d5c68aed8dfb36af9fc13e4d2040fb3718553051a188ce98239c0bed138e1f8713a64acc7de98c183c9597fa58bf37f0b89bb0007800000080269a262bbd16c022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59022d2253494c454e544152544953542d336333626132392d6d6f6464656400000000000000', E - '01012ff55800f5b9492021372d74df4d6547bb0d32aec8d4c932a8c3b044e4bd983c429154e73091b0a2aff1cf9bbf16b37e6e9dd10ce4c2d949217366472acd341b0007800000080269a262bbd1750266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c035180266e453454e494f524245414d000000000000000000000000000000000000000000000000', E ] `` Signed-off-by: Rusty Russell --- tests/test_gossip.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_gossip.py b/tests/test_gossip.py index 1c94e185e4fc..aa67ffe76322 100644 --- a/tests/test_gossip.py +++ b/tests/test_gossip.py @@ -1281,8 +1281,7 @@ def test_node_reannounce(node_factory, bitcoind, chainparams): filters=['0109', '0107', '0102', '0100', '0012']) # May send its own announcement *twice*, since it always spams us. - msgs2 = list(set(msgs2)) - assert msgs == msgs2 + assert set(msgs) == set(msgs2) # Won't have queued up another one, either. assert not l1.daemon.is_in_log('node_announcement: delaying') @@ -1308,8 +1307,7 @@ def test_node_reannounce(node_factory, bitcoind, chainparams): # channel_announcement and channel_updates. # And pings. filters=['0109', '0107', '0102', '0100', '0012']) - msgs2 = list(set(msgs2)) - assert msgs != msgs2 + assert set(msgs) != set(msgs2) def test_gossipwith(node_factory):