connectd: fix accidental handling of old reconnections.

We had multiple reports of channels being unilaterally closed because it seemed like the peer was sending old revocation numbers. Turns out, it was actually old reestablish messages! When we have a reconnection, we would put the new connection aside, and tell lightningd to close the current connection: when it did, we would restart processing of the initial reconnection. However, we could end up with *multiple* "reconnecting" connections, while waiting for an existing connection to close. Though the connections were long gone, there could still be messages queued (particularly the channel_reestablish message, which comes early on). Eventually, a normal reconnection would cause us to process one of these reconnecting connections, and channeld would see the (perhaps very old!) messages, and get confused. (I have a test which triggers this, but it also hangs the connect command, due to other issues we will fix in the next release...) Fixes: ElementsProject#5240 Signed-off-by: Rusty Russell <[email protected]>
whitslack · May 16, 2022 · 1c495ca · 1c495ca
1 parent 9039c9c
commit 1c495ca
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 24 deletions.
diff --git a/connectd/connectd.c b/connectd/connectd.c
@@ -211,36 +211,29 @@ static void peer_connected_in(struct daemon *daemon,
 	tal_free(connect);
 }
 
-/*~ This is an ad-hoc marshalling structure where we store arguments so we
- * can call peer_connected again. */
-struct peer_reconnected {
-	struct daemon *daemon;
-	struct node_id id;
-	struct wireaddr_internal addr;
-	const struct wireaddr *remote_addr;
-	struct crypto_state cs;
-	const u8 *their_features;
-	bool incoming;
-};
-
 /*~ For simplicity, lightningd only ever deals with a single connection per
  * peer.  So if we already know about a peer, we tell lightning to disconnect
  * the old one and retry once it does. */
 static struct io_plan *retry_peer_connected(struct io_conn *conn,
 					    struct peer_reconnected *pr)
 {
-	struct io_plan *plan;
-
 	/*~ As you can see, we've had issues with this code before :( */
 	status_peer_debug(&pr->id, "processing now old peer gone");
 
-	/*~ Usually the pattern is to return this directly, but we have to free
-	 * our temporary structure. */
-	plan = peer_connected(conn, pr->daemon, &pr->id, &pr->addr,
+	/* If this fails (still waiting), pr will be freed, so reparent onto
+	 * tmpctx so it gets freed either way. */
+	tal_steal(tmpctx, pr);
+
+	/*~ Usually the pattern is to return this directly. */
+	return peer_connected(conn, pr->daemon, &pr->id, &pr->addr,
 			      pr->remote_addr,
 			      &pr->cs, take(pr->their_features), pr->incoming);
-	tal_free(pr);
-	return plan;
+}
+
+/*~ A common use for destructors is to remove themselves from a data structure */
+static void destroy_peer_reconnected(struct peer_reconnected *pr)
+{
+	peer_reconnected_htable_del(&pr->daemon->reconnected, pr);
 }
 
 /*~ If we already know about this peer, we tell lightningd and it disconnects
@@ -259,6 +252,13 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,
 
 	status_peer_debug(id, "reconnect");
 
+	/* If we have a previous reconnection, we replace it. */
+	pr = peer_reconnected_htable_get(&daemon->reconnected, id);
+	if (pr) {
+		peer_reconnected_htable_del(&daemon->reconnected, pr);
+		tal_free(pr);
+	}
+
 	/* Tell master to kill it: will send peer_disconnect */
 	msg = towire_connectd_reconnected(NULL, id);
 	daemon_conn_send(daemon->master, take(msg));
@@ -271,6 +271,8 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,
 	pr->addr = *addr;
 	pr->remote_addr = tal_dup_or_null(pr, struct wireaddr, remote_addr);
 	pr->incoming = incoming;
+	peer_reconnected_htable_add(&daemon->reconnected, pr);
+	tal_add_destructor(pr, destroy_peer_reconnected);
 
 	/*~ Note that tal_dup_talarr() will do handle the take() of features
 	 * (turning it into a simply tal_steal() in those cases). */
@@ -280,11 +282,7 @@ static struct io_plan *peer_reconnected(struct io_conn *conn,
 	 * the peer set.  When someone calls `io_wake()` on that address, it
 	 * will call retry_peer_connected above. */
 	return io_wait(conn, peer_htable_get(&daemon->peers, id),
-			/*~ The notleak() wrapper is a DEVELOPER-mode hack so
-			 * that our memory leak detection doesn't consider 'pr'
-			 * (which is not referenced from our code) to be a
-			 * memory leak. */
-		       retry_peer_connected, notleak(pr));
+		       retry_peer_connected, pr);
 }
 
 /*~ When we free a peer, we remove it from the daemon's hashtable */
@@ -1981,6 +1979,7 @@ static void dev_connect_memleak(struct daemon *daemon, const u8 *msg)
 	/* Now delete daemon and those which it has pointers to. */
 	memleak_remove_region(memtable, daemon, sizeof(daemon));
 	memleak_remove_htable(memtable, &daemon->peers.raw);
+	memleak_remove_htable(memtable, &daemon->reconnected.raw);
 
 	found_leak = dump_memleak(memtable, memleak_status_broken);
 	daemon_conn_send(daemon->master,
@@ -2127,6 +2126,7 @@ int main(int argc, char *argv[])
 	/* Allocate and set up our simple top-level structure. */
 	daemon = tal(NULL, struct daemon);
 	peer_htable_init(&daemon->peers);
+	peer_reconnected_htable_init(&daemon->reconnected);
 	memleak_add_helper(daemon, memleak_daemon_cb);
 	list_head_init(&daemon->connecting);
 	timers_init(&daemon->timers, time_mono());

diff --git a/connectd/connectd.h b/connectd/connectd.h
@@ -126,6 +126,37 @@ HTABLE_DEFINE_TYPE(struct peer,
 		   peer_eq_node_id,
 		   peer_htable);
 
+/*~ This is an ad-hoc marshalling structure where we store arguments so we
+ * can call peer_connected again. */
+struct peer_reconnected {
+	struct daemon *daemon;
+	struct node_id id;
+	struct wireaddr_internal addr;
+	const struct wireaddr *remote_addr;
+	struct crypto_state cs;
+	const u8 *their_features;
+	bool incoming;
+};
+
+static const struct node_id *
+peer_reconnected_keyof(const struct peer_reconnected *pr)
+{
+	return &pr->id;
+}
+
+static bool peer_reconnected_eq_node_id(const struct peer_reconnected *pr,
+					const struct node_id *id)
+{
+	return node_id_eq(&pr->id, id);
+}
+
+/*~ This defines 'struct peer_reconnected_htable'. */
+HTABLE_DEFINE_TYPE(struct peer_reconnected,
+		   peer_reconnected_keyof,
+		   node_id_hash,
+		   peer_reconnected_eq_node_id,
+		   peer_reconnected_htable);
+
 /*~ This is the global state, like `struct lightningd *ld` in lightningd. */
 struct daemon {
 	/* Who am I? */
@@ -142,6 +173,9 @@ struct daemon {
 	 * have disconnected. */
 	struct peer_htable peers;
 
+	/* Peers which have reconnected, waiting for us to kill existing conns */
+	struct peer_reconnected_htable reconnected;
+
 	/* Peers we are trying to reach */
 	struct list_head connecting;