Skip to content

Commit

Permalink
gossipd: handle overflowing query properly (avoid slow 100% CPU reports)
Browse files Browse the repository at this point in the history
Don't do this:
  (gdb) bt
  #0  0x00007f37ae667c40 in ?? () from /lib/x86_64-linux-gnu/libz.so.1
  ElementsProject#1  0x00007f37ae668b38 in ?? () from /lib/x86_64-linux-gnu/libz.so.1
  ElementsProject#2  0x00007f37ae669907 in deflate () from /lib/x86_64-linux-gnu/libz.so.1
  ElementsProject#3  0x00007f37ae674c65 in compress2 () from /lib/x86_64-linux-gnu/libz.so.1
  ElementsProject#4  0x000000000040cfe3 in zencode_scids (ctx=0xc1f118, scids=0x2599bc49 "\a\325{", len=176320) at gossipd/gossipd.c:218
  ElementsProject#5  0x000000000040d0b3 in encode_short_channel_ids_end (encoded=0x7fff8f98d9f0, max_bytes=65490) at gossipd/gossipd.c:236
  ElementsProject#6  0x000000000040dd28 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290511, number_of_blocks=8) at gossipd/gossipd.c:576
  ElementsProject#7  0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290511, number_of_blocks=16) at gossipd/gossipd.c:595
  ElementsProject#8  0x000000000040ddee in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290495, number_of_blocks=32) at gossipd/gossipd.c:596
  ElementsProject#9  0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290495, number_of_blocks=64) at gossipd/gossipd.c:595
  ElementsProject#10 0x000000000040ddee in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290431, number_of_blocks=128) at gossipd/gossipd.c:596
  ElementsProject#11 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290431, number_of_blocks=256) at gossipd/gossipd.c:595
  ElementsProject#12 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290431, number_of_blocks=512) at gossipd/gossipd.c:595
  ElementsProject#13 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290431, number_of_blocks=1024) at gossipd/gossipd.c:595
  ElementsProject#14 0x000000000040ddee in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=2047) at gossipd/gossipd.c:596
  ElementsProject#15 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=4095) at gossipd/gossipd.c:595
  ElementsProject#16 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=8191) at gossipd/gossipd.c:595
  ElementsProject#17 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=16382) at gossipd/gossipd.c:595
  ElementsProject#18 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=32764) at gossipd/gossipd.c:595
  ElementsProject#19 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=65528) at gossipd/gossipd.c:595
  ElementsProject#20 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=131056) at gossipd/gossipd.c:595
  ElementsProject#21 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=262112) at gossipd/gossipd.c:595
  ElementsProject#22 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=524225) at gossipd/gossipd.c:595
  ElementsProject#23 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=1048450) at gossipd/gossipd.c:595
  ElementsProject#24 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=2096900) at gossipd/gossipd.c:595
  ElementsProject#25 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=4193801) at gossipd/gossipd.c:595
  ElementsProject#26 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=8387603) at gossipd/gossipd.c:595
  ElementsProject#27 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=16775207) at gossipd/gossipd.c:595
  ElementsProject#28 0x000000000040ddee in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=33550414) at gossipd/gossipd.c:596
  ElementsProject#29 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=67100829) at gossipd/gossipd.c:595
  ElementsProject#30 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=134201659) at gossipd/gossipd.c:595
  ElementsProject#31 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=268403318) at gossipd/gossipd.c:595
  ElementsProject#32 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=536806636) at gossipd/gossipd.c:595
  ElementsProject#33 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=1073613273) at gossipd/gossipd.c:595
  ElementsProject#34 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=2147226547) at gossipd/gossipd.c:595
  ElementsProject#35 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=4294453094) at gossipd/gossipd.c:595
  ElementsProject#36 0x000000000040df26 in handle_query_channel_range (peer=0x3868fc8, msg=0x37e0678 "\001\ao\342\214\n\266\361\263r\301\246\242F\256c\367O\223\036\203e\341Z\b\234h\326\031") at gossipd/gossipd.c:625

The cause was that converting a block number to an scid truncates it
at 24 bits.  When we look through the index from (truncated number) to
(real end number) we get every channel, which is too large to encode,
so we iterate again.

This fixes both that problem, and also the issue that we'd end up
dividing into many empty sections until we get to the highest block
number.  Instead, we just tack the empty blocks on to then end of the
final query.

(My initial version requested 0xFFFFFFFE blocks, but the dev code
which records what blocks were returned can't make a bitmap that big
on 32 bit).

Reported-by: George Vaccaro
Signed-off-by: Rusty Russell <[email protected]>
  • Loading branch information
rustyrussell authored and arowser committed Feb 15, 2019
1 parent 9e4d405 commit 846c60c
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 16 deletions.
32 changes: 32 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,38 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

<<<<<<< HEAD
=======
## [Unreleased]


### Added

- plugins: fully enabled, and ready for you to write some!
- lightning-cli: `help <cmd>` finds man pages even if `make install` not run.
- JSON API: `waitsendpay` now has an `erring_direction` field.
- JSON API: `listpeers` now has a `direction` field in `channels`.
- JSON API: `listchannels` now takes a `source` option to filter by node id.

### Changed

- The `short_channel_id` separator has been changed to be `x` to match the specification.

### Deprecated

Note: You should always set `allow-deprecated-apis=false` to test for
changes.

### Removed

### Fixed

- Protocol: handling `query_channel_range` for large numbers of blocks
(eg. 4 billion) was slow due to a bug.

### Security

>>>>>>> 0ba547ee... gossipd: handle overflowing query properly (avoid slow 100% CPU reports)
## [0.6.3] - 2019-01-09: "The Smallblock Conspiracy"

This release named by @molxyz and [@ctrlbreak](https://twitter.com/ctrlbreak).
Expand Down
60 changes: 44 additions & 16 deletions gossipd/gossipd.c
Original file line number Diff line number Diff line change
Expand Up @@ -659,9 +659,14 @@ static void reply_channel_range(struct peer *peer,
/*~ When we need to send an array of channels, it might go over our 64k packet
* size. If it doesn't, we recurse, splitting in two, etc. Each message
* indicates what blocks it contains, so the recipient knows when we're
* finished. */
* finished.
*
* tail_blocks is the empty blocks at the end, in case they asked for all
* blocks to 4 billion.
*/
static void queue_channel_ranges(struct peer *peer,
u32 first_blocknum, u32 number_of_blocks)
u32 first_blocknum, u32 number_of_blocks,
u32 tail_blocks)
{
struct routing_state *rstate = peer->daemon->rstate;
u8 *encoded = encode_short_channel_ids_start(tmpctx);
Expand Down Expand Up @@ -704,7 +709,8 @@ static void queue_channel_ranges(struct peer *peer,

/* If we can encode that, fine: send it */
if (encode_short_channel_ids_end(&encoded, max_encoded_bytes)) {
reply_channel_range(peer, first_blocknum, number_of_blocks,
reply_channel_range(peer, first_blocknum,
number_of_blocks + tail_blocks,
encoded);
return;
}
Expand All @@ -717,22 +723,26 @@ static void queue_channel_ranges(struct peer *peer,
first_blocknum);
return;
}
status_debug("queue_channel_ranges full: splitting %u+%u and %u+%u",
status_debug("queue_channel_ranges full: splitting %u+%u and %u+%u(+%u)",
first_blocknum,
number_of_blocks / 2,
first_blocknum + number_of_blocks / 2,
number_of_blocks - number_of_blocks / 2);
queue_channel_ranges(peer, first_blocknum, number_of_blocks / 2);
number_of_blocks - number_of_blocks / 2,
tail_blocks);
queue_channel_ranges(peer, first_blocknum, number_of_blocks / 2, 0);
queue_channel_ranges(peer, first_blocknum + number_of_blocks / 2,
number_of_blocks - number_of_blocks / 2);
number_of_blocks - number_of_blocks / 2,
tail_blocks);
}

/*~ The peer can ask for all channels is a series of blocks. We reply with one
* or more messages containing the short_channel_ids. */
static u8 *handle_query_channel_range(struct peer *peer, const u8 *msg)
{
struct routing_state *rstate = peer->daemon->rstate;
struct bitcoin_blkid chain_hash;
u32 first_blocknum, number_of_blocks;
u32 first_blocknum, number_of_blocks, tail_blocks;
struct short_channel_id last_scid;

if (!fromwire_query_channel_range(msg, &chain_hash,
&first_blocknum, &number_of_blocks)) {
Expand All @@ -751,14 +761,25 @@ static u8 *handle_query_channel_range(struct peer *peer, const u8 *msg)
return NULL;
}

/* This checks for 32-bit overflow! */
if (first_blocknum + number_of_blocks < first_blocknum) {
return towire_errorfmt(peer, NULL,
"query_channel_range overflow %u+%u",
first_blocknum, number_of_blocks);
}

queue_channel_ranges(peer, first_blocknum, number_of_blocks);
/* If they ask for number_of_blocks UINTMAX, and we have to divide
* and conquer, we'll do a lot of unnecessary work. Cap it at the
* last value we have, then send an empty reply. */
if (uintmap_last(&rstate->chanmap, &last_scid.u64)) {
u32 last_block = short_channel_id_blocknum(&last_scid);

/* u64 here avoids overflow on number_of_blocks
UINTMAX for example */
if ((u64)first_blocknum + number_of_blocks > last_block) {
tail_blocks = first_blocknum + number_of_blocks
- last_block - 1;
number_of_blocks -= tail_blocks;
} else
tail_blocks = 0;
} else
tail_blocks = 0;

queue_channel_ranges(peer, first_blocknum, number_of_blocks,
tail_blocks);
return NULL;
}

Expand Down Expand Up @@ -2292,6 +2313,13 @@ static struct io_plan *query_channel_range(struct io_conn *conn,
goto fail;
}

/* Check for overflow on 32-bit machines! */
if (BITMAP_NWORDS(number_of_blocks) < number_of_blocks / BITMAP_WORD_BITS) {
status_broken("query_channel_range: huge number_of_blocks (%u) not supported",
number_of_blocks);
goto fail;
}

status_debug("sending query_channel_range for blocks %u+%u",
first_blocknum, number_of_blocks);
msg = towire_query_channel_range(NULL, &daemon->chain_hash,
Expand Down
13 changes: 13 additions & 0 deletions tests/test_gossip.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,9 @@ def test_gossip_query_channel_range(node_factory, bitcoind):
first=0,
num=1000000)

# Turns out it sends: 0+53, 53+26, 79+13, 92+7, 99+3, 102+2, 104+1, 105+999895
l1.daemon.wait_for_logs([r'\[IN\] 0108'] * 8)

# It should definitely have split
assert ret['final_first_block'] != 0 or ret['final_num_blocks'] != 1000000
assert ret['final_complete']
Expand All @@ -648,6 +651,16 @@ def test_gossip_query_channel_range(node_factory, bitcoind):
assert ret['short_channel_ids'][1] == scid23
l2.daemon.wait_for_log('queue_channel_ranges full: splitting')

# Test overflow case doesn't split forever; should still only get 8 for this
ret = l1.rpc.dev_query_channel_range(id=l2.info['id'],
first=1,
num=429496000)
l1.daemon.wait_for_logs([r'\[IN\] 0108'] * 8)

# And no more!
time.sleep(1)
assert not l1.daemon.is_in_log(r'\[IN\] 0108', start=l1.daemon.logsearch_start)

# This should actually be large enough for zlib to kick in!
l3.fund_channel(l4, 10**5)
bitcoind.generate_block(5)
Expand Down

0 comments on commit 846c60c

Please sign in to comment.