-
Notifications
You must be signed in to change notification settings - Fork 889
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix reading of RLE encoded boolean data from parquet files with V2 page headers #13707
Changes from all commits
0f51ef1
20d9d25
05171bd
c02a4b9
39a78b1
a27ec2a
09b1f30
2b2a8f7
694dac2
40d8ffb
7e7c180
c87d759
24cd888
c6d63e8
68313b6
bb7a20d
944adbb
4bf1b70
f637051
24cd8a4
0323628
94c8f73
fcf9db0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -871,45 +871,65 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s, | |
level_type lvl) | ||
{ | ||
int32_t len; | ||
int level_bits = s->col.level_bits[lvl]; | ||
Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding | ||
: s->page.repetition_level_encoding; | ||
int const level_bits = s->col.level_bits[lvl]; | ||
auto const encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding | ||
: s->page.repetition_level_encoding; | ||
|
||
auto start = cur; | ||
if (level_bits == 0) { | ||
len = 0; | ||
s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value | ||
s->initial_rle_value[lvl] = 0; | ||
s->lvl_start[lvl] = cur; | ||
s->abs_lvl_start[lvl] = cur; | ||
} else if (encoding == Encoding::RLE) { | ||
// V2 only uses RLE encoding, so only perform check here | ||
if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) { | ||
len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes; | ||
} else if (cur + 4 < end) { | ||
len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); | ||
cur += 4; | ||
} else { | ||
len = 0; | ||
s->error = 2; | ||
} | ||
s->abs_lvl_start[lvl] = cur; | ||
if (!s->error) { | ||
uint32_t run = get_vlq32(cur, end); | ||
s->initial_rle_run[lvl] = run; | ||
if (!(run & 1)) { | ||
int v = (cur < end) ? cur[0] : 0; | ||
|
||
auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) { | ||
uint32_t const run = get_vlq32(cur, end); | ||
s->initial_rle_run[lvl] = run; | ||
if (!(run & 1)) { | ||
if (cur < end) { | ||
int v = cur[0]; | ||
cur++; | ||
if (level_bits > 8) { | ||
v |= ((cur < end) ? cur[0] : 0) << 8; | ||
cur++; | ||
} | ||
s->initial_rle_value[lvl] = v; | ||
} else { | ||
s->initial_rle_value[lvl] = 0; | ||
} | ||
s->lvl_start[lvl] = cur; | ||
} | ||
s->lvl_start[lvl] = cur; | ||
|
||
if (cur > end) { s->error = 2; } | ||
}; | ||
|
||
// this is a little redundant. if level_bits == 0, then nothing should be encoded | ||
// for the level, but some V2 files in the wild violate this and encode the data anyway. | ||
// thus we will handle V2 headers separately. | ||
if ((s->page.flags & PAGEINFO_FLAGS_V2) != 0) { | ||
// V2 only uses RLE encoding so no need to check encoding | ||
len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes; | ||
s->abs_lvl_start[lvl] = cur; | ||
if (len == 0) { | ||
s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value | ||
s->initial_rle_value[lvl] = 0; | ||
s->lvl_start[lvl] = cur; | ||
} else { | ||
init_rle(cur, cur + len); | ||
} | ||
} else if (level_bits == 0) { | ||
len = 0; | ||
s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value | ||
s->initial_rle_value[lvl] = 0; | ||
s->lvl_start[lvl] = cur; | ||
s->abs_lvl_start[lvl] = cur; | ||
} else if (encoding == Encoding::RLE) { // V1 header with RLE encoding | ||
if (cur + 4 < end) { | ||
len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); | ||
cur += 4; | ||
s->abs_lvl_start[lvl] = cur; | ||
init_rle(cur, cur + len); | ||
// add back the 4 bytes for the length | ||
len += 4; | ||
} else { | ||
len = 0; | ||
s->error = 2; | ||
} | ||
} else if (encoding == Encoding::BIT_PACKED) { | ||
len = (s->page.num_input_values * level_bits + 7) >> 3; | ||
s->initial_rle_run[lvl] = ((s->page.num_input_values + 7) >> 3) * 2 + 1; // literal run | ||
|
@@ -1247,7 +1267,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, | |
s->dict_val = 0; | ||
if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } | ||
break; | ||
case Encoding::RLE: s->dict_run = 0; break; | ||
case Encoding::RLE: { | ||
// first 4 bytes are length of RLE data | ||
int const len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); | ||
cur += 4; | ||
if (cur + len > end) { s->error = 2; } | ||
s->dict_run = 0; | ||
Comment on lines
+1271
to
+1275
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. File this under "how did this ever work?". What's the story here - how did RLE decoding work if we were starting 4 bytes off? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm guessing you never ran into RLE encoded bool data before 😉 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some light summer reading 😉 Near as I can tell booleans will only be encoded with RLE with V2 writers (although it seems arrow-rs might allow RLE w/ V1, but not by default). So it likely is true that this code has never been exercised. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like the encoder has RLE boolean support but it's disabled. If I enable it, it writes RLE encoded boolean data, but lacks the 4 bytes of length required by the Parquet spec. So I guess that answers the question of how did this ever work. Once this PR and #13751 are merged I'll submit a PR to enable RLE encoding for booleans when writing V2 files. |
||
} break; | ||
default: | ||
s->error = 1; // Unsupported encoding | ||
break; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it always going to be safe to do this? That is, is it always the case that for V2 headers, there will be a valid varint, regardless of whether
level_bits
is 0 or not?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
init_rle is only called if the number of bytes for the level is non-zero (
len != 0
). So at this point there should be some bytes to read. The problem file actually only encodes the RLE length, but not the RLE value (which is then assumed to be 0). That's why the test forcur < end
ininit_rle
FWIW I'm planning to refactor the
def/rep_lvl_bytes
into an array to match other bits of thePageInfo
struct. Then this bit of logic gets a little clearer.