Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reading of RLE encoded boolean data from parquet files with V2 page headers #13707

Merged
merged 23 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
0f51ef1
initial pass
etseidl Jul 13, 2023
20d9d25
Merge branch 'rapidsai:branch-23.08' into feature/fix_rle_bool
etseidl Jul 14, 2023
05171bd
Merge branch 'rapidsai:branch-23.08' into feature/fix_rle_bool
etseidl Jul 17, 2023
c02a4b9
reduce some redundancy
etseidl Jul 17, 2023
39a78b1
fix for rle literal run with no value encoded
etseidl Jul 17, 2023
a27ec2a
add python read test
etseidl Jul 17, 2023
09b1f30
Merge branch 'branch-23.08' into feature/fix_rle_bool
etseidl Jul 17, 2023
2b2a8f7
make encoding const
etseidl Jul 17, 2023
694dac2
format
etseidl Jul 17, 2023
40d8ffb
Merge branch 'rapidsai:branch-23.08' into feature/fix_rle_bool
etseidl Jul 17, 2023
7e7c180
Merge branch 'rapidsai:branch-23.08' into feature/fix_rle_bool
etseidl Jul 17, 2023
c87d759
Merge branch 'rapidsai:branch-23.08' into feature/fix_rle_bool
etseidl Jul 18, 2023
24cd888
test error value before calling init_rle
etseidl Jul 18, 2023
c6d63e8
Merge branch 'branch-23.08' into feature/fix_rle_bool
hyperbolic2346 Jul 19, 2023
68313b6
Merge branch 'rapidsai:branch-23.08' into feature/fix_rle_bool
etseidl Jul 21, 2023
bb7a20d
implement suggestion from review
etseidl Jul 21, 2023
944adbb
redo V1 RLE a bit
etseidl Jul 21, 2023
4bf1b70
treat flags like a mask
etseidl Jul 21, 2023
f637051
Merge branch 'branch-23.08' into feature/fix_rle_bool
hyperbolic2346 Jul 25, 2023
24cd8a4
Merge branch 'rapidsai:branch-23.08' into feature/fix_rle_bool
etseidl Jul 25, 2023
0323628
Merge branch 'branch-23.08' into feature/fix_rle_bool
etseidl Jul 26, 2023
94c8f73
Merge branch 'branch-23.08' into feature/fix_rle_bool
etseidl Jul 26, 2023
fcf9db0
Update python/cudf/cudf/tests/test_parquet.py
galipremsagar Jul 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 54 additions & 28 deletions cpp/src/io/parquet/page_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -871,45 +871,65 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
level_type lvl)
{
int32_t len;
int level_bits = s->col.level_bits[lvl];
Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
: s->page.repetition_level_encoding;
int const level_bits = s->col.level_bits[lvl];
auto const encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
: s->page.repetition_level_encoding;

auto start = cur;
if (level_bits == 0) {
len = 0;
s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value
s->initial_rle_value[lvl] = 0;
s->lvl_start[lvl] = cur;
s->abs_lvl_start[lvl] = cur;
} else if (encoding == Encoding::RLE) {
// V2 only uses RLE encoding, so only perform check here
if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
} else if (cur + 4 < end) {
len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
cur += 4;
} else {
len = 0;
s->error = 2;
}
s->abs_lvl_start[lvl] = cur;
if (!s->error) {
uint32_t run = get_vlq32(cur, end);
s->initial_rle_run[lvl] = run;
if (!(run & 1)) {
int v = (cur < end) ? cur[0] : 0;

auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) {
uint32_t const run = get_vlq32(cur, end);
s->initial_rle_run[lvl] = run;
if (!(run & 1)) {
if (cur < end) {
int v = cur[0];
cur++;
if (level_bits > 8) {
v |= ((cur < end) ? cur[0] : 0) << 8;
cur++;
}
s->initial_rle_value[lvl] = v;
} else {
s->initial_rle_value[lvl] = 0;
}
s->lvl_start[lvl] = cur;
}
s->lvl_start[lvl] = cur;

if (cur > end) { s->error = 2; }
};

// this is a little redundant. if level_bits == 0, then nothing should be encoded
// for the level, but some V2 files in the wild violate this and encode the data anyway.
// thus we will handle V2 headers separately.
if ((s->page.flags & PAGEINFO_FLAGS_V2) != 0) {
// V2 only uses RLE encoding so no need to check encoding
len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
s->abs_lvl_start[lvl] = cur;
if (len == 0) {
s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value
s->initial_rle_value[lvl] = 0;
s->lvl_start[lvl] = cur;
} else {
init_rle(cur, cur + len);
}
Comment on lines +912 to +914
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it always going to be safe to do this? That is, is it always the case that for V2 headers, there will be a valid varint, regardless of whether level_bits is 0 or not?

Copy link
Contributor Author

@etseidl etseidl Jul 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

init_rle is only called if the number of bytes for the level is non-zero (len != 0). So at this point there should be some bytes to read. The problem file actually only encodes the RLE length, but not the RLE value (which is then assumed to be 0). That's why the test for cur < end in init_rle

FWIW I'm planning to refactor the def/rep_lvl_bytes into an array to match other bits of the PageInfo struct. Then this bit of logic gets a little clearer.

} else if (level_bits == 0) {
len = 0;
s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value
s->initial_rle_value[lvl] = 0;
s->lvl_start[lvl] = cur;
s->abs_lvl_start[lvl] = cur;
} else if (encoding == Encoding::RLE) { // V1 header with RLE encoding
if (cur + 4 < end) {
len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
cur += 4;
s->abs_lvl_start[lvl] = cur;
init_rle(cur, cur + len);
// add back the 4 bytes for the length
len += 4;
} else {
len = 0;
s->error = 2;
}
} else if (encoding == Encoding::BIT_PACKED) {
len = (s->page.num_input_values * level_bits + 7) >> 3;
s->initial_rle_run[lvl] = ((s->page.num_input_values + 7) >> 3) * 2 + 1; // literal run
Expand Down Expand Up @@ -1247,7 +1267,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
s->dict_val = 0;
if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
break;
case Encoding::RLE: s->dict_run = 0; break;
case Encoding::RLE: {
// first 4 bytes are length of RLE data
int const len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
cur += 4;
if (cur + len > end) { s->error = 2; }
s->dict_run = 0;
Comment on lines +1271 to +1275
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

File this under "how did this ever work?". What's the story here - how did RLE decoding work if we were starting 4 bytes off?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

File this under "how did this ever work?". What's the story here - how did RLE decoding work if we were starting 4 bytes off?

I'm guessing you never ran into RLE encoded bool data before 😉

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some light summer reading 😉 Near as I can tell booleans will only be encoded with RLE with V2 writers (although it seems arrow-rs might allow RLE w/ V1, but not by default). So it likely is true that this code has never been exercised.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the encoder has RLE boolean support but it's disabled. If I enable it, it writes RLE encoded boolean data, but lacks the 4 bytes of length required by the Parquet spec. So I guess that answers the question of how did this ever work.

Once this PR and #13751 are merged I'll submit a PR to enable RLE encoding for booleans when writing V2 files.

} break;
default:
s->error = 1; // Unsupported encoding
break;
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/parquet/page_hdr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ __global__ void __launch_bounds__(128)
// definition levels
bs->page.chunk_row += bs->page.num_rows;
bs->page.num_rows = 0;
bs->page.flags = 0;
// zero out V2 info
bs->page.num_nulls = 0;
bs->page.def_lvl_bytes = 0;
Expand All @@ -395,7 +396,6 @@ __global__ void __launch_bounds__(128)
case PageType::DATA_PAGE:
index_out = num_dict_pages + data_page_count;
data_page_count++;
bs->page.flags = 0;
// this computation is only valid for flat schemas. for nested schemas,
// they will be recomputed in the preprocess step by examining repetition and
// definition levels
Expand All @@ -405,7 +405,7 @@ __global__ void __launch_bounds__(128)
case PageType::DATA_PAGE_V2:
index_out = num_dict_pages + data_page_count;
data_page_count++;
bs->page.flags = 0;
bs->page.flags |= PAGEINFO_FLAGS_V2;
values_found += bs->page.num_input_values;
// V2 only uses RLE, so it was removed from the header
bs->page.definition_level_encoding = Encoding::RLE;
Expand All @@ -414,7 +414,7 @@ __global__ void __launch_bounds__(128)
case PageType::DICTIONARY_PAGE:
index_out = dictionary_page_count;
dictionary_page_count++;
bs->page.flags = PAGEINFO_FLAGS_DICTIONARY;
bs->page.flags |= PAGEINFO_FLAGS_DICTIONARY;
break;
default: index_out = -1; break;
}
Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/parquet/parquet_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ namespace gpu {
*/
enum {
PAGEINFO_FLAGS_DICTIONARY = (1 << 0), // Indicates a dictionary page
PAGEINFO_FLAGS_V2 = (1 << 1), // V2 page header
};

/**
Expand Down
Binary file not shown.
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2538,6 +2538,15 @@ def test_parquet_reader_binary_decimal(datadir):
assert_eq(expect, got)


def test_parquet_reader_rle_boolean(datadir):
fname = datadir / "rle_boolean_encoding.parquet"

expect = pd.read_parquet(fname)
got = cudf.read_parquet(fname)

assert_eq(expect, got)


# testing a specific bug-fix/edge case.
# specifically: int a parquet file containing a particular way of representing
# a list column in a schema, the cudf reader was confusing
Expand Down