Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add decoder for DELTA_BYTE_ARRAY to Parquet reader #14101

Merged
merged 88 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from 79 commits
Commits
Show all changes
88 commits
Select commit Hold shift + click to select a range
1a6f842
add decoder for DELTA_BYTE_ARRAY
etseidl Sep 13, 2023
d4a5ae5
switch to scoped enum
etseidl Sep 13, 2023
e83c281
Merge branch 'branch-23.10' into decode_delta_byte_arr
etseidl Sep 14, 2023
050d0c6
Merge branch 'rapidsai:branch-23.10' into decode_delta_byte_arr
etseidl Sep 14, 2023
8bf69a3
Merge branch 'rapidsai:branch-23.10' into decode_delta_byte_arr
etseidl Sep 15, 2023
54fecc6
Merge branch 'rapidsai:branch-23.10' into decode_delta_byte_arr
etseidl Sep 18, 2023
dab0d40
Merge branch 'rapidsai:branch-23.10' into decode_delta_byte_arr
etseidl Sep 19, 2023
25a42eb
need to copy string data on default stream
etseidl Sep 20, 2023
15167b2
Merge branch 'rapidsai:branch-23.10' into decode_delta_byte_arr
etseidl Sep 20, 2023
fac8c0b
Merge branch 'branch-23.10' into decode_delta_byte_arr
etseidl Sep 21, 2023
36619e3
Merge branch 'rapidsai:branch-23.10' into decode_delta_byte_arr
etseidl Sep 27, 2023
b8dcbb5
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Sep 27, 2023
46d8ea2
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Sep 27, 2023
edda85b
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Sep 27, 2023
c207433
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Sep 28, 2023
ce79d18
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Sep 28, 2023
9b2a67e
set error code in delta byte array decoder
etseidl Sep 28, 2023
af5fcf6
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Sep 29, 2023
20dc29f
redo BitAnd and BitOr
etseidl Sep 29, 2023
a638fd7
Merge branch 'decode_delta_byte_arr' of github.com:etseidl/cudf into …
etseidl Sep 29, 2023
cba2858
use snake case for names
etseidl Sep 29, 2023
c52481e
add test
etseidl Sep 29, 2023
5a1da6b
cleanups
etseidl Oct 2, 2023
1dd1d7f
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Oct 3, 2023
bee540e
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Oct 3, 2023
fad139c
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Oct 4, 2023
d7cf17a
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Oct 4, 2023
af9fb24
add some documentation
etseidl Oct 4, 2023
63e3740
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Oct 4, 2023
c8e3938
more cleanup
etseidl Oct 4, 2023
0332923
more cleanup
etseidl Oct 4, 2023
f5f2bd5
a little more cleanup
etseidl Oct 4, 2023
3f66cd8
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Oct 6, 2023
5f2a7d4
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Oct 9, 2023
2c30646
clean up some docstrings and move where str_bytes is initialized
etseidl Oct 9, 2023
c2dbb47
revert move of str_bytes reset...broke chunked reads
etseidl Oct 9, 2023
d0b01d0
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Oct 10, 2023
b24ec79
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Oct 11, 2023
a2ed863
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Oct 16, 2023
7ab1e1a
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Oct 18, 2023
71b979c
remove redundant comment
etseidl Oct 18, 2023
5ce72a4
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Oct 20, 2023
cb15ace
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Oct 20, 2023
c3ea439
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Oct 20, 2023
46d179a
fix test for supported encodings lost in merge
etseidl Oct 20, 2023
c898194
Merge branch 'branch-23.12' into decode_delta_byte_arr
vuule Oct 20, 2023
cf356c4
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Oct 26, 2023
f30a5c8
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Oct 27, 2023
ddacf38
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Oct 30, 2023
29df5bd
move description of DELTA_BYTE_ARRAY to where it belongs. add some
etseidl Oct 30, 2023
21d7f9b
typo caught by review
etseidl Nov 1, 2023
ebef98d
update calculation of temp_bytes
etseidl Nov 1, 2023
3f2b26e
Merge branch 'decode_delta_byte_arr' of github.com:etseidl/cudf into …
etseidl Nov 1, 2023
cb2d7ce
fix comment
etseidl Nov 1, 2023
5c1549b
add some explanation for what is going on in ComputePageStringSizes
etseidl Nov 1, 2023
e508e3a
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Nov 1, 2023
bad9f6a
use device_uvector
etseidl Nov 1, 2023
75e23fc
implement a TODO
etseidl Nov 1, 2023
300f492
use exec_policy_nosync when calculating string sizs
etseidl Nov 1, 2023
9dff818
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Nov 1, 2023
3a0099b
Merge branch 'decode_delta_byte_arr' of github.com:etseidl/cudf into …
etseidl Nov 1, 2023
e618ad0
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Nov 7, 2023
cd15adf
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Nov 7, 2023
fe7e5a2
fix typo
etseidl Nov 7, 2023
9556bc2
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Nov 8, 2023
e9baf16
add delta binary roundtrip test
etseidl Nov 8, 2023
59c9ef1
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Nov 8, 2023
4e1160a
test a larger range of rows around critical numbers for the delta dec…
etseidl Nov 8, 2023
4affd2f
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Nov 8, 2023
734023b
add struct of list test for delta
etseidl Nov 9, 2023
c39990e
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Nov 9, 2023
8b6ed5e
future proof test by adding param for string encoding
etseidl Nov 9, 2023
42c960f
Merge branch 'branch-23.12' into decode_delta_byte_arr
etseidl Nov 11, 2023
5e4b2dd
fix some comments
etseidl Nov 13, 2023
8338207
a few fixes suggested in review
etseidl Nov 13, 2023
40dbd59
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Nov 13, 2023
982285c
Merge branch 'rapidsai:branch-23.12' into decode_delta_byte_arr
etseidl Nov 14, 2023
8bf6edc
add sanity check to failing test
etseidl Nov 14, 2023
ea4c283
fix overrun caused by large mini-blocks
etseidl Nov 14, 2023
cf5abe5
check that the mini-block size is not too large to handle
etseidl Nov 14, 2023
b697cb8
smack redundant statement
etseidl Nov 15, 2023
07b2f79
add const
etseidl Nov 15, 2023
ea9eac0
Merge branch 'decode_delta_byte_arr' of github.com:etseidl/cudf into …
etseidl Nov 15, 2023
2004caa
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Nov 15, 2023
883a236
remove some unnecessary syncthreads calls
etseidl Nov 15, 2023
0844d8e
Revert "remove some unnecessary syncthreads calls"
etseidl Nov 16, 2023
7a2e546
Merge remote-tracking branch 'origin/branch-23.12' into decode_delta_…
etseidl Nov 16, 2023
2f83f8a
add a TODO to remove some syncthreads calls
etseidl Nov 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 39 additions & 14 deletions cpp/src/io/parquet/delta_binary.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,11 @@ namespace cudf::io::parquet::detail {
// per mini-block. While encoding, the lowest delta value is subtracted from all the deltas in the
// block to ensure that all encoded values are positive. The deltas for each mini-block are bit
// packed using the same encoding as the RLE/Bit-Packing Hybrid encoder.
//
// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
// columns. For each element in a sequence of strings, a prefix length from the preceding string
// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
// lengths, followed by the concatenated suffix data.

// we decode one mini-block at a time. max mini-block size seen is 64.
// The first pass decodes `values_per_mb` values, and then the second pass does another
// batch of size `values_per_mb`. The largest value for values_per_miniblock among the
// major writers seems to be 64, so 2 * 64 should be good. We save the first value separately
// since it is not encoded in the first mini-block.
constexpr int delta_rolling_buf_size = 128;

/**
Expand Down Expand Up @@ -90,7 +87,8 @@ struct delta_binary_decoder {
uleb128_t mini_block_count; // usually 4, chosen such that block_size/mini_block_count is a
// multiple of 32
uleb128_t value_count; // total values encoded in the block
zigzag128_t last_value; // last value decoded, initialized to first_value from header
zigzag128_t first_value; // initial value, stored in the header
zigzag128_t last_value; // last value decoded

uint32_t values_per_mb; // block_size / mini_block_count, must be multiple of 32
uint32_t current_value_idx; // current value index, initialized to 0 at start of block
Expand All @@ -102,6 +100,13 @@ struct delta_binary_decoder {

uleb128_t value[delta_rolling_buf_size]; // circular buffer of delta values

// returns the value stored in the `value` array at index
// `rolling_index<delta_rolling_buf_size>(idx)`. If `idx` is `0`, then return `first_value`.
constexpr zigzag128_t value_at(size_type idx)
{
return idx == 0 ? first_value : value[rolling_index<delta_rolling_buf_size>(idx)];
}

// returns the number of values encoded in the block data. when all_values is true,
// account for the first value in the header. otherwise just count the values encoded
// in the mini-block data.
Expand Down Expand Up @@ -145,7 +150,8 @@ struct delta_binary_decoder {
block_size = get_uleb128(d_start, d_end);
mini_block_count = get_uleb128(d_start, d_end);
value_count = get_uleb128(d_start, d_end);
last_value = get_zz128(d_start, d_end);
first_value = get_zz128(d_start, d_end);
last_value = first_value;

current_value_idx = 0;
values_per_mb = block_size / mini_block_count;
Expand Down Expand Up @@ -179,19 +185,38 @@ struct delta_binary_decoder {
}
}

// given start/end pointers in the data, find the end of the binary encoded block. when done,
// `this` will be initialized with the correct start and end positions. returns the end, which is
// start of data/next block. should only be called from thread 0.
inline __device__ uint8_t const* find_end_of_block(uint8_t const* start, uint8_t const* end)
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
{
// read block header
init_binary_block(start, end);

// test for no encoded values. a single value will be in the block header.
if (value_count <= 1) { return block_start; }

// read mini-block headers and skip over data
while (current_value_idx < num_encoded_values(false)) {
setup_next_mini_block(false);
}
// calculate the correct end of the block
auto const* const new_end = cur_mb == 0 ? block_start : cur_mb_start;
// re-init block with correct end
init_binary_block(start, new_end);
return new_end;
}

// decode the current mini-batch of deltas, and convert to values.
// called by all threads in a warp, currently only one warp supported.
inline __device__ void calc_mini_block_values(int lane_id)
{
using cudf::detail::warp_size;
if (current_value_idx >= value_count) { return; }

// need to save first value from header on first pass
// need to account for the first value from header on first pass
if (current_value_idx == 0) {
if (lane_id == 0) {
current_value_idx++;
value[0] = last_value;
}
if (lane_id == 0) { current_value_idx++; }
__syncwarp();
if (current_value_idx >= value_count) { return; }
}
Expand Down
11 changes: 8 additions & 3 deletions cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,13 @@ __global__ void __launch_bounds__(decode_block_size)
int out_thread0;
[[maybe_unused]] null_count_back_copier _{s, t};

if (!setupLocalPageInfo(
s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_GENERAL}, true)) {
if (!setupLocalPageInfo(s,
&pages[page_idx],
chunks,
min_row,
num_rows,
mask_filter{decode_kernel_mask::GENERAL},
true)) {
return;
}

Expand Down Expand Up @@ -603,7 +608,7 @@ __global__ void __launch_bounds__(decode_block_size)
}

struct mask_tform {
__device__ uint32_t operator()(PageInfo const& p) { return p.kernel_mask; }
__device__ uint32_t operator()(PageInfo const& p) { return static_cast<uint32_t>(p.kernel_mask); }
};

} // anonymous namespace
Expand Down
12 changes: 10 additions & 2 deletions cpp/src/io/parquet/page_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -991,8 +991,15 @@ struct all_types_filter {
* @brief Functor for setupLocalPageInfo that takes a mask of allowed types.
*/
struct mask_filter {
int mask;
__device__ inline bool operator()(PageInfo const& page) { return (page.kernel_mask & mask) != 0; }
uint32_t mask;

__device__ mask_filter(uint32_t m) : mask(m) {}
__device__ mask_filter(decode_kernel_mask m) : mask(static_cast<uint32_t>(m)) {}

__device__ inline bool operator()(PageInfo const& page)
{
return BitAnd(mask, page.kernel_mask) != 0;
}
};

/**
Expand Down Expand Up @@ -1306,6 +1313,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
s->dict_run = 0;
} break;
case Encoding::DELTA_BINARY_PACKED:
case Encoding::DELTA_BYTE_ARRAY:
// nothing to do, just don't error
break;
default: {
Expand Down
Loading