Skip to content

Commit

Permalink
Remove support for skip_rows / num_rows options in the parquet reader. (
Browse files Browse the repository at this point in the history
rapidsai#11503)

Removes support for skip_rows / num_rows options in the parquet reader.  Users retain control of what gets read via row groups.

Did some before/after benchmarking.  As expected, this doesn't change much except for a minor boost in list reading (due to simplification of the preprocessing step).  Most of the ways the row bounds affected the code was in the page setup process (making it slippery to think through the logic) and didn't do much in the actual process of decoding.  A selection of before/after benchmarks (all input files ~512  MB)

```
ParquetRead/integral_buffer_input/29/1000/32/0/1/manual_time
Before:  bytes_per_second=31.4564G/s
After:   bytes_per_second=31.58G/s

ParquetRead/floats_buffer_input/31/1000/32/0/1/manual_time
Before:  bytes_per_second=49.2819G/s
After:   bytes_per_second=49.7408G/s

ParquetRead/string_file_input/23/1000/32/0/0/manual_time
Before:  bytes_per_second=24.634G/s
After:   bytes_per_second=24.6563G/s

ParquetRead/string_buffer_input/23/0/1/0/1/manual_time
Before:  bytes_per_second=5.03313G/s
After:   bytes_per_second=5.03535G/s

ParquetRead/list_buffer_input/24/0/1/1/1/manual_time
Before:  bytes_per_second=1.11488G/s
After:   bytes_per_second=1.31447G/s
```

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Yunsong Wang (https://github.com/PointKernel)

URL: rapidsai#11503
  • Loading branch information
nvdbaranec authored Aug 11, 2022
1 parent 87a5e6a commit d39b957
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 646 deletions.
34 changes: 4 additions & 30 deletions cpp/benchmarks/io/parquet/parquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ void BM_parq_read_varying_input(benchmark::State& state)
std::vector<std::string> get_col_names(cudf::io::source_info const& source)
{
cudf_io::parquet_reader_options const read_options =
cudf_io::parquet_reader_options::builder(source).num_rows(1);
cudf_io::parquet_reader_options::builder(source);
return cudf_io::read_parquet(read_options).metadata.column_names;
}

Expand Down Expand Up @@ -113,9 +113,8 @@ void BM_parq_read_varying_options(benchmark::State& state)
.use_pandas_metadata(use_pandas_metadata)
.timestamp_type(ts_type);

auto const num_row_groups = data_size / (128 << 20);
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto mem_stats_logger = cudf::memory_stats_logger();
auto const num_row_groups = data_size / (128 << 20);
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
try_drop_l3_cache();
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
Expand All @@ -133,11 +132,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
}
read_options.set_row_groups({row_groups_to_read});
} break;
case row_selection::NROWS:
read_options.set_skip_rows(chunk * chunk_row_cnt);
read_options.set_num_rows(chunk_row_cnt);
if (is_last_chunk) read_options.set_num_rows(-1);
break;
case row_selection::NROWS: [[fallthrough]];
default: CUDF_FAIL("Unsupported row selection method");
}

Expand Down Expand Up @@ -186,24 +181,3 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection)

// row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file and
// determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
BENCHMARK_DEFINE_F(ParquetRead, row_selection)
(::benchmark::State& state) { BM_parq_read_varying_options(state); }
BENCHMARK_REGISTER_F(ParquetRead, row_selection)
->ArgsProduct({{int32_t(column_selection::ALL)},
{int32_t(row_selection::NROWS)},
{1, 4},
{0b01}, // defaults
{int32_t(cudf::type_id::EMPTY)}})
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_DEFINE_F(ParquetRead, misc_options)
(::benchmark::State& state) { BM_parq_read_varying_options(state); }
BENCHMARK_REGISTER_F(ParquetRead, misc_options)
->ArgsProduct({{int32_t(column_selection::ALL)},
{int32_t(row_selection::NROWS)},
{1},
{0b01, 0b00, 0b11, 0b010},
{int32_t(cudf::type_id::EMPTY), int32_t(cudf::type_id::TIMESTAMP_NANOSECONDS)}})
->Unit(benchmark::kMillisecond)
->UseManualTime();
74 changes: 0 additions & 74 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,6 @@ class parquet_reader_options {

// List of individual row groups to read (ignored if empty)
std::vector<std::vector<size_type>> _row_groups;
// Number of rows to skip from the start
size_type _skip_rows = 0;
// Number of rows to read; -1 is all
size_type _num_rows = -1;

// Whether to store string data as categorical type
bool _convert_strings_to_categories = false;
Expand Down Expand Up @@ -133,20 +129,6 @@ class parquet_reader_options {
return _convert_binary_to_strings;
}

/**
* @brief Returns number of rows to skip from the start.
*
* @return Number of rows to skip from the start
*/
[[nodiscard]] size_type get_skip_rows() const { return _skip_rows; }

/**
* @brief Returns number of rows to read.
*
* @return Number of rows to read
*/
[[nodiscard]] size_type get_num_rows() const { return _num_rows; }

/**
* @brief Returns names of column to be read, if set.
*
Expand Down Expand Up @@ -182,10 +164,6 @@ class parquet_reader_options {
*/
void set_row_groups(std::vector<std::vector<size_type>> row_groups)
{
if ((!row_groups.empty()) and ((_skip_rows != 0) or (_num_rows != -1))) {
CUDF_FAIL("row_groups can't be set along with skip_rows and num_rows");
}

_row_groups = std::move(row_groups);
}

Expand Down Expand Up @@ -214,34 +192,6 @@ class parquet_reader_options {
_convert_binary_to_strings = std::move(val);
}

/**
* @brief Sets number of rows to skip.
*
* @param val Number of rows to skip from start
*/
void set_skip_rows(size_type val)
{
if ((val != 0) and (!_row_groups.empty())) {
CUDF_FAIL("skip_rows can't be set along with a non-empty row_groups");
}

_skip_rows = val;
}

/**
* @brief Sets number of rows to read.
*
* @param val Number of rows to read after skip
*/
void set_num_rows(size_type val)
{
if ((val != -1) and (!_row_groups.empty())) {
CUDF_FAIL("num_rows can't be set along with a non-empty row_groups");
}

_num_rows = val;
}

/**
* @brief Sets timestamp_type used to cast timestamp columns.
*
Expand Down Expand Up @@ -332,30 +282,6 @@ class parquet_reader_options_builder {
return *this;
}

/**
* @brief Sets number of rows to skip.
*
* @param val Number of rows to skip from start
* @return this for chaining
*/
parquet_reader_options_builder& skip_rows(size_type val)
{
options.set_skip_rows(val);
return *this;
}

/**
* @brief Sets number of rows to read.
*
* @param val Number of rows to read after skip
* @return this for chaining
*/
parquet_reader_options_builder& num_rows(size_type val)
{
options.set_num_rows(val);
return *this;
}

/**
* @brief timestamp_type used to cast timestamp columns.
*
Expand Down
Loading

0 comments on commit d39b957

Please sign in to comment.