Skip to content

Commit

Permalink
Merge pull request #10099 from rapidsai/branch-22.02
Browse files Browse the repository at this point in the history
[gpuCI] Forward-merge branch-22.02 to branch-22.04 [skip gpuci]
  • Loading branch information
GPUtester authored Jan 20, 2022
2 parents 4045727 + 53a31d1 commit 57ff6f5
Show file tree
Hide file tree
Showing 9 changed files with 200 additions and 64 deletions.
19 changes: 14 additions & 5 deletions cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include "cudf/io/types.hpp"
#include <benchmark/benchmark.h>

#include <benchmarks/common/generate_benchmark_input.hpp>
Expand Down Expand Up @@ -65,8 +66,14 @@ void BM_orc_write_varying_inout(benchmark::State& state)

void BM_orc_write_varying_options(benchmark::State& state)
{
auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
auto const enable_stats = state.range(1) != 0;
auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
auto const stats_freq = [&] {
switch (state.range(2)) {
case 0: return cudf::io::STATISTICS_NONE;
case 1: return cudf::io::ORC_STATISTICS_STRIPE;
default: return cudf::io::ORC_STATISTICS_ROW_GROUP;
}
}();

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
Expand All @@ -85,7 +92,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
cudf_io::orc_writer_options const options =
cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression)
.enable_statistics(enable_stats);
.enable_statistics(stats_freq);
cudf_io::write_orc(options);
}

Expand Down Expand Up @@ -113,6 +120,8 @@ BENCHMARK_DEFINE_F(OrcWrite, writer_options)
BENCHMARK_REGISTER_F(OrcWrite, writer_options)
->ArgsProduct({{int32_t(cudf::io::compression_type::NONE),
int32_t(cudf::io::compression_type::SNAPPY)},
{0, 1}})
{int32_t{cudf::io::STATISTICS_NONE},
int32_t{cudf::io::ORC_STATISTICS_STRIPE},
int32_t{cudf::io::ORC_STATISTICS_ROW_GROUP}}})
->Unit(benchmark::kMillisecond)
->UseManualTime();
84 changes: 62 additions & 22 deletions cpp/include/cudf/io/orc.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -434,6 +434,18 @@ table_with_metadata read_orc(
*/
class orc_writer_options_builder;

/**
* @brief Constants to disambiguate statistics terminology for ORC.
*
* ORC refers to its finest granularity of row-grouping as "row group",
* which corresponds to Parquet "pages".
* Similarly, ORC's "stripe" corresponds to a Parquet "row group".
* The following constants disambiguate the terminology for the statistics
* collected at each level.
*/
static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;

/**
* @brief Settings to use for `write_orc()`.
*/
Expand All @@ -442,8 +454,8 @@ class orc_writer_options {
sink_info _sink;
// Specify the compression format to use
compression_type _compression = compression_type::AUTO;
// Enable writing column statistics
bool _enable_statistics = true;
// Specify frequency of statistics collection
statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
// Maximum size of each stripe (unless smaller than a single row group)
size_t _stripe_size_bytes = default_stripe_size_bytes;
// Maximum number of rows in stripe (unless smaller than a single row group)
Expand Down Expand Up @@ -501,7 +513,15 @@ class orc_writer_options {
/**
* @brief Whether writing column statistics is enabled/disabled.
*/
[[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; }
[[nodiscard]] bool is_enabled_statistics() const
{
return _stats_freq != statistics_freq::STATISTICS_NONE;
}

/**
* @brief Returns frequency of statistics collection.
*/
[[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }

/**
* @brief Returns maximum stripe size, in bytes.
Expand Down Expand Up @@ -550,11 +570,16 @@ class orc_writer_options {
void set_compression(compression_type comp) { _compression = comp; }

/**
* @brief Enable/Disable writing column statistics.
* @brief Choose granularity of statistics collection.
*
* @param val Boolean value to enable/disable statistics.
* The granularity can be set to:
* - cudf::io::STATISTICS_NONE: No statistics are collected.
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
*
* @param val Frequency of statistics collection.
*/
void enable_statistics(bool val) { _enable_statistics = val; }
void enable_statistics(statistics_freq val) { _stats_freq = val; }

/**
* @brief Sets the maximum stripe size, in bytes.
Expand Down Expand Up @@ -647,14 +672,19 @@ class orc_writer_options_builder {
}

/**
* @brief Enable/Disable writing column statistics.
* @brief Choose granularity of column statistics to be written
*
* The granularity can be set to:
* - cudf::io::STATISTICS_NONE: No statistics are collected.
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
*
* @param val Boolean value to enable/disable.
* @param val Level of statistics collection.
* @return this for chaining.
*/
orc_writer_options_builder& enable_statistics(bool val)
orc_writer_options_builder& enable_statistics(statistics_freq val)
{
options._enable_statistics = val;
options._stats_freq = val;
return *this;
}

Expand Down Expand Up @@ -775,8 +805,8 @@ class chunked_orc_writer_options {
sink_info _sink;
// Specify the compression format to use
compression_type _compression = compression_type::AUTO;
// Enable writing column statistics
bool _enable_statistics = true;
// Specify granularity of statistics collection
statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
// Maximum size of each stripe (unless smaller than a single row group)
size_t _stripe_size_bytes = default_stripe_size_bytes;
// Maximum number of rows in stripe (unless smaller than a single row group)
Expand Down Expand Up @@ -825,9 +855,9 @@ class chunked_orc_writer_options {
[[nodiscard]] compression_type get_compression() const { return _compression; }

/**
* @brief Whether writing column statistics is enabled/disabled.
* @brief Returns granularity of statistics collection.
*/
[[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; }
[[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }

/**
* @brief Returns maximum stripe size, in bytes.
Expand Down Expand Up @@ -871,11 +901,16 @@ class chunked_orc_writer_options {
void set_compression(compression_type comp) { _compression = comp; }

/**
* @brief Enable/Disable writing column statistics.
* @brief Choose granularity of statistics collection
*
* The granularity can be set to:
* - cudf::io::STATISTICS_NONE: No statistics are collected.
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
*
* @param val Boolean value to enable/disable.
* @param val Frequency of statistics collection.
*/
void enable_statistics(bool val) { _enable_statistics = val; }
void enable_statistics(statistics_freq val) { _stats_freq = val; }

/**
* @brief Sets the maximum stripe size, in bytes.
Expand Down Expand Up @@ -958,14 +993,19 @@ class chunked_orc_writer_options_builder {
}

/**
* @brief Enable/Disable writing column statistics.
* @brief Choose granularity of statistics collection
*
* The granularity can be set to:
* - cudf::io::STATISTICS_NONE: No statistics are collected.
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
*
* @param val Boolean value to enable/disable.
* @param val Frequency of statistics collection.
* @return this for chaining.
*/
chunked_orc_writer_options_builder& enable_statistics(bool val)
chunked_orc_writer_options_builder& enable_statistics(statistics_freq val)
{
options._enable_statistics = val;
options._stats_freq = val;
return *this;
}

Expand Down
44 changes: 27 additions & 17 deletions cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1063,15 +1063,15 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
}

writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
bool are_statistics_enabled,
statistics_freq stats_freq,
orc_table_view const& orc_table,
file_segmentation const& segmentation)
{
auto const num_rowgroup_blobs = segmentation.rowgroups.count();
auto const num_stripe_blobs = segmentation.num_stripes() * orc_table.num_columns();
auto const num_file_blobs = orc_table.num_columns();
auto const num_stat_blobs = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;

auto const num_rowgroup_blobs = segmentation.rowgroups.count();
auto const num_stripe_blobs = segmentation.num_stripes() * orc_table.num_columns();
auto const num_file_blobs = orc_table.num_columns();
auto const num_stat_blobs = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE;
if (not are_statistics_enabled or num_stat_blobs == 0) { return {}; }

hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
Expand Down Expand Up @@ -1164,17 +1164,27 @@ writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(

hostdevice_vector<uint8_t> blobs(
stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
gpu::orc_encode_statistics(
blobs.device_ptr(), stat_merge.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
// Skip rowgroup blobs when encoding, if chosen granularity is coarser than "ROW_GROUP".
auto const is_granularity_rowgroup = stats_freq == ORC_STATISTICS_ROW_GROUP;
auto const num_skip = is_granularity_rowgroup ? 0 : num_rowgroup_blobs;
gpu::orc_encode_statistics(blobs.device_ptr(),
stat_merge.device_ptr(num_skip),
stat_chunks.data() + num_skip,
num_stat_blobs - num_skip,
stream);
stat_merge.device_to_host(stream);
blobs.device_to_host(stream, true);

std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
for (size_t i = 0; i < num_rowgroup_blobs; i++) {
auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
auto const stat_end = stat_begin + rowgroup_stat_merge[i].num_chunks;
rowgroup_blobs[i].assign(stat_begin, stat_end);
}
auto rowgroup_blobs = [&]() -> std::vector<ColStatsBlob> {
if (not is_granularity_rowgroup) { return {}; }
std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
for (size_t i = 0; i < num_rowgroup_blobs; i++) {
auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
auto const stat_end = stat_begin + rowgroup_stat_merge[i].num_chunks;
rowgroup_blobs[i].assign(stat_begin, stat_end);
}
return rowgroup_blobs;
}();

std::vector<ColStatsBlob> stripe_blobs(num_stripe_blobs);
for (size_t i = 0; i < num_stripe_blobs; i++) {
Expand Down Expand Up @@ -1351,7 +1361,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
row_index_stride{options.get_row_index_stride()},
compression_kind_(to_orc_compression(options.get_compression())),
enable_statistics_(options.is_enabled_statistics()),
stats_freq_(options.get_statistics_freq()),
single_write_mode(mode == SingleWriteMode::YES),
kv_meta(options.get_key_value_metadata()),
out_sink_(std::move(sink))
Expand All @@ -1372,7 +1382,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
row_index_stride{options.get_row_index_stride()},
compression_kind_(to_orc_compression(options.get_compression())),
enable_statistics_(options.is_enabled_statistics()),
stats_freq_(options.get_statistics_freq()),
single_write_mode(mode == SingleWriteMode::YES),
kv_meta(options.get_key_value_metadata()),
out_sink_(std::move(sink))
Expand Down Expand Up @@ -1954,7 +1964,7 @@ void writer::impl::write(table_view const& table)

ProtobufWriter pbw_(&buffer_);

auto const statistics = gather_statistic_blobs(enable_statistics_, orc_table, segmentation);
auto const statistics = gather_statistic_blobs(stats_freq_, orc_table, segmentation);

// Write stripes
std::vector<std::future<void>> write_tasks;
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/orc/writer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,13 +293,13 @@ class writer::impl {
/**
* @brief Returns column statistics encoded in ORC protobuf format.
*
* @param are_statistics_enabled True if statistics are to be included in the output file
* @param statistics_freq Frequency of statistics to be included in the output file
* @param orc_table Table information to be written
* @param columns List of columns
* @param segmentation stripe and rowgroup ranges
* @return The statistic blobs
*/
encoded_statistics gather_statistic_blobs(bool are_statistics_enabled,
encoded_statistics gather_statistic_blobs(statistics_freq statistics_freq,
orc_table_view const& orc_table,
file_segmentation const& segmentation);

Expand Down Expand Up @@ -365,8 +365,8 @@ class writer::impl {
size_t compression_blocksize_ = DEFAULT_COMPRESSION_BLOCKSIZE;
CompressionKind compression_kind_ = CompressionKind::NONE;

bool enable_dictionary_ = true;
bool enable_statistics_ = true;
bool enable_dictionary_ = true;
statistics_freq stats_freq_ = ORC_STATISTICS_ROW_GROUP;

// Overall file metadata. Filled in during the process and written during write_chunked_end()
cudf::io::orc::FileFooter ff;
Expand Down
4 changes: 2 additions & 2 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1733,7 +1733,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
.metadata(&metadata)
.compression(static_cast<compression_type>(j_compression))
.enable_statistics(true)
.enable_statistics(ORC_STATISTICS_ROW_GROUP)
.key_value_metadata(kv_metadata)
.build();
auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
Expand Down Expand Up @@ -1776,7 +1776,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
.metadata(&metadata)
.compression(static_cast<compression_type>(j_compression))
.enable_statistics(true)
.enable_statistics(ORC_STATISTICS_ROW_GROUP)
.key_value_metadata(kv_metadata)
.build();
auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import io
import sys
Expand Down Expand Up @@ -74,7 +74,7 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
data_handle=OrcWriter,
params={
"compression": [None, "snappy"],
"enable_statistics": [True, False],
"enable_statistics": ["NONE", "STRIPE", "ROWGROUP"],
},
)
def orc_writer_test(pdf, compression, enable_statistics):
Expand Down
Loading

0 comments on commit 57ff6f5

Please sign in to comment.