Skip to content

Commit

Permalink
Update groupby::hash to use new row operators for keys (#10770)
Browse files Browse the repository at this point in the history
Related to #8039 and #10181 

Contributes to #10186 

This PR updates `groupby::hash` to use new row operators. It gets rid of the current "flattened nested column" logic and allows `groupby::hash` to handle `LIST` and `STRUCT` keys. The work also involves small cleanups like getting rid of unnecessary template parameters and removing unused arguments.

It becomes a breaking PR since the updated `groupby::hash` will treat inner nulls as equal when top-level nulls are excluded 
 while the current behavior treats inner nulls as **unequal**.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: #10770
  • Loading branch information
PointKernel authored May 25, 2022
1 parent 6a64ce1 commit dbd2b08
Show file tree
Hide file tree
Showing 11 changed files with 459 additions and 158 deletions.
4 changes: 2 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,13 @@ ConfigureBench(
groupby/group_sum.cu
groupby/group_nth.cu
groupby/group_shift.cu
groupby/group_struct.cu
groupby/group_struct_values.cpp
groupby/group_no_requests.cu
groupby/group_scan.cu
groupby/group_rank_benchmark.cu
)

ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu)
ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu groupby/group_struct_keys.cpp)

# ##################################################################################################
# * hashing benchmark -----------------------------------------------------------------------------
Expand Down
101 changes: 101 additions & 0 deletions cpp/benchmarks/groupby/group_struct_keys.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/detail/aggregation/aggregation.hpp>
#include <cudf/groupby.hpp>

#include <nvbench/nvbench.cuh>

#include <random>

void bench_groupby_struct_keys(nvbench::state& state)
{
cudf::rmm_pool_raii pool_raii;

using Type = int;
using column_wrapper = cudf::test::fixed_width_column_wrapper<Type>;
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, 100);

const cudf::size_type n_rows{static_cast<cudf::size_type>(state.get_int64("NumRows"))};
const cudf::size_type n_cols{1};
const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("Depth"))};
const bool nulls{static_cast<bool>(state.get_int64("Nulls"))};

// Create columns with values in the range [0,100)
std::vector<column_wrapper> columns;
columns.reserve(n_cols);
std::generate_n(std::back_inserter(columns), n_cols, [&]() {
auto const elements = cudf::detail::make_counting_transform_iterator(
0, [&](auto row) { return distribution(generator); });
if (!nulls) return column_wrapper(elements, elements + n_rows);
auto valids =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 10 != 0; });
return column_wrapper(elements, elements + n_rows, valids);
});

std::vector<std::unique_ptr<cudf::column>> cols;
std::transform(columns.begin(), columns.end(), std::back_inserter(cols), [](column_wrapper& col) {
return col.release();
});

std::vector<std::unique_ptr<cudf::column>> child_cols = std::move(cols);
// Add some layers
for (int i = 0; i < depth; i++) {
std::vector<bool> struct_validity;
std::uniform_int_distribution<int> bool_distribution(0, 100 * (i + 1));
std::generate_n(
std::back_inserter(struct_validity), n_rows, [&]() { return bool_distribution(generator); });
cudf::test::structs_column_wrapper struct_col(std::move(child_cols), struct_validity);
child_cols = std::vector<std::unique_ptr<cudf::column>>{};
child_cols.push_back(struct_col.release());
}

data_profile profile;
profile.set_null_frequency(std::nullopt);
profile.set_cardinality(0);
profile.set_distribution_params<int64_t>(
cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);

auto const keys_table = cudf::table(std::move(child_cols));
auto const vals_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{n_rows}, profile);

cudf::groupby::groupby gb_obj(keys_table.view());

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals_table->get_column(0).view();
requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());

// Set up nvbench default stream
auto stream = rmm::cuda_stream_default;
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
}

NVBENCH_BENCH(bench_groupby_struct_keys)
.set_name("groupby_struct_keys")
.add_int64_power_of_two_axis("NumRows", {10, 16, 20})
.add_int64_axis("Depth", {0, 1, 8})
.add_int64_axis("Nulls", {0, 1});
File renamed without changes.
5 changes: 2 additions & 3 deletions cpp/include/cudf/detail/groupby.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -31,13 +31,12 @@ namespace hash {
* @brief Indicates if a set of aggregation requests can be satisfied with a
* hash-based groupby implementation.
*
* @param keys The table of keys
* @param requests The set of columns to aggregate and the aggregations to
* perform
* @return true A hash-based groupby can be used
* @return false A hash-based groupby cannot be used
*/
bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests);
bool can_use_hash_groupby(host_span<aggregation_request const> requests);

// Hash-based groupby
std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
Expand Down
14 changes: 2 additions & 12 deletions cpp/src/groupby/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,25 +65,15 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using namespace cudf::structs::detail;

// If sort groupby has been called once on this groupby object, then
// always use sort groupby from now on. Because once keys are sorted,
// all the aggs that can be done by hash groupby are efficiently done by
// sort groupby as well.
// Only use hash groupby if the keys aren't sorted and all requests can be
// satisfied with a hash implementation
if (_keys_are_sorted == sorted::NO and not _helper and
detail::hash::can_use_hash_groupby(_keys, requests)) {
// Optionally flatten nested key columns.
auto flattened = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
auto flattened_keys = flattened.flattened_columns();
auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
"Unsupported groupby key type does not support equality comparison");
auto [grouped_keys, results] =
detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
return std::pair(unflatten_nested_columns(std::move(grouped_keys), _keys), std::move(results));
detail::hash::can_use_hash_groupby(requests)) {
return detail::hash::groupby(_keys, requests, _include_null_keys, stream, mr);
} else {
return sort_aggregate(requests, stream, mr);
}
Expand Down
Loading

0 comments on commit dbd2b08

Please sign in to comment.