Update groupby::hash to use new row operators for keys (#10770)

Related to #8039 and #10181 Contributes to #10186 This PR updates `groupby::hash` to use new row operators. It gets rid of the current "flattened nested column" logic and allows `groupby::hash` to handle `LIST` and `STRUCT` keys. The work also involves small cleanups like getting rid of unnecessary template parameters and removing unused arguments. It becomes a breaking PR since the updated `groupby::hash` will treat inner nulls as equal when top-level nulls are excluded while the current behavior treats inner nulls as **unequal**. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Nghia Truong (https://github.com/ttnghia) - Devavret Makkar (https://github.com/devavret) URL: #10770
rapidsai · May 25, 2022 · dbd2b08 · dbd2b08
1 parent 6a64ce1
commit dbd2b08
Show file tree

Hide file tree

Showing 11 changed files with 459 additions and 158 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -198,13 +198,13 @@ ConfigureBench(
   groupby/group_sum.cu
   groupby/group_nth.cu
   groupby/group_shift.cu
-  groupby/group_struct.cu
+  groupby/group_struct_values.cpp
   groupby/group_no_requests.cu
   groupby/group_scan.cu
   groupby/group_rank_benchmark.cu
 )
 
-ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu)
+ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu groupby/group_struct_keys.cpp)
 
 # ##################################################################################################
 # * hashing benchmark -----------------------------------------------------------------------------

diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <random>
+
+void bench_groupby_struct_keys(nvbench::state& state)
+{
+  cudf::rmm_pool_raii pool_raii;
+
+  using Type           = int;
+  using column_wrapper = cudf::test::fixed_width_column_wrapper<Type>;
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> distribution(0, 100);
+
+  const cudf::size_type n_rows{static_cast<cudf::size_type>(state.get_int64("NumRows"))};
+  const cudf::size_type n_cols{1};
+  const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("Depth"))};
+  const bool nulls{static_cast<bool>(state.get_int64("Nulls"))};
+
+  // Create columns with values in the range [0,100)
+  std::vector<column_wrapper> columns;
+  columns.reserve(n_cols);
+  std::generate_n(std::back_inserter(columns), n_cols, [&]() {
+    auto const elements = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto row) { return distribution(generator); });
+    if (!nulls) return column_wrapper(elements, elements + n_rows);
+    auto valids =
+      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 10 != 0; });
+    return column_wrapper(elements, elements + n_rows, valids);
+  });
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  std::transform(columns.begin(), columns.end(), std::back_inserter(cols), [](column_wrapper& col) {
+    return col.release();
+  });
+
+  std::vector<std::unique_ptr<cudf::column>> child_cols = std::move(cols);
+  // Add some layers
+  for (int i = 0; i < depth; i++) {
+    std::vector<bool> struct_validity;
+    std::uniform_int_distribution<int> bool_distribution(0, 100 * (i + 1));
+    std::generate_n(
+      std::back_inserter(struct_validity), n_rows, [&]() { return bool_distribution(generator); });
+    cudf::test::structs_column_wrapper struct_col(std::move(child_cols), struct_validity);
+    child_cols = std::vector<std::unique_ptr<cudf::column>>{};
+    child_cols.push_back(struct_col.release());
+  }
+
+  data_profile profile;
+  profile.set_null_frequency(std::nullopt);
+  profile.set_cardinality(0);
+  profile.set_distribution_params<int64_t>(
+    cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
+
+  auto const keys_table = cudf::table(std::move(child_cols));
+  auto const vals_table =
+    create_random_table({cudf::type_to_id<int64_t>()}, row_count{n_rows}, profile);
+
+  cudf::groupby::groupby gb_obj(keys_table.view());
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = vals_table->get_column(0).view();
+  requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+
+  // Set up nvbench default stream
+  auto stream = rmm::cuda_stream_default;
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
+}
+
+NVBENCH_BENCH(bench_groupby_struct_keys)
+  .set_name("groupby_struct_keys")
+  .add_int64_power_of_two_axis("NumRows", {10, 16, 20})
+  .add_int64_axis("Depth", {0, 1, 8})
+  .add_int64_axis("Nulls", {0, 1});
diff --git a/cpp/benchmarks/groupby/group_struct.cu → ...enchmarks/groupby/group_struct_values.cpp b/cpp/benchmarks/groupby/group_struct.cu → ...enchmarks/groupby/group_struct_values.cpp
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,13 +31,12 @@ namespace hash {
  * @brief Indicates if a set of aggregation requests can be satisfied with a
  * hash-based groupby implementation.
  *
- * @param keys The table of keys
  * @param requests The set of columns to aggregate and the aggregations to
  * perform
  * @return true A hash-based groupby can be used
  * @return false A hash-based groupby cannot be used
  */
-bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests);
+bool can_use_hash_groupby(host_span<aggregation_request const> requests);
 
 // Hash-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(

diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
@@ -65,25 +65,15 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  using namespace cudf::structs::detail;
-
   // If sort groupby has been called once on this groupby object, then
   // always use sort groupby from now on. Because once keys are sorted,
   // all the aggs that can be done by hash groupby are efficiently done by
   // sort groupby as well.
   // Only use hash groupby if the keys aren't sorted and all requests can be
   // satisfied with a hash implementation
   if (_keys_are_sorted == sorted::NO and not _helper and
-      detail::hash::can_use_hash_groupby(_keys, requests)) {
-    // Optionally flatten nested key columns.
-    auto flattened             = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
-    auto flattened_keys        = flattened.flattened_columns();
-    auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
-    CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
-                 "Unsupported groupby key type does not support equality comparison");
-    auto [grouped_keys, results] =
-      detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
-    return std::pair(unflatten_nested_columns(std::move(grouped_keys), _keys), std::move(results));
+      detail::hash::can_use_hash_groupby(requests)) {
+    return detail::hash::groupby(_keys, requests, _include_null_keys, stream, mr);
   } else {
     return sort_aggregate(requests, stream, mr);
   }