Skip to content

Commit

Permalink
Add keep option to distinct nvbench (#16497)
Browse files Browse the repository at this point in the history
This PR adopts some work from @srinivasyadav18 with additional modifications. This is meant to complement #16484.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

URL: #16497
  • Loading branch information
bdice authored Aug 8, 2024
1 parent 792dd06 commit 1bbe440
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 32 deletions.
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ ConfigureNVBench(
stream_compaction/distinct.cpp
stream_compaction/distinct_count.cpp
stream_compaction/stable_distinct.cpp
stream_compaction/stream_compaction_common.cpp
stream_compaction/unique.cpp
stream_compaction/unique_count.cpp
)
Expand Down
45 changes: 29 additions & 16 deletions cpp/benchmarks/stream_compaction/distinct.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/stream_compaction/stream_compaction_common.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/lists/list_view.hpp>
Expand All @@ -23,15 +24,29 @@

#include <nvbench/nvbench.cuh>

#include <limits>

NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

template <typename Type>
void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::size_type const num_rows = state.get_int64("NumRows");
cudf::size_type const num_rows = state.get_int64("NumRows");
auto const keep = get_keep(state.get_string("keep"));
cudf::size_type const cardinality = state.get_int64("cardinality");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
return;
}

data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
data_profile profile = data_profile_builder()
.cardinality(cardinality)
.null_probability(0.01)
.distribution(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
std::numeric_limits<Type>::max());

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);

Expand All @@ -40,27 +55,27 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::distinct(input_table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::distinct(
input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
using data_type = nvbench::type_list<int32_t, int64_t>;

NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
.add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});

template <typename Type>
void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_probability = state.get_float64("null_probability");
auto const keep = get_keep(state.get_string("keep"));

auto builder = data_profile_builder().null_probability(null_probability);
if (dtype == cudf::type_id::LIST) {
Expand All @@ -80,17 +95,15 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::distinct(*table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result =
cudf::distinct(*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

NVBENCH_BENCH_TYPES(nvbench_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("distinct_list")
.set_type_axes_names({"Type"})
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_float64_axis("null_probability", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});
45 changes: 29 additions & 16 deletions cpp/benchmarks/stream_compaction/stable_distinct.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/stream_compaction/stream_compaction_common.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/lists/list_view.hpp>
Expand All @@ -23,15 +24,29 @@

#include <nvbench/nvbench.cuh>

#include <limits>

NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

template <typename Type>
void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::size_type const num_rows = state.get_int64("NumRows");
cudf::size_type const num_rows = state.get_int64("NumRows");
auto const keep = get_keep(state.get_string("keep"));
cudf::size_type const cardinality = state.get_int64("cardinality");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
return;
}

data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
data_profile profile = data_profile_builder()
.cardinality(cardinality)
.null_probability(0.01)
.distribution(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
std::numeric_limits<Type>::max());

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);

Expand All @@ -40,27 +55,27 @@ void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::stable_distinct(input_table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::stable_distinct(
input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
using data_type = nvbench::type_list<int32_t, int64_t>;

NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("stable_distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
.add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});

template <typename Type>
void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_probability = state.get_float64("null_probability");
auto const keep = get_keep(state.get_string("keep"));

auto builder = data_profile_builder().null_probability(null_probability);
if (dtype == cudf::type_id::LIST) {
Expand All @@ -80,17 +95,15 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::stable_distinct(*table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::stable_distinct(
*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("stable_distinct_list")
.set_type_axes_names({"Type"})
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_float64_axis("null_probability", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});
35 changes: 35 additions & 0 deletions cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/stream_compaction/stream_compaction_common.hpp>

#include <cudf/stream_compaction.hpp>
#include <cudf/utilities/error.hpp>

cudf::duplicate_keep_option get_keep(std::string const& keep_str)
{
if (keep_str == "any") {
return cudf::duplicate_keep_option::KEEP_ANY;
} else if (keep_str == "first") {
return cudf::duplicate_keep_option::KEEP_FIRST;
} else if (keep_str == "last") {
return cudf::duplicate_keep_option::KEEP_LAST;
} else if (keep_str == "none") {
return cudf::duplicate_keep_option::KEEP_NONE;
} else {
CUDF_FAIL("Unsupported keep option.");
}
}
19 changes: 19 additions & 0 deletions cpp/benchmarks/stream_compaction/stream_compaction_common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/stream_compaction.hpp>

cudf::duplicate_keep_option get_keep(std::string const& keep_str);

0 comments on commit 1bbe440

Please sign in to comment.